#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Statistical guardrails against spurious results.
When an agent searches over many strategies, the danger is **overfitting and
data-snooping**: a good in-sample Sharpe may be luck or leakage. These guards make
honest evaluation cheap — a permutation test (is the edge real?) and the
probabilistic / deflated Sharpe ratio (does it survive the number of trials?).
All functions are data-agnostic and operate on the existing maillons / plain
metrics; nothing here reads real data or stores results.
"""
# Built-in
from __future__ import annotations
from typing import Any
# Third-party
import numpy as np
from numpy.typing import NDArray
from scipy.stats import norm
# Local
from fynance.core import PriceSeries
__all__ = [
'permutation_test',
'probabilistic_sharpe_ratio',
'deflated_sharpe_ratio',
]
# Euler-Mascheroni constant (for the expected maximum of N Gaussians).
_EULER = 0.5772156649015329
def _to_array(data: Any) -> NDArray[np.float64]:
""" Coerce a PriceSeries / array-like to a 1-D float64 array. """
if isinstance(data, PriceSeries):
return data.to_numpy()
return np.asarray(data, dtype=np.float64).reshape(-1)
[docs]
def permutation_test(
strategy: Any,
data: Any,
*,
metric: str = "sharpe",
n_permutations: int = 200,
seed: int = 0,
) -> dict[str, float]:
""" Permutation test for a spurious edge.
Runs the strategy on the real series, then on ``n_permutations`` shuffles of
the asset's returns (which destroys any temporal structure). If the strategy
scores as well on shuffled data as on the real data, its edge is not real.
The price series must be strictly positive: the null is built on log-returns
(``np.diff(np.log(prices))``), so a non-positive price would yield ``nan`` /
``-inf`` returns.
Each run is seeded with a **distinct** seed derived from ``seed`` (the
observed run and every permutation), so a stochastic strategy does not see
the *same* model RNG draws on every path — which would bias the null
variance. The whole test stays reproducible for a fixed ``seed``.
Parameters
----------
strategy : fynance.strategy.Strategy
The strategy to evaluate.
data : PriceSeries or array-like
Strictly positive price series (``np.log`` is taken).
metric : str
Metric key from the run summary (default ``"sharpe"``).
n_permutations : int
Number of shuffles forming the null distribution.
seed : int
Master seed for the shuffles and the runs (per-run seeds are derived
from it, so the test is fully reproducible).
Returns
-------
dict
``observed``, ``p_value``, ``null_mean``, ``null_std``. The p-value is the
(smoothed) fraction of shuffles scoring at least the observed metric.
"""
# Imported here to avoid a runner <-> guards import cycle at module load.
from fynance.research.runner import run_experiment
prices = _to_array(data)
log_ret = np.diff(np.log(prices))
s0 = float(prices[0])
# Draw distinct per-run seeds from the master seed so stochastic strategies
# do not replay identical RNG draws on every path (which biases the null).
seed_rng = np.random.default_rng(seed)
run_seeds = seed_rng.integers(0, 2**31 - 1, size=n_permutations + 1)
observed = run_experiment(strategy, prices, name="observed",
seed=int(run_seeds[0])).metrics[metric]
rng = np.random.default_rng(seed)
null = np.empty(n_permutations, dtype=np.float64)
for i in range(n_permutations):
shuffled = rng.permutation(log_ret)
path = s0 * np.exp(np.concatenate([[0.0], np.cumsum(shuffled)]))
null[i] = run_experiment(strategy, path, name="perm",
seed=int(run_seeds[i + 1])).metrics[metric]
# Smoothed p-value (never exactly 0): (#{null >= observed} + 1) / (n + 1).
p_value = float((np.sum(null >= observed) + 1) / (n_permutations + 1))
return {
"observed": float(observed),
"p_value": p_value,
"null_mean": float(null.mean()),
"null_std": float(null.std()),
}
[docs]
def probabilistic_sharpe_ratio(
sr: float,
n_obs: int,
*,
sr_benchmark: float = 0.0,
skew: float = 0.0,
kurt: float = 3.0,
) -> float:
""" Probabilistic Sharpe ratio (PSR).
The probability that the true Sharpe exceeds ``sr_benchmark`` given the
estimate ``sr`` from ``n_obs`` observations, correcting for the returns'
skewness and kurtosis (Bailey & López de Prado).
Parameters
----------
sr : float
Observed (non-annualized) Sharpe ratio.
n_obs : int
Number of return observations.
sr_benchmark : float
Benchmark Sharpe to beat.
skew : float
Skewness of the returns.
kurt : float
Kurtosis of the returns (3 for a normal distribution).
Returns
-------
float
PSR in ``[0, 1]``.
"""
if n_obs <= 1:
return float("nan")
denom = np.sqrt(1.0 - skew * sr + (kurt - 1.0) / 4.0 * sr**2)
z = (sr - sr_benchmark) * np.sqrt(n_obs - 1) / denom
return float(norm.cdf(z))
[docs]
def deflated_sharpe_ratio(
sr: float,
n_obs: int,
n_trials: int,
*,
skew: float = 0.0,
kurt: float = 3.0,
sr_variance: float = 1.0,
) -> float:
""" Deflated Sharpe ratio (DSR).
The PSR against a benchmark set to the **expected maximum** Sharpe of
``n_trials`` independent strategies — i.e. the probability the edge survives
the multiple testing implied by trying ``n_trials`` strategies.
Parameters
----------
sr : float
Observed **per-observation** (non-annualized) Sharpe ratio of the
selected strategy. An annualized Sharpe must be divided by
``sqrt(period)`` first, or the DSR saturates to ~1.
n_obs : int
Number of return observations.
n_trials : int
Number of strategy configurations tried.
skew, kurt : float
Skewness / kurtosis of the selected strategy's returns.
sr_variance : float
Variance of the (per-observation) Sharpe estimates **across the
trials**. The default ``1.0`` is a conservative placeholder — it
overstates the expected-maximum benchmark and so understates the DSR;
pass the empirical variance of the trial Sharpes whenever available
(as :meth:`fynance.research.Ledger.deflated_sharpe` does).
Returns
-------
float
DSR in ``[0, 1]``. Low values flag a likely overfit selection.
"""
n = max(int(n_trials), 1)
if n == 1:
sr_star = 0.0
else:
# Expected max of N i.i.d. standard normals, scaled by the SR dispersion.
gauss_max = ((1 - _EULER) * norm.ppf(1 - 1.0 / n)
+ _EULER * norm.ppf(1 - 1.0 / (n * np.e)))
sr_star = float(np.sqrt(sr_variance) * gauss_max)
return probabilistic_sharpe_ratio(
sr, n_obs, sr_benchmark=sr_star, skew=skew, kurt=kurt
)