#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Statistical guardrails against spurious results.
When an agent searches over many strategies, the danger is **overfitting and
data-snooping**: a good in-sample Sharpe may be luck or leakage. These guards make
honest evaluation cheap — a permutation test (is the edge real?) and the
probabilistic / deflated Sharpe ratio (does it survive the number of trials?).
All functions are data-agnostic and operate on the existing maillons / plain
metrics; nothing here reads real data or stores results.
"""
# Built-in
from __future__ import annotations
from typing import Any
# Third-party
import numpy as np
from numpy.typing import NDArray
from scipy.stats import norm
# Local
from fynance.core import PriceSeries
__all__ = [
'permutation_test',
'probabilistic_sharpe_ratio',
'deflated_sharpe_ratio',
]
# Euler-Mascheroni constant (for the expected maximum of N Gaussians).
_EULER = 0.5772156649015329
def _to_array(data: Any) -> NDArray[np.float64]:
""" Coerce a PriceSeries / array-like to a 1-D float64 array. """
if isinstance(data, PriceSeries):
return data.to_numpy()
return np.asarray(data, dtype=np.float64).reshape(-1)
[docs]
def permutation_test(
strategy: Any,
data: Any,
*,
metric: str = "sharpe",
n_permutations: int = 200,
seed: int = 0,
) -> dict[str, float]:
""" Permutation test for a spurious edge.
Runs the strategy on the real series, then on ``n_permutations`` shuffles of
the asset's returns (which destroys any temporal structure). If the strategy
scores as well on shuffled data as on the real data, its edge is not real.
Parameters
----------
strategy : fynance.strategy.Strategy
The strategy to evaluate.
data : PriceSeries or array-like
Price series.
metric : str
Metric key from the run summary (default ``"sharpe"``).
n_permutations : int
Number of shuffles forming the null distribution.
seed : int
Seed for the shuffles and the runs.
Returns
-------
dict
``observed``, ``p_value``, ``null_mean``, ``null_std``. The p-value is the
(smoothed) fraction of shuffles scoring at least the observed metric.
"""
# Imported here to avoid a runner <-> guards import cycle at module load.
from fynance.research.runner import run_experiment
prices = _to_array(data)
log_ret = np.diff(np.log(prices))
s0 = float(prices[0])
observed = run_experiment(strategy, prices, name="observed",
seed=seed).metrics[metric]
rng = np.random.default_rng(seed)
null = np.empty(n_permutations, dtype=np.float64)
for i in range(n_permutations):
shuffled = rng.permutation(log_ret)
path = s0 * np.exp(np.concatenate([[0.0], np.cumsum(shuffled)]))
null[i] = run_experiment(strategy, path, name="perm",
seed=seed).metrics[metric]
# Smoothed p-value (never exactly 0): (#{null >= observed} + 1) / (n + 1).
p_value = float((np.sum(null >= observed) + 1) / (n_permutations + 1))
return {
"observed": float(observed),
"p_value": p_value,
"null_mean": float(null.mean()),
"null_std": float(null.std()),
}
[docs]
def probabilistic_sharpe_ratio(
sr: float,
n_obs: int,
*,
sr_benchmark: float = 0.0,
skew: float = 0.0,
kurt: float = 3.0,
) -> float:
""" Probabilistic Sharpe ratio (PSR).
The probability that the true Sharpe exceeds ``sr_benchmark`` given the
estimate ``sr`` from ``n_obs`` observations, correcting for the returns'
skewness and kurtosis (Bailey & López de Prado).
Parameters
----------
sr : float
Observed (non-annualized) Sharpe ratio.
n_obs : int
Number of return observations.
sr_benchmark : float
Benchmark Sharpe to beat.
skew : float
Skewness of the returns.
kurt : float
Kurtosis of the returns (3 for a normal distribution).
Returns
-------
float
PSR in ``[0, 1]``.
"""
if n_obs <= 1:
return float("nan")
denom = np.sqrt(1.0 - skew * sr + (kurt - 1.0) / 4.0 * sr**2)
z = (sr - sr_benchmark) * np.sqrt(n_obs - 1) / denom
return float(norm.cdf(z))
[docs]
def deflated_sharpe_ratio(
sr: float,
n_obs: int,
n_trials: int,
*,
skew: float = 0.0,
kurt: float = 3.0,
sr_variance: float = 1.0,
) -> float:
""" Deflated Sharpe ratio (DSR).
The PSR against a benchmark set to the **expected maximum** Sharpe of
``n_trials`` independent strategies — i.e. the probability the edge survives
the multiple testing implied by trying ``n_trials`` strategies.
Parameters
----------
sr : float
Observed (non-annualized) Sharpe ratio of the selected strategy.
n_obs : int
Number of return observations.
n_trials : int
Number of strategy configurations tried.
skew, kurt : float
Skewness / kurtosis of the selected strategy's returns.
sr_variance : float
Variance of the Sharpe estimates **across the trials**.
Returns
-------
float
DSR in ``[0, 1]``. Low values flag a likely overfit selection.
"""
n = max(int(n_trials), 1)
if n == 1:
sr_star = 0.0
else:
# Expected max of N i.i.d. standard normals, scaled by the SR dispersion.
gauss_max = ((1 - _EULER) * norm.ppf(1 - 1.0 / n)
+ _EULER * norm.ppf(1 - 1.0 / (n * np.e)))
sr_star = float(np.sqrt(sr_variance) * gauss_max)
return probabilistic_sharpe_ratio(
sr, n_obs, sr_benchmark=sr_star, skew=skew, kurt=kurt
)