Source code for fynance.research.guards

#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" Statistical guardrails against spurious results.

When an agent searches over many strategies, the danger is **overfitting and
data-snooping**: a good in-sample Sharpe may be luck or leakage. These guards make
honest evaluation cheap — a permutation test (is the edge real?) and the
probabilistic / deflated Sharpe ratio (does it survive the number of trials?).

All functions are data-agnostic and operate on the existing maillons / plain
metrics; nothing here reads real data or stores results.

"""

# Built-in
from __future__ import annotations

from typing import Any

# Third-party
import numpy as np
from numpy.typing import NDArray
from scipy.stats import norm

# Local
from fynance.core import PriceSeries

__all__ = [
    'permutation_test',
    'probabilistic_sharpe_ratio',
    'deflated_sharpe_ratio',
]

# Euler-Mascheroni constant (for the expected maximum of N Gaussians).
_EULER = 0.5772156649015329


def _to_array(data: Any) -> NDArray[np.float64]:
    """ Coerce a PriceSeries / array-like to a 1-D float64 array. """
    if isinstance(data, PriceSeries):
        return data.to_numpy()

    return np.asarray(data, dtype=np.float64).reshape(-1)


[docs] def permutation_test( strategy: Any, data: Any, *, metric: str = "sharpe", n_permutations: int = 200, seed: int = 0, ) -> dict[str, float]: """ Permutation test for a spurious edge. Runs the strategy on the real series, then on ``n_permutations`` shuffles of the asset's returns (which destroys any temporal structure). If the strategy scores as well on shuffled data as on the real data, its edge is not real. Parameters ---------- strategy : fynance.strategy.Strategy The strategy to evaluate. data : PriceSeries or array-like Price series. metric : str Metric key from the run summary (default ``"sharpe"``). n_permutations : int Number of shuffles forming the null distribution. seed : int Seed for the shuffles and the runs. Returns ------- dict ``observed``, ``p_value``, ``null_mean``, ``null_std``. The p-value is the (smoothed) fraction of shuffles scoring at least the observed metric. """ # Imported here to avoid a runner <-> guards import cycle at module load. from fynance.research.runner import run_experiment prices = _to_array(data) log_ret = np.diff(np.log(prices)) s0 = float(prices[0]) observed = run_experiment(strategy, prices, name="observed", seed=seed).metrics[metric] rng = np.random.default_rng(seed) null = np.empty(n_permutations, dtype=np.float64) for i in range(n_permutations): shuffled = rng.permutation(log_ret) path = s0 * np.exp(np.concatenate([[0.0], np.cumsum(shuffled)])) null[i] = run_experiment(strategy, path, name="perm", seed=seed).metrics[metric] # Smoothed p-value (never exactly 0): (#{null >= observed} + 1) / (n + 1). p_value = float((np.sum(null >= observed) + 1) / (n_permutations + 1)) return { "observed": float(observed), "p_value": p_value, "null_mean": float(null.mean()), "null_std": float(null.std()), }
[docs] def probabilistic_sharpe_ratio( sr: float, n_obs: int, *, sr_benchmark: float = 0.0, skew: float = 0.0, kurt: float = 3.0, ) -> float: """ Probabilistic Sharpe ratio (PSR). The probability that the true Sharpe exceeds ``sr_benchmark`` given the estimate ``sr`` from ``n_obs`` observations, correcting for the returns' skewness and kurtosis (Bailey & López de Prado). Parameters ---------- sr : float Observed (non-annualized) Sharpe ratio. n_obs : int Number of return observations. sr_benchmark : float Benchmark Sharpe to beat. skew : float Skewness of the returns. kurt : float Kurtosis of the returns (3 for a normal distribution). Returns ------- float PSR in ``[0, 1]``. """ if n_obs <= 1: return float("nan") denom = np.sqrt(1.0 - skew * sr + (kurt - 1.0) / 4.0 * sr**2) z = (sr - sr_benchmark) * np.sqrt(n_obs - 1) / denom return float(norm.cdf(z))
[docs] def deflated_sharpe_ratio( sr: float, n_obs: int, n_trials: int, *, skew: float = 0.0, kurt: float = 3.0, sr_variance: float = 1.0, ) -> float: """ Deflated Sharpe ratio (DSR). The PSR against a benchmark set to the **expected maximum** Sharpe of ``n_trials`` independent strategies — i.e. the probability the edge survives the multiple testing implied by trying ``n_trials`` strategies. Parameters ---------- sr : float Observed (non-annualized) Sharpe ratio of the selected strategy. n_obs : int Number of return observations. n_trials : int Number of strategy configurations tried. skew, kurt : float Skewness / kurtosis of the selected strategy's returns. sr_variance : float Variance of the Sharpe estimates **across the trials**. Returns ------- float DSR in ``[0, 1]``. Low values flag a likely overfit selection. """ n = max(int(n_trials), 1) if n == 1: sr_star = 0.0 else: # Expected max of N i.i.d. standard normals, scaled by the SR dispersion. gauss_max = ((1 - _EULER) * norm.ppf(1 - 1.0 / n) + _EULER * norm.ppf(1 - 1.0 / (n * np.e))) sr_star = float(np.sqrt(sr_variance) * gauss_max) return probabilistic_sharpe_ratio( sr, n_obs, sr_benchmark=sr_star, skew=skew, kurt=kurt )