Source code for fynance.research.guards

#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" Statistical guardrails against spurious results.

When an agent searches over many strategies, the danger is **overfitting and
data-snooping**: a good in-sample Sharpe may be luck or leakage. These guards make
honest evaluation cheap — a permutation test (is the edge real?) and the
probabilistic / deflated Sharpe ratio (does it survive the number of trials?).

All functions are data-agnostic and operate on the existing maillons / plain
metrics; nothing here reads real data or stores results.

"""

# Built-in
from __future__ import annotations

from typing import Any

# Third-party
import numpy as np
from numpy.typing import NDArray
from scipy.stats import norm

# Local
from fynance.core import PriceSeries

__all__ = [
    'permutation_test',
    'probabilistic_sharpe_ratio',
    'deflated_sharpe_ratio',
]

# Euler-Mascheroni constant (for the expected maximum of N Gaussians).
_EULER = 0.5772156649015329


def _to_array(data: Any) -> NDArray[np.float64]:
    """ Coerce a PriceSeries / array-like to a 1-D float64 array. """
    if isinstance(data, PriceSeries):
        return data.to_numpy()

    return np.asarray(data, dtype=np.float64).reshape(-1)



[docs]
def permutation_test(
    strategy: Any,
    data: Any,
    *,
    metric: str = "sharpe",
    n_permutations: int = 200,
    seed: int = 0,
) -> dict[str, float]:
    """ Permutation test for a spurious edge.

    Runs the strategy on the real series, then on ``n_permutations`` shuffles of
    the asset's returns (which destroys any temporal structure). If the strategy
    scores as well on shuffled data as on the real data, its edge is not real.

    The price series must be strictly positive: the null is built on log-returns
    (``np.diff(np.log(prices))``), so a non-positive price would yield ``nan`` /
    ``-inf`` returns.

    Each run is seeded with a **distinct** seed derived from ``seed`` (the
    observed run and every permutation), so a stochastic strategy does not see
    the *same* model RNG draws on every path — which would bias the null
    variance. The whole test stays reproducible for a fixed ``seed``.

    Parameters
    ----------
    strategy : fynance.strategy.Strategy
        The strategy to evaluate.
    data : PriceSeries or array-like
        Strictly positive price series (``np.log`` is taken).
    metric : str
        Metric key from the run summary (default ``"sharpe"``).
    n_permutations : int
        Number of shuffles forming the null distribution.
    seed : int
        Master seed for the shuffles and the runs (per-run seeds are derived
        from it, so the test is fully reproducible).

    Returns
    -------
    dict
        ``observed``, ``p_value``, ``null_mean``, ``null_std``. The p-value is the
        (smoothed) fraction of shuffles scoring at least the observed metric.

    """
    # Imported here to avoid a runner <-> guards import cycle at module load.
    from fynance.research.runner import run_experiment

    prices = _to_array(data)
    log_ret = np.diff(np.log(prices))
    s0 = float(prices[0])

    # Draw distinct per-run seeds from the master seed so stochastic strategies
    # do not replay identical RNG draws on every path (which biases the null).
    seed_rng = np.random.default_rng(seed)
    run_seeds = seed_rng.integers(0, 2**31 - 1, size=n_permutations + 1)

    observed = run_experiment(strategy, prices, name="observed",
                              seed=int(run_seeds[0])).metrics[metric]

    rng = np.random.default_rng(seed)
    null = np.empty(n_permutations, dtype=np.float64)
    for i in range(n_permutations):
        shuffled = rng.permutation(log_ret)
        path = s0 * np.exp(np.concatenate([[0.0], np.cumsum(shuffled)]))
        null[i] = run_experiment(strategy, path, name="perm",
                                 seed=int(run_seeds[i + 1])).metrics[metric]

    # Smoothed p-value (never exactly 0): (#{null >= observed} + 1) / (n + 1).
    p_value = float((np.sum(null >= observed) + 1) / (n_permutations + 1))

    return {
        "observed": float(observed),
        "p_value": p_value,
        "null_mean": float(null.mean()),
        "null_std": float(null.std()),
    }




[docs]
def probabilistic_sharpe_ratio(
    sr: float,
    n_obs: int,
    *,
    sr_benchmark: float = 0.0,
    skew: float = 0.0,
    kurt: float = 3.0,
) -> float:
    """ Probabilistic Sharpe ratio (PSR).

    The probability that the true Sharpe exceeds ``sr_benchmark`` given the
    estimate ``sr`` from ``n_obs`` observations, correcting for the returns'
    skewness and kurtosis (Bailey & López de Prado).

    Parameters
    ----------
    sr : float
        Observed (non-annualized) Sharpe ratio.
    n_obs : int
        Number of return observations.
    sr_benchmark : float
        Benchmark Sharpe to beat.
    skew : float
        Skewness of the returns.
    kurt : float
        Kurtosis of the returns (3 for a normal distribution).

    Returns
    -------
    float
        PSR in ``[0, 1]``.

    """
    if n_obs <= 1:
        return float("nan")

    denom = np.sqrt(1.0 - skew * sr + (kurt - 1.0) / 4.0 * sr**2)
    z = (sr - sr_benchmark) * np.sqrt(n_obs - 1) / denom

    return float(norm.cdf(z))




[docs]
def deflated_sharpe_ratio(
    sr: float,
    n_obs: int,
    n_trials: int,
    *,
    skew: float = 0.0,
    kurt: float = 3.0,
    sr_variance: float = 1.0,
) -> float:
    """ Deflated Sharpe ratio (DSR).

    The PSR against a benchmark set to the **expected maximum** Sharpe of
    ``n_trials`` independent strategies — i.e. the probability the edge survives
    the multiple testing implied by trying ``n_trials`` strategies.

    Parameters
    ----------
    sr : float
        Observed **per-observation** (non-annualized) Sharpe ratio of the
        selected strategy. An annualized Sharpe must be divided by
        ``sqrt(period)`` first, or the DSR saturates to ~1.
    n_obs : int
        Number of return observations.
    n_trials : int
        Number of strategy configurations tried.
    skew, kurt : float
        Skewness / kurtosis of the selected strategy's returns.
    sr_variance : float
        Variance of the (per-observation) Sharpe estimates **across the
        trials**. The default ``1.0`` is a conservative placeholder — it
        overstates the expected-maximum benchmark and so understates the DSR;
        pass the empirical variance of the trial Sharpes whenever available
        (as :meth:`fynance.research.Ledger.deflated_sharpe` does).

    Returns
    -------
    float
        DSR in ``[0, 1]``. Low values flag a likely overfit selection.

    """
    n = max(int(n_trials), 1)
    if n == 1:
        sr_star = 0.0
    else:
        # Expected max of N i.i.d. standard normals, scaled by the SR dispersion.
        gauss_max = ((1 - _EULER) * norm.ppf(1 - 1.0 / n)
                     + _EULER * norm.ppf(1 - 1.0 / (n * np.e)))
        sr_star = float(np.sqrt(sr_variance) * gauss_max)

    return probabilistic_sharpe_ratio(
        sr, n_obs, sr_benchmark=sr_star, skew=skew, kurt=kurt
    )