Source code for fynance.data.split

#!/usr/bin/env python3
# coding: utf-8

""" Strictly time-ordered splits for ML evaluation.

No shuffling, ever. Provides a simple train/test split with an optional embargo
and a purged walk-forward window generator. These are pure index generators,
decoupled from any model (mirroring the walk-forward semantics of
:class:`fynance.models.rolling._RollingBasis`).

"""

from __future__ import annotations

# Built-in packages
from typing import Iterator

# Third-party packages
import numpy as np
from numpy.typing import NDArray

__all__ = ['train_test_split', 'walk_forward']



[docs]
def train_test_split(
    n: int,
    test_size: float | int,
    gap: int = 0,
) -> tuple[NDArray[np.int64], NDArray[np.int64]]:
    """ Time-ordered train/test index split.

    Parameters
    ----------
    n : int
        Number of observations.
    test_size : float or int
        Trailing test set size. A value strictly inside ``(0, 1)`` is read as a
        **fraction** of ``n`` (e.g. ``0.2`` -> ``round(0.2 * n)``); any other
        value -- including the bounds ``0.0`` and ``1.0`` -- is read as an
        **absolute count** (``int(test_size)``). In particular ``1.0`` means a
        single observation (count ``1``), not the whole series, and ``0.0``
        means an empty test set; pass a fraction strictly between the bounds to
        get a proportional split.
    gap : int
        Embargo: observations dropped between train end and test start.

    Returns
    -------
    (train_idx, test_idx) : tuple of numpy.ndarray
        ``test_idx`` is strictly after ``train_idx`` (no leakage).

    Raises
    ------
    ValueError
        If ``test_size`` is negative (a negative integer would yield
        out-of-bounds train indices and a negative fraction would silently
        produce an empty test set), if the resulting test count exceeds ``n``,
        or if the train set would be empty.

    """
    if test_size < 0:

        raise ValueError(f"test_size must be >= 0, got {test_size}")

    n_test = int(round(n * test_size)) if 0 < test_size < 1 else int(test_size)

    if n_test > n:

        raise ValueError(f"test_size ({n_test}) exceeds n ({n})")

    split = n - n_test

    if split - gap <= 0:

        raise ValueError("train set is empty; reduce test_size/gap")

    train_idx = np.arange(0, split - gap, dtype=np.int64)
    test_idx = np.arange(split, n, dtype=np.int64)

    return train_idx, test_idx




[docs]
def walk_forward(
    n: int,
    train: int,
    test: int,
    step: int | None = None,
    purge: int = 0,
) -> Iterator[tuple[NDArray[np.int64], NDArray[np.int64]]]:
    """ Generate purged walk-forward windows.

    Each window trains on ``[t-train : t-purge]`` and tests on ``[t : t+test]``.

    Parameters
    ----------
    n : int
        Number of observations.
    train, test : int
        Train and test window lengths.
    step : int, optional
        Roll step (defaults to ``test``, i.e. non-overlapping test windows).
    purge : int
        Observations removed at the train/test boundary (embargo).

    Yields
    ------
    (train_idx, test_idx) : tuple of numpy.ndarray
        Index arrays with ``test_idx`` strictly after ``train_idx``.

    Raises
    ------
    ValueError
        If ``train <= 0`` or ``purge >= train``: either would yield empty train
        windows (``[t-train : t-purge]`` becomes empty), which silently breaks a
        downstream ``fit`` with an opaque error instead of failing here. Also if
        ``step <= 0``, which would never advance ``t`` and loop forever.

    """
    if train <= 0:

        raise ValueError(f"train must be > 0, got {train}")

    if purge >= train:

        raise ValueError(
            f"purge must be < train, got purge={purge}, train={train} "
            "(otherwise every train window is empty)"
        )

    if step is None:
        step = test

    if step <= 0:

        raise ValueError(
            f"step must be > 0, got {step} (otherwise t never advances "
            "and the window generator loops forever)"
        )

    t = train

    while t + test <= n:
        train_idx = np.arange(max(0, t - train), t - purge, dtype=np.int64)
        test_idx = np.arange(t, t + test, dtype=np.int64)

        yield train_idx, test_idx

        t += step