Source code for fynance.data.split
#!/usr/bin/env python3
# coding: utf-8
""" Strictly time-ordered splits for ML evaluation.
No shuffling, ever. Provides a simple train/test split with an optional embargo
and a purged walk-forward window generator. These are pure index generators,
decoupled from any model (mirroring the walk-forward semantics of
:class:`fynance.models.rolling._RollingBasis`).
"""
from __future__ import annotations
# Built-in packages
from typing import Iterator
# Third-party packages
import numpy as np
from numpy.typing import NDArray
__all__ = ['train_test_split', 'walk_forward']
[docs]
def train_test_split(
n: int,
test_size: float | int,
gap: int = 0,
) -> tuple[NDArray[np.int64], NDArray[np.int64]]:
""" Time-ordered train/test index split.
Parameters
----------
n : int
Number of observations.
test_size : float or int
Trailing test set size. A value strictly inside ``(0, 1)`` is read as a
**fraction** of ``n`` (e.g. ``0.2`` -> ``round(0.2 * n)``); any other
value -- including the bounds ``0.0`` and ``1.0`` -- is read as an
**absolute count** (``int(test_size)``). In particular ``1.0`` means a
single observation (count ``1``), not the whole series, and ``0.0``
means an empty test set; pass a fraction strictly between the bounds to
get a proportional split.
gap : int
Embargo: observations dropped between train end and test start.
Returns
-------
(train_idx, test_idx) : tuple of numpy.ndarray
``test_idx`` is strictly after ``train_idx`` (no leakage).
Raises
------
ValueError
If ``test_size`` is negative (a negative integer would yield
out-of-bounds train indices and a negative fraction would silently
produce an empty test set), if the resulting test count exceeds ``n``,
or if the train set would be empty.
"""
if test_size < 0:
raise ValueError(f"test_size must be >= 0, got {test_size}")
n_test = int(round(n * test_size)) if 0 < test_size < 1 else int(test_size)
if n_test > n:
raise ValueError(f"test_size ({n_test}) exceeds n ({n})")
split = n - n_test
if split - gap <= 0:
raise ValueError("train set is empty; reduce test_size/gap")
train_idx = np.arange(0, split - gap, dtype=np.int64)
test_idx = np.arange(split, n, dtype=np.int64)
return train_idx, test_idx
[docs]
def walk_forward(
n: int,
train: int,
test: int,
step: int | None = None,
purge: int = 0,
) -> Iterator[tuple[NDArray[np.int64], NDArray[np.int64]]]:
""" Generate purged walk-forward windows.
Each window trains on ``[t-train : t-purge]`` and tests on ``[t : t+test]``.
Parameters
----------
n : int
Number of observations.
train, test : int
Train and test window lengths.
step : int, optional
Roll step (defaults to ``test``, i.e. non-overlapping test windows).
purge : int
Observations removed at the train/test boundary (embargo).
Yields
------
(train_idx, test_idx) : tuple of numpy.ndarray
Index arrays with ``test_idx`` strictly after ``train_idx``.
Raises
------
ValueError
If ``train <= 0`` or ``purge >= train``: either would yield empty train
windows (``[t-train : t-purge]`` becomes empty), which silently breaks a
downstream ``fit`` with an opaque error instead of failing here. Also if
``step <= 0``, which would never advance ``t`` and loop forever.
"""
if train <= 0:
raise ValueError(f"train must be > 0, got {train}")
if purge >= train:
raise ValueError(
f"purge must be < train, got purge={purge}, train={train} "
"(otherwise every train window is empty)"
)
if step is None:
step = test
if step <= 0:
raise ValueError(
f"step must be > 0, got {step} (otherwise t never advances "
"and the window generator loops forever)"
)
t = train
while t + test <= n:
train_idx = np.arange(max(0, t - train), t - purge, dtype=np.int64)
test_idx = np.arange(t, t + test, dtype=np.int64)
yield train_idx, test_idx
t += step