Source code for fynance.data.split
#!/usr/bin/env python3
# coding: utf-8
""" Strictly time-ordered splits for ML evaluation.
No shuffling, ever. Provides a simple train/test split with an optional embargo
and a purged walk-forward window generator. These are pure index generators,
decoupled from any model (mirroring the walk-forward semantics of
:class:`fynance.models.rolling._RollingBasis`).
"""
from __future__ import annotations
# Built-in packages
from typing import Iterator
# Third-party packages
import numpy as np
from numpy.typing import NDArray
__all__ = ['train_test_split', 'walk_forward']
[docs]
def train_test_split(
n: int,
test_size: float | int,
gap: int = 0,
) -> tuple[NDArray[np.int64], NDArray[np.int64]]:
""" Time-ordered train/test index split.
Parameters
----------
n : int
Number of observations.
test_size : float or int
Fraction (``0 < x < 1``) or absolute count of the trailing test set.
gap : int
Embargo: observations dropped between train end and test start.
Returns
-------
(train_idx, test_idx) : tuple of numpy.ndarray
``test_idx`` is strictly after ``train_idx`` (no leakage).
"""
n_test = int(round(n * test_size)) if 0 < test_size < 1 else int(test_size)
split = n - n_test
if split - gap <= 0:
raise ValueError("train set is empty; reduce test_size/gap")
train_idx = np.arange(0, split - gap, dtype=np.int64)
test_idx = np.arange(split, n, dtype=np.int64)
return train_idx, test_idx
[docs]
def walk_forward(
n: int,
train: int,
test: int,
step: int | None = None,
purge: int = 0,
) -> Iterator[tuple[NDArray[np.int64], NDArray[np.int64]]]:
""" Generate purged walk-forward windows.
Each window trains on ``[t-train : t-purge]`` and tests on ``[t : t+test]``.
Parameters
----------
n : int
Number of observations.
train, test : int
Train and test window lengths.
step : int, optional
Roll step (defaults to ``test``, i.e. non-overlapping test windows).
purge : int
Observations removed at the train/test boundary (embargo).
Yields
------
(train_idx, test_idx) : tuple of numpy.ndarray
Index arrays with ``test_idx`` strictly after ``train_idx``.
"""
if step is None:
step = test
t = train
while t + test <= n:
train_idx = np.arange(max(0, t - train), t - purge, dtype=np.int64)
test_idx = np.arange(t, t + test, dtype=np.int64)
yield train_idx, test_idx
t += step