Source code for fynance.models.transformer
#!/usr/bin/env python3
# coding: utf-8
""" Transformer model for financial sequences.
Defines :class:`Transformer`, a causal Transformer encoder on
:class:`~fynance.models._base.BaseNeuralNet`. It reuses the attention
building blocks from :mod:`fynance.models.attention` and applies a
**causal mask** so position ``t`` attends only to ``≤ t`` — preserving
the library's no-lookahead invariant for time-series prediction.
Main entry points
-----------------
- :class:`PositionalEncoding` — sinusoidal absolute positional encoding.
- :class:`Transformer` — stacked causal Transformer encoder blocks.
References
----------
.. [1] Vaswani, A. et al. (2017). Attention Is All You Need.
"""
from __future__ import annotations
# Built-in packages
import math
# Third-party packages
import polars as pl
import torch
import torch.nn as nn
from numpy.typing import NDArray
# Local packages
from fynance.models._base import BaseNeuralNet
from fynance.models.attention import MultiHeadAttention
__all__ = ['PositionalEncoding', 'Transformer']
[docs]
class PositionalEncoding(nn.Module):
r""" Sinusoidal absolute positional encoding (Vaswani et al., 2017).
Adds a fixed position-dependent signal to the input embeddings so the
attention layers can use the order of the sequence.
Parameters
----------
d_model : int
Embedding dimension.
max_len : int, optional
Maximum supported sequence length. Default ``5000``.
"""
def __init__(self, d_model: int, max_len: int = 5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(max_len).unsqueeze(1).float()
div = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div)
pe[:, 1::2] = torch.cos(position * div[: pe[:, 1::2].size(1)])
self.register_buffer('pe', pe)
[docs]
def forward(self, x):
""" Add positional encoding to ``x`` of shape ``(B, T, d_model)``. """
return x + self.pe[: x.size(1)]
class _TransformerBlock(nn.Module):
""" Causal Transformer encoder block: self-attention + feed-forward. """
def __init__(self, d_model, num_heads, dim_ff, drop):
super().__init__()
# MultiHeadAttention already applies the residual + layer-norm.
self.attn = MultiHeadAttention(d_model, num_heads, dropout=drop)
self.ff = nn.Sequential(
nn.Linear(d_model, dim_ff),
nn.ReLU(),
nn.Dropout(drop),
nn.Linear(dim_ff, d_model),
)
self.norm = nn.LayerNorm(d_model)
self.drop = nn.Dropout(drop)
def forward(self, x, mask):
x, _ = self.attn(x, mask=mask)
return self.norm(x + self.drop(self.ff(x)))
[docs]
class Transformer(BaseNeuralNet):
r""" Causal Transformer encoder for sequential financial data.
Projects the ``N`` input features to ``d_model``, adds sinusoidal
positional encoding, applies ``num_layers`` causal self-attention
blocks, and reads out to ``M`` outputs. A lower-triangular mask makes
every block **strictly causal** (no lookahead): the output at ``t``
depends only on inputs up to ``t``.
Configure the optimizer with :meth:`BaseNeuralNet.set_optimizer`
(e.g. with :class:`fynance.models.loss.SharpeLoss`).
Parameters
----------
X, y : array-like or int
- If array-like, respectively the input and output data.
- If an integer, respectively the input and output dimension.
d_model : int, optional
Embedding / model dimension (divisible by ``num_heads``). Default 32.
num_heads : int, optional
Number of attention heads. Default 4.
num_layers : int, optional
Number of stacked encoder blocks. Default 2.
dim_ff : int, optional
Hidden size of the position-wise feed-forward sublayer. Default 64.
drop : float, optional
Dropout probability. Default 0.
See Also
--------
fynance.models.attention.MultiHeadAttention, fynance.models.tcn.TemporalConvNet
Examples
--------
>>> import torch
>>> from fynance.models.transformer import Transformer
>>> _ = torch.manual_seed(0)
>>> X = torch.randn(40, 3)
>>> y = torch.randn(40, 1)
>>> model = Transformer(X, y, d_model=16, num_heads=2, num_layers=2)
>>> model(X).shape
torch.Size([40, 1])
"""
def __init__(
self,
X: NDArray | torch.Tensor | pl.DataFrame | int,
y: NDArray | torch.Tensor | pl.DataFrame | int,
d_model: int = 32,
num_heads: int = 4,
num_layers: int = 2,
dim_ff: int = 64,
drop: float = 0.,
x_type=None,
y_type=None,
):
""" Initialize object. """
BaseNeuralNet.__init__(self)
if isinstance(X, int) and isinstance(y, int):
self.N, self.M = X, y
else:
self.set_data(X=X, y=y, x_type=x_type, y_type=y_type) # type: ignore[arg-type]
self.input_proj = nn.Linear(self.N, d_model)
self.pos_encoder = PositionalEncoding(d_model)
self.blocks = nn.ModuleList([
_TransformerBlock(d_model, num_heads, dim_ff, drop)
for _ in range(num_layers)
])
self.output_proj = nn.Linear(d_model, self.M)
[docs]
def forward(self, x):
""" Forward pass.
Parameters
----------
x : torch.Tensor
Input window, shape ``(L, N)``.
Returns
-------
torch.Tensor
Per-step output, shape ``(L, M)``.
"""
length = x.size(0)
z = self.input_proj(x).unsqueeze(0) # (1, L, d_model)
z = self.pos_encoder(z)
# Lower-triangular causal mask: position t may attend to <= t only.
mask = torch.tril(torch.ones(length, length, device=x.device))
for block in self.blocks:
z = block(z, mask)
return self.output_proj(z.squeeze(0)) # (L, M)