Source code for fynance.models.lstm

#!/usr/bin/env python3
# coding: utf-8

""" Long Short-Term Memory (LSTM) model.

Defines :class:`LSTMCell`, a composable LSTM building block, and
:class:`LongShortTermMemory`, a full LSTM model with output projection.
The internal :class:`_LSTMCell` holds the four LSTM gates (forget,
input/update, candidate, output) and is the common base for both.

The distinction mirrors PyTorch's own ``torch.nn.LSTMCell`` vs
``torch.nn.LSTM``: :class:`LSTMCell` is the raw cell (useful for
composing larger architectures such as TCN or Transformer encoders),
while :class:`LongShortTermMemory` wraps it with an output projection
and training helpers.

Main entry points
-----------------
- :class:`LSTMCell` — composable LSTM cell without output projection.
- :class:`LongShortTermMemory` — LSTM model ready for walk-forward
  training via :meth:`~fynance.models._base.BaseNeuralNet.set_optimizer`.

References
----------
.. [1] Hochreiter, S. & Schmidhuber, J. (1997). Long Short-Term Memory.
       Neural Computation, 9(8), 1735–1780.

"""

from __future__ import annotations

# Third-party packages
import torch
from torch import nn

# Local packages
from fynance.models._recurrent_base import _OutputLayerMixin, _RecurrentBase

__all__ = ['LSTMCell', 'LongShortTermMemory']


class _LSTMCell(_RecurrentBase):
    """ LSTM cell: four gates without output projection.

    Implements the Long Short-Term Memory forward pass (Hochreiter &
    Schmidhuber, 1997) with forget gate ``G_f``, input gate ``G_i``,
    candidate cell ``C_tild``, and output gate ``G_o``. Returns the
    updated hidden state ``H`` and cell state ``C`` — no output layer.
    Use :class:`LongShortTermMemory` for a complete model with output
    projection and training helpers.

    Parameters
    ----------
    X, y : array-like or int
        - If it's an array-like, respectively inputs and outputs data.
        - If it's an integer, respectively dimension of inputs and outputs.
    drop : float, optional
        Probability of an element to be zeroed.
    hidden_activation, memory_activation : torch.nn.Module, optional
        Activation functions for respectively hidden and memory state,
        default both are Tanh function.
    hidden_state_size, memory_state_size : int, optional
        Size of respectively hidden and memory states. Default hidden
        state is the same size as input; default memory state is the
        same size as hidden state.
    forget_activation, update_activation, output_activation :
    torch.nn.Module, optional
        Activation functions for respectively forget, update and output
        gate, default are Sigmoid function for all three.

    Attributes
    ----------
    W_f, W_i, W_o, W_c : torch.nn.Linear
        Respectively forget, update and output gate weights and weight to
        compute the candidate value for cell memory.
    f_f, f_i, f_o, f_c : torch.nn.Module
        Respectively activation function for forget, update and output gate
        and activation function to compute the candidate value for cell
        memory.

    See Also
    --------
    LongShortTermMemory,
    fynance.models.gru._GRUCell

    """

    def __init__(
        self, X, y=None, drop=None, x_type=None, y_type=None, bias=True,
        hidden_activation=nn.Tanh, hidden_state_size=None,
        memory_activation=nn.Tanh, memory_state_size=None,
        forget_activation=nn.Sigmoid, update_activation=nn.Sigmoid,
        output_activation=nn.Sigmoid,
    ):

        _RecurrentBase.__init__(
            self,
            X,
            y,
            drop=drop,
            x_type=x_type,
            y_type=y_type,
            bias=bias,
            hidden_activation=hidden_activation,
            hidden_state_size=hidden_state_size,
        )

        self.C = self.H if memory_state_size is None else memory_state_size

        # Forget gate
        self.W_f = nn.Linear(self.N + self.H, self.C, bias=bias)
        self.f_f = forget_activation()

        # Update gate
        self.W_i = nn.Linear(self.N + self.H, self.C, bias=bias)
        self.f_i = update_activation()

        # Candidate value
        self.W_c = nn.Linear(self.N + self.H, self.C, bias=bias)
        self.f_c = memory_activation()

        # Output gate
        self.W_o = nn.Linear(self.N + self.H, self.C, bias=bias)
        self.f_o = output_activation()

        # Hidden activation (applied to cell state before output gate)
        self.f_h = hidden_activation()

    def forward(self, X, H, C):
        X_H = torch.cat([X, H], dim=1)

        # Forget gate
        G_f = self.f_f(self.W_f(self.drop(X_H)))

        # Candidate value
        C_tild = self.f_c(self.W_c(self.drop(X_H)))

        # Update gate
        G_i = self.f_i(self.W_i(self.drop(X_H)))

        C = G_f * C + G_i * C_tild

        # Output gate
        G_o = self.f_o(self.W_o(self.drop(X_H)))

        H = G_o * self.f_h(C)

        return H, C



[docs]
class LSTMCell(_LSTMCell):
    """ LSTM cell — public composable building block.

    Implements the four LSTM gates (forget, input, candidate, output)
    without an output projection layer. Designed to be composed inside
    larger architectures (TCN, Transformers, encoder-decoders). For a
    standalone trainable model with output projection, use
    :class:`LongShortTermMemory`.

    Parameters
    ----------
    X : int or array-like
        Input dimension (int) or input data. When passing an int, ``y``
        may be omitted.
    y : array-like or int, optional
        Output data or output dimension. Not required when using the
        cell as a building block.
    hidden_state_size : int, optional
        Size of the hidden state. Defaults to the input size.
    memory_state_size : int, optional
        Size of the cell state. Defaults to hidden state size.
    drop : float, optional
        Dropout probability applied before each gate.
    hidden_activation, memory_activation : torch.nn.Module, optional
        Activations for hidden and cell state (default: Tanh for both).
    forget_activation, update_activation, output_activation :
    torch.nn.Module, optional
        Gate activations (default: Sigmoid for all three).

    Examples
    --------
    >>> import torch
    >>> from fynance.models.lstm import LSTMCell
    >>> cell = LSTMCell(8, hidden_state_size=16)
    >>> H = torch.zeros(4, 16)
    >>> C = torch.zeros(4, 16)
    >>> X = torch.randn(4, 8)
    >>> H_new, C_new = cell(X, H, C)
    >>> H_new.shape
    torch.Size([4, 16])

    See Also
    --------
    LongShortTermMemory : full model with output projection and training.
    fynance.models.gru.GRUCell : GRU variant.

    """

    def train_on(self, *args, **kwargs):
        raise NotImplementedError(
            "LSTMCell is a composable building block with no output projection. "
            "Use LongShortTermMemory for a standalone trainable model."
        )

    def predict(self, *args, **kwargs):
        raise NotImplementedError(
            "LSTMCell is a composable building block with no output projection. "
            "Use LongShortTermMemory for a standalone trainable model."
        )




[docs]
class LongShortTermMemory(_OutputLayerMixin, LSTMCell):
    """ Long Short-Term Memory cell with output projection.

    LSTM four-gate architecture (:class:`_LSTMCell`) followed by a
    forward output projection. The caller supplies the hidden state ``H``
    and cell state ``C``; :meth:`forward` returns updated ``(Y, H, C)``.
    Like the other gated cells in this package, **each of the ``T`` rows
    of ``X`` is processed independently** — the cell does *not* loop over
    a time axis or thread ``H`` / ``C`` across rows on its own, so it is a
    *stateless* gated feed-forward cell. To model temporal dependencies
    the caller must thread ``H`` and ``C`` across successive steps
    explicitly. For built-in, causal sequence modeling prefer
    :class:`~fynance.models.tcn.TemporalConvNet` or
    :class:`~fynance.models.transformer.Transformer`.

    Parameters
    ----------
    X, y : array-like or int
        - If it's an array-like, respectively inputs and outputs data.
        - If it's an integer, respectively dimension of inputs and outputs.
    drop : float, optional
        Probability of an element to be zeroed.
    bias : bool, optional
        If ``True`` (default), the linear layers learn an additive bias.
    forward_activation : torch.nn.Module, optional
        Output activation, default is Identity (unconstrained regression
        output; pass ``nn.Softmax`` for a probability-simplex output).
    hidden_activation, memory_activation : torch.nn.Module, optional
        Activation functions for respectively hidden and memory state,
        default both are Tanh function.
    hidden_state_size, memory_state_size : int, optional
        Size of respectively hidden and memory states. Default hidden
        state is the same size as input; default memory state is the
        same size as hidden state.
    forget_activation, update_activation, output_activation :
    torch.nn.Module, optional
        Activation functions for respectively forget, update and output
        gate, default are Sigmoid function for all three.

    Attributes
    ----------
    criterion : torch.nn.modules.loss
        A loss function.
    optimizer : torch.optim
        An optimizer algorithm.
    W_f, W_i, W_o, W_c, W_y : torch.nn.Linear
        Respectively forget, update and output gate weights, weight to
        compute the candidate value for cell memory and forward weight.
    f_f, f_i, f_o, f_c, f_y : torch.nn.Module
        Respectively activation function for forget, update and output gate,
        activation function to compute the candidate value for cell memory
        and forward activation function.

    See Also
    --------
    fynance.models.rnn.RecurrentNeuralNetwork,
    fynance.models.gru.GatedRecurrentUnit

    """

    def __init__(
        self, X, y, drop=None, x_type=None, y_type=None, bias=True,
        forward_activation=nn.Identity, hidden_activation=nn.Tanh,
        hidden_state_size=None, memory_activation=nn.Tanh,
        memory_state_size=None, forget_activation=nn.Sigmoid,
        update_activation=nn.Sigmoid, output_activation=nn.Sigmoid,
    ):

        LSTMCell.__init__(
            self,
            X,
            y,
            drop=drop,
            x_type=x_type,
            y_type=y_type,
            bias=bias,
            hidden_activation=hidden_activation,
            hidden_state_size=hidden_state_size,
            memory_activation=memory_activation,
            memory_state_size=memory_state_size,
            forget_activation=forget_activation,
            update_activation=update_activation,
            output_activation=output_activation,
        )

        _OutputLayerMixin.__init__(self, forward_activation=forward_activation)


[docs]
    def forward(self, X, H, C):
        """ Forward method.

        Parameters
        ----------
        X, H, C : torch.Tensor
            Respectively input data, hidden state and memory state.

        Returns
        -------
        torch.Tensor
            Output data.
        torch.Tensor
            Hidden state.
        torch.Tensor
            Memory state.

        """
        H, C = super().forward(X, H, C)
        Y = self.f_y(self.W_y(self.drop(H)))

        return Y, H, C



[docs]
    def fit(self, X, y, epochs: int = 1, x_type=None, y_type=None):
        """ Fit the model on ``(X, y)`` for ``epochs`` full-batch steps.

        Conforms to the :class:`~fynance.core.protocols.SignalModel`
        contract. The hidden state ``H`` and cell state ``C`` are both
        zero-initialized once and threaded across epochs (detached
        between steps). An optimizer must have been registered with
        :meth:`~fynance.models._base.BaseNeuralNet.set_optimizer`.

        Parameters
        ----------
        X, y : array-like
            Input and output data (numpy / torch / polars), shapes
            ``(T, N)`` and ``(T, M)``.
        epochs : int
            Number of full-batch training steps.
        x_type, y_type : torch.dtype, optional
            Target dtypes forwarded to
            :meth:`~fynance.models._base.BaseNeuralNet.set_data`.

        Returns
        -------
        LongShortTermMemory
            ``self``, to allow chaining.

        """
        self.set_data(X, y, x_type=x_type, y_type=y_type)
        H = self._init_state(self.X)
        C = self._init_cell_state(self.X)

        for _ in range(epochs):
            _, H, C = self.train_on(self.X, self.y, H, C)

        return self


    def _init_cell_state(self, X: torch.Tensor) -> torch.Tensor:
        """ Build a zero cell state matching ``X`` (rows, dtype, device). """
        try:
            param = next(self.parameters())
            device, dtype = param.device, param.dtype

        except StopIteration:
            device, dtype = X.device, X.dtype

        return torch.zeros(X.shape[0], self.C, dtype=dtype, device=device)


[docs]
    @torch.enable_grad()
    def train_on(self, X: torch.Tensor, y: torch.Tensor, H: torch.Tensor, C: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:  # type: ignore[override]
        """ Trains the neural network model.

        Parameters
        ----------
        X, y, H, C : torch.Tensor
            Respectively inputs, outputs, states and cell memory to train
            model.

        Returns
        -------
        torch.nn.modules.loss
            Loss outputs.
        torch.Tensor
            Updated states of the model.
        torch.Tensor
            Cell memory of the model.

        """
        self.train()
        self.optimizer.zero_grad()  # type: ignore[attr-defined]
        outputs, H, C = self(X, H, C)
        loss = self.criterion(outputs, y)
        loss.backward()
        self.optimizer.step()  # type: ignore[attr-defined]

        if self.lr_scheduler:
            self.lr_scheduler.step()

        return loss, H.detach(), C.detach()



[docs]
    @torch.no_grad()
    def predict(self, X, H: torch.Tensor | None = None, C: torch.Tensor | None = None):  # type: ignore[override]
        """ Predicts outputs of neural network model.

        Two calling conventions are supported:

        - ``predict(X)`` — conforms to the
          :class:`~fynance.core.protocols.SignalModel` contract: ``X``
          may be array-like (coerced to a tensor), the hidden state and
          cell state are zero-initialized, and **only** the prediction
          tensor ``Y`` is returned.
        - ``predict(X, H, C)`` — explicit-state form: the updated states
          are threaded back, returning the ``(Y, H, C)`` tuple. ``C`` is
          zero-initialized when omitted.

        In both cases ``X`` (and any supplied state) is moved to the
        model's device.

        Parameters
        ----------
        X : array-like or torch.Tensor
            Inputs to compute prediction.
        H : torch.Tensor, optional
            States of the model. If ``None`` (default), a zero state is
            used and only the prediction is returned.
        C : torch.Tensor, optional
            Cell memory of the model. Zero-initialized when ``None``.

        Returns
        -------
        torch.Tensor
            Outputs prediction (when ``H`` is ``None``).
        tuple of torch.Tensor
            ``(Y, H, C)`` outputs prediction and updated states (when
            ``H`` is provided).

        """
        return_state = H is not None

        if not isinstance(X, torch.Tensor):
            X = self._set_data(X)

        try:
            device = next(self.parameters()).device
            X = X.to(device)
            if H is not None:
                H = H.to(device)
            if C is not None:
                C = C.to(device)

        except StopIteration:
            pass

        if H is None:
            H = self._init_state(X)

        if C is None:
            C = self._init_cell_state(X)

        was_training = self.training
        self.eval()
        try:
            Y, H, C = self(X, H, C)

        finally:
            self.train(was_training)

        if return_state:

            return Y.detach(), H.detach(), C.detach()

        return Y.detach()