Source code for fynance.models.attention

#!/usr/bin/env python3
# coding: utf-8

""" Attention mechanisms for sequential financial data.

PyTorch implementations of the building blocks introduced by
Vaswani et al. (2017): scaled dot-product attention and multi-head
attention. Used to build Transformer-style architectures for return
series, order books and other sequential financial signals where
long-range dependencies matter.

Main entry points
-----------------
- :class:`ScaledDotProductAttention` — single-head scaled attention.
- :class:`MultiHeadAttention` — parallel attention heads with
  learnable projections.

References
----------
.. [1] Vaswani, A. et al. (2017). Attention Is All You Need.

"""

import math

from torch import nn

__all__ = ['ScaledDotProductAttention', 'MultiHeadAttention']



[docs]
class ScaledDotProductAttention(nn.Module):
    r""" Scaled Dot-Product Attention.

    Computes :math:`\text{Attention}(Q, K, V) =
    \text{softmax}\!\left(\frac{QK^T}{\sqrt{d_k}}\right)V`.

    Parameters
    ----------
    dropout : float, optional
        Dropout probability applied to the attention weights, default 0.

    References
    ----------
    Vaswani et al., "Attention is All You Need", arXiv 2017.

    """

    def __init__(self, dropout=0.0):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.softmax = nn.Softmax(dim=-1)


[docs]
    def forward(self, Q, K, V, mask=None):
        """ Compute attention.

        Parameters
        ----------
        Q : torch.Tensor
            Queries, shape ``(B, ..., T, d_k)``.
        K : torch.Tensor
            Keys, shape ``(B, ..., S, d_k)``.
        V : torch.Tensor
            Values, shape ``(B, ..., S, d_v)``.
        mask : torch.Tensor, optional
            Boolean mask of shape ``(B, ..., T, S)``.  Positions where
            ``mask == 0`` are set to ``-inf`` before softmax.

        Returns
        -------
        torch.Tensor
            Output of shape ``(B, ..., T, d_v)``.
        torch.Tensor
            Attention weights of shape ``(B, ..., T, S)``.

        """
        d_k = Q.size(-1)
        scores = Q @ K.transpose(-2, -1) / math.sqrt(d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = self.dropout(self.softmax(scores))
        return attn @ V, attn





[docs]
class MultiHeadAttention(nn.Module):
    r""" Multi-Head Self-Attention.

    Building block of the Transformer architecture (Vaswani et al.,
    2017). Each attention head learns to attend to a different subspace
    of the input — useful when several types of dependency coexist in
    a sequence, e.g. short-term and long-term momentum. Outputs of the
    heads are concatenated and projected back through ``w_o``;
    residual connection plus layer norm stabilize training.

    For finance-specific use, this layer is typically stacked with a
    feed-forward sublayer to form a Transformer encoder block applied
    to a return / order-book sequence.

    Splits the input into ``num_heads`` heads, applies
    :class:`ScaledDotProductAttention` in parallel, then re-projects.  A
    residual connection and layer norm are applied.

    Parameters
    ----------
    d_model : int
        Model dimension (must be divisible by ``num_heads``).
    num_heads : int
        Number of attention heads.
    dropout : float, optional
        Dropout on attention weights and output projection, default 0.

    Examples
    --------
    >>> import torch
    >>> mha = MultiHeadAttention(64, 4)
    >>> x = torch.randn(2, 10, 64)
    >>> out, attn = mha(x)
    >>> out.shape
    torch.Size([2, 10, 64])
    >>> attn.shape
    torch.Size([2, 4, 10, 10])

    """

    def __init__(self, d_model, num_heads, dropout=0.0):
        super().__init__()
        if d_model % num_heads != 0:
            raise ValueError(
                f'd_model ({d_model}) must be divisible by num_heads ({num_heads})'
            )
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

        self.attention = ScaledDotProductAttention(dropout=dropout)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p=dropout)


[docs]
    def forward(self, x, mask=None):
        """ Forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input of shape ``(B, T, d_model)``.
        mask : torch.Tensor, optional
            Attention mask of shape ``(B, 1, T, T)`` or ``(B, T, T)``.

        Returns
        -------
        torch.Tensor
            Output of shape ``(B, T, d_model)``.
        torch.Tensor
            Averaged attention weights of shape ``(B, num_heads, T, T)``.

        """
        B, T, _ = x.shape

        Q = self.w_q(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        K = self.w_k(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = self.w_v(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        out, attn = self.attention(Q, K, V, mask=mask)

        out = out.transpose(1, 2).contiguous().view(B, T, self.d_model)
        out = self.w_o(out)

        return self.norm(x + self.dropout(out)), attn