Source code for deeptab.base_models.autoint

import numpy as np
import torch.nn as nn
import torch.nn.init as nn_init

from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
from ..configs.autoint_config import DefaultAutoIntConfig
from .utils.basemodel import BaseModel


[docs]class AutoInt(BaseModel): """ AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks. This model uses multi-head self-attention layers to learn feature interactions for tabular data. It supports key-value compression for memory efficiency and is compatible with embedding-based feature encodings. Parameters ---------- feature_information : tuple A tuple containing information about numerical features, categorical features, and any additional embeddings. Expected format: `(num_feature_info, cat_feature_info, embedding_feature_info)`. num_classes : int, default=1 Number of output classes. For regression, this should be set to `1`. config : DefaultAutoIntConfig, optional Configuration object containing hyperparameters such as `d_model`, `n_heads`, `n_layers`, dropout rates, and compression settings. **kwargs : dict Additional arguments passed to the `BaseModel`. Attributes ---------- embedding_layer : EmbeddingLayer Module that processes numerical and categorical features into embeddings. kv_compression : float or None The proportion of key-value compression. If `None`, no compression is applied. kv_compression_sharing : str or None Defines how key-value compression is shared across layers. Options: - `"layerwise"`: One shared compression layer for all layers. - `"headwise"`: Separate key compression per head. - `"key-value"`: Separate compression layers for `k` and `v`. shared_kv_compression : nn.Linear or None Shared key-value compression layer, used when `kv_compression_sharing="layerwise"`. layers : nn.ModuleList A list of transformer-based attention layers, each consisting of: - `attention`: Multi-head self-attention module. - `linear`: Fully connected layer for projection. - `norm0`: Layer normalization. last_norm : nn.LayerNorm or None Final normalization layer applied before output if `prenormalization` is enabled. head : nn.Linear Output layer mapping from the processed feature representation to the final predictions. """ def __init__( self, feature_information: tuple, # (num_feature_info, cat_feature_info, embedding_feature_info) num_classes=1, config: DefaultAutoIntConfig = DefaultAutoIntConfig(), # noqa: B008 **kwargs, ): super().__init__(config=config, **kwargs) self.save_hyperparameters(ignore=["feature_information"]) self.returns_ensemble = False # Embedding layer self.embedding_layer = EmbeddingLayer(*feature_information, config=config) n_inputs = np.sum([len(info) for info in feature_information]) # Key-Value Compression self.kv_compression = config.kv_compression self.kv_compression_sharing = config.kv_compression_sharing def make_kv_compression(): compression = nn.Linear( n_inputs, int(n_inputs * config.kv_compression), bias=False, ) nn_init.xavier_uniform_(compression.weight) return compression self.shared_kv_compression = ( make_kv_compression() if self.kv_compression and self.kv_compression_sharing == "layerwise" else None ) # Transformer-based Interaction Layers self.layers = nn.ModuleList() for layer_idx in range(config.n_layers): layer = nn.ModuleDict( { "attention": nn.MultiheadAttention( embed_dim=config.d_model, num_heads=config.n_heads, dropout=config.attn_dropout, batch_first=True, ), "linear": nn.Linear(config.d_model, config.d_model, bias=False), "norm0": nn.LayerNorm(config.d_model), } ) if self.kv_compression and self.shared_kv_compression is None: layer["key_compression"] = make_kv_compression() if self.kv_compression_sharing == "headwise": layer["value_compression"] = make_kv_compression() else: assert self.kv_compression_sharing == "key-value" # noqa: S101 self.layers.append(layer) # Final Normalization & Output Head self.last_norm = nn.LayerNorm(config.d_model) if getattr(config, "prenorm", False) else None self.head = nn.Linear(config.d_model * n_inputs, num_classes) def _get_kv_compressions(self, layer): """ Returns the correct key-value compression layers based on the sharing strategy. Parameters ---------- layer : nn.ModuleDict The transformer layer containing possible key-value compression modules. Returns ------- tuple of (nn.Linear or None, nn.Linear or None) The key compression and value compression layers, or `(None, None)` if no compression is applied. """ return ( (self.shared_kv_compression, self.shared_kv_compression) if self.shared_kv_compression is not None else ( (layer["key_compression"], layer["value_compression"]) if "key_compression" in layer and "value_compression" in layer else ( (layer["key_compression"], layer["key_compression"]) if "key_compression" in layer else (None, None) ) ) )
[docs] def forward(self, *data): """ Forward pass of the AutoInt model. Parameters ---------- *data : tuple Input tuple of tensors containing numerical features, categorical features, and embeddings. Returns ------- Tensor The output predictions of the model. """ x = self.embedding_layer(*data) # Shape: (N, J, d_model) for layer in self.layers: x_residual = x # Store original input for residual connection # Apply normalization before attention if prenormalization is enabled x_residual = layer["norm0"](x_residual) # type: ignore[index] # Retrieve key-value compression layers key_compression, value_compression = self._get_kv_compressions(layer) # Multihead Attention x_residual, _ = layer["attention"](x_residual, x_residual, x_residual) # type: ignore[index] # Apply residual connection x = x + x_residual # Apply the linear transformation x_residual = layer["linear"](x) # type: ignore[index] x = x + x_residual # Second residual connection if self.last_norm: x = self.last_norm(x) # Final normalization if prenormalization is used x = x.flatten(1) # Flatten from (N, J, d_model) to (N, J * d_model) return self.head(x) # Final prediction