import os import json import torch import torch.nn as nn from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from huggingface_hub import PyTorchModelHubMixin from huggingface_hub import snapshot_download # Model configuration constants D_FEATURES = 5120 # Hidden dimension size for DeepSeek-R1-Distill-Qwen-32B class RNNSeqRegressor(nn.Module): def __init__(self, hidden=128, num_layers=1): super().__init__() self.rnn = nn.GRU( input_size=D_FEATURES, hidden_size=hidden, num_layers=num_layers, batch_first=True, ) self.head = nn.Linear(hidden, 1) # scalar per step def forward(self, x_pad, lengths): packed = pack_padded_sequence( x_pad, lengths.cpu(), batch_first=True, enforce_sorted=False ) h_seq, _ = self.rnn(packed) # packed output h_seq, _ = pad_packed_sequence( h_seq, batch_first=True ) # (B, L_max, hidden) return self.head(h_seq).squeeze(-1) # (B, L_max) class RNNSeqRegressorHub(nn.Module, PyTorchModelHubMixin): """ GRU-based sequence regressor with Hugging Face Hub integration. This model processes sequences of features and outputs a scalar prediction for each timestep, designed for thinking progress regression tasks. """ def __init__(self, hidden=5120, num_layers=1, **kwargs): super().__init__() self.hidden = hidden self.num_layers = num_layers self.d_features = D_FEATURES self.rnn = nn.GRU( input_size=D_FEATURES, hidden_size=hidden, num_layers=num_layers, batch_first=True, ) self.head = nn.Linear(hidden, 1) # scalar per step def forward(self, x_pad, lengths): """ Forward pass through the model. Args: x_pad: Padded input sequences of shape (batch_size, max_length, d_features) lengths: Actual lengths of sequences in the batch Returns: predictions: Scalar predictions for each timestep (batch_size, max_length) """ packed = pack_padded_sequence( x_pad, lengths.cpu(), batch_first=True, enforce_sorted=False ) h_seq, _ = self.rnn(packed) # packed output h_seq, _ = pad_packed_sequence( h_seq, batch_first=True ) # (B, L_max, hidden) return self.head(h_seq).squeeze(-1) # (B, L_max) def _save_pretrained(self, save_directory): """ Save model configuration and weights. """ # Save configuration config = { "hidden": self.hidden, "num_layers": self.num_layers, "d_features": self.d_features, "model_type": "rnn_seq_regressor", "architecture": "GRU" } config_path = os.path.join(save_directory, "config.json") with open(config_path, "w") as f: json.dump(config, f, indent=2) model_path = os.path.join(save_directory, "pytorch_model.bin") torch.save(self.state_dict(), model_path) @classmethod def _from_pretrained( cls, model_id, revision=None, cache_dir=None, force_download=False, proxies=None, resume_download=False, local_files_only=False, token=None, map_location="cpu", strict=True, **model_kwargs, ): """ Load model from pretrained repo on Hugging Face Hub. """ # Ensure we have a local directory with the model snapshot snapshot_path = snapshot_download( repo_id=model_id, revision=revision, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, token=token, ) # Load config config_path = os.path.join(snapshot_path, "config.json") if os.path.exists(config_path): with open(config_path, "r") as f: config = json.load(f) model_kwargs.update(config) # Create model instance model = cls(**model_kwargs) # Load weights weights_path = os.path.join(snapshot_path, "pytorch_model.bin") if os.path.exists(weights_path): state_dict = torch.load(weights_path, map_location=map_location) model.load_state_dict(state_dict, strict=strict) return model