import os
import json
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from huggingface_hub import PyTorchModelHubMixin
from huggingface_hub import snapshot_download

# Model configuration constants
D_FEATURES = 5120  # Hidden dimension size for DeepSeek-R1-Distill-Qwen-32B

class RNNSeqRegressor(nn.Module):
    def __init__(self, hidden=128, num_layers=1):
        super().__init__()
        self.rnn = nn.GRU(
            input_size=D_FEATURES,
            hidden_size=hidden,
            num_layers=num_layers,
            batch_first=True,
        )
        self.head = nn.Linear(hidden, 1)   # scalar per step

    def forward(self, x_pad, lengths):
        packed = pack_padded_sequence(
            x_pad, lengths.cpu(),
            batch_first=True, enforce_sorted=False
        )
        h_seq, _ = self.rnn(packed)                # packed output
        h_seq, _ = pad_packed_sequence(
            h_seq, batch_first=True
        )                                          # (B, L_max, hidden)
        return self.head(h_seq).squeeze(-1)        # (B, L_max)


class RNNSeqRegressorHub(nn.Module, PyTorchModelHubMixin):
    """
    GRU-based sequence regressor with Hugging Face Hub integration.
    
    This model processes sequences of features and outputs a scalar prediction
    for each timestep, designed for thinking progress regression tasks.
    """
    
    def __init__(self, hidden=5120, num_layers=1, **kwargs):
        super().__init__()
        self.hidden = hidden
        self.num_layers = num_layers
        self.d_features = D_FEATURES
        
        self.rnn = nn.GRU(
            input_size=D_FEATURES,
            hidden_size=hidden,
            num_layers=num_layers,
            batch_first=True,
        )
        self.head = nn.Linear(hidden, 1)   # scalar per step

    def forward(self, x_pad, lengths):
        """
        Forward pass through the model.
        
        Args:
            x_pad: Padded input sequences of shape (batch_size, max_length, d_features)
            lengths: Actual lengths of sequences in the batch
            
        Returns:
            predictions: Scalar predictions for each timestep (batch_size, max_length)
        """
        packed = pack_padded_sequence(
            x_pad, lengths.cpu(),
            batch_first=True, enforce_sorted=False
        )
        h_seq, _ = self.rnn(packed)                # packed output
        h_seq, _ = pad_packed_sequence(
            h_seq, batch_first=True
        )                                          # (B, L_max, hidden)
        return self.head(h_seq).squeeze(-1)        # (B, L_max)
    
    def _save_pretrained(self, save_directory):
        """
        Save model configuration and weights.
        """
        # Save configuration
        config = {
            "hidden": self.hidden,
            "num_layers": self.num_layers,
            "d_features": self.d_features,
            "model_type": "rnn_seq_regressor",
            "architecture": "GRU"
        }
        
        config_path = os.path.join(save_directory, "config.json")
        with open(config_path, "w") as f:
            json.dump(config, f, indent=2)
        
        model_path = os.path.join(save_directory, "pytorch_model.bin")
        torch.save(self.state_dict(), model_path)
    
    @classmethod
    def _from_pretrained(
    cls,
    model_id,
    revision=None,
    cache_dir=None,
    force_download=False,
    proxies=None,
    resume_download=False,
    local_files_only=False,
    token=None,
    map_location="cpu",
    strict=True,
    **model_kwargs,
    ):
        """
        Load model from pretrained repo on Hugging Face Hub.
        """
        # Ensure we have a local directory with the model snapshot
        snapshot_path = snapshot_download(
            repo_id=model_id,
            revision=revision,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
            local_files_only=local_files_only,
            token=token,
        )

        # Load config
        config_path = os.path.join(snapshot_path, "config.json")
        if os.path.exists(config_path):
            with open(config_path, "r") as f:
                config = json.load(f)
            model_kwargs.update(config)

        # Create model instance
        model = cls(**model_kwargs)

        # Load weights
        weights_path = os.path.join(snapshot_path, "pytorch_model.bin")
        if os.path.exists(weights_path):
            state_dict = torch.load(weights_path, map_location=map_location)
            model.load_state_dict(state_dict, strict=strict)

        return model