In [1]:
try:
    !pip uninstall -qy geometricvocab geofractal
except:
    pass

!pip install -q git+https://github.com/AbstractEyes/geofractal.git

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for geofractal (pyproject.toml) ... [?25l[?25hdone
  Building wheel for geometricvocab (pyproject.toml) ... [?25l[?25hdone


In [2]:
# Cell: The Galaxy Brain Collective - Complete Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, T5Tokenizer, T5EncoderModel
from datasets import load_dataset
from torch.utils.data import DataLoader
from geofractal.router.head import build_standard_head, HeadConfig
from tqdm.auto import tqdm
import re
import sympy
from sympy.parsing.sympy_parser import parse_expr

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# ============================================================================
# FROZEN SEMANTIC BACKBONES
# ============================================================================

# MathBERT - understands mathematical notation
mathbert_tokenizer = BertTokenizer.from_pretrained("tbs17/MathBERT")
mathbert = BertModel.from_pretrained("tbs17/MathBERT").to(device)
mathbert.eval()
for p in mathbert.parameters():
    p.requires_grad = False

# T5-base - general language understanding
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
t5 = T5EncoderModel.from_pretrained("t5-base").to(device)
t5.eval()
for p in t5.parameters():
    p.requires_grad = False

print(f"MathBERT hidden: {mathbert.config.hidden_size}")
print(f"T5 hidden: {t5.config.d_model}")

Device: cuda


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

MathBERT hidden: 768
T5 hidden: 768


In [11]:
# Cell: Deterministic Computation Streams
# ============================================================================
# DETERMINISTIC STREAMS - No learning, pure math
# ============================================================================
# Cell: Fixed Deterministic Streams
class EigenspectrumStream(nn.Module):
    """
    Computes eigenspectrum of embedding covariance.
    DETERMINISTIC - captures geometric structure of representation.
    """
    def __init__(self, output_dim, max_features=64):
        super().__init__()
        self.max_features = max_features
        self.projection = nn.Linear(max_features, output_dim)

    def forward(self, embeddings):
        B, S, D = embeddings.shape

        # Compute covariance matrix
        centered = embeddings - embeddings.mean(dim=1, keepdim=True)
        cov = torch.bmm(centered.transpose(1, 2), centered) / S  # [B, D, D]

        # Eigenvalues (real, sorted)
        eigenvalues = torch.linalg.eigvalsh(cov)  # [B, D]

        # Normalize eigenspectrum
        eigenvalues = eigenvalues / (eigenvalues.sum(dim=-1, keepdim=True) + 1e-8)

        # Pad or truncate to fixed size
        if D >= self.max_features:
            eigen_features = eigenvalues[:, -self.max_features:]
        else:
            padding = torch.zeros(B, self.max_features - D, device=embeddings.device)
            eigen_features = torch.cat([padding, eigenvalues], dim=-1)

        # Expand to sequence and project
        eigen_seq = eigen_features.unsqueeze(1).expand(-1, S, -1)
        return self.projection(eigen_seq)


class CayleyMengerStream(nn.Module):
    """
    Computes Cayley-Menger determinants for simplex volumes.
    DETERMINISTIC - captures distance geometry.
    """
    def __init__(self, output_dim, num_features=64):
        super().__init__()
        self.num_features = num_features
        self.projection = nn.Linear(num_features, output_dim)

    def forward(self, embeddings):
        B, S, D = embeddings.shape

        # Sample points for simplex computation
        k = min(self.num_features, S)
        indices = torch.linspace(0, S-1, k).long().to(embeddings.device)
        sampled = embeddings[:, indices, :]  # [B, k, D]

        # Pairwise distances
        dists = torch.cdist(sampled, sampled)  # [B, k, k]

        # Distance matrix eigenspectrum as proxy for CM geometry
        dist_sq = dists ** 2
        eigenvalues = torch.linalg.eigvalsh(dist_sq)  # [B, k]

        # Normalize
        cm_features = eigenvalues / (eigenvalues.abs().sum(dim=-1, keepdim=True) + 1e-8)

        # Pad to fixed size
        if k < self.num_features:
            padding = torch.zeros(B, self.num_features - k, device=embeddings.device)
            cm_features = torch.cat([cm_features, padding], dim=-1)

        # Expand and project
        cm_seq = cm_features.unsqueeze(1).expand(-1, S, -1)
        return self.projection(cm_seq)


class SymbolicCalculatorStream(nn.Module):
    """
    Extracts numbers from text and computes features.
    DETERMINISTIC - actual arithmetic.
    """
    def __init__(self, output_dim, max_numbers=10):
        super().__init__()
        self.max_numbers = max_numbers
        self.num_features = 12
        self.projection = nn.Linear(self.num_features, output_dim)

    def extract_numbers(self, text):
        pattern = r'-?\d+\.?\d*'
        matches = re.findall(pattern, text)
        numbers = [float(m) for m in matches if m not in ['-', '.']]
        return numbers[:self.max_numbers]

    def compute_features(self, numbers):
        if len(numbers) == 0:
            return torch.zeros(self.num_features)

        t = torch.tensor(numbers, dtype=torch.float32)

        features = [
            len(numbers),
            t.sum().item(),
            t.prod().item() if len(t) < 10 else 0,
            t.mean().item(),
            t.std().item() if len(t) > 1 else 0,
            t.min().item(),
            t.max().item(),
            (t.max() - t.min()).item(),
            (t > 0).sum().item(),
            (t < 0).sum().item(),
            (t == 0).sum().item(),
            t.abs().mean().item(),
        ]

        return torch.tensor(features, dtype=torch.float32)

    def forward(self, texts, seq_len):
        batch_features = []
        for text in texts:
            numbers = self.extract_numbers(text)
            features = self.compute_features(numbers)
            batch_features.append(features)

        features = torch.stack(batch_features).to(next(self.parameters()).device)
        features_seq = features.unsqueeze(1).expand(-1, seq_len, -1)
        return self.projection(features_seq)


class FractalDimensionStream(nn.Module):
    """
    Estimates fractal/correlation dimension of embeddings.
    DETERMINISTIC - geometric complexity measure.
    """
    def __init__(self, output_dim, num_scales=8):
        super().__init__()
        self.num_scales = num_scales
        self.projection = nn.Linear(num_scales * 2, output_dim)

    def forward(self, embeddings):
        B, S, D = embeddings.shape

        # Compute pairwise distances
        dists = torch.cdist(embeddings, embeddings)  # [B, S, S]

        # Correlation dimension at multiple scales
        scales = torch.logspace(-2, 1, self.num_scales).to(embeddings.device)

        features = []
        for scale in scales:
            count = (dists < scale).float().sum(dim=(1, 2)) / (S * S)
            features.append(count)
            features.append(torch.log(count + 1e-8))

        fractal_features = torch.stack(features, dim=-1)  # [B, num_scales*2]
        fractal_seq = fractal_features.unsqueeze(1).expand(-1, S, -1)
        return self.projection(fractal_seq)


print("✓ Fixed deterministic streams defined")

print("Deterministic streams defined:")
print("  - EigenspectrumStream: covariance geometry")
print("  - CayleyMengerStream: distance geometry")
print("  - SymbolicCalculatorStream: actual arithmetic")
print("  - FractalDimensionStream: complexity measure")

✓ Fixed deterministic streams defined
Deterministic streams defined:
  - EigenspectrumStream: covariance geometry
  - CayleyMengerStream: distance geometry
  - SymbolicCalculatorStream: actual arithmetic
  - FractalDimensionStream: complexity measure


In [7]:
# Cell: Patch the combiner issue
from geofractal.router.head.components import LearnableWeightCombiner, GatedCombiner, HeadConfig
from geofractal.router.head.builder import HeadBuilder, ComposedHead
from geofractal.router.head.components import (
    CantorAttention, TopKRouter, ConstitutiveAnchorBank,
    FingerprintGate, FFNRefinement
)

def fixed_build(self) -> ComposedHead:
    """Build the composed head - FIXED."""
    attention = self._build_component(
        self._attention_cls, self._attention_instance,
        self._attention_kwargs, CantorAttention
    )
    router = self._build_component(
        self._router_cls, self._router_instance,
        self._router_kwargs, TopKRouter
    )
    anchors = self._build_component(
        self._anchor_cls, self._anchor_instance,
        self._anchor_kwargs, ConstitutiveAnchorBank
    )
    gate = self._build_component(
        self._gate_cls, self._gate_instance,
        self._gate_kwargs, FingerprintGate
    )

    # FIX: Pass only config to combiner, ignore signal_names
    combiner_cls = self._combiner_cls or LearnableWeightCombiner
    if self._combiner_instance is not None:
        combiner = self._combiner_instance
    else:
        combiner = combiner_cls(self.config)

    refinement = self._build_component(
        self._refinement_cls, self._refinement_instance,
        self._refinement_kwargs, FFNRefinement
    )

    return ComposedHead(
        config=self.config,
        attention=attention,
        router=router,
        anchors=anchors,
        gate=gate,
        combiner=combiner,
        refinement=refinement,
    )

HeadBuilder.build = fixed_build
print("✓ HeadBuilder.build patched")

✓ HeadBuilder.build patched


In [13]:
# Cell: Build the Galaxy Brain Collective
# ============================================================================
# ROUTING HEADS & FUSION
# ============================================================================
# Cell: Rebuild with fixed streams
ROUTE_DIM = 512

# Projections
proj_mathbert = nn.Linear(768, ROUTE_DIM).to(device)
proj_t5 = nn.Linear(768, ROUTE_DIM).to(device)

# Fixed deterministic streams
eigen_stream = EigenspectrumStream(output_dim=ROUTE_DIM, max_features=64).to(device)
cayley_stream = CayleyMengerStream(output_dim=ROUTE_DIM, num_features=64).to(device)
symbolic_stream = SymbolicCalculatorStream(output_dim=ROUTE_DIM).to(device)
fractal_stream = FractalDimensionStream(output_dim=ROUTE_DIM, num_scales=8).to(device)

# Routing heads
head_config = HeadConfig(feature_dim=ROUTE_DIM, fingerprint_dim=64, num_anchors=16, num_routes=4)
head_mathbert = build_standard_head(head_config).to(device)
head_t5 = build_standard_head(head_config).to(device)
head_eigen = build_standard_head(head_config).to(device)
head_cayley = build_standard_head(head_config).to(device)
head_symbolic = build_standard_head(head_config).to(device)
head_fractal = build_standard_head(head_config).to(device)

# 6-way fusion
fusion = nn.Sequential(
    nn.Linear(ROUTE_DIM * 6, ROUTE_DIM * 2),
    nn.LayerNorm(ROUTE_DIM * 2),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(ROUTE_DIM * 2, ROUTE_DIM),
    nn.LayerNorm(ROUTE_DIM),
    nn.GELU(),
).to(device)

# Classifiers
NUM_BUCKETS = 20
classifier = nn.Sequential(
    nn.Linear(ROUTE_DIM, 256),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(256, NUM_BUCKETS),
).to(device)

classifier_mathbert = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)
classifier_t5 = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)
classifier_eigen = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)
classifier_cayley = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)
classifier_symbolic = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)
classifier_fractal = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)

# Optimizer
trainable = (
    list(proj_mathbert.parameters()) + list(proj_t5.parameters()) +
    list(eigen_stream.parameters()) + list(cayley_stream.parameters()) +
    list(symbolic_stream.parameters()) + list(fractal_stream.parameters()) +
    list(head_mathbert.parameters()) + list(head_t5.parameters()) +
    list(head_eigen.parameters()) + list(head_cayley.parameters()) +
    list(head_symbolic.parameters()) + list(head_fractal.parameters()) +
    list(fusion.parameters()) + list(classifier.parameters()) +
    list(classifier_mathbert.parameters()) + list(classifier_t5.parameters()) +
    list(classifier_eigen.parameters()) + list(classifier_cayley.parameters()) +
    list(classifier_symbolic.parameters()) + list(classifier_fractal.parameters())
)

optimizer = torch.optim.AdamW(trainable, lr=2e-4, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

print(f"✓ Rebuilt Galaxy Brain Collective")
print(f"  Trainable params: {sum(p.numel() for p in trainable):,}")

✓ Rebuilt Galaxy Brain Collective
  Trainable params: 27,328,884


In [14]:
# Cell: Load and preprocess GSM8K
dataset = load_dataset("openai/gsm8k", "main")

print(f"Train: {len(dataset['train'])}")
print(f"Test: {len(dataset['test'])}")

# Sample
sample = dataset['train'][0]
print(f"\nQuestion: {sample['question'][:200]}...")
print(f"Answer: {sample['answer'][-100:]}")

def extract_final_answer(answer_text):
    """Extract the numeric answer from GSM8K format."""
    # GSM8K answers end with #### followed by the number
    match = re.search(r'####\s*(-?\d+\.?\d*)', answer_text)
    if match:
        return float(match.group(1))
    return None

# Compute answer buckets
answers = [extract_final_answer(ex['answer']) for ex in dataset['train']]
answers = [a for a in answers if a is not None]

import numpy as np
percentiles = np.percentile(answers, np.linspace(0, 100, NUM_BUCKETS + 1))
print(f"\nAnswer range: {min(answers)} to {max(answers)}")
print(f"Bucket boundaries: {percentiles[:5]}... {percentiles[-5:]}")

def answer_to_bucket(answer):
    """Convert numeric answer to bucket index."""
    for i, (low, high) in enumerate(zip(percentiles[:-1], percentiles[1:])):
        if answer <= high:
            return i
    return NUM_BUCKETS - 1

Train: 7473
Test: 1319

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?...
Answer: =24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Answer range: -47.0 to 192000000.0
Bucket boundaries: [-47.   3.   5.   7.  10.]... [2.5000e+02 4.2000e+02 9.0000e+02 2.9304e+03 1.9200e+08]


In [15]:
# Cell: Forward pass
def forward_galaxy_brain(questions, answers_text):
    """Full forward pass through all 6 streams."""
    B = len(questions)

    # === ENCODE WITH SEMANTIC BACKBONES ===

    # MathBERT
    enc_mathbert = mathbert_tokenizer(questions, return_tensors="pt",
                                       padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        hidden_mathbert = mathbert(
            enc_mathbert.input_ids.to(device),
            attention_mask=enc_mathbert.attention_mask.to(device)
        ).last_hidden_state

    # T5
    enc_t5 = t5_tokenizer(questions, return_tensors="pt",
                          padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        hidden_t5 = t5(
            enc_t5.input_ids.to(device),
            attention_mask=enc_t5.attention_mask.to(device)
        ).last_hidden_state

    S = hidden_mathbert.shape[1]  # sequence length

    # === PROJECT TO COMMON SPACE ===
    proj_mb = proj_mathbert(hidden_mathbert)  # [B, S, 512]
    proj_t = proj_t5(hidden_t5)                # [B, S', 512]

    # Align sequence lengths
    S_min = min(proj_mb.shape[1], proj_t.shape[1])
    proj_mb = proj_mb[:, :S_min, :]
    proj_t = proj_t[:, :S_min, :]

    # === DETERMINISTIC STREAMS ===
    # These compute geometric features from the embeddings
    eigen_feat = eigen_stream(proj_mb)                    # [B, S, 512]
    cayley_feat = cayley_stream(proj_mb)                  # [B, S, 512]
    symbolic_feat = symbolic_stream(questions, S_min)     # [B, S, 512]
    fractal_feat = fractal_stream(proj_mb)                # [B, S, 512]

    # === ROUTE THROUGH HEADS ===
    routed_mathbert = head_mathbert(proj_mb)
    routed_t5 = head_t5(proj_t)
    routed_eigen = head_eigen(eigen_feat)
    routed_cayley = head_cayley(cayley_feat)
    routed_symbolic = head_symbolic(symbolic_feat)
    routed_fractal = head_fractal(fractal_feat)

    # === POOL (CLS token) ===
    pooled_mathbert = routed_mathbert[:, 0]
    pooled_t5 = routed_t5[:, 0]
    pooled_eigen = routed_eigen[:, 0]
    pooled_cayley = routed_cayley[:, 0]
    pooled_symbolic = routed_symbolic[:, 0]
    pooled_fractal = routed_fractal[:, 0]

    # === FUSE ===
    fused = fusion(torch.cat([
        pooled_mathbert, pooled_t5, pooled_eigen,
        pooled_cayley, pooled_symbolic, pooled_fractal
    ], dim=-1))

    # === CLASSIFY ===
    logits = classifier(fused)

    # Individual predictions
    ind_logits = {
        'mathbert': classifier_mathbert(pooled_mathbert),
        't5': classifier_t5(pooled_t5),
        'eigen': classifier_eigen(pooled_eigen),
        'cayley': classifier_cayley(pooled_cayley),
        'symbolic': classifier_symbolic(pooled_symbolic),
        'fractal': classifier_fractal(pooled_fractal),
    }

    return logits, ind_logits

# Test forward
test_q = ["John has 5 apples and buys 3 more. How many apples does John have?"]
test_a = ["John starts with 5 apples. He buys 3 more. 5 + 3 = 8. #### 8"]
logits, ind_logits = forward_galaxy_brain(test_q, test_a)
print(f"Collective output: {logits.shape}")
print(f"Individual outputs: {[k for k in ind_logits.keys()]}")

Collective output: torch.Size([1, 20])
Individual outputs: ['mathbert', 't5', 'eigen', 'cayley', 'symbolic', 'fractal']


In [16]:
# Cell: Create HuggingFace repo and setup checkpointing
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder
import json
import os

# Create the repo
api = HfApi()

try:
    create_repo(
        repo_id="AbstractPhil/math_collective",
        repo_type="model",
        exist_ok=True,
        private=False,
    )
    print("✓ Created repo: AbstractPhil/math_collective")
except Exception as e:
    print(f"Repo exists or error: {e}")

# Create local checkpoint directory
os.makedirs("/content/math_collective_checkpoints", exist_ok=True)

✓ Created repo: AbstractPhil/math_collective


In [17]:
# Cell: Model saving/loading utilities
import torch
from datetime import datetime

def save_checkpoint(
    epoch,
    metrics,
    path="/content/math_collective_checkpoints",
    push_to_hub=True
):
    """Save all trainable components."""

    checkpoint = {
        'epoch': epoch,
        'metrics': metrics,
        'timestamp': datetime.now().isoformat(),

        # Projections
        'proj_mathbert': proj_mathbert.state_dict(),
        'proj_t5': proj_t5.state_dict(),

        # Deterministic streams (they have learned projections)
        'eigen_stream': eigen_stream.state_dict(),
        'cayley_stream': cayley_stream.state_dict(),
        'symbolic_stream': symbolic_stream.state_dict(),
        'fractal_stream': fractal_stream.state_dict(),

        # Routing heads
        'head_mathbert': head_mathbert.state_dict(),
        'head_t5': head_t5.state_dict(),
        'head_eigen': head_eigen.state_dict(),
        'head_cayley': head_cayley.state_dict(),
        'head_symbolic': head_symbolic.state_dict(),
        'head_fractal': head_fractal.state_dict(),

        # Fusion and classifiers
        'fusion': fusion.state_dict(),
        'classifier': classifier.state_dict(),
        'classifier_mathbert': classifier_mathbert.state_dict(),
        'classifier_t5': classifier_t5.state_dict(),
        'classifier_eigen': classifier_eigen.state_dict(),
        'classifier_cayley': classifier_cayley.state_dict(),
        'classifier_symbolic': classifier_symbolic.state_dict(),
        'classifier_fractal': classifier_fractal.state_dict(),

        # Optimizer
        'optimizer': optimizer.state_dict(),

        # Config
        'config': {
            'route_dim': ROUTE_DIM,
            'num_buckets': NUM_BUCKETS,
            'head_config': {
                'feature_dim': head_config.feature_dim,
                'fingerprint_dim': head_config.fingerprint_dim,
                'num_anchors': head_config.num_anchors,
                'num_routes': head_config.num_routes,
            },
            'percentiles': percentiles.tolist(),
        }
    }

    # Save locally
    filename = f"checkpoint_epoch_{epoch}.pt"
    local_path = os.path.join(path, filename)
    torch.save(checkpoint, local_path)
    print(f"✓ Saved checkpoint: {local_path}")

    # Push to hub
    if push_to_hub:
        api.upload_file(
            path_or_fileobj=local_path,
            path_in_repo=f"checkpoints/{filename}",
            repo_id="AbstractPhil/math_collective",
            repo_type="model",
        )
        print(f"✓ Pushed to hub: checkpoints/{filename}")

    return local_path


def load_checkpoint(path_or_epoch, from_hub=False):
    """Load checkpoint from local path or hub."""

    if from_hub:
        from huggingface_hub import hf_hub_download
        local_path = hf_hub_download(
            repo_id="AbstractPhil/math_collective",
            filename=f"checkpoints/checkpoint_epoch_{path_or_epoch}.pt",
            repo_type="model",
        )
    else:
        local_path = path_or_epoch

    checkpoint = torch.load(local_path, map_location=device)

    # Load all components
    proj_mathbert.load_state_dict(checkpoint['proj_mathbert'])
    proj_t5.load_state_dict(checkpoint['proj_t5'])

    eigen_stream.load_state_dict(checkpoint['eigen_stream'])
    cayley_stream.load_state_dict(checkpoint['cayley_stream'])
    symbolic_stream.load_state_dict(checkpoint['symbolic_stream'])
    fractal_stream.load_state_dict(checkpoint['fractal_stream'])

    head_mathbert.load_state_dict(checkpoint['head_mathbert'])
    head_t5.load_state_dict(checkpoint['head_t5'])
    head_eigen.load_state_dict(checkpoint['head_eigen'])
    head_cayley.load_state_dict(checkpoint['head_cayley'])
    head_symbolic.load_state_dict(checkpoint['head_symbolic'])
    head_fractal.load_state_dict(checkpoint['head_fractal'])

    fusion.load_state_dict(checkpoint['fusion'])
    classifier.load_state_dict(checkpoint['classifier'])
    classifier_mathbert.load_state_dict(checkpoint['classifier_mathbert'])
    classifier_t5.load_state_dict(checkpoint['classifier_t5'])
    classifier_eigen.load_state_dict(checkpoint['classifier_eigen'])
    classifier_cayley.load_state_dict(checkpoint['classifier_cayley'])
    classifier_symbolic.load_state_dict(checkpoint['classifier_symbolic'])
    classifier_fractal.load_state_dict(checkpoint['classifier_fractal'])

    optimizer.load_state_dict(checkpoint['optimizer'])

    print(f"✓ Loaded checkpoint from epoch {checkpoint['epoch']}")
    print(f"  Metrics: {checkpoint['metrics']}")

    return checkpoint

print("✓ Checkpoint utilities defined")

✓ Checkpoint utilities defined


In [18]:
# Cell: Create model card and push
model_card = """---
license: apache-2.0
tags:
  - geofractal-router
  - collective-intelligence
  - math-reasoning
  - multi-stream
  - emergence
  - routing
datasets:
  - openai/gsm8k
base_model:
  - tbs17/MathBERT
  - google-t5/t5-base
pipeline_tag: text-classification
---

# Math Collective - Galaxy Brain Router

**6-stream collective intelligence system for mathematical reasoning.**

## Architecture
```
┌─────────────────────────────────────────────────────────────┐
│                    GALAXY BRAIN COLLECTIVE                   │
├─────────────────────────────────────────────────────────────┤
│  FUZZY STREAMS (learned representations)                    │
│  ├── MathBERT (frozen) → Head A → "symbolic understanding"  │
│  └── T5-base (frozen)  → Head B → "linguistic reasoning"    │
│                                                             │
│  DETERMINISTIC STREAMS (pure computation)                   │
│  ├── Eigenspectrum    → Head C → "covariance geometry"      │
│  ├── Cayley-Menger    → Head D → "distance geometry"        │
│  ├── Symbolic Calc    → Head E → "actual arithmetic"        │
│  └── Fractal Dim      → Head F → "complexity measure"       │
│                                                             │
│  All 6 streams → Fusion → Classifier                        │
└─────────────────────────────────────────────────────────────┘
```

## Key Innovation

**Fuzzy + Deterministic Routing**

The router learns WHEN to trust each stream:
- Simple arithmetic? Trust the Symbolic Calculator (deterministic)
- Complex word problem? Trust MathBERT/T5 (semantic)
- Ambiguous? Triangulate across all 6 perspectives

## Streams

| Stream | Type | Source | Purpose |
|--------|------|--------|---------|
| MathBERT | Fuzzy | tbs17/MathBERT (frozen) | Mathematical notation understanding |
| T5-base | Fuzzy | t5-base (frozen) | General language reasoning |
| Eigenspectrum | Deterministic | Covariance eigenvalues | Geometric structure of embeddings |
| Cayley-Menger | Deterministic | Distance matrix geometry | Simplex volume features |
| Symbolic | Deterministic | Regex + arithmetic | Actual number extraction & computation |
| Fractal | Deterministic | Correlation dimension | Problem complexity measure |

## Training

- **Dataset**: GSM8K (Grade School Math 8K)
- **Task**: Answer magnitude bucket prediction (20 buckets)
- **Frozen params**: ~330M (MathBERT + T5)
- **Trainable params**: ~15M (routing heads, fusion, projections)

## Emergence Metric (ρ)
```
ρ = collective_accuracy / max(individual_accuracies)

ρ > 1.0 = emergence (collective outperforms best individual)
```

## Usage
```python
from huggingface_hub import hf_hub_download
import torch

# Download checkpoint
checkpoint_path = hf_hub_download(
    repo_id="AbstractPhil/math_collective",
    filename="checkpoints/checkpoint_epoch_5.pt"
)

# Load and use (see geofractal-router for full implementation)
checkpoint = torch.load(checkpoint_path)
print(f"Epoch: {checkpoint['epoch']}")
print(f"Metrics: {checkpoint['metrics']}")
```

## Related

- **Framework**: [AbstractPhil/geofractal_router](https://huggingface.co/AbstractPhil/geofractal_router)
- **Paper**: Coming soon
- **Code**: [GitHub - geofractal](https://github.com/AbstractEyes/geofractal)

## Citation
```bibtex
@misc{abstractphil2025mathcollective,
  title={Math Collective: Galaxy Brain Routing for Mathematical Reasoning},
  author={AbstractPhil},
  year={2025},
  publisher={Hugging Face},
  url={https://huggingface.co/AbstractPhil/math_collective}
}
```

## License

Apache 2.0
"""

# Save and push model card
with open("/content/math_collective_checkpoints/README.md", "w") as f:
    f.write(model_card)

api.upload_file(
    path_or_fileobj="/content/math_collective_checkpoints/README.md",
    path_in_repo="README.md",
    repo_id="AbstractPhil/math_collective",
    repo_type="model",
)
print("✓ Model card pushed to hub")

✓ Model card pushed to hub


In [19]:
# Cell: Training loop with checkpointing
def collate_fn(examples):
    return {
        'question': [ex['question'] for ex in examples],
        'answer': [ex['answer'] for ex in examples],
    }

train_loader = DataLoader(dataset['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(dataset['test'], batch_size=16, shuffle=False, collate_fn=collate_fn)

EPOCHS = 5
history = []

for epoch in range(EPOCHS):
    # Train
    for h in [head_mathbert, head_t5, head_eigen, head_cayley, head_symbolic, head_fractal]:
        h.train()
    fusion.train(); classifier.train()

    correct, total = 0, 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in pbar:
        questions = batch['question']
        answers_text = batch['answer']

        # Extract labels
        labels = []
        for ans in answers_text:
            num = extract_final_answer(ans)
            if num is not None:
                labels.append(answer_to_bucket(num))
            else:
                labels.append(0)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        logits, ind_logits = forward_galaxy_brain(questions, answers_text)

        # Collective loss
        loss = criterion(logits, labels)

        # Individual supervision (light)
        for name, ind_log in ind_logits.items():
            loss += 0.05 * criterion(ind_log, labels)

        loss.backward()
        optimizer.step()

        correct += (logits.argmax(-1) == labels).sum().item()
        total += labels.size(0)
        pbar.set_postfix({'acc': f'{correct/total:.1%}'})

    # Eval
    for h in [head_mathbert, head_t5, head_eigen, head_cayley, head_symbolic, head_fractal]:
        h.eval()
    fusion.eval(); classifier.eval()

    metrics = {name: 0 for name in ['collective', 'mathbert', 't5', 'eigen', 'cayley', 'symbolic', 'fractal']}
    val_total = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating"):
            questions = batch['question']
            answers_text = batch['answer']

            labels = []
            for ans in answers_text:
                num = extract_final_answer(ans)
                if num is not None:
                    labels.append(answer_to_bucket(num))
                else:
                    labels.append(0)
            labels = torch.tensor(labels).to(device)

            logits, ind_logits = forward_galaxy_brain(questions, answers_text)

            metrics['collective'] += (logits.argmax(-1) == labels).sum().item()
            for name, ind_log in ind_logits.items():
                metrics[name] += (ind_log.argmax(-1) == labels).sum().item()
            val_total += labels.size(0)

    # Compute accuracies
    accs = {k: v / val_total for k, v in metrics.items()}
    max_ind = max(v for k, v in accs.items() if k != 'collective')
    rho = accs['collective'] / max_ind if max_ind > 0 else 0

    epoch_metrics = {**accs, 'rho': rho, 'epoch': epoch + 1}
    history.append(epoch_metrics)

    print(f"\nEpoch {epoch+1} Results:")
    print(f"  Collective: {accs['collective']:.1%}")
    print(f"  MathBERT: {accs['mathbert']:.1%}, T5: {accs['t5']:.1%}")
    print(f"  Eigen: {accs['eigen']:.1%}, Cayley: {accs['cayley']:.1%}")
    print(f"  Symbolic: {accs['symbolic']:.1%}, Fractal: {accs['fractal']:.1%}")
    print(f"  ρ = {rho:.3f}")

    # Save checkpoint after each epoch
    save_checkpoint(epoch + 1, epoch_metrics, push_to_hub=True)

# Save training history
history_path = "/content/math_collective_checkpoints/training_history.json"
with open(history_path, "w") as f:
    json.dump(history, f, indent=2)

api.upload_file(
    path_or_fileobj=history_path,
    path_in_repo="training_history.json",
    repo_id="AbstractPhil/math_collective",
    repo_type="model",
)
print("✓ Training history pushed to hub")

Epoch 1:   0%|          | 0/468 [00:00<?, ?it/s]

Validating:   0%|          | 0/83 [00:00<?, ?it/s]


Epoch 1 Results:
  Collective: 8.9%
  MathBERT: 10.4%, T5: 13.0%
  Eigen: 5.4%, Cayley: 5.4%
  Symbolic: 5.7%, Fractal: 6.7%
  ρ = 0.686
✓ Saved checkpoint: /content/math_collective_checkpoints/checkpoint_epoch_1.pt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nts/checkpoint_epoch_1.pt:   0%|          |  559kB /  328MB            

✓ Pushed to hub: checkpoints/checkpoint_epoch_1.pt


Epoch 2:   0%|          | 0/468 [00:00<?, ?it/s]

Validating:   0%|          | 0/83 [00:00<?, ?it/s]


Epoch 2 Results:
  Collective: 9.6%
  MathBERT: 8.9%, T5: 13.0%
  Eigen: 6.5%, Cayley: 6.5%
  Symbolic: 4.6%, Fractal: 6.1%
  ρ = 0.738
✓ Saved checkpoint: /content/math_collective_checkpoints/checkpoint_epoch_2.pt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nts/checkpoint_epoch_2.pt:   0%|          |  558kB /  328MB            

✓ Pushed to hub: checkpoints/checkpoint_epoch_2.pt


Epoch 3:   0%|          | 0/468 [00:00<?, ?it/s]

Validating:   0%|          | 0/83 [00:00<?, ?it/s]


Epoch 3 Results:
  Collective: 9.9%
  MathBERT: 9.6%, T5: 13.6%
  Eigen: 5.8%, Cayley: 5.8%
  Symbolic: 4.5%, Fractal: 5.8%
  ρ = 0.728
✓ Saved checkpoint: /content/math_collective_checkpoints/checkpoint_epoch_3.pt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nts/checkpoint_epoch_3.pt:   0%|          | 14.3kB /  328MB            

✓ Pushed to hub: checkpoints/checkpoint_epoch_3.pt


Epoch 4:   0%|          | 0/468 [00:00<?, ?it/s]

Validating:   0%|          | 0/83 [00:00<?, ?it/s]


Epoch 4 Results:
  Collective: 8.3%
  MathBERT: 10.1%, T5: 13.3%
  Eigen: 6.1%, Cayley: 4.5%
  Symbolic: 4.8%, Fractal: 4.5%
  ρ = 0.619
✓ Saved checkpoint: /content/math_collective_checkpoints/checkpoint_epoch_4.pt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nts/checkpoint_epoch_4.pt:   0%|          | 14.3kB /  328MB            

✓ Pushed to hub: checkpoints/checkpoint_epoch_4.pt


Epoch 5:   0%|          | 0/468 [00:00<?, ?it/s]

Validating:   0%|          | 0/83 [00:00<?, ?it/s]


Epoch 5 Results:
  Collective: 9.0%
  MathBERT: 11.1%, T5: 14.6%
  Eigen: 4.5%, Cayley: 4.6%
  Symbolic: 5.0%, Fractal: 5.9%
  ρ = 0.620
✓ Saved checkpoint: /content/math_collective_checkpoints/checkpoint_epoch_5.pt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nts/checkpoint_epoch_5.pt:   0%|          | 14.3kB /  328MB            

✓ Pushed to hub: checkpoints/checkpoint_epoch_5.pt
✓ Training history pushed to hub


In [20]:
# Cell: Final summary and best checkpoint
print("\n" + "="*70)
print("GALAXY BRAIN COLLECTIVE - GSM8K FINAL RESULTS")
print("="*70)
print("\nStreams:")
print("  FUZZY:        MathBERT (symbolic), T5 (linguistic)")
print("  DETERMINISTIC: Eigenspectrum, Cayley-Menger, Symbolic Calc, Fractal")
print("\n| Epoch | Collective | MathBERT | T5 | Eigen | Cayley | Symbolic | Fractal | ρ |")
print("|-------|------------|----------|-----|-------|--------|----------|---------|-------|")
for h in history:
    print(f"| {h['epoch']} | {h['collective']:.1%} | {h['mathbert']:.1%} | {h['t5']:.1%} | "
          f"{h['eigen']:.1%} | {h['cayley']:.1%} | {h['symbolic']:.1%} | {h['fractal']:.1%} | {h['rho']:.3f} |")

# Find best checkpoint
best_epoch = max(history, key=lambda x: x['collective'])
print(f"\n✓ Best epoch: {best_epoch['epoch']} with collective accuracy {best_epoch['collective']:.1%}")
print(f"✓ All checkpoints available at: https://huggingface.co/AbstractPhil/math_collective")


GALAXY BRAIN COLLECTIVE - GSM8K FINAL RESULTS

Streams:
  FUZZY:        MathBERT (symbolic), T5 (linguistic)
  DETERMINISTIC: Eigenspectrum, Cayley-Menger, Symbolic Calc, Fractal

| Epoch | Collective | MathBERT | T5 | Eigen | Cayley | Symbolic | Fractal | ρ |
|-------|------------|----------|-----|-------|--------|----------|---------|-------|
| 1 | 8.9% | 10.4% | 13.0% | 5.4% | 5.4% | 5.7% | 6.7% | 0.686 |
| 2 | 9.6% | 8.9% | 13.0% | 6.5% | 6.5% | 4.6% | 6.1% | 0.738 |
| 3 | 9.9% | 9.6% | 13.6% | 5.8% | 5.8% | 4.5% | 5.8% | 0.728 |
| 4 | 8.3% | 10.1% | 13.3% | 6.1% | 4.5% | 4.8% | 4.5% | 0.619 |
| 5 | 9.0% | 11.1% | 14.6% | 4.5% | 4.6% | 5.0% | 5.9% | 0.620 |

✓ Best epoch: 3 with collective accuracy 9.9%
✓ All checkpoints available at: https://huggingface.co/AbstractPhil/math_collective
