Spaces:

Ashish-R
/

LoRAFromScratchSentimentAnalysis

Sleeping

App Files Files Community

Ashish Reddy commited on Jun 12

Commit

a090db7

1 Parent(s): a494631

committing

Browse files

Files changed (9) hide show

.gitignore +2 -0
DISTILBERT_MERGED.pth +3 -0
app.py +50 -0
baseline.py +94 -0
loraLayer.py +19 -0
loraLinear.py +45 -0
loraTune.py +86 -0
mergeWeights.py +50 -0
requirements.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.DS_Store
2	+ __pycache__/

DISTILBERT_MERGED.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f124b3db5a1adae5c3d1472b849806e3aa23b352d3d3c9a53bdf404f8d0b2ca0
+size 267861563

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch, torch.nn.functional as F
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+MODEL_CKPT = "distilbert-base-uncased"
+DEVICE     = "cpu"     # HF Spaces default
+print("--- Loading tokenizer & base model ---")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)
+model     = AutoModelForSequenceClassification.from_pretrained(MODEL_CKPT)
+print("--- Loading merged fine-tuned weights ---")
+model.load_state_dict(torch.load("DISTILBERT_MERGED.pth", map_location=DEVICE))
+model.to(DEVICE).eval()
+# nice label names for IMDB
+model.config.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+model.config.label2id = {v: k for k, v in model.config.id2label.items()}
+def predict(text):
+    tokens = tokenizer(
+        text,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=256
+    ).to(DEVICE)
+    with torch.no_grad():
+        probs = F.softmax(model(**tokens).logits, dim=-1)[0]
+    return {model.config.id2label[i]: float(p) for i, p in enumerate(probs)}
+demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(lines=3, label="Movie Review"),
+    outputs=gr.Label(num_top_classes=2, label="Sentiment"),
+    title="Sentiment Analysis (LoRA-merged DistilBERT)",
+    description=(
+        "DistilBERT fine-tuned on IMDB with a custom LoRA adapter. "
+        "Adapters have been merged so the model runs with no extra parameters."
+    ),
+    examples=[
+        ["An absolute masterpiece with brilliant acting!"],
+        ["Total waste of two hours."],
+        ["Predictable plot but gorgeous visuals."]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch()

baseline.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification
+)
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+"""
+---- Device ----
+"""
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+    print("Using CUDA (GPU)")
+elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    device = torch.device('mps')
+    print("Using MPS (Apple Silicon GPU)")
+else:
+    device = torch.device('cpu')
+    print("Using device's CPU")
+"""
+--- Model ---
+"""
+model_ckpt = "distilbert-base-uncased"
+print(f"--- Loading pre-trained model and tokenizer: {model_ckpt.upper()} ---")
+tok = AutoTokenizer.from_pretrained(model_ckpt)
+model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
+model.to(device)
+print(f"Model moved to {device}")
+"""
+--- Data Prep ---
+"""
+print("\n--- Loading and preparing IMDB dataset ---")
+imdb_dataset = load_dataset("imdb")
+"""
+DatasetDict({
+    train: Dataset({
+        features: ['text', 'label'],
+        num_rows: 25000
+    })
+    test: Dataset({
+        features: ['text', 'label'],
+        num_rows: 25000
+    })
+    unsupervised: Dataset({
+        features: ['text', 'label'],
+        num_rows: 50000
+    })
+})
+"""
+def tokenize_fn(examples):
+    return tok(examples["text"], padding="max_length", truncation=True)
+tokenized_datasets = imdb_dataset.map(tokenize_fn, batched=True)
+tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+tokenized_datasets.set_format("torch")
+if __name__ == '__main__':
+    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) # Select random 1000 test datasets
+    eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) # Convert them into 8 batches --> 125 ['labels', 'token_ids', 'attention_mask'] examples in each batch
+    print("\n--- Evaluating baseline model performance ---")
+    model.eval()
+    num_correct = 0
+    num_samples = 0
+    with torch.no_grad(): # Disable gradient calculation for inference (No backprop)
+        for batch in eval_dataloader:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch) # Forward pass
+            logits = outputs.logits  # Logits
+            predictions = torch.argmax(logits, dim=-1) # Highest logit score
+            # Compare predictions to true labels
+            num_correct += (predictions == batch["labels"]).sum().item()
+            num_samples += batch["labels"].size(0)
+    accuracy = num_correct / num_samples
+    print(f"Baseline Accuracy on 1000 samples: {accuracy:.4f}") # Around 0.4880 --> 48% accurate (For 1000 testing examples) [As it plays the game of guessing, it always is around the 50% mark as the model isn't still trained and you can expect the output to be always positive or always negative]

loraLayer.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch.nn as nn
+class LoRALayer(nn.Module):
+    def __init__(self, in_features, out_features, rank, alpha):
+        super().__init__()
+        self.rank = rank
+        self.alpha = alpha
+        self.scaling = alpha/rank
+        self.loraA = nn.Linear(in_features, rank, bias=False)
+        self.loraB = nn.Linear(rank, out_features, bias=False)
+        nn.init.kaiming_uniform_(self.loraA.weight, a=5**0.5)
+        nn.init.zeros_(self.loraB.weight)
+    def forward(self, x):
+        delta = self.loraB(self.loraA(x))   # (x*A)*B -->  ((B, S, D) * (B, D, R)) * (B, R, D) --> (B, S, R) * (B, R, D) --> (B, S, D)
+        x = self.scaling * delta
+        return x

loraLinear.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch.nn as nn
+from transformers import (
+    AutoModelForSequenceClassification
+)
+RANK = 4
+ALPHA = 4
+model_ckpt = "distilbert-base-uncased"
+from loraLayer import LoRALayer
+class LoRALinear(nn.Module):
+    def __init__(self, original_layer, rank, alpha):
+        super().__init__()
+        self.in_features = original_layer.in_features
+        self.out_features = original_layer.out_features
+        self.original_layer = original_layer
+        self.lora = LoRALayer(self.in_features, self.out_features, rank, alpha)
+    def forward(self, x):
+        original_output = self.original_layer(x)  # Wo*x
+        lora_output = self.lora(x)                # (xA)B * scaling
+        return original_output + lora_output      # Wo*x + (xA)B * scaling
+model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
+for param in model.parameters():
+    param.requires_grad = False    # Freeze all original parameters
+print("--- Injecting LoRA adapters into q_lin and v_lin layers of DISTILBERT---")
+for layer in model.distilbert.transformer.layer:
+    layer.attention.q_lin = LoRALinear(layer.attention.q_lin, RANK, ALPHA)
+    layer.attention.v_lin = LoRALinear(layer.attention.v_lin, RANK, ALPHA)
+print("INFO: LoRA Adapters INJECTED")
+print("\nTrainable parameters:")
+for name, param in model.named_parameters():
+    if param.requires_grad:
+        print(name)
+total_params = sum(p.numel() for p in model.parameters())
+trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print(f"\nTotal parameters: {total_params}")
+print(f"Trainable LoRA parameters: {trainable_params}")
+print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.4f}%")

loraTune.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch, torch.optim as optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+model_ckpt = "distilbert-base-uncased"
+batch_size = 16
+n_epochs = 3
+learning_rate = 1e-4
+RANK = 4
+ALPHA = 4
+"""
+---- Device ----
+"""
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+    print("Using CUDA (GPU)")
+elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    device = torch.device('mps')
+    print("Using MPS (Apple Silicon GPU)")
+else:
+    device = torch.device('cpu')
+    print("Using device's CPU")
+from baseline import tokenized_datasets
+"""
+tokenized_datasets:
+DatasetDict({
+    train: Dataset({
+        features: ['labels', 'input_ids', 'attention_mask'],
+        num_rows: 25000
+    })
+    test: Dataset({
+        features: ['labels', 'input_ids', 'attention_mask'],
+        num_rows: 25000
+    })
+    unsupervised: Dataset({
+        features: ['labels', 'input_ids', 'attention_mask'],
+        num_rows: 50000
+    })
+})
+"""
+train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size)
+eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size)
+from loraLinear import model
+model.to(device)
+print(f"INFO: Moved model to {device}")
+trainable_params = [p for p in model.parameters() if p.requires_grad] # len: 24
+optimizer = optim.AdamW(trainable_params, lr=learning_rate)
+for epoch in range(n_epochs):
+    model.train()
+    print(f"\n--- Starting Epoch {epoch+1}/{n_epochs} ---")
+    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        optimizer.zero_grad()
+        outputs = model(**batch)
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+    model.eval()
+    num_correct = 0
+    num_samples = 0
+    with torch.no_grad():
+        for batch in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch+1}"):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            predictions = torch.argmax(outputs.logits, dim=-1)
+            num_correct += (predictions == batch["labels"]).sum().item()
+            num_samples += batch["labels"].size(0)
+    accuracy = num_correct / num_samples
+    print(f"--- Epoch {epoch+1} Validation Accuracy: {accuracy:.4f} ---")
+print("\nFine-tuning complete.")
+torch.save(model.state_dict(), "DISTILBERT_WITH_LORA.pth")
+print("Trained LoRA model saved.")

mergeWeights.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from transformers import AutoModelForSequenceClassification
+from loraLinear import LoRALinear
+MODEL_CKPT   = "distilbert-base-uncased"
+RANK         = 4
+ALPHA        = 4
+DEVICE       = "cpu"                  # fine for Spaces; merge is fast
+# Re-create the LoRA-injected architecture
+lora_model = AutoModelForSequenceClassification.from_pretrained(MODEL_CKPT)
+for blk in lora_model.distilbert.transformer.layer:
+    blk.attention.q_lin = LoRALinear(blk.attention.q_lin, RANK, ALPHA)
+    blk.attention.v_lin = LoRALinear(blk.attention.v_lin, RANK, ALPHA)
+lora_model.load_state_dict(torch.load("DISTILBERT_WITH_LORA.pth", map_location=DEVICE))
+lora_model.eval()
+# Collapse each adapter:  W ← W + (B @ A)·scale
+for blk in lora_model.distilbert.transformer.layer:
+    for name in ("q_lin", "v_lin"):
+        wrap = getattr(blk.attention, name)
+        with torch.no_grad():
+            base_W = wrap.original_layer.weight        # (out, in)
+            A      = wrap.lora.loraA.weight             # (rank, in)
+            B      = wrap.lora.loraB.weight             # (out, rank)
+            base_W += (B @ A) * wrap.lora.scaling       # in-place update
+# Copy the merged weights into a *plain* DistilBERT (no wrappers)
+plain_model = AutoModelForSequenceClassification.from_pretrained(MODEL_CKPT)
+with torch.no_grad():
+    for i in range(6):
+        plain_blk = plain_model.distilbert.transformer.layer[i]
+        lora_blk  = lora_model.distilbert.transformer.layer[i]
+        for lin in ("q_lin", "v_lin"):
+            pl = getattr(plain_blk.attention, lin)
+            lr = getattr(lora_blk.attention, lin).original_layer
+            pl.weight.copy_(lr.weight)
+            pl.bias.copy_(lr.bias)
+    # classification head
+    plain_model.pre_classifier.weight.copy_(lora_model.pre_classifier.weight)
+    plain_model.pre_classifier.bias.copy_(lora_model.pre_classifier.bias)
+    plain_model.classifier.weight.copy_(lora_model.classifier.weight)
+    plain_model.classifier.bias.copy_(lora_model.classifier.bias)
+# Save
+torch.save(plain_model.state_dict(), "DISTILBERT_MERGED.pth")
+print("✅  Merged weights saved to  DISTILBERT_MERGED.pth")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+transformers
+datasets
+gradio