|
|
import torch, torch.optim as optim |
|
|
from torch.utils.data import DataLoader |
|
|
from tqdm import tqdm |
|
|
|
|
|
model_ckpt = "distilbert-base-uncased" |
|
|
batch_size = 16 |
|
|
n_epochs = 3 |
|
|
learning_rate = 1e-4 |
|
|
RANK = 4 |
|
|
ALPHA = 4 |
|
|
|
|
|
""" |
|
|
---- Device ---- |
|
|
""" |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device = torch.device('cuda') |
|
|
print("Using CUDA (GPU)") |
|
|
elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): |
|
|
device = torch.device('mps') |
|
|
print("Using MPS (Apple Silicon GPU)") |
|
|
else: |
|
|
device = torch.device('cpu') |
|
|
print("Using device's CPU") |
|
|
|
|
|
|
|
|
from baseline import tokenized_datasets |
|
|
|
|
|
""" |
|
|
tokenized_datasets: |
|
|
|
|
|
DatasetDict({ |
|
|
train: Dataset({ |
|
|
features: ['labels', 'input_ids', 'attention_mask'], |
|
|
num_rows: 25000 |
|
|
}) |
|
|
test: Dataset({ |
|
|
features: ['labels', 'input_ids', 'attention_mask'], |
|
|
num_rows: 25000 |
|
|
}) |
|
|
unsupervised: Dataset({ |
|
|
features: ['labels', 'input_ids', 'attention_mask'], |
|
|
num_rows: 50000 |
|
|
}) |
|
|
}) |
|
|
""" |
|
|
|
|
|
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size) |
|
|
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size) |
|
|
|
|
|
from loraLinear import model |
|
|
|
|
|
model.to(device) |
|
|
print(f"INFO: Moved model to {device}") |
|
|
|
|
|
trainable_params = [p for p in model.parameters() if p.requires_grad] |
|
|
optimizer = optim.AdamW(trainable_params, lr=learning_rate) |
|
|
|
|
|
for epoch in range(n_epochs): |
|
|
model.train() |
|
|
print(f"\n--- Starting Epoch {epoch+1}/{n_epochs} ---") |
|
|
for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"): |
|
|
batch = {k: v.to(device) for k, v in batch.items()} |
|
|
optimizer.zero_grad() |
|
|
outputs = model(**batch) |
|
|
loss = outputs.loss |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
model.eval() |
|
|
num_correct = 0 |
|
|
num_samples = 0 |
|
|
with torch.no_grad(): |
|
|
for batch in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch+1}"): |
|
|
batch = {k: v.to(device) for k, v in batch.items()} |
|
|
outputs = model(**batch) |
|
|
predictions = torch.argmax(outputs.logits, dim=-1) |
|
|
num_correct += (predictions == batch["labels"]).sum().item() |
|
|
num_samples += batch["labels"].size(0) |
|
|
|
|
|
accuracy = num_correct / num_samples |
|
|
print(f"--- Epoch {epoch+1} Validation Accuracy: {accuracy:.4f} ---") |
|
|
|
|
|
print("\nFine-tuning complete.") |
|
|
torch.save(model.state_dict(), "DISTILBERT_WITH_LORA.pth") |
|
|
print("Trained LoRA model saved.") |