From e8316887f7aac3477e43808423fa361a8eb8b75a Mon Sep 17 00:00:00 2001 From: Kye Date: Mon, 2 Oct 2023 17:42:58 -0400 Subject: [PATCH] basic training scripts --- README.md | 3 ++ train.py | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 train.py diff --git a/README.md b/README.md index a96a1ec..d6a9e3c 100644 --- a/README.md +++ b/README.md @@ -106,3 +106,6 @@ Eprint = {arXiv:2302.05442}, # Todo - [ ] Add flash attention, with layernorm before attn, and then layernom for qk values, +- [ ] When using ViT-22B, similar to any large scale model, it is difficult to understand how the model arrived at a specific decision, which could lead to lack of +trust and accountability. Add in a mechanism to backtrack +- [ ] create logic to train the decoder for 300k steps with a batch size of 64 using Adam (Kingma and Ba, 2015) and clip the gradients to a global norm value of 0.05 to stabilize training. We linearly increase the learning rate for 2500 steps to 0.0002 (starting from 0) and then decay the learning rate with a cosine schedule (Loshchilov and Hutter, 2017) back to 0. \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..2e35fde --- /dev/null +++ b/train.py @@ -0,0 +1,105 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, random_split +from torchvision import datasets, transforms +from torch.optim import Adam +from torch.optim.lr_scheduler import LambdaLR +from torch.nn.utils import clip_grad_norm_ +from mega_vit.main import MegaVit + +# 1. Setup and Imports +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# 2. Data Preparation +transform = transforms.Compose([ + transforms.Resize((224, 224)), + transforms.ToTensor(), +]) + +# Using CIFAR-10 for demonstration purposes +cifar10 = datasets.CIFAR10(root="./data", download=True, transform=transform) +train_size = int(0.9 * len(cifar10)) +val_size = len(cifar10) - train_size +train_dataset, val_dataset = random_split(cifar10, [train_size, val_size]) + +train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) +val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False) + +# 3. Model Initialization +model = MegaVit( + image_size=224, + patch_size=14, + num_classes=10, # CIFAR-10 has 10 classes + dim=6144, + depth=48, + heads=48, + mlp_dim=2048, + dropout=0.1, + emb_dropout=0.1 +).to(device) + +criterion = nn.CrossEntropyLoss() +optimizer = Adam(model.parameters(), lr=0.0002) + +# Warm-up + Cosine schedule for the learning rate +def lr_schedule(epoch): + if epoch < 2500: + return epoch / 2500 + return 0.5 * (1 + torch.cos((epoch - 2500) / (300000 - 2500) * 3.14159)) + +scheduler = LambdaLR(optimizer, lr_schedule) + +# 4. Training Loop +def train_epoch(model, loader, optimizer, criterion, device): + model.train() + total_loss = 0 + correct = 0 + + for imgs, labels in loader: + imgs, labels = imgs.to(device), labels.to(device) + optimizer.zero_grad() + + outputs = model(imgs) + loss = criterion(outputs, labels) + loss.backward() + + clip_grad_norm_(model.parameters(), 0.05) + optimizer.step() + scheduler.step() + + total_loss += loss.item() + _, predicted = outputs.max(1) + correct += predicted.eq(labels).sum().item() + + return total_loss / len(loader), correct / len(train_dataset) + +def validate_epoch(model, loader, criterion, device): + model.eval() + total_loss = 0 + correct = 0 + with torch.no_grad(): + for imgs, labels in loader: + imgs, labels = imgs.to(device), labels.to(device) + + outputs = model(imgs) + loss = criterion(outputs, labels) + + total_loss += loss.item() + _, predicted = outputs.max(1) + correct += predicted.eq(labels).sum().item() + + return total_loss / len(loader), correct / len(val_dataset) + +# Assuming we will train for a certain number of epochs (in this case, calculated to reach 300k steps) +num_epochs = (300000 * 64) // len(train_dataset) + +for epoch in range(num_epochs): + train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device) + val_loss, val_acc = validate_epoch(model, val_loader, criterion, device) + + print(f"Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") + +# 5. Final Steps +torch.save(model.state_dict(), "mega_vit_model.pth") +print("Training finished.")