LightGPT / instruction-tune.py
Andrew DalPino
Broad improvements
ab12a97
import random
from argparse import ArgumentParser
import torch
from torch.utils.data import DataLoader
from torch.optim import Adafactor
from torch.amp import autocast
from torch.cuda import is_available as cuda_is_available, is_bf16_supported
from torch.utils.data import random_split
from torchmetrics.text import Perplexity
from model import GPT, GPTWithLoRA
from data import Alpaca
import tiktoken
from tqdm import tqdm
def main():
parser = ArgumentParser(description="Instruction-tune the GPT.")
parser.add_argument("--base_model_path", default="./out/checkpoint.pt", type=str)
parser.add_argument("--batch_size", default=1, type=int)
parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
parser.add_argument("--learning_rate", default=5e-4, type=float)
parser.add_argument("--rms_decay", default=-0.8, type=float)
parser.add_argument("--optimizer_low_memory", default=True, type=bool)
parser.add_argument("--mask_input", default=False, type=bool)
parser.add_argument("--num_epochs", default=4, type=int)
parser.add_argument("--rank", default=8, type=int)
parser.add_argument("--alpha", default=1.0, type=float)
parser.add_argument("--dropout", default=0.05, type=float)
parser.add_argument("--activation_checkpointing", action="store_true")
parser.add_argument("--eval_interval", default=1, type=int)
parser.add_argument("--checkpoint_interval", default=1, type=int)
parser.add_argument(
"--checkpoint_path", default="./out/lora_instruction.pt", type=str
)
parser.add_argument("--resume", action="store_true")
parser.add_argument("--device", default="cuda", type=str)
parser.add_argument("--seed", default=None, type=int)
args = parser.parse_args()
if "cuda" in args.device and not cuda_is_available():
raise RuntimeError("Cuda is not available.")
torch.set_float32_matmul_precision("high")
dtype = (
torch.bfloat16
if "cuda" in args.device and is_bf16_supported()
else torch.float32
)
forward_context = autocast(device_type=args.device, dtype=dtype)
if args.seed:
torch.manual_seed(args.seed)
random.seed(args.seed)
checkpoint = torch.load(
args.base_model_path, map_location=args.device, weights_only=True
)
model_args = checkpoint["model_args"]
tokenizer = tiktoken.get_encoding(checkpoint["token_encoding"])
dataset = Alpaca(
tokenizer,
max_tokens_per_sample=model_args["block_size"],
mask_input=args.mask_input,
)
training, testing = random_split(dataset, (0.9, 0.1))
train_loader = DataLoader(
training,
collate_fn=dataset.collate,
batch_size=args.batch_size,
pin_memory="cpu" not in args.device,
shuffle=True,
)
test_loader = DataLoader(
testing,
collate_fn=dataset.collate,
batch_size=args.batch_size,
pin_memory="cpu" not in args.device,
shuffle=False,
)
model = GPT(**model_args, activation_checkpointing=args.activation_checkpointing)
model = torch.compile(model)
model.load_state_dict(checkpoint["model"])
print("Model checkpoint loaded")
lora_args = {
"rank": args.rank,
"alpha": args.alpha,
"dropout": args.dropout,
}
model = GPTWithLoRA(model, **lora_args).to(args.device)
print("Compiling model")
model.compile()
optimizer = Adafactor(
model.parameters(),
lr=args.learning_rate,
beta2_decay=args.rms_decay,
foreach=not args.optimizer_low_memory,
)
starting_epoch = 1
if args.resume:
checkpoint = torch.load(
args.checkpoint_path, map_location=args.device, weights_only=True
)
model.load_state_dict(checkpoint["lora"], strict=False)
optimizer.load_state_dict(checkpoint["optimizer"])
starting_epoch += checkpoint["epoch"]
print("Previous checkpoint resumed successfully")
model.train()
print(f"Model has {model.num_trainable_params:,} trainable parameters")
perplexity_metric = Perplexity(ignore_index=dataset.PADDING_INDEX).to(args.device)
print("Instruction-tuning ...")
for epoch in range(starting_epoch, args.num_epochs + 1):
total_cross_entropy, total_batches = 0.0, 0
for step, (x, y) in enumerate(
tqdm(train_loader, desc=f"Epoch {epoch}", leave=False), start=1
):
x = x.to(args.device, non_blocking=True)
y = y.to(args.device, non_blocking=True)
with forward_context:
y_pred, loss = model(x, y)
scaled_loss = loss / args.gradient_accumulation_steps
scaled_loss.backward()
total_cross_entropy += loss.item()
if step % args.gradient_accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad(set_to_none=True)
total_batches += 1
average_cross_entropy = total_cross_entropy / total_batches
print(
f"Epoch {epoch}: Cross Entropy: {average_cross_entropy:.5f}",
)
if epoch % args.eval_interval == 0:
model.eval()
for x, y in tqdm(test_loader, desc="Testing", leave=False):
x = x.to(args.device, non_blocking=True)
y = y.to(args.device, non_blocking=True)
with torch.no_grad():
y_pred, _ = model(x)
perplexity_metric.update(y_pred, y)
perplexity = perplexity_metric.compute()
print(f"Perplexity: {perplexity:.3f}")
perplexity_metric.reset()
model.train()
if epoch % args.checkpoint_interval == 0:
checkpoint = {
"epoch": epoch,
"lora_args": lora_args,
"lora": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
torch.save(checkpoint, args.checkpoint_path)
print("Checkpoint saved")
print("Done!")
if __name__ == "__main__":
main()