|
|
|
%pip install --upgrade pip |
|
%pip install torch torchdata transformers datasets loralib peft pandas numpy |
|
|
|
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from torch.utils.data import Dataset, DataLoader |
|
from torch.optim import Adam |
|
import pandas as pd |
|
import torch |
|
|
|
|
|
model_name = "google/flan-t5-base" |
|
training_file = "tarot_readings.csv" |
|
num_epochs = 3 |
|
num_rows = 500 |
|
device = "cpu" |
|
|
|
|
|
def create_tarot_dataset(csv_file, tokenizer, num_rows=None): |
|
data = pd.read_csv(csv_file) |
|
|
|
if num_rows: |
|
data = data[:num_rows] |
|
|
|
def tokenize(row): |
|
prompt = "Give me a one paragraph tarot reading if I pull the cards {}, {} and {}.".format(row['Card 1'], row[' Card 2'], row[' Card 3']) |
|
reading = row[' Reading'] |
|
|
|
inputs = tokenizer.encode_plus(prompt, add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_tensors='pt') |
|
target = tokenizer.encode_plus(reading, add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_tensors='pt') |
|
|
|
return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'target_ids': target['input_ids'].squeeze(), 'target_attention_mask': target['attention_mask'].squeeze()} |
|
|
|
dataset = data.apply(tokenize, axis=1).tolist() |
|
|
|
return dataset |
|
|
|
|
|
def fine_tune_model(model, optimizer, batch, device): |
|
model.train() |
|
|
|
input_ids = batch['input_ids'].to(device) |
|
attention_mask = batch['attention_mask'].to(device) |
|
labels = batch['target_ids'].to(device) |
|
decoder_attention_mask = batch['target_attention_mask'].to(device) |
|
|
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask) |
|
loss = outputs.loss |
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
|
|
return loss.item() |
|
|
|
|
|
def tacot_reading(model, tokenizer, card1, card2, card3): |
|
prompt = "Give me a one paragraph tarot reading if I pull the cards {}, {} and {}.".format(card1, card2, card3) |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
completion = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=1000)[0], skip_special_tokens=True) |
|
|
|
print("Prompt: {}".format(prompt)) |
|
print("Response: {}".format(completion)) |
|
print() |
|
|
|
return completion |
|
|
|
print("* Loading model [{}]...".format(model_name)) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
|
|
|
print("* Running 3 inferences (pre-training)...") |
|
tacot_reading(model, tokenizer, "The moon", "Two of Swords", "Three of Wands") |
|
tacot_reading(model, tokenizer, "The hermit", "Ace of Pentacles", "Judgement") |
|
tacot_reading(model, tokenizer, "Seven of Cups", "The chariot", "King of Swords") |
|
|
|
print("* Creating dataset from [{}]...".format(training_file)) |
|
dataset = create_tarot_dataset(training_file, tokenizer, num_rows) |
|
data_loader = DataLoader(dataset, batch_size=16, shuffle=True) |
|
|
|
print("* Training model for {} epochs..".format(num_epochs)) |
|
optimizer = Adam(model.parameters(), lr=1e-4) |
|
for epoch in range(num_epochs): |
|
loss = 0 |
|
for batch in data_loader: |
|
loss += fine_tune_model(model, optimizer, batch, device) |
|
print("Epoch {} average loss: {}".format((epoch+1), (loss / len(data_loader)))) |
|
|
|
print("* Running 3 inferences (post-training)...") |
|
tacot_reading(model, tokenizer, "The moon", "Two of Swords", "Three of Wands") |
|
tacot_reading(model, tokenizer, "The hermit", "Ace of Pentacles", "Judgement") |
|
tacot_reading(model, tokenizer, "Seven of Cups", "The chariot", "King of Swords") |