|
|
|
Viviane da Silva Dilly |
|
|
|
In this notebook, I did part 2 and 3 of fine tuning a LLM (I chose GPT2) on the enron dataset from kaggle. |
|
|
|
* Fine-tune a Language Model on the Enron dataset |
|
* Create a Gradio Interface that answers questions related to the case deploying it in a Huggingface Space |
|
|
|
As I mentioned below on my code, after many many days of trying to use the whole data set and having my code crashing after long hours of waiting, I decided to use a sample. |
|
|
|
|
|
|
|
!pip install transformers pandas torch |
|
|
|
import torch |
|
import pandas as pd |
|
|
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments |
|
from google.colab import drive |
|
drive.mount('/content/drive') |
|
|
|
|
|
enron_data = pd.read_csv('/content/drive/MyDrive/Mestrado/emails.csv') |
|
|
|
|
|
sample_size = 10000 |
|
sample_enron_data = enron_data.sample(sample_size) |
|
sample_enron_data.to_csv("sample_enron_dataset.csv", index=False) |
|
|
|
|
|
sample_enron_data.head() |
|
|
|
|
|
len(sample_enron_data) |
|
|
|
|
|
text = "\n".join(sample_enron_data['message']) |
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
input_ids = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)['input_ids'] |
|
|
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup |
|
from torch.utils.data import Dataset, DataLoader |
|
from tqdm import tqdm |
|
|
|
|
|
class EmailDataset(Dataset): |
|
def __init__(self, input_ids): |
|
self.input_ids = input_ids |
|
|
|
def __len__(self): |
|
return len(self.input_ids) |
|
|
|
def __getitem__(self, idx): |
|
return self.input_ids[idx] |
|
|
|
dataset = EmailDataset(input_ids) |
|
|
|
|
|
model = GPT2LMHeadModel.from_pretrained('gpt2') |
|
|
|
|
|
optimizer = AdamW(model.parameters(), lr=5e-5) |
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset)) |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
model.to(device) |
|
|
|
|
|
model.train() |
|
|
|
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True) |
|
|
|
num_epochs = 3 |
|
for epoch in range(num_epochs): |
|
epoch_loss = 0 |
|
steps = 0 |
|
|
|
for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"): |
|
batch = batch.to(device) |
|
|
|
outputs = model(input_ids=batch, labels=batch) |
|
loss = outputs.loss |
|
|
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
scheduler.step() |
|
|
|
epoch_loss += loss.item() |
|
steps += 1 |
|
|
|
print(f"Epoch {epoch + 1} - Average Loss: {epoch_loss / steps}") |
|
|
|
|
|
model.save_pretrained("./fine_tuned_model") |
|
|
|
|
|
|
|
Now, having fine tuned the model, I proceed to creating the gradio interface |
|
|
|
|
|
!pip install gradio |
|
import gradio as gr |
|
|
|
|
|
model_fine_tuned = GPT2LMHeadModel.from_pretrained("./fine_tuned_model") |
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
|
|
|
|
|
def generate_response(question): |
|
input_ids = tokenizer.encode(question, return_tensors="pt") |
|
output = model_fine_tuned.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7) |
|
response = tokenizer.decode(output[0], skip_special_tokens=True) |
|
return response |
|
|
|
|
|
gr.Interface(generate_response, "textbox", "textbox", title="Ask Enron Dataset", description="Enter a question about the case").launch() |
|
|
|
|
|
|
|
(you find the link also on the top of this notebook) |
|
|
|
|