In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from tqdm import tqdm

# Load and preprocess data
data = pd.read_csv('transcript.txt', sep='\t', names=['text', 'punct'], error_bad_lines=False)

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


# Define custom dataset class
class PunctuationDataset(Dataset):
 def __init__(self, df, tokenizer):
 self.tokenizer = tokenizer
 self.sentences = df['text'].tolist()
 self.labels = df['punct'].fillna(0.0).tolist()
 
 def __len__(self):
 return len(self.sentences)
 
 def __getitem__(self, idx):
 sentence = self.sentences[idx]
 label = int(self.labels[idx]) # Convert label to integer
 
 # Tokenize sentence and convert to input IDs
 inputs = self.tokenizer(sentence, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
 
 # Add labels to inputs dictionary
 inputs['labels'] = torch.tensor(label, dtype=torch.long)
 
 return inputs


# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Define custom metric function for accuracy
def accuracy(preds, labels):
 preds = preds.argmax(dim=1)
 return (preds == labels).float().mean()

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Initialize train and validation datasets and dataloaders
train_dataset = PunctuationDataset(train_data, tokenizer)
val_dataset = PunctuationDataset(val_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)


num_epochs = 5

for epoch in range(num_epochs):
 train_loss, train_acc = 0.0, 0.0
 
 model.train()
 for batch in tqdm(train_dataloader, desc='Training'):
 optimizer.zero_grad()
 input_ids = batch['input_ids'].squeeze(1).to(model.device)
 attention_mask = batch['attention_mask'].squeeze(1).to(model.device)
 labels = batch['labels'].to(model.device)
 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 loss = outputs.loss
 loss.backward()
 optimizer.step()
 train_loss += loss.item()
 train_acc += accuracy(outputs.logits, labels)
 
 train_loss /= len(train_dataloader)
 train_acc /= len(train_dataloader)
 
 val_loss, val_acc = 0.0, 0.0
 
 model.eval()
 with torch.no_grad():
 for batch in tqdm(val_dataloader, desc='Validation'):
 input_ids = batch['input_ids'].squeeze(1).to(model.device)
 attention_mask = batch['attention_mask'].squeeze(1).to(model.device)
 labels = batch['labels'].to(model.device)
 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 loss = outputs.loss
 val_loss += loss.item()
 val_acc += accuracy(outputs.logits, labels)
 
 val_loss /= len(val_dataloader)
 val_acc /= len(val_dataloader)

 print(f'Epoch {epoch + 1}:')
 print(f'Training Loss: {train_loss:.3f} | Training Accuracy: {train_acc:.3f}')
 print(f'Validation Loss: {val_loss:.3f} | Validation Accuracy: {val_acc:.3f}')

 # Adjust learning rate
 scheduler.step()

model.save_pretrained('roberta_model_punctuation_prediction')




 data = pd.read_csv('transcript.txt', sep='\t', names=['text', 'punct'], error_bad_lines=False)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the 

Epoch 1:
Training Loss: 1.111 | Training Accuracy: 0.200
Validation Loss: 0.943 | Validation Accuracy: 1.000


Training: 100%|██████████| 2/2 [01:03<00:00, 31.60s/it]
Validation: 100%|██████████| 1/1 [00:06<00:00, 6.87s/it]


Epoch 2:
Training Loss: 0.877 | Training Accuracy: 1.000
Validation Loss: 0.602 | Validation Accuracy: 1.000


Training: 100%|██████████| 2/2 [01:00<00:00, 30.17s/it]
Validation: 100%|██████████| 1/1 [00:07<00:00, 7.54s/it]


Epoch 3:
Training Loss: 0.428 | Training Accuracy: 1.000
Validation Loss: 0.086 | Validation Accuracy: 1.000


Training: 100%|██████████| 2/2 [00:59<00:00, 29.67s/it]
Validation: 100%|██████████| 1/1 [00:06<00:00, 6.23s/it]


Epoch 4:
Training Loss: 0.088 | Training Accuracy: 1.000
Validation Loss: 0.024 | Validation Accuracy: 1.000


Training: 100%|██████████| 2/2 [01:01<00:00, 30.70s/it]
Validation: 100%|██████████| 1/1 [00:06<00:00, 6.42s/it]


Epoch 5:
Training Loss: 0.030 | Training Accuracy: 1.000
Validation Loss: 0.009 | Validation Accuracy: 1.000
