{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d58e48d4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jshru\\AppData\\Local\\Temp\\ipykernel_9760\\2744576560.py:9: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "\n", "\n", " data = pd.read_csv('transcript.txt', sep='\\t', names=['text', 'punct'], error_bad_lines=False)\n", "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']\n", "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "C:\\ProgramData\\anaconda3\\lib\\site-packages\\transformers\\optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "Training: 100%|██████████| 2/2 [01:02<00:00, 31.05s/it]\n", "Validation: 100%|██████████| 1/1 [00:06<00:00, 6.23s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1:\n", "Training Loss: 1.111 | Training Accuracy: 0.200\n", "Validation Loss: 0.943 | Validation Accuracy: 1.000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|██████████| 2/2 [01:03<00:00, 31.60s/it]\n", "Validation: 100%|██████████| 1/1 [00:06<00:00, 6.87s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 2:\n", "Training Loss: 0.877 | Training Accuracy: 1.000\n", "Validation Loss: 0.602 | Validation Accuracy: 1.000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|██████████| 2/2 [01:00<00:00, 30.17s/it]\n", "Validation: 100%|██████████| 1/1 [00:07<00:00, 7.54s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 3:\n", "Training Loss: 0.428 | Training Accuracy: 1.000\n", "Validation Loss: 0.086 | Validation Accuracy: 1.000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|██████████| 2/2 [00:59<00:00, 29.67s/it]\n", "Validation: 100%|██████████| 1/1 [00:06<00:00, 6.23s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 4:\n", "Training Loss: 0.088 | Training Accuracy: 1.000\n", "Validation Loss: 0.024 | Validation Accuracy: 1.000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Training: 100%|██████████| 2/2 [01:01<00:00, 30.70s/it]\n", "Validation: 100%|██████████| 1/1 [00:06<00:00, 6.42s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 5:\n", "Training Loss: 0.030 | Training Accuracy: 1.000\n", "Validation Loss: 0.009 | Validation Accuracy: 1.000\n" ] } ], "source": [ "import pandas as pd\n", "import torch\n", "from sklearn.model_selection import train_test_split\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW\n", "from tqdm import tqdm\n", "\n", "# Load and preprocess data\n", "data = pd.read_csv('transcript.txt', sep='\\t', names=['text', 'punct'], error_bad_lines=False)\n", "\n", "# Split data into train and validation sets\n", "train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)\n", "\n", "\n", "# Define custom dataset class\n", "class PunctuationDataset(Dataset):\n", " def __init__(self, df, tokenizer):\n", " self.tokenizer = tokenizer\n", " self.sentences = df['text'].tolist()\n", " self.labels = df['punct'].fillna(0.0).tolist()\n", " \n", " def __len__(self):\n", " return len(self.sentences)\n", " \n", " def __getitem__(self, idx):\n", " sentence = self.sentences[idx]\n", " label = int(self.labels[idx]) # Convert label to integer\n", " \n", " # Tokenize sentence and convert to input IDs\n", " inputs = self.tokenizer(sentence, padding='max_length', truncation=True, max_length=512, return_tensors='pt')\n", " \n", " # Add labels to inputs dictionary\n", " inputs['labels'] = torch.tensor(label, dtype=torch.long)\n", " \n", " return inputs\n", "\n", "\n", "# Initialize tokenizer and model\n", "tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n", "model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)\n", "\n", "# Define optimizer and learning rate scheduler\n", "optimizer = AdamW(model.parameters(), lr=5e-5)\n", "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)\n", "\n", "# Define custom metric function for accuracy\n", "def accuracy(preds, labels):\n", " preds = preds.argmax(dim=1)\n", " return (preds == labels).float().mean()\n", "\n", "# Split data into train and validation sets\n", "train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)\n", "\n", "# Initialize train and validation datasets and dataloaders\n", "train_dataset = PunctuationDataset(train_data, tokenizer)\n", "val_dataset = PunctuationDataset(val_data, tokenizer)\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=16)\n", "\n", "\n", "num_epochs = 5\n", "\n", "for epoch in range(num_epochs):\n", " train_loss, train_acc = 0.0, 0.0\n", " \n", " model.train()\n", " for batch in tqdm(train_dataloader, desc='Training'):\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].squeeze(1).to(model.device)\n", " attention_mask = batch['attention_mask'].squeeze(1).to(model.device)\n", " labels = batch['labels'].to(model.device)\n", " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs.loss\n", " loss.backward()\n", " optimizer.step()\n", " train_loss += loss.item()\n", " train_acc += accuracy(outputs.logits, labels)\n", " \n", " train_loss /= len(train_dataloader)\n", " train_acc /= len(train_dataloader)\n", " \n", " val_loss, val_acc = 0.0, 0.0\n", " \n", " model.eval()\n", " with torch.no_grad():\n", " for batch in tqdm(val_dataloader, desc='Validation'):\n", " input_ids = batch['input_ids'].squeeze(1).to(model.device)\n", " attention_mask = batch['attention_mask'].squeeze(1).to(model.device)\n", " labels = batch['labels'].to(model.device)\n", " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs.loss\n", " val_loss += loss.item()\n", " val_acc += accuracy(outputs.logits, labels)\n", " \n", " val_loss /= len(val_dataloader)\n", " val_acc /= len(val_dataloader)\n", "\n", " print(f'Epoch {epoch + 1}:')\n", " print(f'Training Loss: {train_loss:.3f} | Training Accuracy: {train_acc:.3f}')\n", " print(f'Validation Loss: {val_loss:.3f} | Validation Accuracy: {val_acc:.3f}')\n", "\n", " # Adjust learning rate\n", " scheduler.step()\n", "\n", "model.save_pretrained('roberta_model_punctuation_prediction')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3f410016", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }