{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "aba235f2", "metadata": { "ExecuteTime": { "end_time": "2023-09-27T18:12:18.439224Z", "start_time": "2023-09-27T18:12:12.646006Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import torch\n", "from torch import nn\n", "import torch.nn.functional as F\n", "from torch.nn import init, MarginRankingLoss\n", "from transformers import BertModel, RobertaModel\n", "from transformers import BertTokenizer, RobertaTokenizer\n", "from torch.optim import Adam\n", "from distutils.version import LooseVersion\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.utils.tensorboard import SummaryWriter\n", "from datetime import datetime\n", "from torch.autograd import Variable\n", "from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer\n", "import torch.optim as optim\n", "from torch.distributions import Categorical\n", "import random\n", "from transformers import AutoModelForMaskedLM, BertForMaskedLM, AdamW\n", "from transformers import BertTokenizer\n", "from tqdm import tqdm\n", "import matplotlib.pyplot as plt\n", "from transformers import XLMRobertaTokenizer\n", "import os\n", "import csv\n", "from sklearn.model_selection import train_test_split\n", "import nltk\n", "from collections import defaultdict\n", "from nltk.tokenize import word_tokenize\n", "from nltk import pos_tag\n", "from nltk.tokenize import word_tokenize\n", "import math\n", "from nltk.corpus import words\n", "from sklearn.model_selection import train_test_split\n", "import random\n", "import re\n", "import random" ] }, { "cell_type": "code", "execution_count": 2, "id": "ddeeea22", "metadata": { "ExecuteTime": { "end_time": "2023-09-27T18:12:18.442893Z", "start_time": "2023-09-27T18:12:18.440610Z" } }, "outputs": [], "source": [ "class MyDataset(Dataset):\n", " def __init__(self,file_name):\n", " df1 = pd.read_csv(file_name)\n", " df1 = df1[200:300]\n", " df1 = df1.fillna(\"\")\n", " res = df1['X'].to_numpy()\n", " self.X_list = res\n", " self.y_list = df1['y'].to_numpy()\n", " def __len__(self):\n", " return len(self.X_list)\n", " def __getitem__(self,idx):\n", " mapi = []\n", " mapi.append(self.X_list[idx])\n", " mapi.append(self.y_list[idx])\n", " return mapi" ] }, { "cell_type": "code", "execution_count": 3, "id": "dd2fe8b9", "metadata": { "ExecuteTime": { "end_time": "2023-09-27T18:12:18.466279Z", "start_time": "2023-09-27T18:12:18.443804Z" } }, "outputs": [], "source": [ "class Step1_model(nn.Module):\n", " def __init__(self, hidden_size=512):\n", "# global old_inp\n", "# global old_mhs\n", "# self.oi = old_inp\n", "# self.old_mhs = old_mhs\n", " super(Step1_model, self).__init__()\n", " self.hidden_size = hidden_size\n", "# self.model = AutoModel.from_pretrained(\"roberta-base\")\n", "# self.tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n", "# self.config = AutoConfig.from_pretrained(\"roberta-base\")\n", " self.model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')\n", " self.tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n", " self.config = AutoConfig.from_pretrained(\"microsoft/graphcodebert-base\")\n", " self.linear_layer = nn.Linear(self.model.config.vocab_size, self.model.config.vocab_size)\n", "\n", "# self.model = AutoModelForMaskedLM.from_pretrained('bert-base-cased')\n", "# self.tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "# self.config = AutoConfig.from_pretrained(\"bert-base-cased\")\n", " for param in self.model.base_model.parameters():\n", " param.requires_grad = True\n", " def foo (self,data):\n", " result = []\n", " if type(data) == tuple:\n", " return data[1]\n", " if type(data) == list:\n", " for inner in data:\n", " result.append(foo(inner))\n", " res = []\n", " for a in result[0]:\n", " res.append(a[:2])\n", " return res\n", " def loss_func1(self, word, y):\n", " if word =='NA':\n", " return torch.full((1,), fill_value=100)\n", " try:\n", " pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', word)\n", " target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n", " pred_tag = self.foo(nltk.pos_tag(pred_list))\n", " target_tag = self.foo(nltk.pos_tag(target_list))\n", " str1 = ' '.join(pred_tag) # Convert lists to strings\n", " str2 = ' '.join(target_tag)\n", " distance = Levenshtein.distance(str1, str2)\n", " dist = torch.Tensor([distance])\n", " except:\n", " dist = torch.Tensor([2*len(target_list)])\n", " return dist\n", " def loss_func2(self, word, y):\n", " if word =='NA':\n", " return torch.full((1,), fill_value=100)\n", " nlp = en_core_web_sm.load()\n", " pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', word)\n", " target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n", " try:\n", " str1 = ' '.join(pred_list) # Convert lists to strings\n", " str2 = ' '.join(target_list)\n", " tokens1 = nlp(str1)\n", " tokens2 = nlp(str2)\n", " # Calculate the average word embedding for each string\n", " embedding1 = sum(token.vector for token in tokens1) / len(tokens1)\n", " embedding2 = sum(token.vector for token in tokens2) / len(tokens2)\n", " # Calculate the cosine similarity between the embeddings\n", " w1= LA.norm(embedding1)\n", " w2= LA.norm(embedding2)\n", " distance = 1 - (embedding1.dot(embedding2) / (w1 * w2))\n", " dist = torch.Tensor([distance])\n", " except:\n", " dist = torch.Tensor([1])\n", " return dist\n", " def forward(self, mapi):\n", " global variable_names\n", " global base_model\n", " global tot_pll\n", " global base_tot_pll\n", " X_init1 = mapi[0]\n", " X_init = mapi[0]\n", " y = mapi[1]\n", " print(y)\n", " y_tok = self.tokenizer.encode(y)[1:-1]\n", " nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n", " lb = ' '.join(nl).lower()\n", " x = self.tokenizer.tokenize(lb)\n", " num_sub_tokens_label = len(x)\n", " X_init = X_init.replace(\"[MASK]\", \" \".join([self.tokenizer.mask_token] * num_sub_tokens_label))\n", " sent_pll = 0.0\n", " base_sent_pll = 0.0\n", " for m in range(num_sub_tokens_label):\n", " print(m)\n", " tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')\n", " input_id_chunki = tokens['input_ids'][0].split(510)\n", " input_id_chunks = []\n", " mask_chunks = []\n", " mask_chunki = tokens['attention_mask'][0].split(510)\n", " for tensor in input_id_chunki:\n", " input_id_chunks.append(tensor)\n", " for tensor in mask_chunki:\n", " mask_chunks.append(tensor)\n", " xi = torch.full((1,), fill_value=101)\n", " yi = torch.full((1,), fill_value=1)\n", " zi = torch.full((1,), fill_value=102)\n", " for r in range(len(input_id_chunks)):\n", " input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)\n", " input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)\n", " mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)\n", " mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)\n", " di = torch.full((1,), fill_value=0)\n", " for i in range(len(input_id_chunks)):\n", " pad_len = 512 - input_id_chunks[i].shape[0]\n", " if pad_len > 0:\n", " for p in range(pad_len):\n", " input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)\n", " mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)\n", " input_ids = torch.stack(input_id_chunks)\n", " attention_mask = torch.stack(mask_chunks)\n", " input_dict = {\n", " 'input_ids': input_ids.long(),\n", " 'attention_mask': attention_mask.int()\n", " }\n", " maski = []\n", " u = 0\n", " ad = 0\n", " for l in range(len(input_dict['input_ids'])):\n", " masked_pos = []\n", " for i in range(len(input_dict['input_ids'][l])):\n", " if input_dict['input_ids'][l][i] == 50264: #103\n", " u+=1\n", " if i != 0 and input_dict['input_ids'][l][i-1] == 50264:\n", " continue\n", " masked_pos.append(i)\n", " ad+=1\n", " maski.append(masked_pos)\n", " print('number of mask tok',u)\n", " print('number of seq', ad)\n", " with torch.no_grad():\n", " output = self.model(**input_dict)\n", " base_output = base_model(**input_dict)\n", " last_hidden_state = output[0].squeeze()\n", " base_last_hidden_state = base_output[0].squeeze()\n", " l_o_l_sa = []\n", " base_l_o_l_sa = []\n", " if len(maski) == 1:\n", " masked_pos = maski[0]\n", " for k in masked_pos:\n", " l_o_l_sa.append(last_hidden_state[k])\n", " base_l_o_l_sa.append(base_last_hidden_state[k])\n", " else:\n", " for p in range(len(maski)):\n", " masked_pos = maski[p]\n", " for k in masked_pos:\n", " l_o_l_sa.append(last_hidden_state[p][k])\n", " base_l_o_l_sa.append(base_last_hidden_state[p][k])\n", " sum_state = l_o_l_sa[0]\n", " base_sum_state = base_l_o_l_sa[0]\n", " for i in range(len(l_o_l_sa)):\n", " if i == 0:\n", " continue\n", " sum_state += l_o_l_sa[i]\n", " base_sum_state += base_l_o_l_sa[i]\n", " yip = len(l_o_l_sa)\n", " sum_state /= yip\n", " base_sum_state /= yip\n", " probs = F.softmax(sum_state, dim=0)\n", " base_probs = F.softmax(base_sum_state, dim=0)\n", " a_lab = y_tok[m]\n", " prob = probs[a_lab]\n", " base_prob = base_probs[a_lab]\n", " log_prob = -1*math.log(prob)\n", " base_log_prob = -1*math.log(base_prob)\n", " sent_pll+=log_prob\n", " base_sent_pll+=base_log_prob\n", " xl = X_init.split()\n", " xxl = []\n", " for p in range(len(xl)):\n", " if xl[p] == self.tokenizer.mask_token:\n", " if p != 0 and xl[p-1] == self.tokenizer.mask_token:\n", " xxl.append(xl[p])\n", " continue\n", " xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))\n", " continue\n", " xxl.append(xl[p])\n", " X_init = \" \".join(xxl)\n", " sent_pll/=num_sub_tokens_label\n", " base_sent_pll/=num_sub_tokens_label\n", " print(\"Sent PLL:\")\n", " print(sent_pll)\n", " print(\"Base Sent PLL:\")\n", " print(base_sent_pll)\n", " print(\"Net % difference:\")\n", " diff = (sent_pll-base_sent_pll)*100/base_sent_pll\n", " print(diff)\n", " tot_pll += sent_pll\n", " base_tot_pll+=base_sent_pll\n", " print()\n", " print()\n", " y = random.choice(variable_names)\n", " print(y)\n", " X_init = X_init1\n", " y_tok = self.tokenizer.encode(y)[1:-1]\n", " nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n", " lb = ' '.join(nl).lower()\n", " x = self.tokenizer.tokenize(lb)\n", " num_sub_tokens_label = len(x)\n", " X_init = X_init.replace(\"[MASK]\", \" \".join([self.tokenizer.mask_token] * num_sub_tokens_label))\n", " sent_pll = 0.0\n", " base_sent_pll = 0.0\n", " for m in range(num_sub_tokens_label):\n", " print(m)\n", " tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')\n", " input_id_chunki = tokens['input_ids'][0].split(510)\n", " input_id_chunks = []\n", " mask_chunks = []\n", " mask_chunki = tokens['attention_mask'][0].split(510)\n", " for tensor in input_id_chunki:\n", " input_id_chunks.append(tensor)\n", " for tensor in mask_chunki:\n", " mask_chunks.append(tensor)\n", " xi = torch.full((1,), fill_value=101)\n", " yi = torch.full((1,), fill_value=1)\n", " zi = torch.full((1,), fill_value=102)\n", " for r in range(len(input_id_chunks)):\n", " input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)\n", " input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)\n", " mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)\n", " mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)\n", " di = torch.full((1,), fill_value=0)\n", " for i in range(len(input_id_chunks)):\n", " pad_len = 512 - input_id_chunks[i].shape[0]\n", " if pad_len > 0:\n", " for p in range(pad_len):\n", " input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)\n", " mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)\n", " input_ids = torch.stack(input_id_chunks)\n", " attention_mask = torch.stack(mask_chunks)\n", " input_dict = {\n", " 'input_ids': input_ids.long(),\n", " 'attention_mask': attention_mask.int()\n", " }\n", " maski = []\n", " u = 0\n", " ad = 0\n", " for l in range(len(input_dict['input_ids'])):\n", " masked_pos = []\n", " for i in range(len(input_dict['input_ids'][l])):\n", " if input_dict['input_ids'][l][i] == 50264: #103\n", " u+=1\n", " if i != 0 and input_dict['input_ids'][l][i-1] == 50264:\n", " continue\n", " masked_pos.append(i)\n", " ad+=1\n", " maski.append(masked_pos)\n", " print('number of mask tok',u)\n", " print('number of seq', ad)\n", " with torch.no_grad():\n", " output = self.model(**input_dict)\n", " base_output = base_model(**input_dict)\n", " last_hidden_state = output[0].squeeze()\n", " base_last_hidden_state = base_output[0].squeeze()\n", " l_o_l_sa = []\n", " base_l_o_l_sa = []\n", " if len(maski) == 1:\n", " masked_pos = maski[0]\n", " for k in masked_pos:\n", " l_o_l_sa.append(last_hidden_state[k])\n", " base_l_o_l_sa.append(base_last_hidden_state[k])\n", " else:\n", " for p in range(len(maski)):\n", " masked_pos = maski[p]\n", " for k in masked_pos:\n", " l_o_l_sa.append(last_hidden_state[p][k])\n", " base_l_o_l_sa.append(base_last_hidden_state[p][k])\n", " sum_state = l_o_l_sa[0]\n", " base_sum_state = base_l_o_l_sa[0]\n", " for i in range(len(l_o_l_sa)):\n", " if i == 0:\n", " continue\n", " sum_state += l_o_l_sa[i]\n", " base_sum_state += base_l_o_l_sa[i]\n", " yip = len(l_o_l_sa)\n", " sum_state /= yip\n", " base_sum_state /= yip\n", " probs = F.softmax(sum_state, dim=0)\n", " base_probs = F.softmax(base_sum_state, dim=0)\n", " a_lab = y_tok[m]\n", " prob = probs[a_lab]\n", " base_prob = base_probs[a_lab]\n", " log_prob = -1*math.log(prob)\n", " base_log_prob = -1*math.log(base_prob)\n", " sent_pll+=log_prob\n", " base_sent_pll+=base_log_prob\n", " xl = X_init.split()\n", " xxl = []\n", " for p in range(len(xl)):\n", " if xl[p] == self.tokenizer.mask_token:\n", " if p != 0 and xl[p-1] == self.tokenizer.mask_token:\n", " xxl.append(xl[p])\n", " continue\n", " xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))\n", " continue\n", " xxl.append(xl[p])\n", " X_init = \" \".join(xxl)\n", " sent_pll/=num_sub_tokens_label\n", " base_sent_pll/=num_sub_tokens_label\n", " print(\"Sent PLL:\")\n", " print(sent_pll)\n", " print(\"Base Sent PLL:\")\n", " print(base_sent_pll)\n", " print(\"Net % difference:\")\n", " diff = (sent_pll-base_sent_pll)*100/base_sent_pll\n", " print(diff)\n", " print()\n", " print(\"******\")\n", " print()\n", " " ] }, { "cell_type": "code", "execution_count": 4, "id": "bc788ca0", "metadata": { "ExecuteTime": { "end_time": "2023-09-27T18:12:36.975722Z", "start_time": "2023-09-27T18:12:18.467898Z" } }, "outputs": [ { "data": { "text/plain": [ "RobertaForMaskedLM(\n", " (roberta): RobertaModel(\n", " (embeddings): RobertaEmbeddings(\n", " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", " (token_type_embeddings): Embedding(1, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): RobertaEncoder(\n", " (layer): ModuleList(\n", " (0-11): 12 x RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (lm_head): RobertaLMHead(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (decoder): Linear(in_features=768, out_features=50265, bias=True)\n", " )\n", ")" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n", "model = Step1_model()\n", "model.load_state_dict(torch.load('var_runs/model_98_3'))\n", "base_model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')\n", "model.eval()\n", "base_model.eval()" ] }, { "cell_type": "code", "execution_count": 5, "id": "f96328ce", "metadata": { "ExecuteTime": { "end_time": "2023-09-27T18:15:14.635841Z", "start_time": "2023-09-27T18:12:36.980040Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\r", " 0%| | 0/50 [00:00 512). Running this sequence through the model will result in indexing errors\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "stackBefore\n", "\n", "0\n", "number of mask tok 16\n", "number of seq 8\n", "1\n", "number of mask tok 8\n", "number of seq 8\n", "Sent PLL:\n", "3.184066466322467\n", "Base Sent PLL:\n", "3.184066466322467\n", "Net % difference:\n", "0.0\n", "\n", "\n", "distance\n", "0\n", "number of mask tok 8\n", "number of seq 8\n", "Sent PLL:\n", "22.091890736746276\n", "Base Sent PLL:\n", "22.091890736746276\n", "Net % difference:\n", "0.0\n", "\n", "******\n", "\n", "records\n", "\n", "0\n", "number of mask tok 4\n", "number of seq 2\n", "1\n", "number of mask tok 2\n", "number of seq 2\n", "Sent PLL:\n", "4.304520906089483\n", "Base Sent PLL:\n", "4.304520906089483\n", "Net % difference:\n", "0.0\n", "\n", "\n", "valueB\n", "0\n", "number of mask tok 4\n", "number of seq 2\n", "1\n", "number of mask tok 2\n", "number of seq 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", " 2%|▊ | 1/50 [03:31<2:52:22, 211.08s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Sent PLL:\n", "9.457522688945344\n", "Base Sent PLL:\n", "9.457522688945344\n", "Net % difference:\n", "0.0\n", "\n", "******\n", "\n", "stackEntry\n", "\n", "0\n", "number of mask tok 30\n", "number of seq 15\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 2%|▊ | 1/50 [03:38<2:58:06, 218.09s/it]\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m l\u001b[38;5;241m.\u001b[39mappend(inputs[\u001b[38;5;241m0\u001b[39m][i])\n\u001b[1;32m 17\u001b[0m l\u001b[38;5;241m.\u001b[39mappend(inputs[\u001b[38;5;241m1\u001b[39m][i])\n\u001b[0;32m---> 18\u001b[0m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43ml\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# X_init1 = inputs[0][i]\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# X_init = inputs[0][i]\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# y = inputs[1][i]\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;66;03m# except:\u001b[39;00m\n\u001b[1;32m 263\u001b[0m \u001b[38;5;66;03m# continue\u001b[39;00m\n\u001b[1;32m 264\u001b[0m tot_pll\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(myDs)\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "Cell \u001b[0;32mIn[3], line 152\u001b[0m, in \u001b[0;36mStep1_model.forward\u001b[0;34m(self, mapi)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[1;32m 151\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minput_dict)\n\u001b[0;32m--> 152\u001b[0m base_output \u001b[38;5;241m=\u001b[39m \u001b[43mbase_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_dict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 153\u001b[0m last_hidden_state \u001b[38;5;241m=\u001b[39m output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39msqueeze()\n\u001b[1;32m 154\u001b[0m base_last_hidden_state \u001b[38;5;241m=\u001b[39m base_output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39msqueeze()\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:1082\u001b[0m, in \u001b[0;36mRobertaForMaskedLM.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1073\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1074\u001b[0m \u001b[38;5;124;03m Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1078\u001b[0m \u001b[38;5;124;03m Used to hide legacy arguments that have been deprecated.\u001b[39;00m\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1080\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1082\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroberta\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1083\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1084\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1085\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1086\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1087\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1088\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1089\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1090\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1091\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1092\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1093\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1094\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1095\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1096\u001b[0m prediction_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(sequence_output)\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:844\u001b[0m, in \u001b[0;36mRobertaModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 835\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m 837\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings(\n\u001b[1;32m 838\u001b[0m input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m 839\u001b[0m position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 842\u001b[0m past_key_values_length\u001b[38;5;241m=\u001b[39mpast_key_values_length,\n\u001b[1;32m 843\u001b[0m )\n\u001b[0;32m--> 844\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 846\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 847\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 848\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 849\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 850\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 851\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 852\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 856\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 857\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:529\u001b[0m, in \u001b[0;36mRobertaEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 520\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(\n\u001b[1;32m 521\u001b[0m create_custom_forward(layer_module),\n\u001b[1;32m 522\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 526\u001b[0m encoder_attention_mask,\n\u001b[1;32m 527\u001b[0m )\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 529\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 539\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 540\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:413\u001b[0m, in \u001b[0;36mRobertaLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 403\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 410\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m 412\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 414\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 416\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 417\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 418\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_past_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 419\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 420\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 422\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:340\u001b[0m, in \u001b[0;36mRobertaAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 331\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 332\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 338\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 339\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[0;32m--> 340\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 341\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 349\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 350\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:236\u001b[0m, in \u001b[0;36mRobertaSelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 233\u001b[0m past_key_value \u001b[38;5;241m=\u001b[39m (key_layer, value_layer)\n\u001b[1;32m 235\u001b[0m \u001b[38;5;66;03m# Take the dot product between \"query\" and \"key\" to get the raw attention scores.\u001b[39;00m\n\u001b[0;32m--> 236\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatmul\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition_embedding_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrelative_key\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition_embedding_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrelative_key_query\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 239\u001b[0m query_length, key_length \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m], key_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m]\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "tot_pll = 0.0\n", "base_tot_pll = 0.0\n", "variable_names = [\n", " 'x', 'y', 'myVariable', 'dataPoint', 'randomNumber', 'userAge', 'resultValue', 'inputValue', 'tempValue', 'indexCounter', \n", " 'itemPrice', 'userName', 'testScore', 'acceleration', 'productCount', 'errorMargin', 'piValue', 'sensorReading', \n", " 'currentTemperature', 'velocityVector', 'variable1', 'variable2', 'valueA', 'valueB', 'counter', 'flag', 'total', \n", " 'average', 'valueX', 'valueY', 'valueZ', 'price', 'quantity', 'name', 'age', 'score', 'weight', 'height', 'distance', \n", " 'time', 'radius', 'width', 'length', 'temperature', 'pressure', 'humidity', 'voltage', 'current', 'resistance'\n", "]\n", "\n", "for batch in loop:\n", " inputs = batch\n", " try:\n", " for i in range(len(inputs[0])):\n", " l = []\n", " l.append(inputs[0][i])\n", " l.append(inputs[1][i])\n", " model(l)\n", " except:\n", " continue\n", "\n", "tot_pll/=len(myDs)\n", "print('Total PLL per sentence: ')\n", "print(tot_pll)\n", "base_tot_pll/=len(myDs)\n", "print('Total Base PLL per sentence: ')\n", "print(base_tot_pll)\n", "print(\"Net % difference average:\")\n", "tot_diff = (tot_pll-base_tot_pll)*100/base_tot_pll\n", "print(tot_diff)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "da79bcc2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10 (tensorflown)", "language": "python", "name": "tensorflown" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }