{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "222e2056", "metadata": { "ExecuteTime": { "end_time": "2023-10-20T02:20:15.321013Z", "start_time": "2023-10-20T02:20:15.317197Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from transformers import AutoTokenizer\n", "import math\n", "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "5a311307", "metadata": { "ExecuteTime": { "end_time": "2023-10-20T02:20:16.190885Z", "start_time": "2023-10-20T02:20:15.780559Z" } }, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n", "# Initialize empty lists to store the extracted data\n", "groundTruth = []\n", "mockTruth = []\n", "groundDifference = []\n", "mockDifference = []\n", "ntok = []\n", "basePLL = []\n", "sentPLL = []\n", "# store.txt is generated using the output from 2.2-all.py\n", "# Open the text file for reading\n", "with open('store.txt', 'r') as file:\n", " lines = file.readlines()\n", "\n", "# Initialize variables to store the temporary data\n", "current_ground_truth = \"\"\n", "current_mock_truth = \"\"\n", "current_ground_difference = None\n", "current_mock_difference = None\n", "current_sent_pll = None\n", "current_base_pll = None\n", "# Iterate over each line in the file\n", "for i in range(len(lines)):\n", " line = lines[i].strip()\n", "\n", " # Check if the line starts with \"Ground truth:\"\n", " if line.find(\"Ground truth:\") != -1:\n", " current_ground_truth = line.split(\"Ground truth:\")[1].strip()\n", " # Check if the line starts with \"Mock truth:\"\n", " elif line.startswith(\"Mock truth:\"):\n", " current_mock_truth = line.split(\"Mock truth:\")[1].strip()\n", " # Check if the line starts with \"Net % difference:\"\n", " elif line.startswith(\"Sent PLL:\"):\n", " # Extract the value from the next line within a tensor\n", " tensor_line = lines[i + 1].strip()\n", " current_sent_pll = float(tensor_line.split(\"tensor(\")[1].split(\",\")[0])\n", " elif line.startswith(\"Base Sent PLL:\"):\n", " # Extract the value from the next line within a tensor\n", " tensor_line = lines[i + 1].strip()\n", " current_base_pll = float(tensor_line.split(\"tensor(\")[1].split(\",\")[0]) \n", " elif line.startswith(\"Net % difference:\"):\n", " # Extract the value from the next line within a tensor\n", " tensor_line = lines[i + 1].strip()\n", " current_ground_difference = float(tensor_line.split(\"tensor(\")[1].split(\",\")[0])\n", " # Check if the line starts with \"Mock Net % difference:\"\n", " elif line.startswith(\"Mock Net % difference:\"):\n", " # Extract the value from the next line within a tensor\n", " tensor_line = lines[i + 1].strip()\n", " current_mock_difference = float(tensor_line.split(\"tensor(\")[1].split(\",\")[0])\n", " # Append the extracted data to the lists\n", " nt = len(tokenizer.tokenize(current_ground_truth))\n", " groundTruth.append(current_ground_truth)\n", " mockTruth.append(current_mock_truth)\n", " groundDifference.append(current_ground_difference)\n", " mockDifference.append(current_mock_difference)\n", " sentPLL.append(current_sent_pll)\n", " basePLL.append(current_base_pll)\n", " ntok.append(nt)\n", "\n", "Now, the four lists are populated with the extracted data\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "e4f72f5f", "metadata": { "ExecuteTime": { "end_time": "2023-10-20T02:20:16.802159Z", "start_time": "2023-10-20T02:20:16.793212Z" } }, "outputs": [], "source": [ "data = {\n", " 'GroundTruth': groundTruth,\n", " 'MockTruth': mockTruth,\n", " 'GroundDifference': groundDifference,\n", " 'MockDifference': mockDifference,\n", " 'NumTokens':ntok,\n", " 'SentPLL':sentPLL,\n", " 'BasePLL':basePLL\n", "}\n", "\n", "# Create a DataFrame\n", "df = pd.DataFrame(data)\n", "df.to_csv('testRes.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "71ff43e7", "metadata": { "ExecuteTime": { "end_time": "2023-10-20T02:20:17.361018Z", "start_time": "2023-10-20T02:20:17.333142Z" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | GroundTruth | \n", "MockTruth | \n", "GroundDifference | \n", "MockDifference | \n", "NumTokens | \n", "SentPLL | \n", "BasePLL | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "deobfOn | \n", "customerOrderShippingAddress | \n", "-72.2109 | \n", "29.5661 | \n", "4 | \n", "2.3904 | \n", "8.6019 | \n", "
1 | \n", "actionContextRegistry | \n", "employeePayrollSystem | \n", "-99.8300 | \n", "-0.8143 | \n", "4 | \n", "0.0258 | \n", "15.1701 | \n", "
2 | \n", "workerPoolsBuilder | \n", "productInventoryItemCount | \n", "-80.1683 | \n", "26.2351 | \n", "4 | \n", "2.2525 | \n", "11.3580 | \n", "
3 | \n", "fieldAccessors | \n", "salesRevenue | \n", "-86.7385 | \n", "-8.3067 | \n", "3 | \n", "1.1829 | \n", "8.9195 | \n", "
4 | \n", "resolutionStrategy | \n", "databaseConnectionPoolSize | \n", "-86.4440 | \n", "12.9442 | \n", "3 | \n", "1.9661 | \n", "14.5034 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
222 | \n", "messageToken | \n", "manufacturingDate | \n", "-66.7565 | \n", "24.5230 | \n", "2 | \n", "3.3962 | \n", "10.2160 | \n", "
223 | \n", "active | \n", "connectionPool | \n", "-96.2424 | \n", "-26.3222 | \n", "1 | \n", "0.5467 | \n", "14.5502 | \n", "
224 | \n", "dotsAndXs | \n", "transactionProcessingStatusFlag | \n", "-46.2719 | \n", "-19.9363 | \n", "5 | \n", "5.9572 | \n", "11.0876 | \n", "
225 | \n", "engine | \n", "inventoryItem | \n", "-99.9956 | \n", "12.3351 | \n", "1 | \n", "0.0007 | \n", "14.8146 | \n", "
226 | \n", "myresult | \n", "customerAddress | \n", "-44.7804 | \n", "-21.6712 | \n", "2 | \n", "8.7443 | \n", "15.8354 | \n", "
227 rows × 7 columns
\n", "