File size: 32,899 Bytes
177b25f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5k3Qn8DImEGv",
        "outputId": "d9946915-5fcd-43b3-edc2-e119b15c77c8"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
            "Collecting transformers\n",
            "  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting datasets\n",
            "  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)\n",
            "Collecting peft\n",
            "  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.7)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n",
            "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
            "Collecting tokenizers<0.21,>=0.20 (from transformers)\n",
            "  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n",
            "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.1.0)\n",
            "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
            "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
            "Collecting xxhash (from datasets)\n",
            "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
            "Collecting multiprocess (from datasets)\n",
            "  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)\n",
            "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft) (5.9.5)\n",
            "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft) (2.4.1+cu121)\n",
            "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft) (0.34.2)\n",
            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
            "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.14.0)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft) (1.13.3)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft) (3.4)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft) (3.1.4)\n",
            "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n",
            "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
            "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft) (3.0.1)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n",
            "Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.9/9.9 MB\u001b[0m \u001b[31m39.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading datasets-3.0.1-py3-none-any.whl (471 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m32.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading peft-0.13.2-py3-none-any.whl (320 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m18.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: xxhash, dill, multiprocess, tokenizers, transformers, peft, datasets\n",
            "  Attempting uninstall: tokenizers\n",
            "    Found existing installation: tokenizers 0.19.1\n",
            "    Uninstalling tokenizers-0.19.1:\n",
            "      Successfully uninstalled tokenizers-0.19.1\n",
            "  Attempting uninstall: transformers\n",
            "    Found existing installation: transformers 4.44.2\n",
            "    Uninstalling transformers-4.44.2:\n",
            "      Successfully uninstalled transformers-4.44.2\n",
            "Successfully installed datasets-3.0.1 dill-0.3.8 multiprocess-0.70.16 peft-0.13.2 tokenizers-0.20.1 transformers-4.45.2 xxhash-3.5.0\n"
          ]
        }
      ],
      "source": [
        "!pip install -U transformers datasets peft"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "F0rYC0S3lhUJ"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from torch.utils.data import DataLoader, Dataset\n",
        "from transformers import AutoModel, AdamW, get_linear_schedule_with_warmup,DebertaV2Tokenizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from datasets import load_dataset\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "from transformers import Trainer, TrainingArguments\n",
        "from datasets import Dataset as HFDataset\n",
        "from peft import PeftConfig, PeftModel,LoraConfig,get_peft_model\n",
        "\n",
        "# Define constants\n",
        "MODEL_NAME = 'google/electra-small-discriminator'\n",
        "BATCH_SIZE = 4\n",
        "EPOCHS = 3\n",
        "LEARNING_RATE = 2e-4\n",
        "MAX_LENGTH = 512"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wVfJhfyqnur3",
        "outputId": "c4f695e0-0281-43f5-b508-6c58c3971222"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Columns in /content/ScamDataNew.csv: ['Scammer', 'Label']\n",
            "Columns in /content/cleaned-data.csv: ['input', 'output']\n",
            "                                                   text  label\n",
            "0     unknown: Hello this is HUGIE Finance calling. ...    1.0\n",
            "1     unknown: Pepperfry item (Yukashi 3 Door Wardro...    0.0\n",
            "2     unknown: Act now to benefit from our unique of...    1.0\n",
            "3     unknown: It's Shoppers Stop BirthYAY & we love...    0.0\n",
            "4     unknown: Hello I'm calling from MUTHOOT Financ...    1.0\n",
            "...                                                 ...    ...\n",
            "4433  unknown: did you check the email i sent yester...    0.0\n",
            "4434  unknown: Cant wait to see you this weekend, so...    0.0\n",
            "4435  unknown: I think we should leave earlier, traf...    0.0\n",
            "4436  unknown: forgot to bring the umbrella, it's ra...    0.0\n",
            "4437  unknown: is there anything else you need from the    0.0\n",
            "\n",
            "[4438 rows x 2 columns]\n"
          ]
        }
      ],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# List of file paths to the CSV files\n",
        "csv_files = [\n",
        "    '/content/ScamDataNew.csv',\n",
        "    '/content/cleaned-data.csv',\n",
        "]\n",
        "\n",
        "# Function to load a CSV file and extract two columns\n",
        "def load_and_select_columns(file_path, text_col, label_col):\n",
        "    if (file_path=='/content/Data_including_normal.csv'):\n",
        "      df = pd.read_csv(file_path, encoding='ISO-8859-1')\n",
        "    else:\n",
        "      df = pd.read_csv(file_path)\n",
        "    print(f\"Columns in {file_path}: {df.columns.tolist()}\")\n",
        "    selected_df = df[[text_col, label_col]].copy()  # Select the two columns\n",
        "    selected_df.columns = ['text', 'label']  # Standardize column names\n",
        "    return selected_df\n",
        "\n",
        "# Load each CSV and extract relevant columns\n",
        "# Update 'text_col' and 'label_col' with actual column names from each CSV\n",
        "df1 = load_and_select_columns(csv_files[0], 'Scammer', 'Label')\n",
        "df1['text']='unknown: '+df1['text']\n",
        "df2 = load_and_select_columns(csv_files[1], 'input', 'output')\n",
        "# df3 = load_and_select_columns(csv_files[2], 'dialogue', 'labels')\n",
        "df4=pd.read_excel(\"/content/Old+Improved data.xlsx\")\n",
        "df4 = df4[['content', 'is scam']].copy()  # Select the two columns\n",
        "df4.columns = ['text', 'label']\n",
        "df4['text']='unknown: '+df4['text']\n",
        "# Concatenate the selected columns from all files\n",
        "combined_df = pd.concat([df1, df2,df4], ignore_index=True)\n",
        "\n",
        "# Display the combined DataFrame\n",
        "print(combined_df)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "combined_df.dropna(inplace=True)\n",
        "combined_df['label'] = combined_df['label'].astype(int)\n",
        "\n",
        "# # Reset the index of the combined DataFrame\n",
        "# combined_df.reset_index(drop=True, inplace=True)\n",
        "combined_df['text']=combined_df['text'].str.lower()\n",
        "# Display the combined DataFrame\n",
        "print(combined_df)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZXcCFIgM08Bp",
        "outputId": "898a7392-0b33-40f6-b274-d01989facd41"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "                                                   text  label\n",
            "0     unknown: hello this is hugie finance calling. ...      1\n",
            "1     unknown: pepperfry item (yukashi 3 door wardro...      0\n",
            "2     unknown: act now to benefit from our unique of...      1\n",
            "3     unknown: it's shoppers stop birthyay & we love...      0\n",
            "4     unknown: hello i'm calling from muthoot financ...      1\n",
            "...                                                 ...    ...\n",
            "4433  unknown: did you check the email i sent yester...      0\n",
            "4434  unknown: cant wait to see you this weekend, so...      0\n",
            "4435  unknown: i think we should leave earlier, traf...      0\n",
            "4436  unknown: forgot to bring the umbrella, it's ra...      0\n",
            "4437  unknown: is there anything else you need from the      0\n",
            "\n",
            "[4437 rows x 2 columns]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "0M-Psc9XlwCx"
      },
      "outputs": [],
      "source": [
        "combined_df.to_csv('cleaned-data-version2-with-user-unknown.csv', index=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "9HEql8ZemQ8V"
      },
      "outputs": [],
      "source": [
        "def load_data_from_csv():\n",
        "    df = combined_df\n",
        "    texts = df['text'].tolist()  # Replace with your text column name\n",
        "    label = df['label'].tolist()  # Replace with your label column name\n",
        "    le = LabelEncoder()\n",
        "    label = le.fit_transform(label)\n",
        "    return texts, label\n",
        "\n",
        "import pandas as pd\n",
        "from datasets import Dataset as HFDataset\n",
        "import torch\n",
        "\n",
        "def preprocess_data(texts, label, tokenizer, max_length):\n",
        "    # Tokenize the input texts\n",
        "    encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')\n",
        "\n",
        "    # Convert PyTorch tensors to lists\n",
        "    input_ids = encodings['input_ids'].tolist()\n",
        "    attention_mask = encodings['attention_mask'].tolist()\n",
        "    token_type_ids = encodings['token_type_ids'].tolist() if 'token_type_ids' in encodings else None\n",
        "\n",
        "    # Ensure labels are also in list format\n",
        "    if isinstance(label, torch.Tensor):\n",
        "        label = label.tolist()\n",
        "\n",
        "    # Create a dictionary for the dataset\n",
        "    dataset_dict = {\n",
        "        'input_ids': input_ids,\n",
        "        'attention_mask': attention_mask,\n",
        "        'token_type_ids': token_type_ids,\n",
        "        'labels': label\n",
        "    }\n",
        "\n",
        "    # Convert the dictionary to a Pandas DataFrame\n",
        "    df = pd.DataFrame(dataset_dict)\n",
        "\n",
        "    # Convert the DataFrame to a Hugging Face Dataset\n",
        "    dataset = HFDataset.from_pandas(df)\n",
        "\n",
        "    print(dataset)\n",
        "    return dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6VVDZ_WAo9o5",
        "outputId": "897c1876-8636-4079-98ca-d002eeb997c7"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "13549314\n",
            "Dataset({\n",
            "    features: ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],\n",
            "    num_rows: 3940\n",
            "})\n",
            "Dataset({\n",
            "    features: ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],\n",
            "    num_rows: 986\n",
            "})\n"
          ]
        }
      ],
      "source": [
        "from transformers import AutoModelForSequenceClassification,AutoTokenizer\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
        "# Load model directly\n",
        "model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)\n",
        "\n",
        "\n",
        "def count_trainable_parameters(model):\n",
        "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
        "print(count_trainable_parameters(model))\n",
        "\n",
        "\n",
        "\n",
        "lora_config = LoraConfig(\n",
        "    r=8,  # Rank of the low-rank matrices\n",
        "    lora_alpha=16,  # Alpha for the LoRA scaling\n",
        "    lora_dropout=0.1  # Dropout for LoRA layers\n",
        ")\n",
        "# peft_config = PeftConfig(\n",
        "#     base_model_name_or_path=MODEL_NAME,\n",
        "#     adapter_config=lora_config\n",
        "# )\n",
        "# model = get_peft_model(model, peft_config=lora_config)\n",
        "# Load and preprocess data\n",
        "texts, label = load_data_from_csv()  # Replace with your file path\n",
        "train_texts, val_texts, train_label, val_label = train_test_split(texts, label, test_size=0.2, random_state=42)\n",
        "\n",
        "train_dataset = preprocess_data(train_texts, train_label, tokenizer, MAX_LENGTH)\n",
        "val_dataset = preprocess_data(val_texts, val_label, tokenizer, MAX_LENGTH)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "l7FiPtRFr9ma",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "de83f7ea-0c56-4695-fe73-5ffffc8ca0cc"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "13549314\n"
          ]
        }
      ],
      "source": [
        "print(count_trainable_parameters(model))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 422
        },
        "id": "8qZSOElhsDsG",
        "outputId": "46e7b2c1-6ba8-4fb4-c083-7282feab6194"
      },
      "outputs": [
        {
          "output_type": "error",
          "ename": "RuntimeError",
          "evalue": "CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-14-3e927ad3458e>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m training_args = TrainingArguments(\n\u001b[1;32m      3\u001b[0m         \u001b[0moutput_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'./results'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0meval_strategy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'epoch'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mlearning_rate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mLEARNING_RATE\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/cuda/memory.py\u001b[0m in \u001b[0;36mempty_cache\u001b[0;34m()\u001b[0m\n\u001b[1;32m    168\u001b[0m     \"\"\"\n\u001b[1;32m    169\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mis_initialized\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 170\u001b[0;31m         \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cuda_emptyCache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    171\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    172\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mRuntimeError\u001b[0m: CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n"
          ]
        }
      ],
      "source": [
        "torch.cuda.empty_cache()\n",
        "training_args = TrainingArguments(\n",
        "        output_dir='./results',\n",
        "        eval_strategy='epoch',\n",
        "        learning_rate=LEARNING_RATE,\n",
        "        per_device_train_batch_size=16,\n",
        "        per_device_eval_batch_size=16,\n",
        "        num_train_epochs=6,\n",
        "        weight_decay=0.001,\n",
        "        logging_dir='./logs',\n",
        "        logging_steps=1,\n",
        "        remove_unused_columns=False)\n",
        "\n",
        "trainer = Trainer(\n",
        "        model=model,\n",
        "        args=training_args,\n",
        "        train_dataset=train_dataset,\n",
        "        eval_dataset=val_dataset,\n",
        ")\n",
        "\n",
        "trainer.train()\n",
        "trainer.evaluate()\n",
        "#trainer.save_model('./final_model')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SHjymNkXuwEY"
      },
      "outputs": [],
      "source": [
        "# from huggingface_hub import notebook_login\n",
        "# notebook_login()\n",
        "# repo_name = \"AiisNothing/electra-discriminator-trained-merged-dataset-version1\"\n",
        "# model = model.merge_and_unload()\n",
        "\n",
        "# model.push_to_hub(repo_name)\n",
        "# tokenizer.push_to_hub(repo_name)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model.save_pretrained('/content/final_model')\n",
        "tokenizer.save_pretrained('/content/final_model')"
      ],
      "metadata": {
        "id": "RfSas7HWwNfG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "POq_UyFZw-mS"
      },
      "outputs": [],
      "source": [
        "# After inference\n",
        "del tokenized_inputs, outputs, logits\n",
        "torch.cuda.empty_cache()  # Clear unused memory\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "s4p4Lv0Ry_J5"
      },
      "outputs": [],
      "source": [
        "from datasets import load_dataset\n",
        "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
        "import torch\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "# Load the dataset from Hugging Face Hub (test split)\n",
        "dataset = load_dataset(\"AiisNothing/test_data\", split=\"test\")\n",
        "\n",
        "# Load the tokenizer and the model from your Hugging Face model repository\n",
        "repo_name = \"AiisNothing/electra-discriminator-trained-merged-dataset-version1\"  # Replace with your repo name\n",
        "tokenizer = AutoTokenizer.from_pretrained(repo_name)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(repo_name)\n",
        "\n",
        "# Move model to GPU if available and set to eval mode\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "model.to(device)\n",
        "model.eval()  # Set the model to evaluation mode\n",
        "\n",
        "# Prepare inputs from the dataset (assuming the 'dialogue' column contains the text and 'label' contains the labels)\n",
        "inputs = dataset['dialogue']\n",
        "true_labels = dataset['label']\n",
        "\n",
        "predicted_labels = []\n",
        "\n",
        "# Process each input one by one\n",
        "for i in range(len(inputs)):\n",
        "    # Get the current input\n",
        "    current_input = inputs[i]\n",
        "\n",
        "    # Tokenize the input\n",
        "    tokenized_input = tokenizer(current_input, padding=True, truncation=True, return_tensors=\"pt\", max_length=256)\n",
        "\n",
        "    # Move the tokenized input to GPU\n",
        "    tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}\n",
        "\n",
        "    # Perform inference (disable gradients for faster evaluation)\n",
        "    with torch.no_grad():\n",
        "        outputs = model(**tokenized_input)\n",
        "\n",
        "    # Get the logits (raw predictions)\n",
        "    logits = outputs.logits\n",
        "\n",
        "    # Convert logits to predicted class (using argmax)\n",
        "    predicted_labels.append(torch.argmax(logits, dim=-1).cpu().item())  # Use .item() to get a Python number\n",
        "\n",
        "    # Clear GPU memory\n",
        "    del tokenized_input, outputs, logits\n",
        "    torch.cuda.empty_cache()  # Clear unused memory\n",
        "\n",
        "# Calculate accuracy\n",
        "accuracy = accuracy_score(true_labels, predicted_labels)\n",
        "\n",
        "# Report accuracy\n",
        "print(f\"Model Accuracy on Test Split: {accuracy * 100:.2f}%\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9qSC8A70vGJ8"
      },
      "outputs": [],
      "source": [
        "accuracy"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "I3XtYBPa0UVE"
      },
      "outputs": [],
      "source": [
        "pip install optimum[exporters]"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
        "from transformers import AutoTokenizer\n",
        "from onnxruntime.quantization import quantize_dynamic, QuantType\n",
        "\n",
        "model_checkpoint = \"\"\n",
        "save_directory = \"\"\n",
        "\n",
        "# Load a model from transformers and export it to ONNX\n",
        "ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
        "\n",
        "# Save the ONNX model and tokenizer\n",
        "ort_model.save_pretrained(save_directory)\n",
        "tokenizer.save_pretrained(save_directory)\n",
        "\n",
        "# Quantize the exported ONNX model to 8-bit\n",
        "onnx_model_path = f\"{save_directory}/model.onnx\"\n",
        "quantized_model_path = f\"{save_directory}/model-quantized.onnx\"\n",
        "\n",
        "# Apply dynamic quantization\n",
        "quantize_dynamic(\n",
        "    model_input=onnx_model_path,\n",
        "    model_output=quantized_model_path,\n",
        "    weight_type=QuantType.QUInt8  # Quantize weights to 8-bit\n",
        ")\n",
        "\n",
        "print(f\"Quantized model saved to: {quantized_model_path}\")"
      ],
      "metadata": {
        "id": "PFWPfabCwCZe"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}