perman2011 commited on
Commit
bbfebbe
·
1 Parent(s): fe16611

Update DistilBERT.py

Browse files
Files changed (1) hide show
  1. DistilBERT.py +75 -0
DistilBERT.py CHANGED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
4
+ from transformers import DistilBertTokenizer, DistilBertModel
5
+ import logging
6
+ logging.basicConfig(level=logging.ERROR)
7
+ import torch.nn as nn
8
+ from torch.nn import functional as F
9
+ import torch.optim as optim
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ # Điều chỉnh các tham số
14
+ MAX_LEN = 100
15
+ TRAIN_BATCH_SIZE = 4
16
+ VALID_BATCH_SIZE = 4
17
+ EPOCHS = 1
18
+ LEARNING_RATE = 1e-05
19
+ tokenizer_DB = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
20
+
21
+ # Tạo dataframe
22
+
23
+
24
+ # Tạo class
25
+ class BinaryLabel(Dataset):
26
+
27
+ def __init__(self, dataframe, tokenizer, max_len):
28
+ self.tokenizer = tokenizer_DB
29
+ self.data = dataframe
30
+ self.text = dataframe.text
31
+ self.targets = self.data.label
32
+ self.max_len = max_len
33
+
34
+ def __len__(self):
35
+ return len(self.text)
36
+
37
+ def __getitem__(self, index):
38
+ text = str(self.text[index])
39
+ text = " ".join(text.split())
40
+
41
+ inputs = self.tokenizer.encode_plus(
42
+ text,
43
+ None,
44
+ add_special_tokens=True,
45
+ max_length=self.max_len,
46
+ pad_to_max_length=True,
47
+ return_token_type_ids=True
48
+ )
49
+ ids = inputs['input_ids']
50
+ mask = inputs['attention_mask']
51
+ token_type_ids = inputs["token_type_ids"]
52
+
53
+
54
+ return {
55
+ 'ids': torch.tensor(ids, dtype=torch.long),
56
+ 'mask': torch.tensor(mask, dtype=torch.long),
57
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
58
+ 'targets': torch.tensor(self.targets[index], dtype=torch.float)
59
+ }
60
+
61
+ train_params = {'batch_size': TRAIN_BATCH_SIZE,
62
+ 'shuffle': True,
63
+ 'num_workers': 0
64
+ }
65
+
66
+ test_params = {'batch_size': VALID_BATCH_SIZE,
67
+ 'shuffle': True,
68
+ 'num_workers': 0
69
+ }
70
+
71
+ training_set = BinaryLabel(train_df_DB, tokenizer, MAX_LEN)
72
+ testing_set = BinaryLabel(test_df_DB, tokenizer, MAX_LEN)
73
+
74
+ training_loader = DataLoader(training_set, **train_params)
75
+ testing_loader = DataLoader(testing_set, **test_params)