Spaces:

perman2011
/

DistilBERT-Sentiment-Analysis

Sleeping

App Files Files Community

perman2011 commited on Dec 22, 2023

Commit

bbfebbe

1 Parent(s): fe16611

Update DistilBERT.py

Browse files

Files changed (1) hide show

DistilBERT.py +75 -0

DistilBERT.py CHANGED Viewed

	@@ -0,0 +1,75 @@

+import transformers
+import torch
+from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import DistilBertTokenizer, DistilBertModel
+import logging
+logging.basicConfig(level=logging.ERROR)
+import torch.nn as nn
+from torch.nn import functional as F
+import torch.optim as optim
+import pandas as pd
+import numpy as np
+# Điều chỉnh các tham số
+MAX_LEN = 100
+TRAIN_BATCH_SIZE = 4
+VALID_BATCH_SIZE = 4
+EPOCHS = 1
+LEARNING_RATE = 1e-05
+tokenizer_DB = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
+# Tạo dataframe
+# Tạo class
+class BinaryLabel(Dataset):
+    def __init__(self, dataframe, tokenizer, max_len):
+        self.tokenizer = tokenizer_DB
+        self.data = dataframe
+        self.text = dataframe.text
+        self.targets = self.data.label
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.text)
+    def __getitem__(self, index):
+        text = str(self.text[index])
+        text = " ".join(text.split())
+        inputs = self.tokenizer.encode_plus(
+            text,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            pad_to_max_length=True,
+            return_token_type_ids=True
+        )
+        ids = inputs['input_ids']
+        mask = inputs['attention_mask']
+        token_type_ids = inputs["token_type_ids"]
+        return {
+            'ids': torch.tensor(ids, dtype=torch.long),
+            'mask': torch.tensor(mask, dtype=torch.long),
+            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+            'targets': torch.tensor(self.targets[index], dtype=torch.float)
+        }
+train_params = {'batch_size': TRAIN_BATCH_SIZE,
+                'shuffle': True,
+                'num_workers': 0
+                }
+test_params = {'batch_size': VALID_BATCH_SIZE,
+                'shuffle': True,
+                'num_workers': 0
+                }
+training_set = BinaryLabel(train_df_DB, tokenizer, MAX_LEN)
+testing_set = BinaryLabel(test_df_DB, tokenizer, MAX_LEN)
+training_loader = DataLoader(training_set, **train_params)
+testing_loader = DataLoader(testing_set, **test_params)