Spaces:

perman2011
/

DistilBERT-Sentiment-Analysis

Sleeping

App Files Files Community

DistilBERT-Sentiment-Analysis / DistilBERT.py

perman2011

Update DistilBERT.py

b6794ac about 1 year ago

raw

history blame

5.12 kB

	import transformers
	import torch
	from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
	from transformers import DistilBertTokenizer, DistilBertModel
	import logging
	logging.basicConfig(level=logging.ERROR)
	import torch.nn as nn
	from torch.nn import functional as F
	import torch.optim as optim
	import pandas as pd
	import numpy as np

	# Điều chỉnh các tham số
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	MAX_LEN = 100
	TRAIN_BATCH_SIZE = 4
	VALID_BATCH_SIZE = 4
	EPOCHS = 1
	LEARNING_RATE = 1e-05
	tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

	# Tạo dataframe
	train_df_DB = pd.read_csv('./data/train.csv')
	train_df_DB['label'] = train_df_DB.iloc[:, 1:].values.tolist()
	test_df_DB = pd.read_csv('./data/test.csv')
	test_df_DB = test_df_DB[['text', 'preprocess_sentence', 'label']]
	test_df_DB['label'] = test_df_DB.iloc[:, 2:].values.tolist()

	# Tạo class
	class BinaryLabel(Dataset):

	def __init__(self, dataframe, tokenizer, max_len):
	self.tokenizer = tokenizer
	self.data = dataframe
	self.text = dataframe.text
	self.targets = self.data.label
	self.max_len = max_len

	def __len__(self):
	return len(self.text)

	def __getitem__(self, index):
	text = str(self.text[index])
	text = " ".join(text.split())

	inputs = self.tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	pad_to_max_length=True,
	return_token_type_ids=True
	)
	ids = inputs['input_ids']
	mask = inputs['attention_mask']
	token_type_ids = inputs["token_type_ids"]


	return {
	'ids': torch.tensor(ids, dtype=torch.long),
	'mask': torch.tensor(mask, dtype=torch.long),
	'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
	'targets': torch.tensor(self.targets[index], dtype=torch.float)
	}

	train_params = {'batch_size': TRAIN_BATCH_SIZE,
	'shuffle': True,
	'num_workers': 0
	}

	test_params = {'batch_size': VALID_BATCH_SIZE,
	'shuffle': True,
	'num_workers': 0
	}

	training_set = BinaryLabel(train_df_DB, tokenizer, MAX_LEN)
	testing_set = BinaryLabel(test_df_DB, tokenizer, MAX_LEN)

	training_loader = DataLoader(training_set, **train_params)
	testing_loader = DataLoader(testing_set, **test_params)

	# Create model
	class DistilBERTClass(torch.nn.Module):
	def __init__(self):
	super(DistilBERTClass, self).__init__()
	self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
	self.pre_classifier = torch.nn.Linear(768, 768)
	self.dropout = torch.nn.Dropout(0.1)
	self.classifier = torch.nn.Linear(768, 1)

	def forward(self, input_ids, attention_mask, token_type_ids):
	output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
	hidden_state = output_1[0]
	pooler = hidden_state[:, 0]
	pooler = self.pre_classifier(pooler)
	pooler = torch.nn.ReLU()(pooler)
	pooler = self.dropout(pooler)
	output = self.classifier(pooler)
	return output


	# Validation function
	def validation(testing_loader):
	model_DB.eval()
	fin_targets=[]
	fin_outputs=[]
	with torch.no_grad():
	for _, data in tqdm(enumerate(testing_loader, 0)):
	ids = data['ids'].to(device, dtype = torch.long)
	mask = data['mask'].to(device, dtype = torch.long)
	token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
	targets = data['targets'].to(device, dtype = torch.float)
	outputs = model_DB(ids, mask, token_type_ids)
	fin_targets.extend(targets.cpu().detach().numpy().tolist())
	fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
	return fin_outputs, fin_targets

	# Train function
	def train(epoch):
	model.train()
	for _,data in tqdm(enumerate(training_loader, 0)):
	ids = data['ids'].to(device, dtype = torch.long)
	mask = data['mask'].to(device, dtype = torch.long)
	token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
	targets = data['targets'].to(device, dtype = torch.float)

	outputs = model(ids, mask, token_type_ids)

	optimizer.zero_grad()
	loss = loss_fn(outputs, targets)
	if _%50==0:
	print(f'Epoch: {epoch}, Loss: {loss.item()}')
	if loss.item() < 0.07:
	print(f'Breaking the loop as loss is below 0.07: {loss.item()}')
	break
	loss.backward()
	optimizer.step()
	def loss_fn(outputs, targets):
	return torch.nn.BCEWithLogitsLoss()(outputs, targets)

	model_DB = DistilBERTClass()
	optimizer = torch.optim.Adam(params = model_DB.parameters(), lr=LEARNING_RATE)

	loaded_model_path = './model_DB_1.pt'
	model_DB.load_state_dict(torch.load(loaded_model_path, map_location=torch.device('cpu')))