—here is my complete code
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import numpy as np
import datasets
import evaluate
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import torch.nn as nn
accuracy_metric = evaluate.load("accuracy")
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
base_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
resume_data=load_dataset('cnamuangtoun/resume-job-description-fit')
resume_data=resume_data.shuffle(seed=23)
unique_labels=set(resume_data['train']['label']) | set(resume_data['test']['label'])
unique_labels={data:index for index,data in enumerate(set(unique_labels))}
#taking only small size of the data to train
train=resume_data['train']#.train_test_split(test_size=0.02, seed=42)
test=resume_data['test']#.train_test_split(test_size=0.02, seed=42)
mod_data=DatasetDict({
'train':train,
'test':test
})
class modelWithLossFunc(nn.Module):
def __init__(self,base_model,num_labels):
super(modelWithLossFunc,self).__init__()
self.base_model=base_model
self.classifier=nn.Linear(base_model.config.hidden_size,num_labels)
self.config=base_model.config
def forward(self,input_ids,attention_mask,token_type_ids=None,labels=None):
outputs=self.base_model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
pooled_output=outputs[1]
logits=self.classifier(pooled_output)
loss=None
if labels is not None:
loss_fn=nn.CrossEntropyLoss()
loss=loss_fn(logits.view(-1,self.classifier.out_features),labels.view(-1))
return (loss,logits) if loss is not None else logits
num_labels=3
model=modelWithLossFunc(base_model,num_labels)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
def encode_label(row):
row['label_encode']=unique_labels[row['label']]
return row
def compute_metrics(p):
predictions, labels = p
preds = np.argmax(predictions, axis=1)
accuracy = accuracy_metric.compute(predictions=preds, references=labels)
return {
"accuracy": accuracy['accuracy'],
}
def tokenize_data(data):
return tokenizer(data['resume_text'],data['job_description_text'],padding='max_length',truncation=True)
mod_data=mod_data.map(encode_label)
tokenized_data=mod_data.map(tokenize_data,batched=True)
mod_data=tokenized_data.rename_columns({'label_encode':'labels'})
mod_data=mod_data.select_columns(['input_ids','token_type_ids','attention_mask','labels'])
training_args=TrainingArguments(
output_dir='resume_evaluation_model',
learning_rate=2e-5,
eval_strategy='epoch'
)
trainer=Trainer(
model=model,
args=training_args,
train_dataset=mod_data['train'],
eval_dataset=mod_data['test'],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
from huggingface_hub import notebook_login
notebook_login()
trainer.push_to_hub('./resume_evaluation_model')