Hi,
I am using bert-base-uncased to train a model based on traning data to classified if that text belongs to a specific industry.
It has a training set of 3,000 sentences and classifies to
“1”: “Financial Services”,
“2”: “Energy”,
“3”: “Automotive”,
It works very well when I type something related to these industries but when i type nonsense it always classifies to Automotive and with very high score.
Samples:
Please input your sentence to get classified: good track record on investment fund industry
[{‘label’: ‘Financial Services’, ‘score’: 0.997352123260498}]
Please input your sentence to get classified: very related to gas and nuclear power
[{‘label’: ‘Energy’, ‘score’: 0.9856479167938232}]
Then this nonse with high score
Please input your sentence to get classified: sfhsdfhskjdf
[{‘label’: ‘Automotive’, ‘score’: 0.9958509206771851}]
Any idea why gives such a high score and always goes to automotive?
Code Below
import pandas as pd
from datasets import Dataset
import datasets
import matplotlib.pyplot as plt
from typing import List, Dict, Any, Union, Generator, Callable, Tuple
from transformers import BertTokenizerFast, DataCollatorWithPadding, TrainingArguments, Trainer ,BertForSequenceClassification
from tqdm import tqdm
import torch
import numpy as np
import evaluate
from unittest.mock import Mock, patch
torch_device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Torch device: ", torch_device)
TRAINING_FILE = "./trainingdata/TrainingIndustrySmall.xlsx"
VALIDATTION_FILE = "./trainingdata/ValidationIndustrySmall.xlsx"
MODEL_SAVED = "./models/industry-classifier-small"
LABEL_DICTIONARY = {
"0": "No Classified",
"1": "Financial Services",
"2": "Energy",
"3": "Automotive",
}
industry_labels = list(LABEL_DICTIONARY.values())
id2label = {k:l for k, l in LABEL_DICTIONARY.items()}
label2id = {l:k for k, l in LABEL_DICTIONARY.items()}
#Define features
features = datasets.Features({"sentence": datasets.Value("string"), "label": datasets.ClassLabel(names=industry_labels,num_classes=len(industry_labels))})
#Load datasets for training
dataset_training = pd.read_excel(TRAINING_FILE)
training_data = Dataset.from_pandas(dataset_training, features=features)
dataset_validation = pd.read_excel(VALIDATTION_FILE)
validation_data = Dataset.from_pandas(dataset_validation, features=features)
#Tokenizer
pretrained_model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
# Note that we are specifying the number of labels we want.
# This preconfigures the model with a softmax output layer over the appropriate number of classes.
model = BertForSequenceClassification.from_pretrained(pretrained_model_name, id2label=id2label, label2id=label2id,num_labels=len(industry_labels))
def tokenize_function(example: Dict[str, Union[str, int]]) -> Dict[str, torch.Tensor]:
"""Tokenizes a single example using a pre-trained tokenizer.
Args:
example: The example containing a sentence to tokenize.
Returns:
A dictionary containing tokenized input_ids and attention_mask, both as PyTorch tensors.
"""
tokenized_example = tokenizer(
example["sentence"],
padding=True,
truncation=True,
return_tensors="pt"
)
return tokenized_example
# Map the train and test sets to tokenized versions of that data using the tokenize_function()
train_tokenized_industries = training_data.map(tokenize_function, batched=True)
test_tokenized_industries = validation_data.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
output_dir='./results', # output directory
overwrite_output_dir = True,
evaluation_strategy = 'steps',
eval_steps = 100,
logging_steps = 100,
num_train_epochs = 4
)
def compute_metrics(eval_preds: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
"""Computes F1 score and accuracy for model evaluation.
This function takes a tuple containing the predicted logits and true labels,
and computes the F1 score and accuracy. It uses pre-loaded evaluation metrics
for F1 and accuracy, assumed to be loaded via a hypothetical `evaluate.load` method.
Args:
eval_preds: A tuple containing two NumPy arrays.
The first array contains the predicted logits.
The second array contains the true labels.
Returns:
A dictionary containing the F1 score and accuracy as scalar values.
"""
# Load evaluation metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
# Extract logits and labels from eval_preds
logits, labels = eval_preds
# Convert logits to class labels
predictions = np.argmax(logits, axis=-1)
# Compute F1 score and extract the scalar value
f1_result = f1_metric.compute(predictions=predictions, references=labels, average="macro")
f1_score = f1_result['f1'] if isinstance(f1_result, dict) else f1_result
# Compute accuracy and extract the scalar value
accuracy_result = accuracy_metric.compute(predictions=predictions, references=labels)
accuracy_score = accuracy_result['accuracy'] if isinstance(accuracy_result, dict) else accuracy_result
return {"F1": f1_score, "Accuracy": accuracy_score}
#Initialize trainer
trainer = Trainer(
model.to(torch_device),
training_args,
train_dataset=train_tokenized_industries,
eval_dataset=test_tokenized_industries,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics)
trainer.train()
trainer.save_model(MODEL_SAVED)
Thanks a lot
Sergio