Model gives weird predictions
Hello! Thank you so much for sharing your code :)
I have a question, though. I followed a similar code to yours to machine translation task using a pre-trained BERT model on Arabic language named AraBERT as an encoder, and GPT2 as a decoder and fine-tune on a small dataset just to see what the results will look like. However, the model predicts a bunch of exclamation marks "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!". Do you know what the cause of the problem may be and how to solve it?
Here is my full code:
import numpy as np
import logging
import torch
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, load_metric, DatasetDict, Dataset
logging.basicConfig(level=logging.INFO)
#arabert_model = AutoModel.from_pretrained("bert-base-arabertv02/")
#gpt2_model = AutoModel.from_pretrained("gpt2/")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-arabertv02/", "gpt2/")
cache is currently not supported by EncoderDecoder framework
model.decoder.config.use_cache = False
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-arabertv02/")
CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token
SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token
make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
return outputs
GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2/")
set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token
set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.pad_token_id = bert_tokenizer.pad_token_id
model.config.max_length = 142
model.config.min_length = 1
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 5
load train and validation data
dataset = load_dataset("ted_talks_iwslt", language_pair=("ar", "en"), year="2014")
metric = load_metric("sacrebleu")
dataset = dataset['train']
train_test = dataset.train_test_split(0.2)
train_test_dataset = DatasetDict({
'train': train_test['train'],
'test': train_test['test']})
encoder_length = 128
decoder_length = 128
batch_size = 1
map data correctly
def map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS]
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["labels"] = outputs.input_ids.copy()
batch["decoder_attention_mask"] = outputs.attention_mask
# complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
batch["labels"] = [
[-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
]
assert all([len(x) == encoder_length for x in inputs.input_ids])
assert all([len(x) == decoder_length for x in outputs.input_ids])
return batch
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
torch.cuda.empty_cache()
with torch.no_grad():
label_ids = eval_preds.label_ids
print("LABEL IDS: ", label_ids)
pred_ids = eval_preds.predictions
#preds, labels = eval_preds
if isinstance(pred_ids, tuple):
pred_ids = pred_ids[0]
print("PREDICTION IDS: ", pred_ids)
decoded_preds = gpt2_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
Replace -100 in the labels as we can't decode them.
label_ids = np.where(label_ids != -100, label_ids, bert_tokenizer.pad_token_id)
label_ids[label_ids == -100] = gpt2_tokenizer.eos_token_id
decoded_labels = gpt2_tokenizer.batch_decode(label_ids, skip_special_tokens=True)
file = open("reference sentences iwslt test", "a")
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
for label in decoded_labels:
print("LABEL: ", label)
file2 = open("predicted sentences iwslt test", "a")
for pred in decoded_preds:
print("PRED: ", pred)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != gpt2_tokenizer.pad_token_id) for pred in pred_ids]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
make train dataset ready
train_dataset = train_test_dataset.map(
map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["translation"],
)
del(train_test_dataset)
train_dataset.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
same for validation dataset
val_dataset = val_dataset.map(
map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["ar", "en"],
)
val_dataset.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
training_args = Seq2SeqTrainingArguments(
f"arabert2bert-finetuned-ar-to-en-on-iwslt",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=1,
weight_decay=0.01,
do_train=True,
do_eval=True,
predict_with_generate=True,
eval_accumulation_steps=20,
save_total_limit=3,
num_train_epochs=1,
logging_steps=150,
fp16=True,
no_cuda=True
)
trainer = Seq2SeqTrainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset = train_dataset['train'], eval_dataset=train_dataset['test'])
trainer.train()
Thanks in advance :)
Hey @salma-elshafey ,
Thanks for opening the discussion here! Could you maybe put this code in a google colab where I can just run it to see the problem? :-)
Thanks!
Here's the link to the notebook: https://colab.research.google.com/drive/1wHRroGCpY1_78oC4e9v4hYJnNJ1Ub-tu?usp=sharing
Thank you so much for your help! :)
BTW you'll soon be able to edit comments =)
Cool! :)
Hey, @patrickvonplaten , sorry if I'm bothering but did you find where the problem lies?😅
Hey @salma-elshafey ,
I sadly don't have the time to fully debug your script, but the problem here seems to be that the model predicts the EOS token id (50256) all the time while getting a very low loss.
Could it be that your labels only consist of 50256 tokens? Could you maybe be sure that the label ids that you train the model on are correct?
Hello, @patrickvonplaten ! I used an older version of the transformes library -specifically v4.2.1- and the problem was solved :)
Awesome, very glad that you got it to work! Do you know by any chance what the problem was? Maybe there is a bug in the newer transformers version