I am trying to fine tune a transformer model on my own unlabeled corpus of text. My code for doing this is:
from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import AutoModel
from transformers import TrainingArguments
from transformers import Trainer
import glob
import os
base_path = '../data/'
model_name = 'bert-base-uncased'
max_length = 512
checkpoints_dir = 'checkpoints'
if not os.path.exists(checkpoints_dir):
os.mkdir(checkpoints_dir)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)
dataset = load_dataset('text',
data_files={
'train': f'{base_path}train.txt',
'test': f'{base_path}test.txt',
'validation': f'{base_path}valid.txt'
}
)
print('Tokenizing data. This may take a while...')
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']
model = AutoModel.from_pretrained(model_name)
training_args = TrainingArguments(checkpoints_dir)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)
trainer.train()
However, I get KeyError: 'loss'
when running the code at trainer.train()
. How do I fix this?