This is the code train a gpt2 from scratch
from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import Trainer, TrainingArguments,GPT2LMHeadModel,GPT2Config
import torch
import os
from torch.utils.data.dataset import Dataset
from transformers.utils import logging
from transformers.tokenization_utils import PreTrainedTokenizer
logger = logging.get_logger(__name__)
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
assert os.path.isfile(file_path), f"Input file path {file_path} not found"
# Here, we do not cache the features, operating under the assumption
# that we will soon use fast multithreaded tokenizers from the
# `tokenizers` repo everywhere =)
logger.info("Creating features from dataset file at %s", file_path)
with open(file_path, encoding="utf-8") as f:
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
batch_encoding = tokenizer(lines, add_special_tokens=False, truncation=True, max_length=block_size)
self.examples = batch_encoding["input_ids"]
def __len__(self):
return len(self.examples)
def __getitem__(self, i) -> torch.Tensor:
return torch.tensor(self.examples[i], dtype=torch.long)
tokenizer = BertTokenizerFast(
vocab_file = r"D:\2020.09.15GPT2\vocab.txt",
unk_token='<unk>',
sep_token='<sep>',
pad_token='<pad>',
cls_token='</s>',
mask_token='<mask>')
special_tokens_dict = {"bos_token": "<s>", "eos_token": "</s>"}
tokenizer.add_special_tokens(special_tokens_dict)
config = GPT2Config.from_pretrained(r'D:\2020.09.15GPT2\config.json')
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
def load_dataset(train_path,tokenizer):
train_dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path=train_path,
block_size=128)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
return train_dataset,data_collator
train_path = r'Seven_Lines_Verse_plus_sign.txt'
train_dataset,data_collator = load_dataset(train_path,tokenizer)
training_args = TrainingArguments(
output_dir=r"D:\2020.09.15GPT2", #The output directory
overwrite_output_dir=True, #overwrite the content of the output directory
save_total_limit= 20,
num_train_epochs=5, # number of training epochs
per_device_train_batch_size=36, # batch size for training
per_device_eval_batch_size=36, # batch size for evaluation
eval_steps = 1000, # Number of update steps between two evaluations.
save_steps=1000, # after # steps model is saved
warmup_steps=500,# number of warmup steps for learning rate scheduler
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=None,
prediction_loss_only=True,
)
trainer.train()
trainer.save_model()