GPT-2 Medium

Model Description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective. Created new fine-tuned model using wikitext dataset referred from huggingface(lilacai/lilac-wikitext-2-raw-v1)

Created model as part of experiment, please increase epoch and other parameters as part of hyperparameter tuning.

Library Import

from transformers import GPT2Tokenizer, GPT2LMHeadModel, \
DataCollatorForLanguageModeling, Trainer, TrainingArguments
import time
from rich import print
from datasets import load_dataset

Variable Declaration

# Fine-tuned model name
new_model = "gpt2-medium-wiki"
# Load the entire model on the GPU 0
device_map = {"": 0}

Load the tokenizer and model

model_name = "gpt2-medium"  # You can use "gpt2", "gpt2-medium", "gpt2-large", etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name, device_map=device_map)

Load a sample dataset from Hugging Face's datasets library

train_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[10:90]')
eval_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation[10:30]')

Tokenize the dataset

train_tokenized = train_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)
eval_tokenized = eval_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)
train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask'])
eval_tokenized.set_format('torch', columns=['input_ids', 'attention_mask'])

Fine-tune the model

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Set this higher for more fine-tuning
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=200,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    output_dir="./gpt2_finetuned",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
)
trainer.train()

Evaluate the fine-tuned model (optional)

results = trainer.evaluate()
print(results)

Performance Time

end = time.time()
print(f"[bold green]Model Successfully trained in {end - start} seconds")

Save trained model

trainer.model.save_pretrained(new_model)

Generate Text for inferencing

def generate_text(input_text):
  input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
  # Generate text
  output = model.generate(input_ids, max_length=50, temperature=0.7)

  # Decode and print the generated text
  for i, sequence in enumerate(output):
      decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True)
      print(f"Generated Text {i + 1}: {decoded_sequence}")

input_text = "which system carried over directly from Valkyira Chronicles"
generate_text(input_text)

My GPU Size

image/png

Downloads last month
9
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train aswin1906/gpt2-medium-wiki