|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datasets import load_dataset |
|
|
|
Falcon = load_dataset('csv', data_files={"train": 'FalconData_train2.csv', "validation": 'FalconData_validation2.csv'}) |
|
|
|
print('Dataset Loaded!') |
|
|
|
|
|
|
|
"""Then take a look at an example:""" |
|
|
|
Falcon['train'][0] |
|
|
|
Falcon['validation'][0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:""" |
|
|
|
from transformers import AutoTokenizer, GPT2TokenizerFast |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
Falcon = Falcon.flatten() |
|
Falcon["train"][0] |
|
|
|
|
|
|
|
def preprocess_function(examples): |
|
return tokenizer([" ".join(x) for x in examples["Text"]]) |
|
|
|
|
|
|
|
tokenized_Falcon = Falcon.map( |
|
preprocess_function, |
|
batched=True, |
|
num_proc=4, |
|
remove_columns=Falcon["train"].column_names, |
|
) |
|
|
|
|
|
block_size = tokenizer.model_max_length |
|
|
|
|
|
|
|
def group_texts(examples): |
|
|
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
|
total_length = len(concatenated_examples[list(examples.keys())[0]]) |
|
|
|
|
|
if total_length >= block_size: |
|
total_length = (total_length // block_size) * block_size |
|
|
|
result = { |
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
|
for k, t in concatenated_examples.items() |
|
} |
|
result["labels"] = result["input_ids"].copy() |
|
return result |
|
|
|
"""Apply the `group_texts` function over the entire dataset:""" |
|
|
|
lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4) |
|
|
|
|
|
|
|
from transformers import DataCollatorForLanguageModeling |
|
|
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
|
|
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
|
import torch |
|
model = AutoModelForCausalLM.from_pretrained("rwh/tiny8", torch_dtype=torch.bfloat16) |
|
|
|
print('Model Loaded!') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.to('cuda') |
|
|
|
OutputDir = "C1ReadyModel" |
|
|
|
training_args = TrainingArguments( |
|
output_dir=OutputDir, |
|
overwrite_output_dir=True, |
|
bf16=True, |
|
|
|
evaluation_strategy="steps", |
|
|
|
|
|
learning_rate=1e-5, |
|
weight_decay=0.01, |
|
|
|
num_train_epochs=6, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
|
|
lr_scheduler_type = 'linear', |
|
push_to_hub=False, |
|
save_total_limit = 2, |
|
save_strategy = "steps", |
|
load_best_model_at_end=True, |
|
save_safetensors=True, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=lm_dataset["train"], |
|
eval_dataset=lm_dataset["validation"], |
|
|
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
print('Started Training!') |
|
trainer.train() |
|
|
|
trainer.save_model(OutputDir) |
|
print('Saved Model Path:', OutputDir) |
|
|
|
import math |
|
|
|
eval_results = trainer.evaluate() |
|
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |
|
|
|
|
|
|