|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
|
from datasets import load_dataset |
|
|
|
|
|
datasets_info = { |
|
"SQuAD": "squad", |
|
"SQuAD 2.0": "squad_v2", |
|
"Natural Questions": "nq", |
|
"TriviaQA": "triviaqa", |
|
"QuAC": "quac", |
|
"FAQ Dataset": "faq", |
|
"BoolQ": "boolq", |
|
"Open Book QA": "obqa" |
|
} |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF") |
|
model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF") |
|
|
|
def train_model(dataset_name): |
|
|
|
dataset = load_dataset(datasets_info[dataset_name]) |
|
|
|
|
|
def preprocess_function(examples): |
|
return tokenizer(examples['question'], examples['context'], truncation=True) |
|
|
|
tokenized_dataset = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"./{dataset_name}_model", |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset['train'], |
|
eval_dataset=tokenized_dataset['validation'] |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained(f"./{dataset_name}_model") |
|
tokenizer.save_pretrained(f"./{dataset_name}_model") |
|
|
|
return f"Model trained and saved for {dataset_name}!" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Train QA Model on Multiple Datasets") |
|
dataset_name = gr.Dropdown(choices=list(datasets_info.keys()), label="Select Dataset") |
|
train_button = gr.Button("Train Model") |
|
output = gr.Textbox(label="Output") |
|
|
|
def train_and_display(dataset_name): |
|
return train_model(dataset_name) |
|
|
|
train_button.click(train_and_display, inputs=dataset_name, outputs=output) |
|
|
|
demo.launch() |
|
|