|
|
|
|
|
import numpy as np |
|
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer |
|
from datasets import load_dataset |
|
|
|
|
|
MODEL_NAME = 'distilbert-base-uncased' |
|
OUTPUT_DIR = './model_output' |
|
EPOCHS = 3 |
|
BATCH_SIZE = 16 |
|
LEARNING_RATE = 5e-5 |
|
|
|
|
|
dataset = load_dataset('imdb') |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
|
def preprocess_function(examples): |
|
return tokenizer(examples['text'], truncation=True) |
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=OUTPUT_DIR, |
|
evaluation_strategy="epoch", |
|
learning_rate=LEARNING_RATE, |
|
per_device_train_batch_size=BATCH_SIZE, |
|
per_device_eval_batch_size=BATCH_SIZE, |
|
num_train_epochs=EPOCHS, |
|
weight_decay=0.01, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets['train'], |
|
eval_dataset=tokenized_datasets['test'], |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model(OUTPUT_DIR) |
|
|
|
print("Model trained and saved!") |
|
|