from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer from datasets import load_dataset # Model Pre-trained MODEL_NAME = "indobenchmark/indobert-base-p2" # Load Dataset dataset = load_dataset("csv", data_files="dataset.csv") # Tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) def preprocess(data): return tokenizer(data['pertanyaan'], padding="max_length", truncation=True) # Preprocessing dataset = dataset.map(preprocess, batched=True) dataset = dataset.rename_column("jawaban", "labels") dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) # Load Model model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) # Training Arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, num_train_epochs=3, save_total_limit=2 ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset['train'], eval_dataset=dataset['validation'] ) # Train Model trainer.train() # Save Model model.save_pretrained("./fine_tuned_model") print("Model telah dilatih ulang dan disimpan ke './fine_tuned_model'.")