Sofa321 commited on
Commit
2f4f974
·
verified ·
1 Parent(s): 113d0fb

Create fine_tune.py

Browse files
Files changed (1) hide show
  1. fine_tune.py +47 -0
fine_tune.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
2
+ from datasets import load_dataset
3
+
4
+ # Model Pre-trained
5
+ MODEL_NAME = "indobenchmark/indobert-base-p2"
6
+
7
+ # Load Dataset
8
+ dataset = load_dataset("csv", data_files="dataset.csv")
9
+
10
+ # Tokenizer
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+
13
+ def preprocess(data):
14
+ return tokenizer(data['pertanyaan'], padding="max_length", truncation=True)
15
+
16
+ # Preprocessing
17
+ dataset = dataset.map(preprocess, batched=True)
18
+ dataset = dataset.rename_column("jawaban", "labels")
19
+ dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
20
+
21
+ # Load Model
22
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
23
+
24
+ # Training Arguments
25
+ training_args = TrainingArguments(
26
+ output_dir="./results",
27
+ evaluation_strategy="epoch",
28
+ learning_rate=2e-5,
29
+ per_device_train_batch_size=16,
30
+ num_train_epochs=3,
31
+ save_total_limit=2
32
+ )
33
+
34
+ # Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=dataset['train'],
39
+ eval_dataset=dataset['validation']
40
+ )
41
+
42
+ # Train Model
43
+ trainer.train()
44
+
45
+ # Save Model
46
+ model.save_pretrained("./fine_tuned_model")
47
+ print("Model telah dilatih ulang dan disimpan ke './fine_tuned_model'.")