bstraehle commited on
Commit
2371111
·
verified ·
1 Parent(s): ead792b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -2
app.py CHANGED
@@ -9,9 +9,92 @@ import evaluate
9
  import numpy as np
10
  import random
11
 
12
- def process(model, dataset):
 
 
 
 
13
  dataset_imdb = load_dataset(dataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  return "Done"
15
 
16
- demo = gr.Interface(fn=process, inputs=["model", "dataset"], outputs="text")
 
 
 
17
  demo.launch()
 
9
  import numpy as np
10
  import random
11
 
12
+ def preprocess_function(examples):
13
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
14
+
15
+ def process(model_id, dataset):
16
+ # Step 1: Load dataset
17
  dataset_imdb = load_dataset(dataset)
18
+
19
+ # Step 2: Reduce dataset (optional)
20
+
21
+ reduction_rate = 0.1
22
+ num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows)
23
+ num_test_to_keep = int(reduction_rate * dataset_imdb["test"].num_rows)
24
+
25
+ def select_random_indices(dataset, num_to_keep):
26
+ indices = list(range(dataset.num_rows))
27
+ random.shuffle(indices)
28
+ return indices[:num_to_keep]
29
+
30
+ train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep)
31
+ test_indices = select_random_indices(dataset_imdb["test"], num_test_to_keep)
32
+
33
+ dataset_imdb = DatasetDict({
34
+ "train": dataset_imdb["train"].select(train_indices),
35
+ "test": dataset_imdb["test"].select(test_indices),
36
+ })
37
+
38
+ # Step 3: Text tokenization
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
41
+
42
+ # Step 4: Apply tokenization to dataset
43
+
44
+ tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True)
45
+
46
+ #Step 5: Fine-tune the model
47
+
48
+ model_id = model_id
49
+ model = AutoModelForSequenceClassification.from_pretrained(model_id)
50
+
51
+ lora_config = LoraConfig(task="sequence_classification")
52
+ peft_model = PeftModel(model, lora_config)
53
+
54
+ training_args = TrainingArguments(
55
+ output_dir="./results",
56
+ num_train_epochs=3,
57
+ per_device_train_batch_size=16,
58
+ per_device_eval_batch_size=64,
59
+ evaluation_strategy="epoch",
60
+ learning_rate=1e-5,
61
+ save_total_limit=2,
62
+ save_steps=500,
63
+ load_best_model_at_end=True,
64
+ metric_for_best_model="accuracy",
65
+ greater_is_better=True,
66
+ save_strategy="steps",
67
+ eval_accumulation_steps=10,
68
+ )
69
+
70
+ trainer = Trainer(
71
+ model=peft_model,
72
+ args=training_args,
73
+ train_dataset=tokenized_imdb["train"],
74
+ eval_dataset=tokenized_imdb["test"],
75
+ compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()},
76
+ data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
77
+ )
78
+
79
+ trainer.train()
80
+
81
+ # Step 6: Evaluate the fine-tuned model
82
+
83
+ targets = []
84
+ predictions = []
85
+ for i in range(len(tokenized_imdb["test"])):
86
+ review = tokenized_imdb["test"][i]["text"]
87
+ target_sentiment = tokenized_imdb["test"][i]["label"]
88
+ predicted_sentiment = predict_sentiment(review)
89
+ if predicted_sentiment in ["positive", "negative"]:
90
+ targets.append(target_sentiment)
91
+ predictions.append(predicted_sentiment)
92
+ print(f"Record {i+1} - Actual: {target_sentiment}, Predicted: {predicted_sentiment}")
93
+
94
  return "Done"
95
 
96
+ demo = gr.Interface(fn=process,
97
+ inputs=[gr.Textbox(label = "Model ID", value = "google/gemma-7b", lines = 1),
98
+ gr.Textbox(label = "Dataset", value = "imdb", lines = 1)],
99
+ outputs=[gr.Textbox(label = "Completion")])
100
  demo.launch()