bstraehle commited on
Commit
8cdd9a7
·
verified ·
1 Parent(s): 8b53261

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -26
app.py CHANGED
@@ -32,6 +32,7 @@ def fine_tune_model(base_model_name, dataset_name):
32
 
33
  print("### Dataset")
34
  print(dataset)
 
35
  print("###")
36
 
37
  # Load model
@@ -44,43 +45,46 @@ def fine_tune_model(base_model_name, dataset_name):
44
  print("###")
45
 
46
  # Pre-process dataset
 
47
  def preprocess(examples):
48
- model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"], max_length=512, padding="max_length", truncation=True)
49
  return model_inputs
 
50
  dataset = dataset.map(preprocess, batched=True)
51
 
52
  print("### Pre-processed dataset")
53
  print(dataset)
 
54
  print("###")
55
 
56
  # Split dataset into training and validation sets
57
- train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
58
- test_dataset = dataset["test"].shuffle(seed=42).select(range(100))
59
 
60
  print("### Training dataset")
61
- print(test_dataset)
62
  print("### Validation dataset")
63
  print(test_dataset)
64
  print("###")
65
 
66
  # Configure training arguments
67
  training_args = Seq2SeqTrainingArguments(
68
- output_dir="./results",
69
- logging_dir="./logs",
70
  num_train_epochs=1,
71
- per_device_train_batch_size=16,
72
- per_device_eval_batch_size=64,
73
- eval_strategy="steps",
74
- save_total_limit=2,
75
- save_steps=500,
76
- eval_steps=500,
77
- warmup_steps=500,
78
- weight_decay=0.01,
79
- metric_for_best_model="accuracy",
80
- greater_is_better=True,
81
- load_best_model_at_end=True,
82
- push_to_hub=True,
83
- save_on_each_node=True,
84
  )
85
 
86
  print("### Training arguments")
@@ -93,13 +97,9 @@ def fine_tune_model(base_model_name, dataset_name):
93
  args=training_args,
94
  train_dataset=train_dataset,
95
  eval_dataset=test_dataset,
96
- compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
97
  )
98
 
99
- print("### Trainer")
100
- print(trainer)
101
- print("###")
102
-
103
  # Train and save model
104
  #trainer.train()
105
  #trainer.save_model()
@@ -128,8 +128,8 @@ def prompt_model(model_name, system_prompt, user_prompt, sql_schema):
128
 
129
  def load_model(model_name):
130
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
131
- tokenizer = AutoTokenizer.from_pretrained(model_name)
132
- tokenizer.pad_token = tokenizer.eos_token
133
 
134
  return model, tokenizer
135
 
 
32
 
33
  print("### Dataset")
34
  print(dataset)
35
+ print(dataset.head())
36
  print("###")
37
 
38
  # Load model
 
45
  print("###")
46
 
47
  # Pre-process dataset
48
+
49
  def preprocess(examples):
50
+ model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"]) #, max_length=512, padding="max_length", truncation=True)
51
  return model_inputs
52
+
53
  dataset = dataset.map(preprocess, batched=True)
54
 
55
  print("### Pre-processed dataset")
56
  print(dataset)
57
+ print(dataset.head())
58
  print("###")
59
 
60
  # Split dataset into training and validation sets
61
+ train_dataset = dataset["train"] #.shuffle(seed=42).select(range(1000))
62
+ test_dataset = dataset["test"] #.shuffle(seed=42).select(range(100))
63
 
64
  print("### Training dataset")
65
+ print(train_dataset)
66
  print("### Validation dataset")
67
  print(test_dataset)
68
  print("###")
69
 
70
  # Configure training arguments
71
  training_args = Seq2SeqTrainingArguments(
72
+ output_dir="./output",
73
+ logging_dir="./logging",
74
  num_train_epochs=1,
75
+ #per_device_train_batch_size=16,
76
+ #per_device_eval_batch_size=64,
77
+ #eval_strategy="steps",
78
+ #save_total_limit=2,
79
+ #save_steps=500,
80
+ #eval_steps=500,
81
+ #warmup_steps=500,
82
+ #weight_decay=0.01,
83
+ #metric_for_best_model="accuracy",
84
+ #greater_is_better=True,
85
+ #load_best_model_at_end=True,
86
+ #push_to_hub=True,
87
+ #save_on_each_node=True,
88
  )
89
 
90
  print("### Training arguments")
 
97
  args=training_args,
98
  train_dataset=train_dataset,
99
  eval_dataset=test_dataset,
100
+ #compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
101
  )
102
 
 
 
 
 
103
  # Train and save model
104
  #trainer.train()
105
  #trainer.save_model()
 
128
 
129
  def load_model(model_name):
130
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
131
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
132
+ #tokenizer.pad_token = tokenizer.eos_token
133
 
134
  return model, tokenizer
135