bstraehle commited on
Commit
a70f9f8
·
verified ·
1 Parent(s): 33efb94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -334
app.py CHANGED
@@ -1,33 +1,13 @@
1
- # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/sample_finetune.py
2
  import gradio as gr
3
- import os
4
- #import os, torch
5
- #from datasets import load_dataset
6
- #from huggingface_hub import HfApi, login
7
  #from peft import AutoPeftModelForCausalLM, LoraConfig
8
  #from random import randint
9
  #from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
10
  #from trl import SFTTrainer, setup_chat_format
11
 
12
- #import datasets, sys, logging, torch, transformers
13
- #from datasets import load_dataset
14
- #from peft import LoraConfig
15
- #from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
16
- #from trl import SFTTrainer
17
-
18
- import sys
19
- import logging
20
-
21
- import datasets
22
- from datasets import load_dataset
23
- from peft import LoraConfig
24
- import torch
25
- import transformers
26
- from trl import SFTTrainer
27
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
28
-
29
- # Fine-tune on NVidia 4xL4 (sleep after 10 hours)
30
-
31
  hf_profile = "bstraehle"
32
 
33
  action_1 = "Fine-tune pre-trained model"
@@ -37,7 +17,7 @@ system_prompt = "You are a text to SQL query translator. Given a question in Eng
37
  user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
38
  schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
39
 
40
- base_model_id = "microsoft/Phi-3-mini-4k-instruct"
41
  dataset = "b-mc2/sql-create-context"
42
 
43
  def prompt_model(model_id, system_prompt, user_prompt, schema):
@@ -54,219 +34,12 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
54
  output = pipe(messages)
55
  result = output[0]["generated_text"][-1]["content"]
56
  print(result)
57
- return result
58
-
59
- # peft_model_id = "./code-llama-7b-text-to-sql"
60
- # # peft_model_id = args.output_dir
61
-
62
- # # Load Model with PEFT adapter
63
- # model = AutoPeftModelForCausalLM.from_pretrained(
64
- # peft_model_id,
65
- # device_map="auto",
66
- # torch_dtype=torch.float16
67
- # )
68
- # tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
69
- # # load into pipeline
70
- # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
71
-
72
- ###
73
-
74
- # eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
75
- # rand_idx = randint(0, len(eval_dataset))
76
-
77
- # # Test on sample
78
- # prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
79
- # outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
80
-
81
- # print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
82
- # print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
83
- # print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
84
 
85
  def fine_tune_model(base_model_id, dataset):
86
- test(base_model_id, dataset)
87
- ##tokenizer = download_model(base_model_id)
88
- #prepare_dataset(dataset)
89
- #train_model(base_model_id)
90
- ##fine_tuned_model_id = upload_model(base_model_id, tokenizer)
91
- return "fine_tuned_model_id"
92
-
93
- def create_conversation(sample):
94
- return {
95
- "messages": [
96
- {"role": "system", "content": system_prompt.format(schema=sample["context"])},
97
- {"role": "user", "content": sample["question"]},
98
- {"role": "assistant", "content": sample["answer"]}
99
- ]
100
- }
101
-
102
- # Define the formatting function for the prompts
103
- def formatting_prompts_func(examples):
104
- convos = examples["conversations"]
105
- texts = []
106
- mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
107
- end_mapper = {"system": "", "human": "", "gpt": ""}
108
- for convo in convos:
109
- text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
110
- texts.append(f"{text}{tokenizer.eos_token}")
111
- return {"text": texts}
112
-
113
- def test(base_model_id, dataset):
114
- logger = logging.getLogger(__name__)
115
-
116
-
117
- ###################
118
- # Hyper-parameters
119
- ###################
120
- training_config = {
121
- "bf16": True,
122
- "do_eval": False,
123
- "learning_rate": 5.0e-06,
124
- "log_level": "info",
125
- "logging_steps": 20,
126
- "logging_strategy": "steps",
127
- "lr_scheduler_type": "cosine",
128
- "num_train_epochs": 1,
129
- "max_steps": -1,
130
- "output_dir": "./checkpoint_dir",
131
- "overwrite_output_dir": True,
132
- "per_device_eval_batch_size": 4,
133
- "per_device_train_batch_size": 4,
134
- "remove_unused_columns": True,
135
- "save_steps": 100,
136
- "save_total_limit": 1,
137
- "seed": 0,
138
- "gradient_checkpointing": True,
139
- "gradient_checkpointing_kwargs":{"use_reentrant": False},
140
- "gradient_accumulation_steps": 1,
141
- "warmup_ratio": 0.2,
142
- }
143
-
144
- peft_config = {
145
- "r": 16,
146
- "lora_alpha": 32,
147
- "lora_dropout": 0.05,
148
- "bias": "none",
149
- "task_type": "CAUSAL_LM",
150
- "target_modules": "all-linear",
151
- "modules_to_save": None,
152
- }
153
- train_conf = TrainingArguments(**training_config)
154
- peft_conf = LoraConfig(**peft_config)
155
-
156
-
157
- ###############
158
- # Setup logging
159
- ###############
160
- logging.basicConfig(
161
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
162
- datefmt="%Y-%m-%d %H:%M:%S",
163
- handlers=[logging.StreamHandler(sys.stdout)],
164
- )
165
- log_level = train_conf.get_process_log_level()
166
- logger.setLevel(log_level)
167
- datasets.utils.logging.set_verbosity(log_level)
168
- transformers.utils.logging.set_verbosity(log_level)
169
- transformers.utils.logging.enable_default_handler()
170
- transformers.utils.logging.enable_explicit_format()
171
-
172
- # Log on each process a small summary
173
- logger.warning(
174
- f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
175
- + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
176
- )
177
- logger.info(f"Training/evaluation parameters {train_conf}")
178
- logger.info(f"PEFT parameters {peft_conf}")
179
-
180
-
181
- ################
182
- # Model Loading
183
- ################
184
- checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
185
- # checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
186
- model_kwargs = dict(
187
- use_cache=False,
188
- trust_remote_code=True,
189
- #attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
190
- torch_dtype=torch.bfloat16,
191
- device_map=None
192
- )
193
- model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
194
- tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
195
- tokenizer.model_max_length = 2048
196
- tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
197
- tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
198
- tokenizer.padding_side = 'right'
199
-
200
-
201
- ##################
202
- # Data Processing
203
- ##################
204
- def apply_chat_template(
205
- example,
206
- tokenizer,
207
- ):
208
- messages = example["messages"]
209
- example["text"] = tokenizer.apply_chat_template(
210
- messages, tokenize=False, add_generation_prompt=False)
211
- return example
212
-
213
- raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
214
- train_dataset = raw_dataset["train_sft"]
215
- test_dataset = raw_dataset["test_sft"]
216
- column_names = list(train_dataset.features)
217
-
218
- processed_train_dataset = train_dataset.map(
219
- apply_chat_template,
220
- fn_kwargs={"tokenizer": tokenizer},
221
- num_proc=10,
222
- remove_columns=column_names,
223
- desc="Applying chat template to train_sft",
224
- )
225
-
226
- processed_test_dataset = test_dataset.map(
227
- apply_chat_template,
228
- fn_kwargs={"tokenizer": tokenizer},
229
- num_proc=10,
230
- remove_columns=column_names,
231
- desc="Applying chat template to test_sft",
232
- )
233
-
234
-
235
- ###########
236
- # Training
237
- ###########
238
- trainer = SFTTrainer(
239
- model=model,
240
- args=train_conf,
241
- peft_config=peft_conf,
242
- train_dataset=processed_train_dataset,
243
- eval_dataset=processed_test_dataset,
244
- max_seq_length=2048,
245
- dataset_text_field="text",
246
- tokenizer=tokenizer,
247
- packing=True
248
- )
249
- train_result = trainer.train()
250
- metrics = train_result.metrics
251
- trainer.log_metrics("train", metrics)
252
- trainer.save_metrics("train", metrics)
253
- trainer.save_state()
254
-
255
-
256
- #############
257
- # Evaluation
258
- #############
259
- tokenizer.padding_side = 'left'
260
- metrics = trainer.evaluate()
261
- metrics["eval_samples"] = len(processed_test_dataset)
262
- trainer.log_metrics("eval", metrics)
263
- trainer.save_metrics("eval", metrics)
264
-
265
-
266
- # ############
267
- # # Save model
268
- # ############
269
- trainer.save_model(train_conf.output_dir)
270
 
271
  def download_model(base_model_id):
272
  tokenizer = AutoTokenizer.from_pretrained(base_model_id)
@@ -274,104 +47,6 @@ def download_model(base_model_id):
274
  model.save_pretrained(base_model_id)
275
  return tokenizer
276
 
277
- def prepare_dataset(dataset):
278
- dataset = load_dataset(dataset, split="train")
279
- dataset = dataset.shuffle().select(range(12500))
280
-
281
- # Convert dataset to OAI messages
282
- dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
283
- # split dataset into 10,000 training samples and 2,500 test samples
284
- dataset = dataset.train_test_split(test_size=2500/12500)
285
-
286
- print(dataset["train"][345]["messages"])
287
-
288
- # save datasets to disk
289
- dataset["train"].to_json("train_dataset.json", orient="records")
290
- dataset["test"].to_json("test_dataset.json", orient="records")
291
- ###
292
-
293
- def train_model(model_id):
294
- print("111")
295
- dataset = load_dataset("json", data_files="train_dataset.json", split="train")
296
-
297
- bnb_config = BitsAndBytesConfig(
298
- load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
299
- )
300
-
301
- print("222")
302
- # Load model and tokenizer
303
- model = AutoModelForCausalLM.from_pretrained(
304
- model_id,
305
- device_map="auto",
306
- #attn_implementation="flash_attention_2",
307
- torch_dtype=torch.bfloat16,
308
- quantization_config=bnb_config
309
- )
310
- tokenizer = AutoTokenizer.from_pretrained(model_id)
311
- tokenizer.padding_side = 'right' # to prevent warnings
312
-
313
- print("333")
314
- # # set chat template to OAI chatML, remove if you start from a fine-tuned model
315
- model, tokenizer = setup_chat_format(model, tokenizer)
316
-
317
- peft_config = LoraConfig(
318
- lora_alpha=128,
319
- lora_dropout=0.05,
320
- r=256,
321
- bias="none",
322
- target_modules="all-linear",
323
- task_type="CAUSAL_LM",
324
- )
325
-
326
- print("444")
327
- args = TrainingArguments(
328
- output_dir="code-llama-7b-text-to-sql", # directory to save and repository id
329
- num_train_epochs=3, # number of training epochs
330
- per_device_train_batch_size=3, # batch size per device during training
331
- gradient_accumulation_steps=2, # number of steps before performing a backward/update pass
332
- gradient_checkpointing=True, # use gradient checkpointing to save memory
333
- optim="adamw_torch_fused", # use fused adamw optimizer
334
- logging_steps=10, # log every 10 steps
335
- save_strategy="epoch", # save checkpoint every epoch
336
- learning_rate=2e-4, # learning rate, based on QLoRA paper
337
- bf16=True, # use bfloat16 precision
338
- tf32=True, # use tf32 precision
339
- max_grad_norm=0.3, # max gradient norm based on QLoRA paper
340
- warmup_ratio=0.03, # warmup ratio based on QLoRA paper
341
- lr_scheduler_type="constant", # use constant learning rate scheduler
342
- push_to_hub=True, # push model to hub
343
- report_to="tensorboard", # report metrics to tensorboard
344
- )
345
-
346
- max_seq_length = 3072 # max sequence length for model and packing of the dataset
347
-
348
- print("555")
349
- trainer = SFTTrainer(
350
- model=model,
351
- args=args,
352
- train_dataset=dataset,
353
- peft_config=peft_config,
354
- max_seq_length=max_seq_length,
355
- tokenizer=tokenizer,
356
- packing=True,
357
- dataset_kwargs={
358
- "add_special_tokens": False, # We template with special tokens
359
- "append_concat_token": False, # No need to add additional separator token
360
- }
361
- )
362
-
363
- print("666")
364
- # start training, the model will be automatically saved to the hub and the output directory
365
- trainer.train()
366
-
367
- print("777")
368
- # save model
369
- trainer.save_model()
370
-
371
- del model
372
- del trainer
373
- torch.cuda.empty_cache()
374
-
375
  def upload_model(base_model_id, tokenizer):
376
  fine_tuned_model_id = replace_hf_profile(base_model_id)
377
  login(token=os.environ["HF_TOKEN"])
 
1
+ # https://www.philschmid.de/fine-tune-llms-in-2024-with-trl#3-create-and-prepare-the-dataset
2
  import gradio as gr
3
+ import os, torch
4
+ from datasets import load_dataset
5
+ from huggingface_hub import HfApi, login
 
6
  #from peft import AutoPeftModelForCausalLM, LoraConfig
7
  #from random import randint
8
  #from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
9
  #from trl import SFTTrainer, setup_chat_format
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  hf_profile = "bstraehle"
12
 
13
  action_1 = "Fine-tune pre-trained model"
 
17
  user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
18
  schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
19
 
20
+ base_model_id = "codellama/CodeLlama-7b-hf"
21
  dataset = "b-mc2/sql-create-context"
22
 
23
  def prompt_model(model_id, system_prompt, user_prompt, schema):
 
34
  output = pipe(messages)
35
  result = output[0]["generated_text"][-1]["content"]
36
  print(result)
37
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def fine_tune_model(base_model_id, dataset):
40
+ tokenizer = download_model(base_model_id)
41
+ fine_tuned_model_id = upload_model(base_model_id, tokenizer)
42
+ return fine_tuned_model_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def download_model(base_model_id):
45
  tokenizer = AutoTokenizer.from_pretrained(base_model_id)
 
47
  model.save_pretrained(base_model_id)
48
  return tokenizer
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def upload_model(base_model_id, tokenizer):
51
  fine_tuned_model_id = replace_hf_profile(base_model_id)
52
  login(token=os.environ["HF_TOKEN"])