winglian commited on
Commit
949a27b
·
1 Parent(s): f2a2029

more fixes and prep for llama training

Browse files
configs/llama_65B_alpaca.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: decapoda-research/llama-65b-hf
2
+ model_type: LlamaForCausalLM
3
+ tokenizer_type: LlamaTokenizer
4
+ load_in_8bit: true
5
+ datasets:
6
+ - path: data/alpaca_data_gpt4.jsonl
7
+ type: alpaca
8
+ - path: data/vicuna_cleaned.jsonl
9
+ type: sharegpt
10
+ - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
+ type: gpteacher
12
+ - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
+ type: gpteacher
14
+ val_set_size: 0.04
15
+ adapter: lora
16
+ lora_model_dir:
17
+ sequence_len: 2048
18
+ lora_r: 8
19
+ lora_alpha: 16
20
+ lora_dropout: 0.05
21
+ lora_target_modules:
22
+ - q_proj
23
+ - w_proj
24
+ lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
25
+ wandb_project: llama-65b-lora
26
+ wandb_watch:
27
+ wandb_run_name:
28
+ wandb_log_model: checkpoint
29
+ output_dir: ./lora-llama-alpaca
30
+ batch_size: 128
31
+ micro_batch_size: 16
32
+ num_epochs: 5
33
+ learning_rate: 0.00003
34
+ train_on_inputs: false
35
+ group_by_length: false
36
+ bf16: True
37
+ tf32: True
38
+ resume_from_checkpoint:
39
+ local_rank:
40
+ deepspeed:
configs/pythia_1_2B_alpaca.yml CHANGED
@@ -13,22 +13,24 @@ datasets:
13
  type: gpteacher
14
  val_set_size: 0.05
15
  adapter: lora
 
16
  sequence_len: 2048
17
  lora_r: 8
18
  lora_alpha: 32
19
  lora_dropout: 0.05
20
  lora_target_modules:
21
  - query_key_value
 
22
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
23
  wandb_project: pythia-1.4b-lora
24
  wandb_watch:
25
  wandb_run_name:
26
  wandb_log_model: checkpoint
27
  output_dir: ./lora-alpaca
28
- batch_size: 32
29
  micro_batch_size: 4
30
  num_epochs: 5
31
- learning_rate: 0.0003
32
  train_on_inputs: false
33
  group_by_length: false
34
  bf16: True
 
13
  type: gpteacher
14
  val_set_size: 0.05
15
  adapter: lora
16
+ lora_model_dir:
17
  sequence_len: 2048
18
  lora_r: 8
19
  lora_alpha: 32
20
  lora_dropout: 0.05
21
  lora_target_modules:
22
  - query_key_value
23
+ # - xxx
24
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
25
  wandb_project: pythia-1.4b-lora
26
  wandb_watch:
27
  wandb_run_name:
28
  wandb_log_model: checkpoint
29
  output_dir: ./lora-alpaca
30
+ batch_size: 48
31
  micro_batch_size: 4
32
  num_epochs: 5
33
+ learning_rate: 0.00001
34
  train_on_inputs: false
35
  group_by_length: false
36
  bf16: True
scripts/finetune.py CHANGED
@@ -1,5 +1,6 @@
1
  import math
2
  import os
 
3
  import signal
4
  import sys
5
  from pathlib import Path
@@ -15,7 +16,7 @@ from peft import (
15
  LoraConfig,
16
  get_peft_model,
17
  prepare_model_for_int8_training,
18
- get_peft_model_state_dict,
19
  )
20
  from torch import nn
21
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -94,12 +95,16 @@ def load_model(base_model, model_type, tokenizer_type, cfg, adapter="lora"):
94
  bias="none",
95
  task_type="CAUSAL_LM",
96
  )
97
- model = get_peft_model(model, lora_config)
 
 
 
 
 
98
  if cfg.ddp:
99
  model.to(f"cuda:{cfg.local_rank}")
100
 
101
  # TODO resume_from_checkpoint handling
102
-
103
  model.print_trainable_parameters()
104
  return model, tokenizer, lora_config
105
 
@@ -152,6 +157,26 @@ def check_dataset_labels(dataset, tokenizer):
152
  print("\n\n\n")
153
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def choose_config(path: Path):
156
  yaml_files = [file for file in path.glob("*.yml")]
157
 
@@ -180,7 +205,7 @@ def train(
180
  config: Path = Path("configs/"),
181
  **kwargs,
182
  ):
183
- if config.is_dir():
184
  config = choose_config(config)
185
 
186
  # load the config from the yaml file
@@ -214,41 +239,55 @@ def train(
214
  model, tokenizer, lora_config = load_model(
215
  cfg.base_model, cfg.model_type, cfg.tokenizer_type, cfg, adapter=cfg.adapter
216
  )
 
 
 
 
 
217
  datasets = []
218
- for d in cfg.datasets:
219
- ds: IterableDataset = load_dataset(
220
- "json", data_files=d.path, streaming=True, split=None
221
- )
222
- if d.type == "alpaca":
223
- ds_strategy = AlpacaPromptTokenizingStrategy(
224
- AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
225
- )
226
- ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
227
- datasets.append(ds_wrapper)
228
- elif d.type == "gpteacher":
229
- ds_strategy = GPTeacherPromptTokenizingStrategy(
230
- GPTeacherPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
231
- )
232
- ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
233
- datasets.append(ds_wrapper)
234
- elif d.type == "sharegpt":
235
- ds_strategy = ShareGPTPromptTokenizingStrategy(
236
- ShareGPTPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
237
  )
238
- ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
239
- datasets.append(ds_wrapper)
240
- constant_len_dataset = ConstantLengthDataset(
241
- tokenizer, datasets, seq_length=cfg.sequence_len
242
- )
243
- constant_len_dataset = Dataset.from_list(
244
- [_ for _ in constant_len_dataset]
245
- ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
246
-
247
- print(constant_len_dataset)
248
- train_dataset = constant_len_dataset["train"]
249
- eval_dataset = constant_len_dataset["test"]
250
 
251
- # check_dataset_labels(eval_dataset, tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  total_num_steps = int(
254
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
 
1
  import math
2
  import os
3
+ import random
4
  import signal
5
  import sys
6
  from pathlib import Path
 
16
  LoraConfig,
17
  get_peft_model,
18
  prepare_model_for_int8_training,
19
+ get_peft_model_state_dict, PeftModel,
20
  )
21
  from torch import nn
22
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
95
  bias="none",
96
  task_type="CAUSAL_LM",
97
  )
98
+
99
+ if cfg.lora_model_dir:
100
+ model = PeftModel.from_pretrained(model, cfg.lora_model_dir, device_map = cfg.device_map, torch_dtype=torch.float16)
101
+ else:
102
+ model = get_peft_model(model, lora_config)
103
+
104
  if cfg.ddp:
105
  model.to(f"cuda:{cfg.local_rank}")
106
 
107
  # TODO resume_from_checkpoint handling
 
108
  model.print_trainable_parameters()
109
  return model, tokenizer, lora_config
110
 
 
157
  print("\n\n\n")
158
 
159
 
160
+ def do_inference(cfg, model, tokenizer):
161
+ instruction = "Tell me a joke about dromedaries."
162
+ input = ""
163
+ prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
164
+ batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
165
+ with torch.no_grad():
166
+ generated = model.generate(inputs=batch["input_ids"],
167
+ do_sample=True, use_cache=True,
168
+ repetition_penalty=1.1,
169
+ max_new_tokens=50,
170
+ temperature=0.9,
171
+ top_p=0.95,
172
+ top_k=40,
173
+ return_dict_in_generate=True,
174
+ output_attentions=False,
175
+ output_hidden_states=False,
176
+ output_scores=False)
177
+ print(tokenizer.decode(generated['sequences'].cpu().tolist()[0]))
178
+
179
+
180
  def choose_config(path: Path):
181
  yaml_files = [file for file in path.glob("*.yml")]
182
 
 
205
  config: Path = Path("configs/"),
206
  **kwargs,
207
  ):
208
+ if Path(config).is_dir():
209
  config = choose_config(config)
210
 
211
  # load the config from the yaml file
 
239
  model, tokenizer, lora_config = load_model(
240
  cfg.base_model, cfg.model_type, cfg.tokenizer_type, cfg, adapter=cfg.adapter
241
  )
242
+
243
+ if "inference" in kwargs:
244
+ do_inference(cfg, model, tokenizer)
245
+ return
246
+
247
  datasets = []
248
+ if len(cfg.datasets) == 1 and cfg.datasets[0].type == "arrow":
249
+ dataset = load_dataset(cfg.datasets[0].path, split="train")
250
+ else:
251
+ for d in cfg.datasets:
252
+ ds: IterableDataset = load_dataset(
253
+ "json", data_files=d.path, streaming=True, split=None
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  )
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ if d.type == "alpaca":
257
+ ds_strategy = AlpacaPromptTokenizingStrategy(
258
+ AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
259
+ )
260
+ ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
261
+ datasets.append(ds_wrapper)
262
+ elif d.type == "gpteacher":
263
+ ds_strategy = GPTeacherPromptTokenizingStrategy(
264
+ GPTeacherPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
265
+ )
266
+ ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
267
+ datasets.append(ds_wrapper)
268
+ elif d.type == "sharegpt":
269
+ ds_strategy = ShareGPTPromptTokenizingStrategy(
270
+ ShareGPTPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
271
+ )
272
+ ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
273
+ datasets.append(ds_wrapper)
274
+ constant_len_dataset = ConstantLengthDataset(
275
+ tokenizer, datasets, seq_length=cfg.sequence_len
276
+ )
277
+ dataset = Dataset.from_list(
278
+ [_ for _ in constant_len_dataset]
279
+ ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
280
+ dataset.save_to_disk("data/last_run")
281
+ print(dataset)
282
+
283
+ train_dataset = dataset["train"]
284
+ eval_dataset = dataset["test"]
285
+
286
+ if cfg.debug:
287
+ check_dataset_labels(
288
+ train_dataset.select([random.randrange(0, len(train_dataset) - 1)]),
289
+ tokenizer,
290
+ )
291
 
292
  total_num_steps = int(
293
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)