winglian commited on
Commit
94f5e41
·
1 Parent(s): 2624bc2

various bugfixes

Browse files
configs/stability_3b.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: stabilityai/stablelm-base-alpha-3b
2
+ load_in_8bit: true
3
+ datasets:
4
+ - path: vicgalle/alpaca-gpt4
5
+ type: alpaca
6
+ dataset_prepared_path: last_run_prepared
7
+ val_set_size: 0.04
8
+ adapter:
9
+ lora_model_dir:
10
+ sequence_len: 4096
11
+ lora_r: 8
12
+ lora_alpha: 16
13
+ lora_dropout: 0.05
14
+ lora_target_modules:
15
+ - q_proj
16
+ - v_proj
17
+ lora_fan_in_fan_out: false
18
+ wandb_project: stable-llama-3b
19
+ wandb_watch:
20
+ wandb_run_id:
21
+ wandb_log_model: checkpoint
22
+ output_dir: ./stable-llama-3b
23
+ batch_size: 128
24
+ micro_batch_size: 16
25
+ num_epochs: 1
26
+ learning_rate: 0.00003
27
+ train_on_inputs: false
28
+ group_by_length: false
29
+ bf16: true
30
+ tf32: true
31
+ early_stopping_patience: 3
32
+ resume_from_checkpoint:
33
+ local_rank:
scripts/finetune.py CHANGED
@@ -159,7 +159,7 @@ def train(
159
  cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
160
  cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
161
  choose_device(cfg)
162
- cfg.ddp = cfg.world_size != 1
163
  if cfg.ddp:
164
  cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
165
  cfg.gradient_accumulation_steps = (
 
159
  cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
160
  cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
161
  choose_device(cfg)
162
+ cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
163
  if cfg.ddp:
164
  cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
165
  cfg.gradient_accumulation_steps = (
src/axolotl/datasets.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import List
2
 
3
  import torch
@@ -92,11 +93,14 @@ class ConstantLengthDataset(IterableDataset):
92
  : self.seq_length
93
  ]
94
  labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
95
- yield {
96
- "input_ids": input_ids,
97
- "labels": labels,
98
- "attention_mask": attention_mask,
99
- }
 
 
 
100
  buffer = {"input_ids": [], "attention_mask": [], "labels": []}
101
  buffer_len = 0
102
 
 
1
+ import logging
2
  from typing import List
3
 
4
  import torch
 
93
  : self.seq_length
94
  ]
95
  labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
96
+ if labels.size() == input_ids.size() and attention_mask.size() == input_ids.size():
97
+ yield {
98
+ "input_ids": input_ids,
99
+ "labels": labels,
100
+ "attention_mask": attention_mask,
101
+ }
102
+ else:
103
+ logging.warning("dropping batch due to tensor size mismatch")
104
  buffer = {"input_ids": [], "attention_mask": [], "labels": []}
105
  buffer_len = 0
106
 
src/axolotl/utils/data.py CHANGED
@@ -65,7 +65,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
65
  elif ds_from_hub:
66
  ds = load_dataset(d.path, streaming=True)
67
  else:
68
- raise Exception("unhandled dataset load")
69
 
70
  if d.type == "alpaca":
71
  ds_strategy = AlpacaPromptTokenizingStrategy(
 
65
  elif ds_from_hub:
66
  ds = load_dataset(d.path, streaming=True)
67
  else:
68
+ raise Exception(f"unhandled dataset load for {d.path}")
69
 
70
  if d.type == "alpaca":
71
  ds_strategy = AlpacaPromptTokenizingStrategy(
src/axolotl/utils/models.py CHANGED
@@ -102,13 +102,20 @@ def load_model(
102
  torch_dtype=torch_dtype,
103
  device_map=cfg.device_map,
104
  )
105
- else:
106
  model = getattr(transformers, model_type).from_pretrained(
107
  base_model,
108
  load_in_8bit=cfg.load_in_8bit,
109
  torch_dtype=torch_dtype,
110
  device_map=cfg.device_map,
111
  )
 
 
 
 
 
 
 
112
  except Exception as e:
113
  logging.error(
114
  "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
@@ -148,7 +155,7 @@ def load_model(
148
 
149
  model, lora_config = load_adapter(model, cfg, adapter)
150
 
151
- if cfg.ddp:
152
  model.to(f"cuda:{cfg.local_rank}")
153
 
154
  if cfg.load_4bit:
 
102
  torch_dtype=torch_dtype,
103
  device_map=cfg.device_map,
104
  )
105
+ elif model_type:
106
  model = getattr(transformers, model_type).from_pretrained(
107
  base_model,
108
  load_in_8bit=cfg.load_in_8bit,
109
  torch_dtype=torch_dtype,
110
  device_map=cfg.device_map,
111
  )
112
+ else:
113
+ model = AutoModelForCausalLM.from_pretrained(
114
+ base_model,
115
+ load_in_8bit=cfg.load_in_8bit,
116
+ torch_dtype=torch_dtype,
117
+ device_map=cfg.device_map,
118
+ )
119
  except Exception as e:
120
  logging.error(
121
  "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
 
155
 
156
  model, lora_config = load_adapter(model, cfg, adapter)
157
 
158
+ if cfg.ddp and not load_in_8bit:
159
  model.to(f"cuda:{cfg.local_rank}")
160
 
161
  if cfg.load_4bit:
src/axolotl/utils/trainer.py CHANGED
@@ -94,13 +94,22 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
94
  )
95
  trainer_kwargs["callbacks"] = [early_stop_cb]
96
 
 
 
 
 
 
 
 
97
  trainer = transformers.Trainer(
98
  model=model,
99
  train_dataset=train_dataset,
100
  eval_dataset=eval_dataset,
101
  args=training_args,
102
  data_collator=transformers.DataCollatorForSeq2Seq(
103
- tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
 
 
104
  ),
105
  **trainer_kwargs,
106
  )
 
94
  )
95
  trainer_kwargs["callbacks"] = [early_stop_cb]
96
 
97
+ data_collator_kwargs = {
98
+ "padding": True,
99
+ }
100
+ if cfg.collator_pad_to_longest:
101
+ data_collator_kwargs["padding"] = "longest"
102
+ else:
103
+ data_collator_kwargs["pad_to_multiple_of"] = 8
104
  trainer = transformers.Trainer(
105
  model=model,
106
  train_dataset=train_dataset,
107
  eval_dataset=eval_dataset,
108
  args=training_args,
109
  data_collator=transformers.DataCollatorForSeq2Seq(
110
+ tokenizer,
111
+ return_tensors="pt",
112
+ **data_collator_kwargs,
113
  ),
114
  **trainer_kwargs,
115
  )