winglian commited on
Commit
71d600f
·
unverified ·
2 Parent(s): e07bd8a 4fd0c2d

Merge branch 'main' into winglian-patch-1

Browse files
.gitignore CHANGED
@@ -1,4 +1,163 @@
1
  **/axolotl.egg-info
2
- **/__pycache__
3
- .idea
4
  configs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  **/axolotl.egg-info
 
 
2
  configs
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py,cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ #poetry.lock
106
+
107
+ # pdm
108
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
+ #pdm.lock
110
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
+ # in version control.
112
+ # https://pdm.fming.dev/#use-with-ide
113
+ .pdm.toml
114
+
115
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116
+ __pypackages__/
117
+
118
+ # Celery stuff
119
+ celerybeat-schedule
120
+ celerybeat.pid
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+
152
+ # pytype static type analyzer
153
+ .pytype/
154
+
155
+ # Cython debug symbols
156
+ cython_debug/
157
+
158
+ # PyCharm
159
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
162
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
+ .idea/
README.md CHANGED
@@ -97,6 +97,18 @@ Have dataset(s) in one of the following format (JSONL recommended):
97
  ```json
98
  {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
99
  ```
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  > Have some new format to propose? Check if it's already defined in [data.py](src/axolotl/utils/data.py) in `dev` branch!
102
 
@@ -124,17 +136,17 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
124
 
125
  - loading
126
  ```yaml
127
- load_4bit: true
128
  load_in_8bit: true
129
- bf16: true
130
  fp16: true
131
- tf32: true
132
  ```
133
  Note: Repo does not do 4-bit quantization.
134
 
135
  - lora
136
  ```yaml
137
- adapter: lora # blank for full finetune
138
  lora_r: 8
139
  lora_alpha: 16
140
  lora_dropout: 0.05
@@ -163,28 +175,32 @@ tokenizer_type: AutoTokenizer
163
  # Trust remote code for untrusted source
164
  trust_remote_code:
165
 
166
- # whether you are training a 4-bit quantized model
167
  load_4bit: true
168
  gptq_groupsize: 128 # group size
169
  gptq_model_v1: false # v1 or v2
170
 
171
  # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
172
  load_in_8bit: true
 
 
173
 
174
  # Use CUDA bf16
175
- bf16: true
176
  # Use CUDA fp16
177
  fp16: true
178
  # Use CUDA tf32
179
- tf32: true
180
 
181
  # a list of one or more datasets to finetune the model with
182
  datasets:
183
  # this can be either a hf dataset, or relative path
184
  - path: vicgalle/alpaca-gpt4
185
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
186
- type: alpaca
187
  data_files: # path to source data files
 
 
188
 
189
  # axolotl attempts to save the dataset as an arrow after packing the data together so
190
  # subsequent training attempts load faster, relative path
@@ -201,7 +217,7 @@ sequence_len: 2048
201
  # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
202
  max_packed_sequence_len: 1024
203
 
204
- # if you want to use lora, leave blank to train all parameters in original model
205
  adapter: lora
206
  # if you already have a lora model trained that you want to load, put that here
207
  # lora hyperparameters
@@ -224,6 +240,7 @@ lora_out_dir:
224
  lora_fan_in_fan_out: false
225
 
226
  # wandb configuration if you're using it
 
227
  wandb_project:
228
  wandb_watch:
229
  wandb_run_id:
@@ -252,8 +269,18 @@ gradient_checkpointing: false
252
  # stop training after this many evaluation losses have increased in a row
253
  # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
254
  early_stopping_patience: 3
255
- # specify a scheduler to use with the optimizer. only one_cycle is supported currently
256
- lr_scheduler:
 
 
 
 
 
 
 
 
 
 
257
  # specify optimizer
258
  optimizer:
259
  # specify weight decay
@@ -262,7 +289,7 @@ weight_decay:
262
  # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
263
  xformers_attention:
264
  # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
265
- flash_attention:
266
 
267
  # resume from a specific checkpoint dir
268
  resume_from_checkpoint:
@@ -288,11 +315,17 @@ fsdp_config:
288
  # Deepspeed
289
  deepspeed:
290
 
291
- # TODO
292
  torchdistx_path:
293
 
 
 
 
294
  # Debug mode
295
  debug:
 
 
 
296
  ```
297
 
298
  </details>
@@ -317,12 +350,16 @@ accelerate launch scripts/finetune.py configs/your_config.yml
317
 
318
  ### Inference
319
 
320
- Add `--inference` flag to train command above
321
 
322
- If you are inferencing a pretrained LORA, pass
323
- ```bash
324
- --lora_model_dir ./completed-model
325
- ```
 
 
 
 
326
 
327
  ### Merge LORA to base
328
 
@@ -341,8 +378,11 @@ Please reduce any below
341
  - `eval_batch_size`
342
  - `sequence_len`
343
 
344
-
345
- ## Need help
 
 
 
346
 
347
  Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
348
 
 
97
  ```json
98
  {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
99
  ```
100
+ - `explainchoice`: question, choices, (solution OR explanation)
101
+ ```json
102
+ {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
103
+ ```
104
+ - `concisechoice`: question, choices, (solution OR explanation)
105
+ ```json
106
+ {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
107
+ ```
108
+ - `summarizetldr`: article and summary
109
+ ```json
110
+ {"article": "...", "summary": "..."}
111
+ ```
112
 
113
  > Have some new format to propose? Check if it's already defined in [data.py](src/axolotl/utils/data.py) in `dev` branch!
114
 
 
136
 
137
  - loading
138
  ```yaml
139
+ load_in_4bit: true
140
  load_in_8bit: true
141
+ bf16: true # require >=ampere
142
  fp16: true
143
+ tf32: true # require >=ampere
144
  ```
145
  Note: Repo does not do 4-bit quantization.
146
 
147
  - lora
148
  ```yaml
149
+ adapter: lora # qlora or leave blank for full finetune
150
  lora_r: 8
151
  lora_alpha: 16
152
  lora_dropout: 0.05
 
175
  # Trust remote code for untrusted source
176
  trust_remote_code:
177
 
178
+ # whether you are training a 4-bit GPTQ quantized model
179
  load_4bit: true
180
  gptq_groupsize: 128 # group size
181
  gptq_model_v1: false # v1 or v2
182
 
183
  # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
184
  load_in_8bit: true
185
+ # use bitsandbytes 4 bit
186
+ load_in_4bit:
187
 
188
  # Use CUDA bf16
189
+ bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
190
  # Use CUDA fp16
191
  fp16: true
192
  # Use CUDA tf32
193
+ tf32: true # require >=ampere
194
 
195
  # a list of one or more datasets to finetune the model with
196
  datasets:
197
  # this can be either a hf dataset, or relative path
198
  - path: vicgalle/alpaca-gpt4
199
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
200
+ type: alpaca # format OR format:prompt_style (chat/instruct)
201
  data_files: # path to source data files
202
+ shards: # true if use subset data. make sure to set `shards` param also
203
+ shards: # number of shards to split dataset into
204
 
205
  # axolotl attempts to save the dataset as an arrow after packing the data together so
206
  # subsequent training attempts load faster, relative path
 
217
  # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
218
  max_packed_sequence_len: 1024
219
 
220
+ # if you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
221
  adapter: lora
222
  # if you already have a lora model trained that you want to load, put that here
223
  # lora hyperparameters
 
240
  lora_fan_in_fan_out: false
241
 
242
  # wandb configuration if you're using it
243
+ wandb_mode:
244
  wandb_project:
245
  wandb_watch:
246
  wandb_run_id:
 
269
  # stop training after this many evaluation losses have increased in a row
270
  # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
271
  early_stopping_patience: 3
272
+
273
+ # specify a scheduler and kwargs to use with the optimizer
274
+ lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
275
+ lr_scheduler_kwargs:
276
+
277
+ # for one_cycle optim
278
+ lr_div_factor: # learning rate div factor
279
+
280
+ # for log_sweep optim
281
+ log_sweep_min_lr:
282
+ log_sweep_max_lr:
283
+
284
  # specify optimizer
285
  optimizer:
286
  # specify weight decay
 
289
  # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
290
  xformers_attention:
291
  # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
292
+ flash_attention: # require a100 for llama
293
 
294
  # resume from a specific checkpoint dir
295
  resume_from_checkpoint:
 
315
  # Deepspeed
316
  deepspeed:
317
 
318
+ # Path to torch distx for optim 'adamw_anyprecision'
319
  torchdistx_path:
320
 
321
+ # Set padding for data collator to 'longest'
322
+ collator_pad_to_longest:
323
+
324
  # Debug mode
325
  debug:
326
+
327
+ # Seed
328
+ seed:
329
  ```
330
 
331
  </details>
 
350
 
351
  ### Inference
352
 
353
+ Pass the appropriate flag to the train command:
354
 
355
+ - Pretrained LORA:
356
+ ```bash
357
+ --inference --lora_model_dir ./completed-model
358
+ ```
359
+ - Full weights finetune:
360
+ ```bash
361
+ --inference --base_model ./completed-model
362
+ ```
363
 
364
  ### Merge LORA to base
365
 
 
378
  - `eval_batch_size`
379
  - `sequence_len`
380
 
381
+ > RuntimeError: expected scalar type Float but found Half
382
+
383
+ Try set `fp16: true`
384
+
385
+ ## Need help? 🙋‍♂️
386
 
387
  Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
388
 
docker/Dockerfile-base CHANGED
@@ -43,11 +43,11 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \
43
  python3 setup.py bdist_wheel && \
44
  cd csrc/fused_dense_lib && \
45
  python3 setup.py bdist_wheel && \
46
- cd csrc/xentropy && \
47
  python3 setup.py bdist_wheel && \
48
- cd csrc/rotary && \
49
  python3 setup.py bdist_wheel && \
50
- cd csrc/layer_norm && \
51
  python3 setup.py bdist_wheel
52
 
53
  FROM base-builder AS deepspeed-builder
 
43
  python3 setup.py bdist_wheel && \
44
  cd csrc/fused_dense_lib && \
45
  python3 setup.py bdist_wheel && \
46
+ cd ../xentropy && \
47
  python3 setup.py bdist_wheel && \
48
+ cd ../rotary && \
49
  python3 setup.py bdist_wheel && \
50
+ cd ../layer_norm && \
51
  python3 setup.py bdist_wheel
52
 
53
  FROM base-builder AS deepspeed-builder
examples/lora-openllama-3b/config.yml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: openlm-research/open_llama_3b_600bt_preview
2
+ base_model_config: openlm-research/open_llama_3b_600bt_preview
3
+ model_type: LlamaForCausalLM
4
+ tokenizer_type: LlamaTokenizer
5
+ load_in_8bit: true
6
+ load_in_4bit: false
7
+ strict: false
8
+ push_dataset_to_hub:
9
+ datasets:
10
+ - path: teknium/GPT4-LLM-Cleaned
11
+ type: alpaca
12
+ dataset_prepared_path: last_run_prepared
13
+ val_set_size: 0.02
14
+ adapter: lora
15
+ lora_model_dir:
16
+ sequence_len: 256
17
+ max_packed_sequence_len:
18
+ lora_r: 8
19
+ lora_alpha: 16
20
+ lora_dropout: 0.0
21
+ lora_target_modules:
22
+ - gate_proj
23
+ - down_proj
24
+ - up_proj
25
+ - q_proj
26
+ - v_proj
27
+ - k_proj
28
+ - o_proj
29
+ lora_fan_in_fan_out:
30
+ wandb_project:
31
+ wandb_watch:
32
+ wandb_run_id:
33
+ wandb_log_model:
34
+ output_dir: ./lora-out
35
+ batch_size: 16
36
+ micro_batch_size: 4
37
+ num_epochs: 3
38
+ optimizer: adamw_bnb_8bit
39
+ torchdistx_path:
40
+ lr_scheduler: cosine
41
+ learning_rate: 0.0002
42
+ train_on_inputs: false
43
+ group_by_length: false
44
+ bf16: false
45
+ fp16: true
46
+ tf32: false
47
+ gradient_checkpointing: true
48
+ early_stopping_patience:
49
+ resume_from_checkpoint:
50
+ local_rank:
51
+ logging_steps: 1
52
+ xformers_attention: true
53
+ flash_attention:
54
+ gptq_groupsize:
55
+ gptq_model_v1:
56
+ warmup_steps: 10
57
+ eval_steps: 50
58
+ save_steps:
59
+ debug:
60
+ deepspeed:
61
+ weight_decay: 0.0
62
+ fsdp:
63
+ fsdp_config:
64
+ special_tokens:
65
+ bos_token: "<s>"
66
+ eos_token: "</s>"
67
+ unk_token: "<unk>"
src/axolotl/prompters.py CHANGED
@@ -17,8 +17,8 @@ class AlpacaPrompter:
17
  system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
18
  prompt_style = None
19
 
20
- def __init__(self, prompt_style="instruct"):
21
- self.prompt_style = prompt_style
22
  self.match_prompt_style()
23
 
24
  def match_prompt_style(self):
 
17
  system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
18
  prompt_style = None
19
 
20
+ def __init__(self, prompt_style=PromptStyle.instruct.value):
21
+ self.prompt_style = prompt_style if prompt_style else PromptStyle.instruct.value
22
  self.match_prompt_style()
23
 
24
  def match_prompt_style(self):
src/axolotl/utils/models.py CHANGED
@@ -211,12 +211,12 @@ def load_model(
211
  try:
212
  if is_llama_derived_model and "LlamaTokenizer" in globals():
213
  tokenizer = LlamaTokenizer.from_pretrained(
214
- model,
215
  trust_remote_code=True if cfg.trust_remote_code is True else False,
216
  )
217
  else:
218
  tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
219
- model,
220
  trust_remote_code=True if cfg.trust_remote_code is True else False,
221
  )
222
  except:
 
211
  try:
212
  if is_llama_derived_model and "LlamaTokenizer" in globals():
213
  tokenizer = LlamaTokenizer.from_pretrained(
214
+ base_model_config,
215
  trust_remote_code=True if cfg.trust_remote_code is True else False,
216
  )
217
  else:
218
  tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
219
+ base_model_config,
220
  trust_remote_code=True if cfg.trust_remote_code is True else False,
221
  )
222
  except:
src/axolotl/utils/validation.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ def validate_config(cfg):
2
+ if cfg.adapter == "qlora":
3
+ assert cfg.load_in_8bit is False
4
+ assert cfg.load_4bit is False
5
+ assert cfg.load_in_4bit is True
6
+ pass
7
+ # TODO
8
+ # MPT 7b
9
+ # https://github.com/facebookresearch/bitsandbytes/issues/25
10
+ # no 8bit adamw w bf16