fix relative path for fixtures
Browse files- ds_config.json +33 -34
- src/axolotl/utils/models.py +5 -5
- src/axolotl/utils/trainer.py +3 -2
- tests/test_prompt_tokenizers.py +4 -2
ds_config.json
CHANGED
@@ -1,58 +1,57 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"bf16": {
|
3 |
"enabled": "auto"
|
4 |
},
|
5 |
"fp16": {
|
6 |
"enabled": "auto",
|
|
|
7 |
"loss_scale": 0,
|
|
|
8 |
"loss_scale_window": 1000,
|
9 |
-
"initial_scale_power": 16,
|
10 |
"hysteresis": 2,
|
11 |
"min_loss_scale": 1
|
12 |
},
|
13 |
"optimizer": {
|
14 |
-
"type": "
|
15 |
"params": {
|
16 |
"lr": "auto",
|
17 |
-
"betas":
|
18 |
-
|
|
|
|
|
|
|
19 |
"weight_decay": "auto"
|
20 |
}
|
21 |
},
|
22 |
"scheduler": {
|
23 |
-
"type": "
|
24 |
"params": {
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"total_num_steps": "auto"
|
29 |
}
|
30 |
},
|
31 |
-
"zero_optimization": {
|
32 |
-
"stage": 2,
|
33 |
-
"offload_optimizer": {
|
34 |
-
"device": "cpu",
|
35 |
-
"pin_memory": true
|
36 |
-
},
|
37 |
-
"offload_param": {
|
38 |
-
"device": "cpu",
|
39 |
-
"pin_memory": true
|
40 |
-
},
|
41 |
-
"overlap_comm": true,
|
42 |
-
"allgather_partitions": true,
|
43 |
-
"allgather_bucket_size": 5e8,
|
44 |
-
"contiguous_gradients": true,
|
45 |
-
"reduce_bucket_size": "auto",
|
46 |
-
"reduce_scatter": true,
|
47 |
-
"stage3_max_live_parameters": 0,
|
48 |
-
"stage3_max_reuse_distance": 0,
|
49 |
-
"stage3_gather_16bit_weights_on_model_save": true
|
50 |
-
},
|
51 |
-
"gradient_accumulation_steps": "auto",
|
52 |
-
"gradient_clipping": "auto",
|
53 |
-
"steps_per_print": 5,
|
54 |
"train_batch_size": "auto",
|
55 |
"train_micro_batch_size_per_gpu": "auto",
|
56 |
-
"wall_clock_breakdown": false
|
57 |
-
"round_robin_gradients": true
|
58 |
}
|
|
|
1 |
{
|
2 |
+
"zero_optimization": {
|
3 |
+
"stage": 3,
|
4 |
+
"offload_optimizer": {
|
5 |
+
"device": "cpu",
|
6 |
+
"pin_memory": true
|
7 |
+
},
|
8 |
+
"offload_param": {
|
9 |
+
"device": "cpu",
|
10 |
+
"pin_memory": true
|
11 |
+
},
|
12 |
+
"overlap_comm": true,
|
13 |
+
"contiguous_gradients": true,
|
14 |
+
"sub_group_size": 0,
|
15 |
+
"reduce_bucket_size": "auto",
|
16 |
+
"stage3_prefetch_bucket_size": "auto",
|
17 |
+
"stage3_param_persistence_threshold": "auto",
|
18 |
+
"stage3_max_live_parameters": 0,
|
19 |
+
"stage3_max_reuse_distance": 0,
|
20 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
21 |
+
},
|
22 |
"bf16": {
|
23 |
"enabled": "auto"
|
24 |
},
|
25 |
"fp16": {
|
26 |
"enabled": "auto",
|
27 |
+
"auto_cast": false,
|
28 |
"loss_scale": 0,
|
29 |
+
"initial_scale_power": 32,
|
30 |
"loss_scale_window": 1000,
|
|
|
31 |
"hysteresis": 2,
|
32 |
"min_loss_scale": 1
|
33 |
},
|
34 |
"optimizer": {
|
35 |
+
"type": "AdamW",
|
36 |
"params": {
|
37 |
"lr": "auto",
|
38 |
+
"betas": [
|
39 |
+
0.9,
|
40 |
+
0.999
|
41 |
+
],
|
42 |
+
"eps": 1e-8,
|
43 |
"weight_decay": "auto"
|
44 |
}
|
45 |
},
|
46 |
"scheduler": {
|
47 |
+
"type": "OneCycle",
|
48 |
"params": {
|
49 |
+
"cycle_min_lr": 0.00001,
|
50 |
+
"cycle_max_lr": 0.00003,
|
51 |
+
"cycle_first_step_size": 120
|
|
|
52 |
}
|
53 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
"train_batch_size": "auto",
|
55 |
"train_micro_batch_size_per_gpu": "auto",
|
56 |
+
"wall_clock_breakdown": false
|
|
|
57 |
}
|
src/axolotl/utils/models.py
CHANGED
@@ -125,7 +125,7 @@ def load_model(
|
|
125 |
load_in_4bit=True,
|
126 |
llm_int8_threshold=6.0,
|
127 |
llm_int8_has_fp16_weight=False,
|
128 |
-
bnb_4bit_compute_dtype=
|
129 |
bnb_4bit_use_double_quant=True,
|
130 |
bnb_4bit_quant_type="nf4",
|
131 |
)
|
@@ -174,7 +174,7 @@ def load_model(
|
|
174 |
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
175 |
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
176 |
torch_dtype=torch_dtype,
|
177 |
-
device_map=cfg.device_map,
|
178 |
**model_kwargs,
|
179 |
)
|
180 |
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
|
@@ -273,13 +273,13 @@ def load_model(
|
|
273 |
if (
|
274 |
torch.cuda.device_count() > 1
|
275 |
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
276 |
-
and cfg.gptq
|
277 |
):
|
278 |
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
279 |
# so let's only set it for the 4bit, see
|
280 |
# https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133
|
281 |
-
model
|
282 |
-
model
|
283 |
|
284 |
requires_grad = []
|
285 |
for name, param in model.named_parameters(recurse=True):
|
|
|
125 |
load_in_4bit=True,
|
126 |
llm_int8_threshold=6.0,
|
127 |
llm_int8_has_fp16_weight=False,
|
128 |
+
bnb_4bit_compute_dtype=torch_dtype,
|
129 |
bnb_4bit_use_double_quant=True,
|
130 |
bnb_4bit_quant_type="nf4",
|
131 |
)
|
|
|
174 |
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
175 |
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
176 |
torch_dtype=torch_dtype,
|
177 |
+
device_map="auto" if cfg.world_size == 1 else cfg.device_map,
|
178 |
**model_kwargs,
|
179 |
)
|
180 |
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
|
|
|
273 |
if (
|
274 |
torch.cuda.device_count() > 1
|
275 |
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
276 |
+
and (cfg.gptq or cfg.load_in_4bit)
|
277 |
):
|
278 |
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
279 |
# so let's only set it for the 4bit, see
|
280 |
# https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133
|
281 |
+
setattr(model, 'is_parallelizable', True)
|
282 |
+
setattr(model, 'model_parallel', True)
|
283 |
|
284 |
requires_grad = []
|
285 |
for name, param in model.named_parameters(recurse=True):
|
src/axolotl/utils/trainer.py
CHANGED
@@ -113,7 +113,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
|
113 |
output_dir=cfg.output_dir,
|
114 |
save_total_limit=3,
|
115 |
load_best_model_at_end=True
|
116 |
-
if cfg.
|
|
|
117 |
and save_steps is not None
|
118 |
and save_steps % eval_steps == 0
|
119 |
and cfg.load_in_8bit is not True
|
@@ -218,7 +219,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
|
218 |
|
219 |
trainer_cls = (
|
220 |
OneCycleLRSchedulerTrainer
|
221 |
-
if cfg.lr_scheduler == "one_cycle" and cfg.fsdp
|
222 |
else transformers.Trainer
|
223 |
)
|
224 |
trainer = trainer_cls(
|
|
|
113 |
output_dir=cfg.output_dir,
|
114 |
save_total_limit=3,
|
115 |
load_best_model_at_end=True
|
116 |
+
if cfg.load_best_model_at_end is not False # if explicitly set to False, it should be resort to False
|
117 |
+
and cfg.val_set_size > 0
|
118 |
and save_steps is not None
|
119 |
and save_steps % eval_steps == 0
|
120 |
and cfg.load_in_8bit is not True
|
|
|
219 |
|
220 |
trainer_cls = (
|
221 |
OneCycleLRSchedulerTrainer
|
222 |
+
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
|
223 |
else transformers.Trainer
|
224 |
)
|
225 |
trainer = trainer_cls(
|
tests/test_prompt_tokenizers.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import logging
|
3 |
import unittest
|
|
|
4 |
|
5 |
from transformers import AutoTokenizer
|
6 |
|
@@ -22,10 +23,11 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
|
|
22 |
)
|
23 |
|
24 |
def test_sharegpt_integration(self):
|
25 |
-
|
|
|
26 |
data = fin.read()
|
27 |
conversation = json.loads(data)
|
28 |
-
with open("
|
29 |
data = fin.read()
|
30 |
tokenized_conversation = json.loads(data)
|
31 |
prompter = ShareGPTPrompter("chat")
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import unittest
|
4 |
+
from pathlib import Path
|
5 |
|
6 |
from transformers import AutoTokenizer
|
7 |
|
|
|
23 |
)
|
24 |
|
25 |
def test_sharegpt_integration(self):
|
26 |
+
print(Path(__file__).parent)
|
27 |
+
with open(Path(__file__).parent / "fixtures/conversation.json", "r") as fin:
|
28 |
data = fin.read()
|
29 |
conversation = json.loads(data)
|
30 |
+
with open(Path(__file__).parent / "fixtures/conversation.tokenized.json", "r") as fin:
|
31 |
data = fin.read()
|
32 |
tokenized_conversation = json.loads(data)
|
33 |
prompter = ShareGPTPrompter("chat")
|