File size: 2,187 Bytes
ca11ae9
 
72bf8aa
79a8f52
72bf8aa
 
 
a7a9a14
72bf8aa
ca11ae9
72bf8aa
 
 
 
 
 
 
 
 
e50a64e
f544ab2
ca11ae9
72bf8aa
 
 
fb3d40f
ca11ae9
 
 
72bf8aa
 
ca11ae9
 
72bf8aa
ca11ae9
72bf8aa
 
 
ca11ae9
a52f481
7019509
72bf8aa
a1da39c
72bf8aa
 
ca11ae9
 
 
 
 
 
 
 
c9c0503
fb3d40f
8b79ff0
ca11ae9
72bf8aa
 
 
ca11ae9
 
 
72bf8aa
 
 
782b6a4
 
72bf8aa
 
 
 
 
 
 
 
 
fb3d40f
72bf8aa
 
 
 
5f79b82
 
72bf8aa
 
 
 
 
 
 
e799e08
72bf8aa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# 1b: tiiuae/falcon-rw-1b
# 40b: tiiuae/falcon-40b
base_model: tiiuae/falcon-7b
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
strict: false
push_dataset_to_hub:
datasets:
  - path: QingyiSi/Alpaca-CoT
    data_files:
      - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
    type: "alpaca:chat"
dataset_prepared_path:
val_set_size: 0.05
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./qlora-out

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 1
gradient_accumulation_steps: 2
num_epochs: 4
# Optimizer for QLoRA
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: true
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
local_rank:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.000001
fsdp:
fsdp_config:
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"