winglian commited on
Commit
c530e4b
·
1 Parent(s): f620706

more config pruning and migrating

Browse files
configs/llama_7B_alpaca.yml DELETED
@@ -1,41 +0,0 @@
1
- base_model: huggyllama/llama-7b
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: LlamaTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
- type: alpaca
8
- - path: data/vicuna_cleaned.jsonl
9
- type: sharegpt
10
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
- type: gpteacher
12
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
- type: gpteacher
14
- dataset_prepared_path: last_run_prepared
15
- val_set_size: 0.04
16
- adapter: lora
17
- lora_model_dir:
18
- sequence_len: 2048
19
- lora_r: 8
20
- lora_alpha: 16
21
- lora_dropout: 0.05
22
- lora_target_modules:
23
- - q_proj
24
- - v_proj
25
- lora_fan_in_fan_out: false
26
- wandb_project: llama-7b-lora
27
- wandb_watch:
28
- wandb_run_id:
29
- wandb_log_model:
30
- output_dir: ./lora-llama-alpaca
31
- gradient_accumulation_steps: 1
32
- micro_batch_size: 16
33
- num_epochs: 5
34
- learning_rate: 0.00003
35
- train_on_inputs: false
36
- group_by_length: false
37
- bf16: true
38
- tf32: true
39
- early_stopping_patience:
40
- resume_from_checkpoint:
41
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/sample.yml DELETED
@@ -1,87 +0,0 @@
1
- # this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
2
- # this can also be a relative path to a model on disk
3
- base_model: decapoda-research/llama-7b-hf-int4
4
- # you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
5
- base_model_ignore_patterns:
6
- # if the base_model repo on hf hub doesn't include configuration .json files,
7
- # you can set that here, or leave this empty to default to base_model
8
- base_model_config: decapoda-research/llama-7b-hf
9
- # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
10
- model_type: AutoModelForCausalLM
11
- # Corresponding tokenizer for the model AutoTokenizer is a good choice
12
- tokenizer_type: AutoTokenizer
13
- # whether you are training a 4-bit quantized model
14
- load_4bit: true
15
- # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
16
- load_in_8bit: true
17
- # a list of one or more datasets to finetune the model with
18
- datasets:
19
- # this can be either a hf dataset, or relative path
20
- - path: vicgalle/alpaca-gpt4
21
- # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
22
- type: alpaca
23
- # axolotl attempts to save the dataset as an arrow after packing the data together so
24
- # subsequent training attempts load faster, relative path
25
- dataset_prepared_path: data/last_run_prepared
26
- # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
27
- val_set_size: 0.04
28
- # if you want to use lora, leave blank to train all parameters in original model
29
- adapter: lora
30
- # if you already have a lora model trained that you want to load, put that here
31
- lora_model_dir:
32
- # the maximum length of an input to train with, this should typically be less than 2048
33
- # as most models have a token/context limit of 2048
34
- sequence_len: 2048
35
- # max sequence length to concatenate training samples together up to
36
- # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
37
- max_packed_sequence_len: 1024
38
- # lora hyperparameters
39
- lora_r: 8
40
- lora_alpha: 16
41
- lora_dropout: 0.05
42
- lora_target_modules:
43
- - q_proj
44
- - v_proj
45
- # - k_proj
46
- # - o_proj
47
- lora_fan_in_fan_out: false
48
- # wandb configuration if your're using it
49
- wandb_project:
50
- wandb_watch:
51
- wandb_run_id:
52
- wandb_log_model:
53
- # where to save the finsihed model to
54
- output_dir: ./completed-model
55
- # training hyperparameters
56
- gradient_accumulation_steps: 1
57
- batch_size:
58
- micro_batch_size: 2
59
- num_epochs: 3
60
- warmup_steps: 100
61
- learning_rate: 0.00003
62
- # whether to mask out or include the human's prompt from the training labels
63
- train_on_inputs: false
64
- # don't use this, leads to wonky training (according to someone on the internet)
65
- group_by_length: false
66
- # Use CUDA bf16
67
- bf16: true
68
- # Use CUDA tf32
69
- tf32: true
70
- # does not work with current implementation of 4-bit LoRA
71
- gradient_checkpointing: false
72
- # stop training after this many evaluation losses have increased in a row
73
- # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
74
- early_stopping_patience: 3
75
- # specify a scheduler to use with the optimizer. only one_cycle is supported currently
76
- lr_scheduler:
77
- # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
78
- xformers_attention:
79
- # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
80
- flash_attention:
81
- # resume from a specific checkpoint dir
82
- resume_from_checkpoint:
83
- # if resume_from_checkpoint isn't set and you simply want it to start where it left off
84
- # be careful with this being turned on between different models
85
- auto_resume_from_checkpoints: false
86
- # don't mess with this, it's here for accelerate and torchrun
87
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/gptj-qlora/config.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: EleutherAI/gpt-j-6b
2
+ base_model_config: EleutherAI/gpt-j-6b
3
+ load_in_8bit: false
4
+ load_in_4bit: true
5
+ strict: false
6
+ push_dataset_to_hub:
7
+ datasets:
8
+ - path: teknium/GPT4-LLM-Cleaned
9
+ type: alpaca
10
+ dataset_prepared_path: last_run_prepared
11
+ val_set_size: 0.01
12
+ adapter: qlora
13
+ lora_model_dir:
14
+ sequence_len: 2048
15
+ max_packed_sequence_len:
16
+ lora_r: 8
17
+ lora_alpha: 32
18
+ lora_dropout: 0.05
19
+ lora_target_modules:
20
+ lora_target_linear: true
21
+ lora_fan_in_fan_out:
22
+ wandb_project:
23
+ wandb_watch:
24
+ wandb_run_id:
25
+ wandb_log_model:
26
+ output_dir: ./qlora-out
27
+ gradient_accumulation_steps: 2
28
+ micro_batch_size: 2
29
+ num_epochs: 2
30
+ optimizer: paged_adamw_8bit
31
+ torchdistx_path:
32
+ lr_scheduler: cosine
33
+ learning_rate: 0.0001
34
+ train_on_inputs: false
35
+ group_by_length: true
36
+ bf16: true
37
+ fp16: false
38
+ tf32: true
39
+ gradient_checkpointing: true
40
+ early_stopping_patience:
41
+ resume_from_checkpoint:
42
+ local_rank:
43
+ logging_steps: 1
44
+ xformers_attention: true
45
+ flash_attention:
46
+ gptq_groupsize:
47
+ gptq_model_v1:
48
+ warmup_steps: 10
49
+ eval_steps: 20
50
+ save_steps:
51
+ debug:
52
+ deepspeed:
53
+ weight_decay: 0.1
54
+ fsdp:
55
+ fsdp_config:
56
+ special_tokens:
57
+ pad_token: "<|endoftext|>"
configs/llama_7B_jeopardy.yml → examples/jeopardy-bot/config.yml RENAMED
@@ -7,30 +7,28 @@ datasets:
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
  dataset_prepared_path: last_run_prepared
10
- val_set_size: 0.01
11
  adapter:
12
  lora_model_dir:
13
- sequence_len: 2048
14
- max_packed_sequence_len: 2048
15
- lora_r: 8
16
- lora_alpha: 16
17
- lora_dropout: 0.05
18
  lora_target_modules:
19
- - q_proj
20
- - v_proj
21
  lora_fan_in_fan_out: false
22
- wandb_project: jeopardy-bot-7b
23
  wandb_watch:
24
  wandb_run_id:
25
  wandb_log_model:
26
  output_dir: ./jeopardy-bot-7b
27
- gradient_accumulation_steps: 2
28
  micro_batch_size: 1
29
- num_epochs: 2
30
  optimizer: adamw_bnb_8bit
31
  torchdistx_path:
32
  lr_scheduler: cosine
33
- learning_rate: 0.0000002
34
  train_on_inputs: false
35
  group_by_length: false
36
  bf16: true
@@ -48,11 +46,10 @@ eval_steps: 110
48
  save_steps: 660
49
  debug:
50
  deepspeed:
51
- weight_decay: 0.0001
52
  fsdp:
53
  fsdp_config:
54
  tokens:
55
- pad_token: "[PAD]"
56
  bos_token: "<s>"
57
  eos_token: "</s>"
58
  unk_token: "<unk>"
 
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
  dataset_prepared_path: last_run_prepared
10
+ val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
13
+ sequence_len: 512
14
+ max_packed_sequence_len:
15
+ lora_r:
16
+ lora_alpha:
17
+ lora_dropout:
18
  lora_target_modules:
 
 
19
  lora_fan_in_fan_out: false
20
+ wandb_project:
21
  wandb_watch:
22
  wandb_run_id:
23
  wandb_log_model:
24
  output_dir: ./jeopardy-bot-7b
25
+ gradient_accumulation_steps: 1
26
  micro_batch_size: 1
27
+ num_epochs: 3
28
  optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
31
+ learning_rate: 0.00003
32
  train_on_inputs: false
33
  group_by_length: false
34
  bf16: true
 
46
  save_steps: 660
47
  debug:
48
  deepspeed:
49
+ weight_decay: 0.1
50
  fsdp:
51
  fsdp_config:
52
  tokens:
 
53
  bos_token: "<s>"
54
  eos_token: "</s>"
55
  unk_token: "<unk>"