yentinglin commited on
Commit
9cf42fb
·
1 Parent(s): abeb80f

Upload mixtral-zhtw.yml

Browse files
Files changed (1) hide show
  1. mixtral-zhtw.yml +97 -0
mixtral-zhtw.yml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: mistralai/Mixtral-8x7B-Instruct-v0.1
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: LlamaTokenizer
4
+ trust_remote_code: true
5
+
6
+ load_in_8bit: false
7
+ load_in_4bit: true
8
+ strict: false
9
+
10
+ datasets:
11
+ - path: yentinglin/v1
12
+ type: sharegpt
13
+ conversation: mistral
14
+ dataset_prepared_path: last_run_prepared
15
+ val_set_size: 0.0
16
+ output_dir: ./qlora-out-3e
17
+
18
+ ## You can optionally freeze the entire model and unfreeze a subset of parameters
19
+ unfrozen_parameters:
20
+ # - lm_head.*
21
+ # - model.embed_tokens.*
22
+ # - model.layers.2[0-9]+.block_sparse_moe.gate.*
23
+ # - model.layers.2[0-9]+.block_sparse_moe.experts.*
24
+ # - model.layers.3[0-9]+.block_sparse_moe.gate.*
25
+ # - model.layers.3[0-9]+.block_sparse_moe.experts.*
26
+
27
+ adapter: qlora
28
+ lora_model_dir:
29
+
30
+ sequence_len: 4096
31
+ sample_packing: true
32
+ pad_to_sequence_len: true
33
+
34
+ model_config:
35
+ output_router_logits: true
36
+
37
+ lora_r: 32
38
+ lora_alpha: 16
39
+ lora_dropout: 0.05
40
+ lora_target_linear: true
41
+ lora_fan_in_fan_out:
42
+ #lora_target_modules:
43
+ # - gate
44
+ # - q_proj
45
+ # - k_proj
46
+ # - v_proj
47
+ # - o_proj
48
+ # - w1
49
+ # - w2
50
+ # - w3
51
+
52
+ hub_model_id: yentinglin/Taiwan-LLM-MoE-chat-alpha-3e
53
+ hub_strategy: end
54
+ wandb_project: Taiwan-LLM-MoE
55
+ wandb_entity:
56
+ wandb_watch:
57
+ wandb_name:
58
+ wandb_log_model:
59
+
60
+ gradient_accumulation_steps: 4
61
+ micro_batch_size: 1
62
+ num_epochs: 3
63
+ optimizer: adamw_bnb_8bit
64
+ lr_scheduler: cosine
65
+ learning_rate: 0.0002
66
+
67
+ train_on_inputs: false
68
+ group_by_length: false
69
+ bf16: true
70
+ fp16: false
71
+ tf32: false
72
+
73
+ gradient_checkpointing: true
74
+ early_stopping_patience:
75
+ resume_from_checkpoint:
76
+ local_rank:
77
+ logging_steps: 1
78
+ xformers_attention:
79
+ flash_attention: true
80
+
81
+ loss_watchdog_threshold: 5.0
82
+ loss_watchdog_patience: 3
83
+
84
+ warmup_steps: 10
85
+ evals_per_epoch: 4
86
+ eval_table_size:
87
+ eval_table_max_new_tokens: 128
88
+ saves_per_epoch: 1
89
+ debug:
90
+ deepspeed: deepspeed/zero2.json
91
+ weight_decay: 0.0
92
+ fsdp:
93
+ fsdp_config:
94
+ special_tokens:
95
+
96
+ ddp_timeout: 8640000000
97
+ dataset_processes: 16