diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..330d4d913607ecfa58675bc8aab2bbf8be169b3d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,786 @@
+---
+tags:
+- generated_from_trainer
+model-index:
+- name: out
+ results: []
+---
+
+
+
+[](https://github.com/OpenAccess-AI-Collective/axolotl)
+See axolotl config
+
+axolotl version: `0.4.0`
+```yaml
+base_model: /workspace/axolotl/dbrx-checkpoint
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+# load_in_4bit: true
+strict: false
+
+# adapter: qlora
+# lora_modules_to_save: [embed_tokens, lm_head]
+
+# lora_r: 32
+# lora_alpha: 16
+# lora_dropout: 0.05
+# lora_target_linear: false
+# lora_fan_in_fan_out:
+
+datasets:
+ - path: /workspace/datasets/dolphin-2.9/dolphin201-sharegpt2.jsonl
+ type: sharegpt
+ conversation: chatml
+ # - path: /workspace/datasets/dolphin-2.9/Ultrachat200kunfiltered.jsonl
+ # type: sharegpt
+ # conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/dolphin-coder-translate-sharegpt2.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/dolphin-coder-codegen-sharegpt2.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/not_samantha_norefusals.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/Orca-Math-resort-unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/agent_instruct_react_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/toolbench_instruct_j1s1_3k_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/toolbench_negative_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/toolbench_react_10p_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/toolbench_tflan_cot_30p_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ - path: /workspace/datasets/dolphin-2.9/openhermes200k_unfiltered.jsonl
+ type: sharegpt
+ conversation: chatml
+ # - path: /workspace/datasets/dolphin-2.9/SystemConversations.jsonl
+ # type: sharegpt
+ # conversation: chatml
+
+chat_template: chatml
+
+unfrozen_parameters:
+- ^lm_head.weight$
+# ffn.experts.mlp_experts.0.v1 layers
+- transformer.blocks.30.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.25.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.22.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.31.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.21.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.0.v1
+- transformer.blocks.23.ffn.experts.mlp_experts.0.v1
+# ffn.experts.mlp_experts.0.w1 layers
+- transformer.blocks.7.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.32.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.25.ffn.experts.mlp_experts.0.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.0.w1
+# ffn.experts.mlp_experts.0.w2 layers
+- transformer.blocks.25.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.22.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.7.ffn.experts.mlp_experts.0.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.0.w2
+# ffn.experts.mlp_experts.1.v1 layers
+- transformer.blocks.27.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.25.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.23.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.6.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.21.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.1.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.1.v1
+# ffn.experts.mlp_experts.1.w1 layers
+- transformer.blocks.0.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.27.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.1.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.1.w1
+# ffn.experts.mlp_experts.1.w2 layers
+- transformer.blocks.25.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.23.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.21.ffn.experts.mlp_experts.1.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.1.w2
+# ffn.experts.mlp_experts.10.v1 layers
+- transformer.blocks.28.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.34.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.36.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.20.ffn.experts.mlp_experts.10.v1
+- transformer.blocks.35.ffn.experts.mlp_experts.10.v1
+# ffn.experts.mlp_experts.10.w1 layers
+- transformer.blocks.24.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.34.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.28.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.10.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.10.w1
+# ffn.experts.mlp_experts.10.w2 layers
+- transformer.blocks.24.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.23.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.10.w2
+- transformer.blocks.20.ffn.experts.mlp_experts.10.w2
+# ffn.experts.mlp_experts.11.w1 layers
+- transformer.blocks.6.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.28.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.11.w1
+- transformer.blocks.13.ffn.experts.mlp_experts.11.w1
+# ffn.experts.mlp_experts.11.w2 layers
+- transformer.blocks.27.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.22.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.7.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.11.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.11.w2
+# ffn.experts.mlp_experts.12.v1 layers
+- transformer.blocks.30.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.21.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.10.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.23.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.6.ffn.experts.mlp_experts.12.v1
+- transformer.blocks.20.ffn.experts.mlp_experts.12.v1
+# ffn.experts.mlp_experts.12.w1 layers
+- transformer.blocks.8.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.2.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.17.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.12.w1
+- transformer.blocks.21.ffn.experts.mlp_experts.12.w1
+# ffn.experts.mlp_experts.12.w2 layers
+- transformer.blocks.6.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.8.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.21.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.12.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.12.w2
+# ffn.experts.mlp_experts.13.v1 layers
+- transformer.blocks.31.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.10.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.11.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.25.ffn.experts.mlp_experts.13.v1
+- transformer.blocks.36.ffn.experts.mlp_experts.13.v1
+# ffn.experts.mlp_experts.13.w1 layers
+- transformer.blocks.4.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.24.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.13.w1
+- transformer.blocks.11.ffn.experts.mlp_experts.13.w1
+# ffn.experts.mlp_experts.13.w2 layers
+- transformer.blocks.24.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.20.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.13.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.13.w2
+# ffn.experts.mlp_experts.14.v1 layers
+- transformer.blocks.28.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.35.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.11.ffn.experts.mlp_experts.14.v1
+- transformer.blocks.22.ffn.experts.mlp_experts.14.v1
+# ffn.experts.mlp_experts.14.w1 layers
+- transformer.blocks.8.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.13.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.28.ffn.experts.mlp_experts.14.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.14.w1
+# ffn.experts.mlp_experts.14.w2 layers
+- transformer.blocks.26.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.14.w2
+- transformer.blocks.22.ffn.experts.mlp_experts.14.w2
+# ffn.experts.mlp_experts.15.v1 layers
+- transformer.blocks.33.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.31.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.9.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.34.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.17.ffn.experts.mlp_experts.15.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.15.v1
+# ffn.experts.mlp_experts.15.w1 layers
+- transformer.blocks.6.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.14.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.34.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.15.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.15.w1
+# ffn.experts.mlp_experts.15.w2 layers
+- transformer.blocks.28.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.7.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.15.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.15.w2
+# ffn.experts.mlp_experts.2.v1 layers
+- transformer.blocks.31.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.23.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.35.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.21.ffn.experts.mlp_experts.2.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.2.v1
+# ffn.experts.mlp_experts.2.w1 layers
+- transformer.blocks.7.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.31.ffn.experts.mlp_experts.2.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.2.w1
+# ffn.experts.mlp_experts.2.w2 layers
+- transformer.blocks.26.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.23.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.2.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.2.w2
+# ffn.experts.mlp_experts.3.v1 layers
+- transformer.blocks.28.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.36.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.14.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.10.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.31.ffn.experts.mlp_experts.3.v1
+- transformer.blocks.21.ffn.experts.mlp_experts.3.v1
+# ffn.experts.mlp_experts.3.w1 layers
+- transformer.blocks.7.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.3.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.3.w1
+# ffn.experts.mlp_experts.3.w2 layers
+- transformer.blocks.28.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.21.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.3.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.3.w2
+# ffn.experts.mlp_experts.4.v1 layers
+- transformer.blocks.34.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.31.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.14.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.6.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.20.ffn.experts.mlp_experts.4.v1
+- transformer.blocks.9.ffn.experts.mlp_experts.4.v1
+# ffn.experts.mlp_experts.4.w1 layers
+- transformer.blocks.6.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.14.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.34.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.4.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.4.w1
+# ffn.experts.mlp_experts.4.w2 layers
+- transformer.blocks.25.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.36.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.4.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.4.w2
+# ffn.experts.mlp_experts.5.v1 layers
+- transformer.blocks.35.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.5.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.5.v1
+# ffn.experts.mlp_experts.5.w1 layers
+- transformer.blocks.0.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.7.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.12.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.5.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.5.w1
+# ffn.experts.mlp_experts.5.w2 layers
+- transformer.blocks.26.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.6.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.5.w2
+- transformer.blocks.7.ffn.experts.mlp_experts.5.w2
+# ffn.experts.mlp_experts.6.v1 layers
+- transformer.blocks.34.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.31.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.26.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.35.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.20.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.15.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.10.ffn.experts.mlp_experts.6.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.6.v1
+# ffn.experts.mlp_experts.6.w1 layers
+- transformer.blocks.0.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.34.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.26.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.2.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.6.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.6.w1
+# ffn.experts.mlp_experts.6.w2 layers
+- transformer.blocks.24.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.26.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.32.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.20.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.4.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.6.w2
+- transformer.blocks.9.ffn.experts.mlp_experts.6.w2
+# ffn.experts.mlp_experts.7.v1 layers
+- transformer.blocks.27.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.11.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.12.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.10.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.23.ffn.experts.mlp_experts.7.v1
+- transformer.blocks.34.ffn.experts.mlp_experts.7.v1
+# ffn.experts.mlp_experts.7.w1 layers
+- transformer.blocks.12.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.5.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.29.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.10.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.34.ffn.experts.mlp_experts.7.w1
+- transformer.blocks.33.ffn.experts.mlp_experts.7.w1
+# ffn.experts.mlp_experts.7.w2 layers
+- transformer.blocks.23.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.5.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.7.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.7.w2
+# ffn.experts.mlp_experts.8.v1 layers
+- transformer.blocks.30.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.20.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.34.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.9.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.6.ffn.experts.mlp_experts.8.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.8.v1
+# ffn.experts.mlp_experts.8.w1 layers
+- transformer.blocks.7.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.0.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.3.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.2.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.30.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.24.ffn.experts.mlp_experts.8.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.8.w1
+# ffn.experts.mlp_experts.8.w2 layers
+- transformer.blocks.32.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.24.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.30.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.3.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.23.ffn.experts.mlp_experts.8.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.8.w2
+# ffn.experts.mlp_experts.9.v1 layers
+- transformer.blocks.31.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.29.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.33.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.25.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.14.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.32.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.9.ffn.experts.mlp_experts.9.v1
+- transformer.blocks.34.ffn.experts.mlp_experts.9.v1
+# ffn.experts.mlp_experts.9.w1 layers
+- transformer.blocks.7.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.1.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.9.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.2.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.27.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.12.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.4.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.6.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.19.ffn.experts.mlp_experts.9.w1
+- transformer.blocks.8.ffn.experts.mlp_experts.9.w1
+# ffn.experts.mlp_experts.9.w2 layers
+- transformer.blocks.26.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.25.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.28.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.27.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.31.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.29.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.7.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.34.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.2.ffn.experts.mlp_experts.9.w2
+- transformer.blocks.33.ffn.experts.mlp_experts.9.w2
+# ffn.router.layer layers
+- transformer.blocks.2.ffn.router.layer
+- transformer.blocks.3.ffn.router.layer
+- transformer.blocks.4.ffn.router.layer
+- transformer.blocks.5.ffn.router.layer
+- transformer.blocks.6.ffn.router.layer
+- transformer.blocks.7.ffn.router.layer
+- transformer.blocks.8.ffn.router.layer
+- transformer.blocks.9.ffn.router.layer
+- transformer.blocks.10.ffn.router.layer
+- transformer.blocks.11.ffn.router.layer
+# norm_attn_norm.attn.Wqkv layers
+- transformer.blocks.16.norm_attn_norm.attn.Wqkv
+- transformer.blocks.15.norm_attn_norm.attn.Wqkv
+- transformer.blocks.11.norm_attn_norm.attn.Wqkv
+- transformer.blocks.14.norm_attn_norm.attn.Wqkv
+- transformer.blocks.12.norm_attn_norm.attn.Wqkv
+- transformer.blocks.20.norm_attn_norm.attn.Wqkv
+- transformer.blocks.10.norm_attn_norm.attn.Wqkv
+- transformer.blocks.9.norm_attn_norm.attn.Wqkv
+- transformer.blocks.19.norm_attn_norm.attn.Wqkv
+- transformer.blocks.18.norm_attn_norm.attn.Wqkv
+# norm_attn_norm.attn.out_proj layers
+- transformer.blocks.1.norm_attn_norm.attn.out_proj
+- transformer.blocks.18.norm_attn_norm.attn.out_proj
+- transformer.blocks.2.norm_attn_norm.attn.out_proj
+- transformer.blocks.16.norm_attn_norm.attn.out_proj
+- transformer.blocks.0.norm_attn_norm.attn.out_proj
+- transformer.blocks.39.norm_attn_norm.attn.out_proj
+- transformer.blocks.23.norm_attn_norm.attn.out_proj
+- transformer.blocks.8.norm_attn_norm.attn.out_proj
+- transformer.blocks.24.norm_attn_norm.attn.out_proj
+- transformer.blocks.19.norm_attn_norm.attn.out_proj
+# norm_attn_norm.norm_1 layers
+- transformer.blocks.0.norm_attn_norm.norm_1
+- transformer.blocks.1.norm_attn_norm.norm_1
+- transformer.blocks.2.norm_attn_norm.norm_1
+- transformer.blocks.3.norm_attn_norm.norm_1
+- transformer.blocks.4.norm_attn_norm.norm_1
+- transformer.blocks.5.norm_attn_norm.norm_1
+- transformer.blocks.6.norm_attn_norm.norm_1
+- transformer.blocks.7.norm_attn_norm.norm_1
+- transformer.blocks.8.norm_attn_norm.norm_1
+- transformer.blocks.9.norm_attn_norm.norm_1
+# norm_attn_norm.norm_2 layers
+- transformer.blocks.0.norm_attn_norm.norm_2
+- transformer.blocks.1.norm_attn_norm.norm_2
+- transformer.blocks.2.norm_attn_norm.norm_2
+- transformer.blocks.3.norm_attn_norm.norm_2
+- transformer.blocks.4.norm_attn_norm.norm_2
+- transformer.blocks.5.norm_attn_norm.norm_2
+- transformer.blocks.6.norm_attn_norm.norm_2
+- transformer.blocks.7.norm_attn_norm.norm_2
+- transformer.blocks.8.norm_attn_norm.norm_2
+- transformer.blocks.9.norm_attn_norm.norm_2
+# transformer.norm_f layers
+# transformer.wte layers
+# ffn.experts.mlp_experts.11.v1 layers
+- transformer.blocks.29.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.27.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.30.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.28.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.22.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.7.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.24.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.8.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.6.ffn.experts.mlp_experts.11.v1
+- transformer.blocks.12.ffn.experts.mlp_experts.11.v1
+
+
+
+dataset_prepared_path: dbrx2
+val_set_size: 0.01
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project: dolphin-2.9-Dbrx
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 1e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+ use_reentrant: false
+early_stopping_patience:
+# resume_from_checkpoint: /workspace/axolotl/dbrx-checkpoint
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 4
+save_total_limit: 2
+save_steps:
+debug:
+deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+weight_decay: 0.05
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: "<|endoftext|>"
+ eos_token: "<|im_end|>"
+ pad_token: "<|pad|>"
+ unk_token: "<|endoftext|>"
+tokens:
+ - "<|im_start|>"
+ - "<|im_end|>"
+
+
+```
+
+
+
+# out
+
+This model was trained from scratch on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.4336
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 64
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 1
+
+### Training results
+
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.4009 | 0.0 | 1 | 0.4328 |
+| 0.413 | 0.25 | 587 | 0.4408 |
+| 0.3626 | 0.5 | 1174 | 0.4368 |
+| 0.3896 | 0.75 | 1761 | 0.4336 |
+
+
+### Framework versions
+
+- Transformers 4.40.0.dev0
+- Pytorch 2.2.2+cu121
+- Datasets 2.15.0
+- Tokenizers 0.15.0
diff --git a/added_tokens.json b/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fd93dbaccc9e3824c96d24bd07102836233454e
--- /dev/null
+++ b/added_tokens.json
@@ -0,0 +1,4 @@
+{
+ "<|im_end|>": 100278,
+ "<|im_start|>": 100277
+}
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..818f2c4e5d0d13111d9e3a7749f3b8e510924130
--- /dev/null
+++ b/config.json
@@ -0,0 +1,39 @@
+{
+ "_name_or_path": "/workspace/axolotl/dbrx-checkpoint",
+ "architectures": [
+ "DbrxForCausalLM"
+ ],
+ "attn_config": {
+ "clip_qkv": 8,
+ "kv_n_heads": 8,
+ "model_type": "",
+ "rope_theta": 500000
+ },
+ "auto_map": {
+ "AutoConfig": "configuration_dbrx.DbrxConfig",
+ "AutoModelForCausalLM": "modeling_dbrx.DbrxForCausalLM"
+ },
+ "d_model": 6144,
+ "emb_pdrop": 0.0,
+ "ffn_config": {
+ "ffn_hidden_size": 10752,
+ "model_type": "",
+ "moe_jitter_eps": 0.01,
+ "moe_loss_weight": 0.05,
+ "moe_num_experts": 16,
+ "moe_top_k": 4
+ },
+ "initializer_range": 0.02,
+ "max_seq_len": 32768,
+ "model_type": "dbrx",
+ "n_heads": 48,
+ "n_layers": 40,
+ "output_router_logits": false,
+ "resid_pdrop": 0.0,
+ "router_aux_loss_coef": 0.05,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.40.0.dev0",
+ "use_cache": false,
+ "vocab_size": 100352
+}
diff --git a/configuration_dbrx.py b/configuration_dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c387be81edd9a192e935aa44692726f061b508
--- /dev/null
+++ b/configuration_dbrx.py
@@ -0,0 +1,264 @@
+"""Dbrx configuration."""
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+ """Configuration class for Dbrx Attention.
+
+ [`DbrxAttention`] class. It is used to instantiate attention layers
+ according to the specified arguments, defining the layers architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ attn_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability for the attention layers.
+ clip_qkv (`float`, *optional*, defualts to None):
+ If not `None`, clip the queries, keys, and values in the attention layer to this value.
+ kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+ rope_theta (float): The base frequency for rope.
+ """
+
+ def __init__(
+ self,
+ attn_pdrop: float = 0,
+ clip_qkv: Optional[float] = None,
+ kv_n_heads: int = 1,
+ rope_theta: float = 10000.0,
+ **kwargs: Any,
+ ):
+ super().__init__(**kwargs)
+ self.attn_pdrop = attn_pdrop
+ self.clip_qkv = clip_qkv
+ self.kv_n_heads = kv_n_heads
+ self.rope_theta = rope_theta
+
+ for k in ['model_type']:
+ if k in kwargs:
+ kwargs.pop(k)
+ if len(kwargs) != 0:
+ raise ValueError(f'Found unknown {kwargs=}')
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
+ **kwargs: Any) -> 'PretrainedConfig':
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+ **kwargs)
+
+ if config_dict.get('model_type') == 'dbrx':
+ config_dict = config_dict['attn_config']
+
+ if 'model_type' in config_dict and hasattr(
+ cls,
+ 'model_type') and config_dict['model_type'] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ +
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+ """Configuration class for Dbrx FFN.
+
+ [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+ the specified arguments, defining the layers architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+ The dict should have a key 'name' with the value being the name of
+ the activation function along with any additional keyword arguments.
+ ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+ moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+ moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+ moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+ moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+ moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+ uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+ This should only be used for benchmarking purposes.
+ """
+
+ def __init__(
+ self,
+ ffn_act_fn: Optional[dict] = None,
+ ffn_hidden_size: int = 3584,
+ moe_num_experts: int = 4,
+ moe_top_k: int = 1,
+ moe_jitter_eps: Optional[float] = None,
+ moe_loss_weight: float = 0.01,
+ moe_normalize_expert_weights: Optional[float] = 1,
+ uniform_expert_assignment: bool = False,
+ **kwargs: Any,
+ ):
+ super().__init__()
+ if ffn_act_fn is None:
+ ffn_act_fn = {'name': 'silu'}
+ self.ffn_act_fn = ffn_act_fn
+ self.ffn_hidden_size = ffn_hidden_size
+ self.moe_num_experts = moe_num_experts
+ self.moe_top_k = moe_top_k
+ self.moe_jitter_eps = moe_jitter_eps
+ self.moe_loss_weight = moe_loss_weight
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
+ self.uniform_expert_assignment = uniform_expert_assignment
+
+ for k in ['model_type']:
+ if k in kwargs:
+ kwargs.pop(k)
+ if len(kwargs) != 0:
+ raise ValueError(f'Found unknown {kwargs=}')
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
+ **kwargs: Any) -> 'PretrainedConfig':
+ cls._set_token_in_kwargs(kwargs)
+
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+ **kwargs)
+
+ if config_dict.get('model_type') == 'dbrx':
+ config_dict = config_dict['ffn_config']
+
+ if 'model_type' in config_dict and hasattr(
+ cls,
+ 'model_type') and config_dict['model_type'] != cls.model_type:
+ logger.warning(
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ +
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+ )
+
+ return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+ """Configuration class for Dbrx.
+
+ [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+ specified arguments, defining the model architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ d_model (`int`, *optional*, defaults to 6144):
+ Dimensionality of the embeddings and hidden states.
+ n_heads (`int`, *optional*, defaults to 48):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ n_layers (`int`, *optional*, defaults to 40):
+ Number of hidden layers in the Transformer encoder.
+ max_seq_len (`int`, *optional*, defaults to 32768):
+ The maximum sequence length of the model.
+ vocab_size (`int`, *optional*, defaults to 100352):
+ Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+ the `inputs_ids` passed when calling [`DbrxModel`].
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability applied to the attention output before combining with residual.
+ emb_pdrop (`float`, *optional*, defaults to 0.0):
+ The dropout probability for the embedding layer.
+ attn_config (`dict`, *optional*):
+ A dictionary used to configure the model's attention module.
+ ffn_config (`dict`, *optional*):
+ A dictionary used to configure the model's FFN module.
+ use_cache (`bool`, *optional*, defaults to `False`):
+ Whether or not the model should return the last key/values attentions (not used by all models).
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ output_router_logits (`bool`, *optional*, defaults to `False`):
+ Whether or not the router logits should be returned by the model. Enabling this will also
+ allow the model to output the auxiliary loss. See [here]() for more details
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+ The aux loss factor for the total loss.
+
+
+ Example:
+ ```python
+ >>> from transformers import DbrxConfig, DbrxModel
+
+ >>> # Initializing a Dbrx configuration
+ >>> configuration = DbrxConfig()
+
+ >>> # Initializing a model (with random weights) from the configuration
+ >>> model = DbrxModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```
+ """
+
+ model_type = 'dbrx'
+ attribute_map = {
+ 'num_attention_heads': 'n_heads',
+ 'hidden_size': 'd_model',
+ 'num_hidden_layers': 'n_layers',
+ 'max_position_embeddings': 'max_seq_len'
+ }
+
+ def __init__(
+ self,
+ d_model: int = 2048,
+ n_heads: int = 16,
+ n_layers: int = 24,
+ max_seq_len: int = 2048,
+ vocab_size: int = 32000,
+ resid_pdrop: float = 0.0,
+ emb_pdrop: float = 0.0,
+ attn_config: Optional[DbrxAttentionConfig] = None,
+ ffn_config: Optional[DbrxFFNConfig] = None,
+ use_cache: bool = True,
+ initializer_range: float = 0.02,
+ output_router_logits: bool = False,
+ router_aux_loss_coef: float = 0.05,
+ **kwargs: Any,
+ ):
+ if attn_config is None:
+ self.attn_config = DbrxAttentionConfig()
+ elif isinstance(attn_config, dict):
+ self.attn_config = DbrxAttentionConfig(**attn_config)
+ else:
+ self.attn_config = attn_config
+
+ if ffn_config is None:
+ self.ffn_config = DbrxFFNConfig()
+ elif isinstance(ffn_config, dict):
+ self.ffn_config = DbrxFFNConfig(**ffn_config)
+ else:
+ self.ffn_config = ffn_config
+
+ self.d_model = d_model
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.max_seq_len = max_seq_len
+ self.vocab_size = vocab_size
+ self.resid_pdrop = resid_pdrop
+ self.emb_pdrop = emb_pdrop
+ self.use_cache = use_cache
+ self.initializer_range = initializer_range
+ self.output_router_logits = output_router_logits
+ self.router_aux_loss_coef = router_aux_loss_coef
+
+ tie_word_embeddings = kwargs.pop('tie_word_embeddings', False)
+ if tie_word_embeddings:
+ raise ValueError(
+ 'tie_word_embeddings is not supported for Dbrx models.')
+
+ super().__init__(
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e80125b496d618b762f321e0d026bd80f2e937a
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,5 @@
+{
+ "_from_model_config": true,
+ "do_sample": true,
+ "transformers_version": "4.40.0.dev0"
+}
diff --git a/model-00001-of-00054.safetensors b/model-00001-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ddcc762fe3bc7954d216e56a1a9e1db99f8862e5
--- /dev/null
+++ b/model-00001-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:844d6ee310e60f776437f34b532a89764c869474fa771babcc88f457a1a41b49
+size 4976767312
diff --git a/model-00002-of-00054.safetensors b/model-00002-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..50aae60cf016537ca9b266594fcebf161f02093f
--- /dev/null
+++ b/model-00002-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67edbaf8ce22cef89b551054543eaf80c2a65f71702a0a6e818300e19a7d9883
+size 4932728256
diff --git a/model-00003-of-00054.safetensors b/model-00003-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..22b4581ccf33f6d6376288331442705671a5460a
--- /dev/null
+++ b/model-00003-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f9432fdf70381f02519e595cc1a32171a24cdff89817bab9b9162d9261df8cb
+size 4932728256
diff --git a/model-00004-of-00054.safetensors b/model-00004-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f47e81494ad23344068845bfa3be26689ad09d6d
--- /dev/null
+++ b/model-00004-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42a92ceadf870e5641c61925094356618bbccb078ee8ddadf6e0fe7000f02a22
+size 4888466376
diff --git a/model-00005-of-00054.safetensors b/model-00005-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0cb009a051de0e5f1829b8e1e31e1d1e13df1d83
--- /dev/null
+++ b/model-00005-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbaeb5f57d35a71310802168cc1c7d0c40adff467da82b4a40bd7923a9ee35e4
+size 4932728248
diff --git a/model-00006-of-00054.safetensors b/model-00006-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c31cad1fa0905af91ed784e09c224cb6c753004e
--- /dev/null
+++ b/model-00006-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b968c9e9d8e880db7517ed6d6fce3688ba186bd6bf52813cb5be6d2ca9d94bd
+size 4932728256
diff --git a/model-00007-of-00054.safetensors b/model-00007-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7dace946fb66554538eb0540993badd35002e267
--- /dev/null
+++ b/model-00007-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc898a1265501c5d7816d0d5a5bec727c82ff398e34b01216d19a08d1276441
+size 4932728256
diff --git a/model-00008-of-00054.safetensors b/model-00008-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3e7cd9a6a020ba15264246af703ae6652eccbed1
--- /dev/null
+++ b/model-00008-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a03a9d91c30daf0be095e7fe207f2ec64459f3b84de5354436d66ee7bc87fdb5
+size 4888466376
diff --git a/model-00009-of-00054.safetensors b/model-00009-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ec4e171ee7f219791388dc569584cfb6cb014b84
--- /dev/null
+++ b/model-00009-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019e332f7c02b72f409d07597a51fe6b9750f7e3c6844e625ff7d5b64fc53dd4
+size 4932728248
diff --git a/model-00010-of-00054.safetensors b/model-00010-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b07d5c235d4890b64988afc3addbc980ebe5a818
--- /dev/null
+++ b/model-00010-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08796964f4ab0f84558720edf6eaa3a84d4284fd8fb46e5efa8c307acee50bfb
+size 4932728256
diff --git a/model-00011-of-00054.safetensors b/model-00011-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6159d0207a7a5e12a4a1eec69b61898a0168707
--- /dev/null
+++ b/model-00011-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b923056be1b7922da2ccde84fc5fbce0e4493e74d075080c9cff6d6d72baccc
+size 4932728256
diff --git a/model-00012-of-00054.safetensors b/model-00012-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cde62cc9db78749c584823105248d66d14635a4a
--- /dev/null
+++ b/model-00012-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4df2fe028ea52491ed31525dd581ecac07e19da6585e066ed00513f956c1e4a2
+size 4888466376
diff --git a/model-00013-of-00054.safetensors b/model-00013-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..015a792803b40114974e9389d28e6ca36e8f2e6b
--- /dev/null
+++ b/model-00013-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51cc2f9e11b4d986cf83a0568afb4ae3a0796606bd9bdd8337a4f44f734ca86b
+size 4932728240
diff --git a/model-00014-of-00054.safetensors b/model-00014-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8ad4b949c4f48b84d5182a6e31d2c94c7ffae136
--- /dev/null
+++ b/model-00014-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:166b445c266747bfef1b529494a8b64b8014ae9f681d3b31bb279f5ce56148dc
+size 4932728280
diff --git a/model-00015-of-00054.safetensors b/model-00015-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6fdc9a7c9e682fb40e4222a967cbcaa768b84a3f
--- /dev/null
+++ b/model-00015-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51c4a8745ab117e3993ec7e41933885240759ff5766b61309e6210740e5f0687
+size 4932728296
diff --git a/model-00016-of-00054.safetensors b/model-00016-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a883a164797650bf1dc85091bda24c84e3f69b6
--- /dev/null
+++ b/model-00016-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:401be63b10132316fd3d16a1a4eb5ab1b41d38dcd24dfba7585a13faf36b1d55
+size 4888466416
diff --git a/model-00017-of-00054.safetensors b/model-00017-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bbbc912d299f44183314c27c7244648ede92384a
--- /dev/null
+++ b/model-00017-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f508fbcfac88c06a1a9c01c6041530cd7bc32261753faa86240ce733f96c335d
+size 4932728288
diff --git a/model-00018-of-00054.safetensors b/model-00018-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..88814875df426f1c69ec030fa3bd4dd4cb4c6be3
--- /dev/null
+++ b/model-00018-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba87d72f59502b88a04fa2a145f73712d412fa784e63c816b72f872cccc167e3
+size 4932728296
diff --git a/model-00019-of-00054.safetensors b/model-00019-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bc0f854410a250214becd65d770e9445b79a5d8f
--- /dev/null
+++ b/model-00019-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22c0313fb62cb31922e14d2d71b3794d6ca82a6193ca1fced36fb57fc445b0c2
+size 4932728296
diff --git a/model-00020-of-00054.safetensors b/model-00020-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1ce508bdc49296058c5202a2d35df31732c26031
--- /dev/null
+++ b/model-00020-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54b983476b64d4eafadfe50647c307d0bd1114415be8eb1e2f65c612c778bf07
+size 4888466416
diff --git a/model-00021-of-00054.safetensors b/model-00021-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3bca03bda6969424cb4adf5cf3e2ea043e0f6a1f
--- /dev/null
+++ b/model-00021-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d59190fb228d8490d7c65a4003e7f65877bd8d0f2c527b7b3a1b493026efa88
+size 4932728288
diff --git a/model-00022-of-00054.safetensors b/model-00022-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ddf8f402bce562831ce0a7d40ec776ef1866b8c3
--- /dev/null
+++ b/model-00022-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af8b37e5d9a55ff376f923cafe175c109ab3329dc0dcd703d1d5110d25f5cd2f
+size 4932728296
diff --git a/model-00023-of-00054.safetensors b/model-00023-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..468b4594daf2ec82c63d56d4223bbffca8d6f3d2
--- /dev/null
+++ b/model-00023-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:042526a2936e7fb570abd769da46aedd6fcc65745aaee13f00bcdc70a5e06b81
+size 4932728296
diff --git a/model-00024-of-00054.safetensors b/model-00024-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3fa69f3fccee8906b19b5d32d2c8a521a9234522
--- /dev/null
+++ b/model-00024-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eebc383a0a528db88d0d4965a02b4cbc81d702991963e4523a6b2dfc3a9151f9
+size 4888466416
diff --git a/model-00025-of-00054.safetensors b/model-00025-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..93126067808f613bf91694841b88091dea3ad6b4
--- /dev/null
+++ b/model-00025-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae5b9fcc37e67350cd83ff8b3db1313d119ed5afc7d594ab8a6918077918eba
+size 4932728288
diff --git a/model-00026-of-00054.safetensors b/model-00026-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3a78c63273c1e3f92cf710c1f69f3163468e6f2
--- /dev/null
+++ b/model-00026-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:033d368ca1793ae5ba73bb03b8c6cc7256eb4a18f77329c2c8fdeb8b5fbd3411
+size 4932728296
diff --git a/model-00027-of-00054.safetensors b/model-00027-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..23ebab5cc8107364c30eefe35b755d91f63354fc
--- /dev/null
+++ b/model-00027-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84f35cfd716d3f28eb636926ab692a0fef1cca2a5fc6df2aaf508895b4d6b8c5
+size 4932728296
diff --git a/model-00028-of-00054.safetensors b/model-00028-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8b60bc8a3de05295ad5860d4c914bc5e4fc9ada8
--- /dev/null
+++ b/model-00028-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee70a22cc0b03e27154164b623d987837f156c813d3ad3db979859995a9101da
+size 4888466416
diff --git a/model-00029-of-00054.safetensors b/model-00029-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..83246d499a4b86c50423a5090163d32e4d896037
--- /dev/null
+++ b/model-00029-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d970806d6867f0808c497eb159a21dcded2194cc676a3da58159c5449a424c8
+size 4932728288
diff --git a/model-00030-of-00054.safetensors b/model-00030-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ea89c07d58b616f8f8bed69ddea44d88678c6859
--- /dev/null
+++ b/model-00030-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d1b9f8845e95ca6afcd803e158d82807646f35114e06164451d303b9ab9ec8
+size 4932728296
diff --git a/model-00031-of-00054.safetensors b/model-00031-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..afa20e5c00619da964d3119aab54fd92a62c6562
--- /dev/null
+++ b/model-00031-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b116b356f8871f98dcac1130097315bb6e534e9f91e8703b2c5a12ad9ba000f
+size 4932728296
diff --git a/model-00032-of-00054.safetensors b/model-00032-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d1195d5381701518594d59ee9a8903145a0bdc8f
--- /dev/null
+++ b/model-00032-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ca0e4c2054445a7b33a548f45c6b09bf469e97ffb0c48e27e6b277bf04e6037
+size 4888466416
diff --git a/model-00033-of-00054.safetensors b/model-00033-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5c52732c9b76b4f35b19aa4988540eeca4a8687c
--- /dev/null
+++ b/model-00033-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e18cbafcfc97cecc047fa18f64ff805b61a3976b8b6b01b333c6cae73c3b9797
+size 4932728288
diff --git a/model-00034-of-00054.safetensors b/model-00034-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e61ea9f213df58eb80bed2cd7a824b6c5ec8bf1f
--- /dev/null
+++ b/model-00034-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc7bfcbd66ee533cd39cf2c236ac7a32f249f4b90c6a1d025bd30e3dcba8b37e
+size 4932728288
diff --git a/model-00035-of-00054.safetensors b/model-00035-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6da8c0274fb1b49a54a9ff00f299a1ce3d1e8e25
--- /dev/null
+++ b/model-00035-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3957da1791e004a08595a89a2ea4587c168a1c6b916da521fd4fde3751b68a89
+size 4932728296
diff --git a/model-00036-of-00054.safetensors b/model-00036-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f2ba0e35a3e3c8b8ca195c9452a78227e1322a8e
--- /dev/null
+++ b/model-00036-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5691f61db31dd0894d272f6a2107e366484825fe1279952f9abfc835421cf16e
+size 4989142256
diff --git a/model-00037-of-00054.safetensors b/model-00037-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5aa6e8fa63a5a302116c356031f7d299af3df11
--- /dev/null
+++ b/model-00037-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21f2dbd599835d83511dd2122d5bea4ad6647f521f950dcb901699c1aa1bcfcb
+size 4964173160
diff --git a/model-00038-of-00054.safetensors b/model-00038-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cc5733076103fe074cfad8fa2215eeb578f8f5f1
--- /dev/null
+++ b/model-00038-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:500fd82e253552d7283f6bc2dd7287a1cfc524d3a483bd6e525de912238c815c
+size 4932728288
diff --git a/model-00039-of-00054.safetensors b/model-00039-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a2f72cd067eb3d692aea0e354674b6d61579d301
--- /dev/null
+++ b/model-00039-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79307da855fad9bd2377cc36c914938657dfc6554a35edaee4874b6153bef98f
+size 4932728296
diff --git a/model-00040-of-00054.safetensors b/model-00040-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..674179602aea35f3e85b149b3a85f56ce7bf683e
--- /dev/null
+++ b/model-00040-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b300507198f682ee40a81b1af4b16169023ae07fc3f45767eea3d0019c8f84f6
+size 4932728296
diff --git a/model-00041-of-00054.safetensors b/model-00041-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af658cd70e0c1e07b6081db2ee48d6ff2238a07e
--- /dev/null
+++ b/model-00041-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c20d1529859d1a2cd0ba96512ce0dfe4d97137e591febf0998d80a2ee497731
+size 4888466408
diff --git a/model-00042-of-00054.safetensors b/model-00042-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9c5418ec99c7bd769bb37bc336d5918a957779dd
--- /dev/null
+++ b/model-00042-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfdf0dcff1dc6f5da4754dbf6d58f4ec69102b185f95a3116c106597c9fd34b6
+size 4932728288
diff --git a/model-00043-of-00054.safetensors b/model-00043-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bdd9dfcc66a0a4c2293f29a3ba6a038790f9934f
--- /dev/null
+++ b/model-00043-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d62413dbb7ec0a905a8f03b47f86693ddf0570c35dc8afb83cdc31892708d420
+size 4932728296
diff --git a/model-00044-of-00054.safetensors b/model-00044-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a7858820231c3d1057961202585f745cf949a3f4
--- /dev/null
+++ b/model-00044-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0de53f440d3e537a70891a225e39555f0a730ae2ba92916f98087e86531d330d
+size 4932728296
diff --git a/model-00045-of-00054.safetensors b/model-00045-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..69cf3adc330fd8cca46a32cb02bc29913ad54d34
--- /dev/null
+++ b/model-00045-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74006b98dac79bfa8765e97abab9ef348e65c76b581c7810578489ab7c2258cc
+size 4888466408
diff --git a/model-00046-of-00054.safetensors b/model-00046-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..64cf7fe5dba42e823740702bb2ed9b4f2086cd94
--- /dev/null
+++ b/model-00046-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73f51a9b5c9acf866d7ed02b12e7a250ad1beb790c91408f576a254da750b635
+size 4932728288
diff --git a/model-00047-of-00054.safetensors b/model-00047-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f377108d89abd808538cac2f1455f2c281e0aa1
--- /dev/null
+++ b/model-00047-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67a91080f6f0aa8d2a9a6981be7ae4161a625d10dd79347d45c1834cb5d38aff
+size 4932728296
diff --git a/model-00048-of-00054.safetensors b/model-00048-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c99f6c382198a7c6d910922b6f8f61de8cc5458e
--- /dev/null
+++ b/model-00048-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:224d1b62723bf612b8758ec2e30490508110571541ad9df597d389f462b24dcd
+size 4932728296
diff --git a/model-00049-of-00054.safetensors b/model-00049-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1086a827de2feab6df6a3ba2ec00165bb07e2406
--- /dev/null
+++ b/model-00049-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8890a1f9f691819d7ced6e5170d73fc83cad1a98262bcd40f3fb364a93cfc664
+size 4888466408
diff --git a/model-00050-of-00054.safetensors b/model-00050-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d1442f08797b9dd90d7b0a97b0c37a8ee9458b9f
--- /dev/null
+++ b/model-00050-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13724ca02f921494021f2b03c292dd852c5f1133e1e60c1cdcc07a247465bf47
+size 4932728288
diff --git a/model-00051-of-00054.safetensors b/model-00051-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..71560cd413f00a7f1903fa63dd526cf6d1b8149b
--- /dev/null
+++ b/model-00051-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b30120bc6a14b2531eb6b1faa369b838a4dab9863aaa75ed1eebe919caa681e1
+size 4932728296
diff --git a/model-00052-of-00054.safetensors b/model-00052-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f121d969fa1efeb751f62f5106a396e02e58d551
--- /dev/null
+++ b/model-00052-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c7e8dfcfe4189122f5f558352f8aa35f0d8df3ffa5d8316634a06a6b73392c0
+size 4932728296
diff --git a/model-00053-of-00054.safetensors b/model-00053-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..70b8dbaea57dc9eed182fc2ee67b252794f9383f
--- /dev/null
+++ b/model-00053-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e6b8f8df0793fe3c5133a3643c269ff16d58bb03a01ef20c6531a2fb0f637c4
+size 4888466416
diff --git a/model-00054-of-00054.safetensors b/model-00054-of-00054.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..33dfeb8838f519f9742529da870214a48937e391
--- /dev/null
+++ b/model-00054-of-00054.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff597e0c7f828a974de3149d022689165724784f00e3443a544b684594c1e9d1
+size 2157982888
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..481f189b62ed8caaf9cd2f0c9d2f6e3e453c1894
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,2130 @@
+{
+ "metadata": {
+ "total_size": 263193047040
+ },
+ "weight_map": {
+ "lm_head.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.0.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.0.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.0.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.1.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.1.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.1.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.10.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.10.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.10.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.11.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.11.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.11.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.12.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.12.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.12.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.13.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.13.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.13.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.14.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.14.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.14.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.15.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.15.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.15.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.2.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.2.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.2.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.3.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.3.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.3.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.4.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.4.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.4.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.5.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.5.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.5.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.6.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.6.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.6.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.7.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.7.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.7.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.8.v1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.8.w1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.8.w2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.9.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.9.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.experts.mlp_experts.9.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.0.ffn.router.layer.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.norm_attn_norm.attn.Wqkv.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.norm_attn_norm.attn.out_proj.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.norm_attn_norm.norm_1.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.0.norm_attn_norm.norm_2.weight": "model-00001-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.0.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.0.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.0.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.1.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.1.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.1.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.10.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.10.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.10.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.11.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.11.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.11.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.12.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.12.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.12.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.13.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.13.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.13.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.14.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.14.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.14.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.15.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.15.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.15.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.2.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.2.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.2.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.3.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.3.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.3.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.4.v1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.4.w1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.4.w2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.5.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.5.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.5.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.6.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.6.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.6.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.7.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.7.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.7.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.8.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.8.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.8.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.9.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.9.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.experts.mlp_experts.9.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.1.ffn.router.layer.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.norm_attn_norm.attn.Wqkv.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.norm_attn_norm.attn.out_proj.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.norm_attn_norm.norm_1.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.1.norm_attn_norm.norm_2.weight": "model-00002-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.0.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.0.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.0.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.1.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.1.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.1.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.10.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.10.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.10.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.11.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.11.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.11.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.12.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.12.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.12.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.13.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.13.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.13.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.14.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.14.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.14.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.15.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.15.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.15.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.2.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.2.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.2.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.3.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.3.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.3.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.4.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.4.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.4.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.5.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.5.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.5.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.6.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.6.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.6.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.7.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.7.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.7.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.8.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.8.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.8.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.9.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.9.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.experts.mlp_experts.9.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.10.ffn.router.layer.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.norm_attn_norm.attn.Wqkv.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.norm_attn_norm.attn.out_proj.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.norm_attn_norm.norm_1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.10.norm_attn_norm.norm_2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.0.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.0.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.0.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.1.v1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.1.w1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.1.w2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.10.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.10.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.10.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.11.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.11.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.11.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.12.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.12.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.12.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.13.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.13.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.13.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.14.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.14.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.14.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.15.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.15.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.15.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.2.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.2.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.2.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.3.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.3.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.3.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.4.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.4.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.4.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.5.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.5.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.5.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.6.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.6.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.6.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.7.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.7.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.7.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.8.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.8.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.8.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.9.v1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.9.w1.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.experts.mlp_experts.9.w2.weight": "model-00016-of-00054.safetensors",
+ "transformer.blocks.11.ffn.router.layer.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.norm_attn_norm.attn.Wqkv.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.norm_attn_norm.attn.out_proj.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.norm_attn_norm.norm_1.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.11.norm_attn_norm.norm_2.weight": "model-00015-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.0.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.0.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.0.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.1.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.1.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.1.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.10.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.10.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.10.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.11.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.11.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.11.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.12.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.12.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.12.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.13.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.13.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.13.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.14.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.14.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.14.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.15.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.15.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.15.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.2.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.2.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.2.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.3.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.3.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.3.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.4.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.4.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.4.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.5.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.5.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.5.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.6.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.6.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.6.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.7.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.7.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.7.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.8.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.8.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.8.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.9.v1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.9.w1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.experts.mlp_experts.9.w2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.ffn.router.layer.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.norm_attn_norm.attn.Wqkv.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.norm_attn_norm.attn.out_proj.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.norm_attn_norm.norm_1.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.12.norm_attn_norm.norm_2.weight": "model-00017-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.0.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.0.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.0.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.1.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.1.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.1.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.10.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.10.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.10.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.11.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.11.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.11.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.12.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.12.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.12.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.13.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.13.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.13.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.14.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.14.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.14.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.15.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.15.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.15.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.2.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.2.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.2.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.3.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.3.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.3.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.4.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.4.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.4.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.5.v1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.5.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.5.w2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.6.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.6.w1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.6.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.7.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.7.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.7.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.8.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.8.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.8.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.9.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.9.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.experts.mlp_experts.9.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.13.ffn.router.layer.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.norm_attn_norm.attn.Wqkv.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.norm_attn_norm.attn.out_proj.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.norm_attn_norm.norm_1.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.13.norm_attn_norm.norm_2.weight": "model-00018-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.0.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.0.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.0.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.1.v1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.1.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.1.w2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.10.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.10.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.10.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.11.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.11.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.11.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.12.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.12.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.12.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.13.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.13.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.13.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.14.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.14.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.14.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.15.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.15.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.15.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.2.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.2.w1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.2.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.3.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.3.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.3.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.4.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.4.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.4.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.5.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.5.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.5.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.6.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.6.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.6.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.7.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.7.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.7.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.8.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.8.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.8.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.9.v1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.9.w1.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.experts.mlp_experts.9.w2.weight": "model-00020-of-00054.safetensors",
+ "transformer.blocks.14.ffn.router.layer.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.norm_attn_norm.attn.Wqkv.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.norm_attn_norm.attn.out_proj.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.norm_attn_norm.norm_1.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.14.norm_attn_norm.norm_2.weight": "model-00019-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.0.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.0.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.0.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.1.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.1.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.1.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.10.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.10.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.10.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.11.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.11.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.11.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.12.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.12.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.12.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.13.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.13.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.13.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.14.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.14.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.14.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.15.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.15.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.15.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.2.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.2.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.2.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.3.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.3.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.3.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.4.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.4.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.4.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.5.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.5.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.5.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.6.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.6.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.6.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.7.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.7.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.7.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.8.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.8.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.8.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.9.v1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.9.w1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.experts.mlp_experts.9.w2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.ffn.router.layer.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.norm_attn_norm.attn.Wqkv.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.norm_attn_norm.attn.out_proj.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.norm_attn_norm.norm_1.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.15.norm_attn_norm.norm_2.weight": "model-00021-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.0.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.0.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.0.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.1.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.1.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.1.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.10.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.10.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.10.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.11.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.11.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.11.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.12.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.12.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.12.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.13.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.13.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.13.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.14.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.14.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.14.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.15.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.15.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.15.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.2.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.2.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.2.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.3.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.3.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.3.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.4.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.4.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.4.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.5.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.5.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.5.w2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.6.v1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.6.w1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.6.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.7.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.7.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.7.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.8.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.8.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.8.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.9.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.9.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.experts.mlp_experts.9.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.16.ffn.router.layer.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.norm_attn_norm.attn.Wqkv.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.norm_attn_norm.attn.out_proj.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.norm_attn_norm.norm_1.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.16.norm_attn_norm.norm_2.weight": "model-00022-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.0.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.0.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.0.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.1.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.1.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.1.w2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.10.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.10.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.10.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.11.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.11.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.11.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.12.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.12.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.12.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.13.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.13.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.13.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.14.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.14.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.14.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.15.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.15.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.15.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.2.v1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.2.w1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.2.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.3.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.3.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.3.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.4.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.4.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.4.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.5.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.5.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.5.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.6.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.6.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.6.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.7.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.7.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.7.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.8.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.8.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.8.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.9.v1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.9.w1.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.experts.mlp_experts.9.w2.weight": "model-00024-of-00054.safetensors",
+ "transformer.blocks.17.ffn.router.layer.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.norm_attn_norm.attn.Wqkv.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.norm_attn_norm.attn.out_proj.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.norm_attn_norm.norm_1.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.17.norm_attn_norm.norm_2.weight": "model-00023-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.0.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.0.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.0.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.1.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.1.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.1.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.10.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.10.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.10.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.11.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.11.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.11.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.12.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.12.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.12.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.13.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.13.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.13.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.14.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.14.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.14.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.15.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.15.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.15.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.2.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.2.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.2.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.3.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.3.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.3.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.4.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.4.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.4.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.5.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.5.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.5.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.6.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.6.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.6.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.7.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.7.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.7.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.8.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.8.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.8.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.9.v1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.9.w1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.experts.mlp_experts.9.w2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.ffn.router.layer.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.norm_attn_norm.attn.Wqkv.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.norm_attn_norm.attn.out_proj.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.norm_attn_norm.norm_1.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.18.norm_attn_norm.norm_2.weight": "model-00025-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.0.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.0.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.0.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.1.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.1.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.1.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.10.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.10.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.10.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.11.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.11.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.11.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.12.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.12.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.12.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.13.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.13.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.13.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.14.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.14.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.14.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.15.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.15.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.15.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.2.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.2.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.2.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.3.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.3.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.3.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.4.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.4.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.4.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.5.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.5.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.5.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.6.v1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.6.w1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.6.w2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.7.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.7.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.7.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.8.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.8.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.8.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.9.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.9.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.experts.mlp_experts.9.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.19.ffn.router.layer.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.norm_attn_norm.attn.Wqkv.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.norm_attn_norm.attn.out_proj.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.norm_attn_norm.norm_1.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.19.norm_attn_norm.norm_2.weight": "model-00026-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.0.v1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.0.w1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.0.w2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.1.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.1.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.1.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.10.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.10.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.10.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.11.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.11.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.11.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.12.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.12.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.12.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.13.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.13.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.13.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.14.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.14.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.14.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.15.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.15.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.15.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.2.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.2.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.2.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.3.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.3.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.3.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.4.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.4.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.4.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.5.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.5.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.5.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.6.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.6.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.6.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.7.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.7.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.7.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.8.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.8.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.8.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.9.v1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.9.w1.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.experts.mlp_experts.9.w2.weight": "model-00004-of-00054.safetensors",
+ "transformer.blocks.2.ffn.router.layer.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.norm_attn_norm.attn.Wqkv.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.norm_attn_norm.attn.out_proj.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.norm_attn_norm.norm_1.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.2.norm_attn_norm.norm_2.weight": "model-00003-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.0.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.0.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.0.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.1.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.1.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.1.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.10.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.10.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.10.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.11.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.11.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.11.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.12.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.12.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.12.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.13.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.13.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.13.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.14.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.14.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.14.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.15.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.15.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.15.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.2.v1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.2.w1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.2.w2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.3.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.3.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.3.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.4.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.4.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.4.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.5.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.5.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.5.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.6.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.6.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.6.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.7.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.7.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.7.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.8.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.8.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.8.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.9.v1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.9.w1.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.experts.mlp_experts.9.w2.weight": "model-00028-of-00054.safetensors",
+ "transformer.blocks.20.ffn.router.layer.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.norm_attn_norm.attn.Wqkv.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.norm_attn_norm.attn.out_proj.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.norm_attn_norm.norm_1.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.20.norm_attn_norm.norm_2.weight": "model-00027-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.0.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.0.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.0.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.1.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.1.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.1.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.10.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.10.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.10.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.11.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.11.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.11.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.12.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.12.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.12.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.13.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.13.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.13.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.14.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.14.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.14.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.15.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.15.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.15.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.2.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.2.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.2.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.3.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.3.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.3.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.4.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.4.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.4.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.5.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.5.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.5.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.6.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.6.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.6.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.7.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.7.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.7.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.8.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.8.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.8.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.9.v1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.9.w1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.experts.mlp_experts.9.w2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.ffn.router.layer.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.norm_attn_norm.attn.Wqkv.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.norm_attn_norm.attn.out_proj.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.norm_attn_norm.norm_1.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.21.norm_attn_norm.norm_2.weight": "model-00029-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.0.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.0.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.0.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.1.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.1.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.1.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.10.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.10.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.10.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.11.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.11.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.11.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.12.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.12.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.12.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.13.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.13.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.13.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.14.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.14.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.14.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.15.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.15.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.15.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.2.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.2.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.2.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.3.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.3.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.3.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.4.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.4.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.4.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.5.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.5.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.5.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.6.v1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.6.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.6.w2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.7.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.7.w1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.7.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.8.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.8.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.8.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.9.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.9.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.experts.mlp_experts.9.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.22.ffn.router.layer.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.norm_attn_norm.attn.Wqkv.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.norm_attn_norm.attn.out_proj.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.norm_attn_norm.norm_1.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.22.norm_attn_norm.norm_2.weight": "model-00030-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.0.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.0.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.0.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.1.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.1.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.1.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.10.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.10.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.10.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.11.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.11.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.11.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.12.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.12.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.12.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.13.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.13.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.13.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.14.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.14.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.14.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.15.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.15.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.15.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.2.v1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.2.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.2.w2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.3.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.3.w1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.3.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.4.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.4.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.4.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.5.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.5.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.5.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.6.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.6.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.6.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.7.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.7.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.7.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.8.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.8.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.8.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.9.v1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.9.w1.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.experts.mlp_experts.9.w2.weight": "model-00032-of-00054.safetensors",
+ "transformer.blocks.23.ffn.router.layer.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.norm_attn_norm.attn.Wqkv.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.norm_attn_norm.attn.out_proj.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.norm_attn_norm.norm_1.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.23.norm_attn_norm.norm_2.weight": "model-00031-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.0.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.0.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.0.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.1.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.1.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.1.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.10.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.10.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.10.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.11.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.11.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.11.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.12.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.12.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.12.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.13.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.13.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.13.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.14.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.14.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.14.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.15.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.15.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.15.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.2.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.2.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.2.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.3.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.3.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.3.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.4.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.4.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.4.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.5.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.5.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.5.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.6.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.6.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.6.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.7.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.7.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.7.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.8.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.8.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.8.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.9.v1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.9.w1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.experts.mlp_experts.9.w2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.ffn.router.layer.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.norm_attn_norm.attn.Wqkv.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.norm_attn_norm.attn.out_proj.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.norm_attn_norm.norm_1.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.24.norm_attn_norm.norm_2.weight": "model-00033-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.0.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.0.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.0.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.1.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.1.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.1.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.10.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.10.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.10.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.11.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.11.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.11.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.12.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.12.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.12.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.13.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.13.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.13.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.14.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.14.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.14.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.15.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.15.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.15.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.2.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.2.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.2.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.3.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.3.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.3.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.4.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.4.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.4.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.5.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.5.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.5.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.6.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.6.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.6.w2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.7.v1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.7.w1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.7.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.8.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.8.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.8.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.9.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.9.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.experts.mlp_experts.9.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.25.ffn.router.layer.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.norm_attn_norm.attn.Wqkv.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.norm_attn_norm.attn.out_proj.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.norm_attn_norm.norm_1.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.25.norm_attn_norm.norm_2.weight": "model-00034-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.0.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.0.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.0.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.1.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.1.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.1.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.10.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.10.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.10.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.11.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.11.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.11.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.12.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.12.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.12.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.13.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.13.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.13.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.14.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.14.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.14.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.15.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.15.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.15.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.2.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.2.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.2.w2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.3.v1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.3.w1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.3.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.4.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.4.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.4.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.5.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.5.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.5.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.6.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.6.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.6.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.7.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.7.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.7.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.8.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.8.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.8.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.9.v1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.9.w1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.experts.mlp_experts.9.w2.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.26.ffn.router.layer.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.norm_attn_norm.attn.Wqkv.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.norm_attn_norm.attn.out_proj.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.norm_attn_norm.norm_1.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.26.norm_attn_norm.norm_2.weight": "model-00035-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.0.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.0.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.0.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.1.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.1.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.1.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.10.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.10.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.10.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.11.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.11.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.11.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.12.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.12.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.12.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.13.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.13.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.13.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.14.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.14.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.14.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.15.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.15.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.15.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.2.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.2.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.2.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.3.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.3.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.3.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.4.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.4.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.4.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.5.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.5.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.5.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.6.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.6.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.6.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.7.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.7.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.7.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.8.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.8.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.8.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.9.v1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.9.w1.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.experts.mlp_experts.9.w2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.ffn.router.layer.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.norm_attn_norm.attn.Wqkv.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.27.norm_attn_norm.attn.out_proj.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.27.norm_attn_norm.norm_1.weight": "model-00036-of-00054.safetensors",
+ "transformer.blocks.27.norm_attn_norm.norm_2.weight": "model-00037-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.0.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.0.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.0.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.1.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.1.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.1.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.10.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.10.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.10.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.11.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.11.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.11.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.12.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.12.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.12.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.13.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.13.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.13.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.14.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.14.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.14.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.15.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.15.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.15.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.2.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.2.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.2.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.3.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.3.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.3.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.4.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.4.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.4.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.5.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.5.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.5.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.6.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.6.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.6.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.7.v1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.7.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.7.w2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.8.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.8.w1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.8.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.9.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.9.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.experts.mlp_experts.9.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.28.ffn.router.layer.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.norm_attn_norm.attn.Wqkv.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.norm_attn_norm.attn.out_proj.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.norm_attn_norm.norm_1.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.28.norm_attn_norm.norm_2.weight": "model-00038-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.0.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.0.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.0.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.1.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.1.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.1.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.10.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.10.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.10.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.11.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.11.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.11.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.12.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.12.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.12.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.13.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.13.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.13.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.14.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.14.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.14.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.15.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.15.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.15.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.2.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.2.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.2.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.3.v1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.3.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.3.w2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.4.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.4.w1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.4.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.5.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.5.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.5.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.6.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.6.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.6.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.7.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.7.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.7.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.8.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.8.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.8.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.9.v1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.9.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.experts.mlp_experts.9.w2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.29.ffn.router.layer.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.norm_attn_norm.attn.Wqkv.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.norm_attn_norm.attn.out_proj.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.norm_attn_norm.norm_1.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.29.norm_attn_norm.norm_2.weight": "model-00039-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.0.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.0.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.0.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.1.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.1.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.1.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.10.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.10.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.10.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.11.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.11.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.11.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.12.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.12.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.12.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.13.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.13.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.13.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.14.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.14.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.14.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.15.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.15.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.15.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.2.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.2.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.2.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.3.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.3.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.3.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.4.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.4.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.4.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.5.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.5.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.5.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.6.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.6.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.6.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.7.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.7.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.7.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.8.v1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.8.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.8.w2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.9.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.9.w1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.ffn.experts.mlp_experts.9.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.3.ffn.router.layer.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.norm_attn_norm.attn.Wqkv.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.norm_attn_norm.attn.out_proj.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.norm_attn_norm.norm_1.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.3.norm_attn_norm.norm_2.weight": "model-00005-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.0.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.0.w1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.0.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.1.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.1.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.1.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.10.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.10.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.10.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.11.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.11.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.11.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.12.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.12.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.12.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.13.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.13.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.13.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.14.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.14.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.14.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.15.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.15.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.15.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.2.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.2.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.2.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.3.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.3.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.3.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.4.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.4.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.4.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.5.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.5.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.5.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.6.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.6.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.6.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.7.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.7.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.7.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.8.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.8.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.8.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.9.v1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.9.w1.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.experts.mlp_experts.9.w2.weight": "model-00041-of-00054.safetensors",
+ "transformer.blocks.30.ffn.router.layer.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.30.norm_attn_norm.attn.Wqkv.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.30.norm_attn_norm.attn.out_proj.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.30.norm_attn_norm.norm_1.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.30.norm_attn_norm.norm_2.weight": "model-00040-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.0.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.0.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.0.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.1.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.1.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.1.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.10.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.10.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.10.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.11.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.11.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.11.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.12.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.12.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.12.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.13.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.13.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.13.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.14.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.14.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.14.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.15.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.15.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.15.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.2.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.2.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.2.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.3.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.3.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.3.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.4.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.4.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.4.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.5.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.5.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.5.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.6.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.6.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.6.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.7.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.7.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.7.w2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.8.v1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.8.w1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.8.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.9.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.9.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.experts.mlp_experts.9.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.31.ffn.router.layer.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.norm_attn_norm.attn.Wqkv.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.norm_attn_norm.attn.out_proj.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.norm_attn_norm.norm_1.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.31.norm_attn_norm.norm_2.weight": "model-00042-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.0.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.0.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.0.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.1.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.1.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.1.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.10.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.10.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.10.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.11.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.11.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.11.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.12.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.12.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.12.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.13.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.13.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.13.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.14.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.14.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.14.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.15.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.15.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.15.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.2.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.2.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.2.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.3.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.3.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.3.w2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.4.v1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.4.w1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.4.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.5.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.5.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.5.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.6.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.6.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.6.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.7.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.7.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.7.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.8.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.8.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.8.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.9.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.9.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.experts.mlp_experts.9.w2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.32.ffn.router.layer.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.norm_attn_norm.attn.Wqkv.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.norm_attn_norm.attn.out_proj.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.norm_attn_norm.norm_1.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.32.norm_attn_norm.norm_2.weight": "model-00043-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.0.v1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.0.w1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.0.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.1.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.1.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.1.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.10.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.10.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.10.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.11.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.11.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.11.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.12.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.12.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.12.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.13.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.13.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.13.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.14.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.14.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.14.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.15.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.15.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.15.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.2.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.2.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.2.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.3.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.3.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.3.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.4.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.4.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.4.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.5.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.5.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.5.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.6.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.6.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.6.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.7.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.7.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.7.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.8.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.8.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.8.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.9.v1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.9.w1.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.experts.mlp_experts.9.w2.weight": "model-00045-of-00054.safetensors",
+ "transformer.blocks.33.ffn.router.layer.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.norm_attn_norm.attn.Wqkv.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.norm_attn_norm.attn.out_proj.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.norm_attn_norm.norm_1.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.33.norm_attn_norm.norm_2.weight": "model-00044-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.0.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.0.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.0.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.1.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.1.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.1.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.10.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.10.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.10.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.11.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.11.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.11.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.12.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.12.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.12.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.13.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.13.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.13.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.14.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.14.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.14.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.15.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.15.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.15.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.2.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.2.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.2.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.3.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.3.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.3.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.4.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.4.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.4.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.5.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.5.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.5.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.6.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.6.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.6.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.7.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.7.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.7.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.8.v1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.8.w1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.8.w2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.9.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.9.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.experts.mlp_experts.9.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.34.ffn.router.layer.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.norm_attn_norm.attn.Wqkv.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.norm_attn_norm.attn.out_proj.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.norm_attn_norm.norm_1.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.34.norm_attn_norm.norm_2.weight": "model-00046-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.0.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.0.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.0.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.1.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.1.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.1.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.10.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.10.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.10.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.11.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.11.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.11.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.12.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.12.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.12.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.13.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.13.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.13.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.14.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.14.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.14.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.15.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.15.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.15.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.2.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.2.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.2.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.3.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.3.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.3.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.4.v1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.4.w1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.4.w2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.5.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.5.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.5.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.6.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.6.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.6.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.7.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.7.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.7.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.8.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.8.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.8.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.9.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.9.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.experts.mlp_experts.9.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.35.ffn.router.layer.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.norm_attn_norm.attn.Wqkv.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.norm_attn_norm.attn.out_proj.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.norm_attn_norm.norm_1.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.35.norm_attn_norm.norm_2.weight": "model-00047-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.0.v1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.0.w1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.0.w2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.1.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.1.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.1.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.10.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.10.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.10.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.11.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.11.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.11.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.12.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.12.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.12.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.13.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.13.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.13.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.14.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.14.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.14.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.15.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.15.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.15.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.2.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.2.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.2.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.3.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.3.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.3.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.4.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.4.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.4.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.5.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.5.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.5.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.6.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.6.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.6.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.7.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.7.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.7.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.8.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.8.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.8.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.9.v1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.9.w1.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.experts.mlp_experts.9.w2.weight": "model-00049-of-00054.safetensors",
+ "transformer.blocks.36.ffn.router.layer.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.norm_attn_norm.attn.Wqkv.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.norm_attn_norm.attn.out_proj.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.norm_attn_norm.norm_1.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.36.norm_attn_norm.norm_2.weight": "model-00048-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.0.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.0.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.0.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.1.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.1.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.1.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.10.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.10.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.10.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.11.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.11.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.11.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.12.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.12.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.12.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.13.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.13.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.13.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.14.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.14.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.14.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.15.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.15.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.15.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.2.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.2.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.2.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.3.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.3.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.3.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.4.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.4.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.4.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.5.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.5.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.5.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.6.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.6.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.6.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.7.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.7.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.7.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.8.v1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.8.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.8.w2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.9.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.9.w1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.ffn.experts.mlp_experts.9.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.37.ffn.router.layer.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.norm_attn_norm.attn.Wqkv.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.norm_attn_norm.attn.out_proj.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.norm_attn_norm.norm_1.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.37.norm_attn_norm.norm_2.weight": "model-00050-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.0.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.0.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.0.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.1.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.1.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.1.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.10.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.10.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.10.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.11.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.11.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.11.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.12.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.12.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.12.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.13.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.13.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.13.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.14.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.14.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.14.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.15.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.15.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.15.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.2.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.2.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.2.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.3.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.3.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.3.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.4.v1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.4.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.4.w2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.5.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.5.w1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.5.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.6.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.6.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.6.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.7.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.7.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.7.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.8.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.8.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.8.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.9.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.9.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.experts.mlp_experts.9.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.38.ffn.router.layer.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.norm_attn_norm.attn.Wqkv.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.norm_attn_norm.attn.out_proj.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.norm_attn_norm.norm_1.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.38.norm_attn_norm.norm_2.weight": "model-00051-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.0.v1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.0.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.0.w2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.1.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.1.w1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.1.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.10.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.10.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.10.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.11.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.11.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.11.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.12.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.12.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.12.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.13.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.13.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.13.w2.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.14.v1.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.14.w1.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.14.w2.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.15.v1.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.15.w1.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.15.w2.weight": "model-00054-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.2.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.2.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.2.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.3.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.3.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.3.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.4.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.4.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.4.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.5.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.5.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.5.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.6.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.6.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.6.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.7.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.7.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.7.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.8.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.8.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.8.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.9.v1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.9.w1.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.experts.mlp_experts.9.w2.weight": "model-00053-of-00054.safetensors",
+ "transformer.blocks.39.ffn.router.layer.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.norm_attn_norm.attn.Wqkv.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.norm_attn_norm.attn.out_proj.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.norm_attn_norm.norm_1.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.39.norm_attn_norm.norm_2.weight": "model-00052-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.0.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.0.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.0.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.1.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.1.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.1.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.10.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.10.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.10.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.11.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.11.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.11.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.12.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.12.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.12.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.13.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.13.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.13.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.14.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.14.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.14.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.15.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.15.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.15.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.2.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.2.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.2.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.3.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.3.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.3.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.4.v1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.4.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.4.w2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.5.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.5.w1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.5.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.6.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.6.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.6.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.7.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.7.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.7.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.8.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.8.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.8.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.9.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.9.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.experts.mlp_experts.9.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.4.ffn.router.layer.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.norm_attn_norm.attn.Wqkv.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.norm_attn_norm.attn.out_proj.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.norm_attn_norm.norm_1.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.4.norm_attn_norm.norm_2.weight": "model-00006-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.0.v1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.0.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.0.w2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.1.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.1.w1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.1.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.10.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.10.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.10.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.11.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.11.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.11.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.12.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.12.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.12.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.13.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.13.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.13.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.14.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.14.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.14.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.15.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.15.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.15.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.2.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.2.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.2.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.3.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.3.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.3.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.4.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.4.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.4.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.5.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.5.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.5.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.6.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.6.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.6.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.7.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.7.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.7.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.8.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.8.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.8.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.9.v1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.9.w1.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.experts.mlp_experts.9.w2.weight": "model-00008-of-00054.safetensors",
+ "transformer.blocks.5.ffn.router.layer.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.norm_attn_norm.attn.Wqkv.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.norm_attn_norm.attn.out_proj.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.norm_attn_norm.norm_1.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.5.norm_attn_norm.norm_2.weight": "model-00007-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.0.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.0.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.0.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.1.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.1.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.1.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.10.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.10.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.10.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.11.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.11.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.11.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.12.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.12.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.12.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.13.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.13.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.13.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.14.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.14.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.14.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.15.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.15.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.15.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.2.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.2.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.2.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.3.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.3.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.3.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.4.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.4.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.4.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.5.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.5.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.5.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.6.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.6.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.6.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.7.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.7.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.7.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.8.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.8.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.8.w2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.9.v1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.9.w1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.ffn.experts.mlp_experts.9.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.6.ffn.router.layer.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.norm_attn_norm.attn.Wqkv.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.norm_attn_norm.attn.out_proj.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.norm_attn_norm.norm_1.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.6.norm_attn_norm.norm_2.weight": "model-00009-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.0.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.0.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.0.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.1.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.1.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.1.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.10.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.10.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.10.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.11.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.11.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.11.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.12.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.12.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.12.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.13.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.13.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.13.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.14.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.14.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.14.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.15.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.15.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.15.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.2.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.2.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.2.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.3.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.3.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.3.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.4.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.4.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.4.w2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.5.v1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.5.w1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.5.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.6.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.6.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.6.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.7.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.7.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.7.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.8.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.8.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.8.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.9.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.9.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.experts.mlp_experts.9.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.7.ffn.router.layer.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.norm_attn_norm.attn.Wqkv.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.norm_attn_norm.attn.out_proj.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.norm_attn_norm.norm_1.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.7.norm_attn_norm.norm_2.weight": "model-00010-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.0.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.0.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.0.w2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.1.v1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.1.w1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.1.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.10.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.10.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.10.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.11.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.11.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.11.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.12.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.12.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.12.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.13.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.13.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.13.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.14.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.14.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.14.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.15.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.15.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.15.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.2.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.2.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.2.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.3.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.3.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.3.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.4.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.4.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.4.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.5.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.5.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.5.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.6.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.6.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.6.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.7.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.7.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.7.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.8.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.8.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.8.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.9.v1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.9.w1.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.experts.mlp_experts.9.w2.weight": "model-00012-of-00054.safetensors",
+ "transformer.blocks.8.ffn.router.layer.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.norm_attn_norm.attn.Wqkv.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.norm_attn_norm.attn.out_proj.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.norm_attn_norm.norm_1.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.8.norm_attn_norm.norm_2.weight": "model-00011-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.0.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.0.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.0.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.1.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.1.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.1.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.10.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.10.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.10.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.11.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.11.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.11.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.12.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.12.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.12.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.13.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.13.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.13.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.14.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.14.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.14.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.15.v1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.15.w1.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.15.w2.weight": "model-00014-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.2.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.2.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.2.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.3.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.3.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.3.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.4.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.4.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.4.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.5.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.5.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.5.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.6.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.6.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.6.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.7.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.7.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.7.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.8.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.8.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.8.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.9.v1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.9.w1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.experts.mlp_experts.9.w2.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.ffn.router.layer.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.norm_attn_norm.attn.Wqkv.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.norm_attn_norm.attn.out_proj.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.norm_attn_norm.norm_1.weight": "model-00013-of-00054.safetensors",
+ "transformer.blocks.9.norm_attn_norm.norm_2.weight": "model-00013-of-00054.safetensors",
+ "transformer.norm_f.weight": "model-00054-of-00054.safetensors",
+ "transformer.wte.weight": "model-00001-of-00054.safetensors"
+ }
+}
diff --git a/modeling_dbrx.py b/modeling_dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..21f018aaa80c7c0a8522ce03b66c09aa5aec7ee1
--- /dev/null
+++ b/modeling_dbrx.py
@@ -0,0 +1,1454 @@
+"""PyTorch Dbrx model."""
+
+import math
+import warnings
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (MoeCausalLMOutputWithPast,
+ MoeModelOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available, logging
+
+from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
+
+if is_flash_attn_2_available():
+ try:
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
+ from flash_attn.bert_padding import pad_input # noqa
+ from flash_attn.bert_padding import index_first_axis, unpad_input
+ except:
+ pass
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'DbrxConfig'
+
+#############################################################################
+# Copied from LLaMaRotaryEmbedding
+#############################################################################
+
+
+class DbrxRotaryEmbedding(nn.Module):
+
+ def __init__(self,
+ dim: int,
+ max_position_embeddings: int = 2048,
+ base: float = 10000.0,
+ scaling_factor: float = 1.0):
+ super().__init__()
+ self.scaling_factor = scaling_factor
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base**(
+ torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+ self.register_buffer('inv_freq', inv_freq, persistent=False)
+ # For BC we register cos and sin cached
+ self.max_seq_len_cached = max_position_embeddings
+
+ @torch.no_grad()
+ def forward(
+ self, x: torch.Tensor, position_ids: torch.LongTensor
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
+ position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(
+ device_type, str) and device_type != 'mps' else 'cpu'
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float()
+ @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., :x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(
+ q: torch.Tensor,
+ k: torch.Tensor,
+ cos: torch.Tensor,
+ sin: torch.Tensor,
+ unsqueeze_dim: int = 1) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and
+ sin so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+
+ The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
+ (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :,
+ None, :, :].expand(batch, num_key_value_heads,
+ n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
+ head_dim)
+
+
+#############################################################################
+
+#############################################################################
+# Modified from modeling_mixtral
+#############################################################################
+
+
+def load_balancing_loss_func(
+ gate_logits: torch.Tensor,
+ num_experts: int,
+ top_k: int,
+ attention_mask: Optional[torch.Tensor],
+) -> torch.Tensor:
+ r"""Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+ See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+ experts is too unbalanced.
+
+ Args:
+ gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+ shape [batch_size X sequence_length, num_experts].
+ num_experts (`int`):
+ Number of experts.
+ top_k (`int`):
+ The number of experts each token is routed to.
+ attention_mask (`torch.Tensor`, None):
+ The attention_mask used in forward function
+ shape [batch_size X sequence_length] if not None.
+
+ Returns:
+ The auxiliary loss.
+ """
+ if gate_logits is None or not isinstance(gate_logits, tuple):
+ return torch.tensor(0.0)
+
+ if isinstance(gate_logits, tuple):
+ compute_device = gate_logits[0].device
+ concatenated_gate_logits = torch.cat(
+ [layer_gate.to(compute_device) for layer_gate in gate_logits],
+ dim=0)
+
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits,
+ dim=-1)
+
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+ if attention_mask is None:
+ # Compute the percentage of tokens routed to each experts
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+ # Compute the average probability of routing to these experts
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
+ else:
+ batch_size, sequence_length = attention_mask.shape
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (
+ batch_size * sequence_length)
+
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+ expert_attention_mask = (attention_mask[None, :, :, None, None].expand(
+ (num_hidden_layers, batch_size, sequence_length, top_k,
+ num_experts)).reshape(-1, top_k, num_experts).to(compute_device))
+
+ # Compute the percentage of tokens routed to each experts
+ tokens_per_expert = torch.sum(
+ expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+ expert_attention_mask, dim=0)
+
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+ router_per_expert_attention_mask = (
+ attention_mask[None, :, :, None].expand(
+ (num_hidden_layers, batch_size, sequence_length,
+ num_experts)).reshape(-1, num_experts).to(compute_device))
+
+ # Compute the average probability of routing to these experts
+ router_prob_per_expert = torch.sum(
+ routing_weights * router_per_expert_attention_mask,
+ dim=0) / torch.sum(router_per_expert_attention_mask, dim=0)
+
+ overall_loss = torch.sum(tokens_per_expert *
+ router_prob_per_expert.unsqueeze(0))
+ return overall_loss * num_experts
+
+
+#############################################################################
+
+
+def resolve_ffn_act_fn(
+ ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]:
+ """Resolve the activation function for the feed-forward network.
+
+ Args:
+ ffn_act_fn (dict): The configuration dictionary for the activation function.
+ The dict config must specify the 'name' of a torch.nn.functional activation
+ function. All of other key values pairs are bound to the function as a partial.
+
+ Returns:
+ Callable[[torch.Tensor], torch.Tensor]: The activation function.
+ """
+ config = deepcopy(ffn_act_fn)
+ name = config.pop('name')
+ if not hasattr(nn.functional, name):
+ raise ValueError(f'Unrecognised activation function name ({name}).')
+ act = getattr(nn.functional, name)
+ return partial(act, **config)
+
+
+#############################################################################
+# Copied from LLaMaAttention
+#############################################################################
+
+
+def _get_unpad_data(attention_mask: torch.Tensor):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
+ (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+class DbrxAttention(nn.Module):
+ """Multi-head self attention."""
+
+ def __init__(self,
+ hidden_size: int,
+ num_heads: int,
+ max_position_embeddings: int,
+ attn_config: DbrxAttentionConfig,
+ block_idx: Optional[int] = None):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.num_heads = num_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.max_position_embeddings = max_position_embeddings
+ self.block_idx = block_idx
+ self.config = attn_config
+ if block_idx is None:
+ logger.warning_once(
+ f'Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will '
+ +
+ 'lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` '
+ + 'when creating this class.')
+
+ self.attn_pdrop = attn_config.attn_pdrop
+ self.clip_qkv = attn_config.clip_qkv
+ self.num_key_value_heads = attn_config.kv_n_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.rope_theta = attn_config.rope_theta
+
+ self.Wqkv = nn.Linear(self.hidden_size,
+ self.hidden_size +
+ 2 * self.num_key_value_heads * self.head_dim,
+ bias=False)
+ self.out_proj = nn.Linear(self.hidden_size,
+ self.hidden_size,
+ bias=False)
+ self.rotary_emb = DbrxRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_ids: torch.LongTensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.Wqkv(hidden_states)
+ if self.clip_qkv is not None:
+ qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+ query_states, key_states, value_states = qkv_states.split(
+ [
+ self.hidden_size,
+ self.num_key_value_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ ],
+ dim=2,
+ )
+
+ query_states = query_states.view(bsz, q_len, self.num_heads,
+ self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+ self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+ self.head_dim).transpose(1, 2)
+
+ past_key_value = getattr(self, 'past_key_value', past_key_value)
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states,
+ key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
+ cache_kwargs = {
+ 'sin': sin,
+ 'cos': cos,
+ 'cache_position': cache_position
+ }
+ key_states, value_states = past_key_value.update(
+ key_states, value_states, self.block_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(
+ 2, 3)) / math.sqrt(self.head_dim)
+
+ if attention_mask is not None: # no matter the length, we just slice it
+ causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights,
+ dim=-1,
+ dtype=torch.float32).to(
+ query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights,
+ p=self.attn_pdrop,
+ training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+ + f' {attn_output.size()}')
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class DbrxFlashAttention2(DbrxAttention):
+ """Dbrx flash attention module.
+
+ This module inherits from `DbrxAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it
+ calls the public API of flash attention.
+ """
+
+ def __init__(self, *args: Any, **kwargs: Any):
+ if not is_flash_attn_2_available():
+ raise ImportError(
+ 'Flash Attention 2 is not available. Please install it with `pip install flash-attn`.'
+ )
+
+ super().__init__(*args, **kwargs)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+ Optional[Tuple[torch.Tensor]]]:
+ logger.info(
+ 'Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.'
+ )
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ qkv_states = self.Wqkv(hidden_states)
+ if self.clip_qkv is not None:
+ qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+ query_states, key_states, value_states = qkv_states.split(
+ [
+ self.hidden_size,
+ self.num_key_value_heads * self.head_dim,
+ self.num_key_value_heads * self.head_dim,
+ ],
+ dim=2,
+ )
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads,
+ self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+ self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+ self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states,
+ key_states, cos, sin)
+
+ past_key_value = getattr(self, 'past_key_value', past_key_value)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {
+ 'sin': sin,
+ 'cos': cos,
+ 'cache_position': cache_position
+ }
+ key_states, value_states = past_key_value.update(
+ key_states, value_states, self.block_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+ # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attn_pdrop if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, '_pre_quantization_dtype'):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = query_states.dtype
+
+ logger.warning_once(
+ f'The input hidden states seems to be silently casted in float32, this might be '
+ +
+ f'related to the fact you have upcasted embedding or layer norm layers in '
+ + f'float32. We will cast back the input in {target_dtype}.')
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = self._flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ dropout=dropout_rate,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len,
+ self.hidden_size).contiguous()
+ attn_output = self.out_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value # type: ignore
+
+ def _flash_attention_forward(
+ self,
+ query_states: torch.Tensor,
+ key_states: torch.Tensor,
+ value_states: torch.Tensor,
+ attention_mask: Union[torch.LongTensor, None],
+ query_length: int,
+ dropout: float = 0.0,
+ softmax_scale: Optional[float] = None,
+ ):
+ """Use FlashAttention, stripping padding tokens if necessary.
+
+ Args:
+ query_states (torch.Tensor): Input query states to be passed to Flash Attention API
+ key_states (torch.Tensor): Input key states to be passed to Flash Attention API
+ value_states (torch.Tensor): Input value states to be passed to Flash Attention API
+ attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size
+ (batch_size, seq_len) where 0 stands for the position of padding tokens and 1
+ for the position of non-padding tokens.
+ query_length (int): The length of the query sequence
+ dropout (float): Attention dropout
+ softmax_scale (float, optional): The scaling of QK^T before applying softmax.
+ Defaults to 1 / sqrt(head_dim)
+ """
+ causal = True
+ # Contains at least one padding token in the sequence
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+ query_states, key_states, value_states, attention_mask,
+ query_length)
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(
+ attn_output_unpad,
+ indices_q,
+ batch_size,
+ query_length,
+ )
+ else:
+ attn_output = flash_attn_func(
+ query_states,
+ key_states,
+ value_states,
+ dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ return attn_output
+
+ def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
+ value_layer: torch.Tensor, attention_mask: torch.Tensor,
+ query_length: int):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+ attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+ head_dim), indices_k)
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+ head_dim), indices_k)
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+ head_dim), indices_k)
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+ query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q,
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+
+DBRX_ATTENTION_CLASSES = {
+ 'eager': DbrxAttention,
+ 'flash_attention_2': DbrxFlashAttention2,
+}
+
+
+class DbrxNormAttentionNorm(nn.Module):
+
+ def __init__(
+ self,
+ hidden_size: int,
+ num_heads: int,
+ max_position_embeddings: int,
+ resid_pdrop: float,
+ attn_implementation: str,
+ attn_config: DbrxAttentionConfig,
+ block_idx: Optional[int] = None,
+ ):
+ super().__init__()
+ self.block_idx = block_idx
+ self.resid_pdrop = resid_pdrop
+ self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
+ self.attn = DBRX_ATTENTION_CLASSES[attn_implementation](
+ hidden_size=hidden_size,
+ num_heads=num_heads,
+ max_position_embeddings=max_position_embeddings,
+ attn_config=attn_config,
+ block_idx=block_idx,
+ )
+ self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_ids: torch.LongTensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+ Optional[Cache]]:
+
+ residual_states = hidden_states
+ hidden_states = self.norm_1(hidden_states).to(hidden_states.dtype)
+
+ hidden_states, attn_weights, past_key_value = self.attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ hidden_states = nn.functional.dropout(hidden_states,
+ p=self.resid_pdrop,
+ training=self.training)
+ hidden_states = hidden_states + residual_states
+
+ residual_states = hidden_states
+ hidden_states = self.norm_2(hidden_states).to(hidden_states.dtype)
+
+ return residual_states, hidden_states, attn_weights, past_key_value
+
+
+class DbrxRouter(nn.Module):
+
+ def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int,
+ moe_jitter_eps: Optional[float],
+ moe_normalize_expert_weights: Optional[float],
+ uniform_expert_assignment: bool):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.moe_num_experts = moe_num_experts
+ self.moe_top_k = moe_top_k
+ self.moe_jitter_eps = moe_jitter_eps
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
+ self.uniform_expert_assignment = uniform_expert_assignment
+
+ self.layer = nn.Linear(self.hidden_size,
+ self.moe_num_experts,
+ bias=False)
+
+ def jitter(self, x: torch.Tensor) -> torch.Tensor:
+ if self.moe_jitter_eps is None:
+ raise RuntimeError('The router does not have moe_jitter_eps set.')
+ low = 1.0 - self.moe_jitter_eps
+ high = 1.0 + self.moe_jitter_eps
+ noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+ return low + noise * (high - low)
+
+ def forward(
+ self, x: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+ if self.training and self.moe_jitter_eps is not None:
+ x = x * self.jitter(x)
+
+ weights = self.layer(x.view(-1,
+ x.shape[-1])).softmax(dim=-1,
+ dtype=torch.float32)
+ top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
+
+ if self.moe_normalize_expert_weights:
+ top_weights = top_weights / torch.norm(
+ top_weights,
+ p=self.moe_normalize_expert_weights,
+ dim=-1,
+ keepdim=True)
+
+ if self.uniform_expert_assignment:
+ with torch.no_grad():
+ uniform_tensor = torch.arange(
+ 0,
+ top_experts.numel(),
+ device=top_experts.device,
+ dtype=top_experts.dtype) % self.moe_num_experts
+ top_experts = uniform_tensor.reshape(top_experts.shape)
+ # Note, weights and top_weights are not changed
+
+ weights = weights.to(x.dtype)
+ top_weights = top_weights.to(x.dtype)
+ return weights, top_weights, top_experts # type: ignore
+
+
+class DbrxExpertGLU(nn.Module):
+
+ def __init__(self, hidden_size: int, ffn_hidden_size: int, ffn_act_fn: dict):
+ super().__init__()
+ self.w1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
+ self.v1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
+ self.w2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False)
+ self.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x1 = self.w1(x)
+ x2 = self.v1(x)
+ x1 = self.activation_fn(x1)
+ x1 = x1 * x2
+ x1 = self.w2(x1)
+ return x1
+
+
+class DbrxExperts(nn.Module):
+
+ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
+ super().__init__()
+ self.moe_num_experts = moe_num_experts
+ self.mlp_experts = nn.ModuleList([DbrxExpertGLU(hidden_size, ffn_hidden_size, ffn_act_fn) for _ in range(moe_num_experts)])
+
+ def forward(self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor) -> torch.Tensor:
+ bsz, q_len, hidden_size = x.shape
+ x = x.view(-1, hidden_size)
+ out = torch.zeros_like(x)
+
+ expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+ for expert_idx in range(0, self.moe_num_experts):
+ topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+ if token_idx.shape[0] == 0:
+ continue
+
+ token_list = token_idx.tolist()
+ topk_list = topk_idx.tolist()
+
+ expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+ expert_out = self.mlp_experts[expert_idx](expert_tokens) * top_weights[token_list, topk_list, None]
+
+ out.index_add_(0, token_idx, expert_out)
+
+ out = out.reshape(bsz, q_len, hidden_size)
+ return out
+
+
+class DbrxFFN(nn.Module):
+
+ def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
+ super().__init__()
+
+ self.router = DbrxRouter(
+ hidden_size,
+ moe_num_experts=ffn_config.moe_num_experts,
+ moe_top_k=ffn_config.moe_top_k,
+ moe_jitter_eps=ffn_config.moe_jitter_eps,
+ moe_normalize_expert_weights=ffn_config.
+ moe_normalize_expert_weights,
+ uniform_expert_assignment=ffn_config.uniform_expert_assignment,
+ )
+
+ self.experts = DbrxExperts(
+ hidden_size=hidden_size,
+ ffn_hidden_size=ffn_config.ffn_hidden_size,
+ moe_num_experts=ffn_config.moe_num_experts,
+ ffn_act_fn=ffn_config.ffn_act_fn,
+ )
+
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ weights, top_weights, top_experts = self.router(x)
+ out = self.experts(x, weights, top_weights, top_experts)
+ return out, weights
+
+
+class DbrxBlock(nn.Module):
+
+ def __init__(self, config: DbrxConfig, block_idx: int):
+ super().__init__()
+ self.hidden_size = config.d_model
+ self.resid_pdrop = config.resid_pdrop
+ self.block_idx = block_idx
+ self.norm_attn_norm = DbrxNormAttentionNorm(
+ hidden_size=config.d_model,
+ num_heads=config.n_heads,
+ max_position_embeddings=config.max_seq_len,
+ resid_pdrop=config.resid_pdrop,
+ attn_implementation=config._attn_implementation,
+ attn_config=config.attn_config,
+ block_idx=block_idx,
+ )
+ self.ffn = DbrxFFN(hidden_size=config.d_model,
+ ffn_config=config.ffn_config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: torch.LongTensor = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ output_router_logits: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Any,
+ ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, Optional[torch.Tensor]],
+ Tuple[torch.Tensor, Optional[Cache]], Tuple[
+ torch.Tensor, Optional[torch.Tensor], Optional[Cache]],
+ Tuple[torch.Tensor, Optional[torch.Tensor],
+ Optional[torch.Tensor]], Tuple[
+ torch.Tensor, Optional[Cache], Optional[torch.Tensor]],
+ Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache],
+ Optional[torch.Tensor]],]:
+ """Forward function for DbrxBlock.
+
+ Args:
+ hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
+ attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
+ if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
+ if default attention is used.
+ past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
+ output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
+ attention layers. See `attentions` under returned tensors for more detail.
+ output_router_logits (`bool`, optional): Whether or not to return the router logits.
+ use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
+ returned and can be used to speed up decoding (see `past_key_values`).
+ cache_position (`torch.LongTensor`, optional): position ids of the cache
+ """
+ if 'padding_mask' in kwargs:
+ warnings.warn(
+ 'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+ )
+
+ # Norm + Attention + Norm
+ resid_states, hidden_states, self_attn_weights, present_key_value = self.norm_attn_norm(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ # Fully Connected
+ hidden_states, router_logits = self.ffn(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states,
+ p=self.resid_pdrop,
+ training=self.training)
+ hidden_states = resid_states + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ if output_router_logits:
+ outputs += (router_logits,)
+
+ return outputs
+
+
+class DbrxPreTrainedModel(PreTrainedModel):
+ config_class = DbrxConfig
+ base_model_prefix = 'transformer'
+ supports_gradient_checkpointing = True
+ _no_split_modules = ['DbrxBlock']
+ _skip_keys_device_placement = ['past_key_values']
+ _supports_flash_attn_2 = True
+ _supports_sdpa = False
+ _supports_cache_class = True
+
+ def _init_weights(self, module: nn.Module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+
+ def _setup_cache(self, cache_cls: Any, max_batch_size: int,
+ max_cache_len: int): # TODO: how to set var type of class?
+ if self.config._attn_implementation == 'flash_attention_2' and cache_cls == StaticCache:
+ raise ValueError(
+ '`static` cache implementation is not compatible with ' +
+ '`attn_implementation==flash_attention_2`. Make sure to use ' +
+ '`spda` in the mean time and open an issue at https://github.com/huggingface/transformers.'
+ )
+
+ for block in self.transformer.blocks:
+ device = block.norm_attn_norm.norm_1.weight.device
+ if hasattr(self.config, '_pre_quantization_dtype'):
+ dtype = self.config._pre_quantization_dtype
+ else:
+ dtype = block.norm_attn_norm.attn.out_proj.weight.dtype
+ block.norm_attn_norm.attn.past_key_value = cache_cls(self.config,
+ max_batch_size,
+ max_cache_len,
+ device=device,
+ dtype=dtype)
+
+ def _reset_cache(self):
+ for block in self.transformer.blocks:
+ block.norm_attn_norm.attn.past_key_value = None
+
+
+class DbrxModel(DbrxPreTrainedModel):
+ """Transformer decoder consisting of *config.num_hidden_layers*
+
+ [`DbrxBlock`] layers.
+
+ Args:
+ config: DbrxConfig
+ """
+
+ def __init__(self, config: DbrxConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.emb_pdrop = config.emb_pdrop
+
+ self.wte = nn.Embedding(config.vocab_size, config.d_model,
+ self.padding_idx)
+ self.blocks = nn.ModuleList([
+ DbrxBlock(config, block_idx) for block_idx in range(config.n_layers)
+ ])
+ self.norm_f = nn.LayerNorm(config.d_model, bias=False)
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Embedding:
+ return self.wte
+
+ def set_input_embeddings(self, value: nn.Embedding):
+ self.wte = value
+
+ def _autocast_input_embeddings(self,
+ inputs_embeds: torch.Tensor) -> torch.Tensor:
+ if inputs_embeds.device.type == 'cuda' and torch.is_autocast_enabled():
+ return inputs_embeds.to(dtype=torch.get_autocast_gpu_dtype())
+ elif inputs_embeds.device.type == 'cpu' and torch.is_autocast_cpu_enabled(
+ ):
+ return inputs_embeds.to(dtype=torch.get_autocast_cpu_dtype())
+ else:
+ return inputs_embeds
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, MoeModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (output_hidden_states
+ if output_hidden_states is not None else
+ self.config.output_hidden_states)
+ output_router_logits = (output_router_logits
+ if output_router_logits is not None else
+ self.config.output_router_logits)
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError(
+ 'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
+ )
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
+ )
+ use_cache = False
+
+ if inputs_embeds is None:
+ inputs_embeds = self.wte(input_ids)
+
+ inputs_embeds = self._autocast_input_embeddings(
+ inputs_embeds) # type: ignore
+ inputs_embeds = nn.functional.dropout(inputs_embeds,
+ p=self.emb_pdrop,
+ training=self.training)
+
+ past_seen_tokens = 0
+ if use_cache: # kept for BC (cache positions)
+ if not isinstance(past_key_values, StaticCache):
+ past_key_values = DynamicCache.from_legacy_cache(
+ past_key_values)
+ past_seen_tokens = past_key_values.get_seq_length( # type: ignore
+ )
+
+ if cache_position is None:
+ if isinstance(past_key_values, StaticCache):
+ raise ValueError(
+ 'cache_position is a required argument when using StaticCache.'
+ )
+ cache_position = torch.arange( # type: ignore
+ past_seen_tokens,
+ past_seen_tokens + inputs_embeds.shape[1],
+ device=inputs_embeds.device)
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0) # type: ignore
+
+ causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
+ cache_position) # type: ignore
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_router_logits = () if output_router_logits else None
+ next_decoder_cache = None
+
+ for block in self.blocks:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,) # type: ignore
+
+ if self.gradient_checkpointing and self.training:
+ block_outputs = self._gradient_checkpointing_func(
+ block.__call__,
+ hidden_states,
+ causal_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ output_router_logits,
+ use_cache,
+ cache_position,
+ )
+ else:
+ block_outputs = block(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ output_router_logits=output_router_logits,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ hidden_states = block_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = block_outputs[
+ 2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (block_outputs[1],) # type: ignore
+
+ if output_router_logits:
+ all_router_logits += (block_outputs[-1],) # type: ignore
+
+ hidden_states = self.norm_f(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,) # type: ignore
+
+ next_cache = None
+ if use_cache:
+ next_cache = (
+ next_decoder_cache.to_legacy_cache() # type: ignore
+ if isinstance(next_decoder_cache, Cache) else
+ next_decoder_cache)
+ if not return_dict:
+ return tuple(v for v in [
+ hidden_states, next_cache, all_hidden_states, all_self_attns,
+ all_router_logits
+ ] if v is not None)
+ return MoeModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ router_logits=all_router_logits,
+ )
+
+ # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+ # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+ # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+ # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+ def _update_causal_mask(
+ self, attention_mask: Optional[torch.Tensor],
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor) -> Optional[torch.Tensor]:
+ if self.config._attn_implementation == 'flash_attention_2':
+ if attention_mask is not None and 0.0 in attention_mask:
+ return attention_mask
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ min_dtype = torch.finfo(dtype).min
+ sequence_length = input_tensor.shape[1]
+ if hasattr(self.blocks[0].norm_attn_norm.attn,
+ 'past_key_value'): # static cache
+ target_length = self.config.max_position_embeddings
+ else: # dynamic cache
+ target_length = (attention_mask.shape[-1] if isinstance(
+ attention_mask, torch.Tensor) else cache_position[-1] + 1)
+ target_length = int(target_length)
+
+ causal_mask = torch.full((sequence_length, target_length),
+ fill_value=min_dtype,
+ dtype=dtype,
+ device=device)
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(
+ target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None,
+ None, :, :].expand(input_tensor.shape[0], 1,
+ -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone(
+ ) # copy to contiguous memory for in-place edit
+ if attention_mask.dim() == 2:
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[..., :mask_length].eq(
+ 0.0) * attention_mask[:, None, None, :].eq(0.0)
+ causal_mask[..., :mask_length] = causal_mask[
+ ..., :mask_length].masked_fill(padding_mask, min_dtype)
+ elif attention_mask.dim() == 4:
+ # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+ # cache. In that case, the 4D attention mask attends to the newest tokens only.
+ if attention_mask.shape[
+ -2] < cache_position[0] + sequence_length:
+ offset = cache_position[0]
+ else:
+ offset = 0
+ mask_shape = attention_mask.shape
+ mask_slice = (attention_mask.eq(0.0)).to(
+ dtype=dtype) * min_dtype
+ causal_mask[:mask_shape[0], :mask_shape[1],
+ offset:mask_shape[2] +
+ offset, :mask_shape[3]] = mask_slice
+
+ if (self.config._attn_implementation == 'sdpa' and
+ attention_mask is not None and
+ attention_mask.device.type == 'cuda'):
+ # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+ is_tracing = (
+ torch.jit.is_tracing() or
+ isinstance(input_tensor, torch.fx.Proxy) or # type: ignore
+ (hasattr(torch, '_dynamo') and torch._dynamo.is_compiling()))
+ if not is_tracing and torch.any(attention_mask != 1):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ causal_mask = AttentionMaskConverter._unmask_unattended(
+ causal_mask, min_dtype)
+
+ return causal_mask
+
+
+class DbrxForCausalLM(DbrxPreTrainedModel):
+
+ def __init__(self, config: DbrxConfig):
+ super().__init__(config)
+ self.transformer = DbrxModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size,
+ config.vocab_size,
+ bias=False)
+ self.router_aux_loss_coef = config.router_aux_loss_coef
+ self.num_experts = config.ffn_config.moe_num_experts
+ self.num_experts_per_tok = config.ffn_config.moe_top_k
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> nn.Embedding:
+ return self.transformer.get_input_embeddings()
+
+ def set_input_embeddings(self, value: nn.Embedding):
+ self.transformer.set_input_embeddings(value)
+
+ def get_output_embeddings(self) -> nn.Linear:
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings: nn.Linear):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder: DbrxModel):
+ self.transformer = decoder
+
+ def get_decoder(self) -> DbrxModel:
+ return self.transformer
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+ r"""Forward function for causal language modeling.
+
+ Example:
+ ```python
+ >>> from transformers import AutoTokenizer, DbrxForCausalLM
+
+ >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx")
+ >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx")
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (output_hidden_states
+ if output_hidden_states is not None else
+ self.config.output_hidden_states)
+ output_router_logits = (output_router_logits
+ if output_router_logits is not None else
+ self.config.output_router_logits)
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.transformer(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ output_router_logits=output_router_logits,
+ return_dict=return_dict,
+ cache_position=cache_position,
+ )
+
+ hidden_states = outputs[0]
+ logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = nn.CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ aux_loss = None
+ if output_router_logits:
+ aux_loss = load_balancing_loss_func(
+ outputs.router_logits if return_dict else outputs[-1],
+ self.num_experts,
+ self.num_experts_per_tok,
+ attention_mask,
+ )
+ if labels is not None and loss is not None:
+ loss += self.router_aux_loss_coef * aux_loss.to(
+ loss.device) # make sure to reside in the same device
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return MoeCausalLMOutputWithPast(
+ loss=loss,
+ aux_loss=aux_loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ router_logits=outputs.router_logits,
+ )
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids: torch.Tensor,
+ past_key_values: Optional[Cache] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ **kwargs: Any) -> Dict[str, Any]:
+ past_length = 0
+ if past_key_values is not None:
+ if isinstance(past_key_values, Cache):
+ cache_length = past_key_values.get_seq_length()
+ past_length = past_key_values.seen_tokens
+ max_cache_length = past_key_values.get_max_length()
+ else:
+ cache_length = past_length = past_key_values[0][0].shape[2]
+ max_cache_length = None
+
+ # Keep only the unprocessed tokens:
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+ # input)
+ if attention_mask is not None and attention_mask.shape[
+ 1] > input_ids.shape[1]:
+ input_ids = input_ids[:,
+ -(attention_mask.shape[1] - past_length):]
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+ # input_ids based on the past_length.
+ elif past_length < input_ids.shape[1]:
+ input_ids = input_ids[:, past_length:]
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+ if (max_cache_length is not None and attention_mask is not None and
+ cache_length + input_ids.shape[1] > max_cache_length):
+ attention_mask = attention_mask[:, -max_cache_length:]
+
+ position_ids = kwargs.get('position_ids', None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1]:]
+
+ if self.generation_config.cache_implementation == 'static':
+ # generation with static cache
+ cache_position = kwargs.get('cache_position', None)
+ if cache_position is None:
+ past_length = 0
+ else:
+ past_length = cache_position[-1] + 1
+ input_ids = input_ids[:, past_length:]
+ position_ids = position_ids[:,
+ past_length:] if position_ids is not None else None
+
+ # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
+ # same goes for position ids. Could also help with continued generation.
+ input_length = position_ids.shape[
+ -1] if position_ids is not None else input_ids.shape[-1]
+ cache_position = torch.arange(past_length,
+ past_length + input_length,
+ device=input_ids.device)
+ position_ids = position_ids.contiguous(
+ ) if position_ids is not None else None
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {'inputs_embeds': inputs_embeds}
+ else:
+ # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+ # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+ # TODO: use `next_tokens` directly instead.
+ model_inputs = {'input_ids': input_ids.contiguous()}
+
+ model_inputs.update(
+ { # type: ignore
+ 'position_ids': position_ids,
+ 'cache_position': cache_position,
+ 'past_key_values': past_key_values,
+ 'use_cache': kwargs.get('use_cache'),
+ 'attention_mask': attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values: Cache, beam_idx: torch.LongTensor):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (tuple(
+ past_state.index_select(0, beam_idx.to(past_state.device))
+ for past_state in layer_past),)
+ return reordered_past
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5ed1cbeedb0ca6503c3f2e9141576c8e86279da
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|pad|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tiktoken.py b/tiktoken.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbc0f10c1bbc05d25657755c73b4779c080d2b93
--- /dev/null
+++ b/tiktoken.py
@@ -0,0 +1,374 @@
+"""Dbrx tokenizer."""
+
+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple
+
+from transformers import PreTrainedTokenizer
+
+
+def dbrx_system_prompt():
+ # This is inspired by the Claude3 prompt.
+ # source: https://twitter.com/AmandaAskell/status/1765207842993434880
+ # Identity and knowledge
+ prompt = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\n'
+ prompt += 'YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\n'
+ # Capabilities (and reminder to use ``` for JSON blocks and tables, which it can forget). Also a reminder that it can't browse the internet or run code.
+ prompt += 'You assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n'
+ prompt += '(You do not have real-time data access or code execution capabilities. '
+ # Ethical guidelines
+ prompt += 'You avoid stereotyping and provide balanced perspectives on controversial topics. '
+ # Data: the model doesn't know what it was trained on; it thinks that everything that it is aware of was in its training data. This is a reminder that it wasn't.
+ # We also encourage it not to try to generate lyrics or poems
+ prompt += 'You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\n'
+ # The model really wants to talk about its system prompt, to the point where it is annoying, so encourage it not to
+ prompt += 'This is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\n'
+ prompt += 'You do not mention any of this information about yourself unless the information is directly pertinent to the user\\\'s query.'.upper()
+ return prompt
+
+
+# Taken from
+# https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84
+@lru_cache()
+def bytes_to_unicode():
+ """Returns list of utf-8 byte and a mapping to unicode strings.
+
+ We specifically avoids mapping to whitespace/control characters the bpe code
+ barfs on.
+
+ The reversible bpe codes work on unicode strings. This means you need a
+ large # of unicode characters in your vocab if you want to avoid UNKs. When
+ you're at something like a 10B token dataset you end up needing around 5K
+ for decent coverage. This is a significant percentage of your normal, say,
+ 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+ unicode strings.
+ """
+ bs = (list(range(ord('!'),
+ ord('~') + 1)) + list(range(ord('¡'),
+ ord('¬') + 1)) +
+ list(range(ord('®'),
+ ord('ÿ') + 1)))
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+class TiktokenTokenizerWrapper(PreTrainedTokenizer):
+ """A thin wrapper around tiktoken to make it compatible with Hugging Face.
+
+ tokenizers.
+
+ See HuggingFace for further documentation on general tokenizer methods.
+ """
+
+ model_input_names = ['input_ids', 'attention_mask']
+
+ def __init__(self,
+ model_name: Optional[str] = None,
+ encoding_name: Optional[str] = None,
+ add_bos_token: bool = False,
+ add_eos_token: bool = False,
+ use_default_system_prompt: bool = False,
+ unk_token: Optional[str] = '<|endoftext|>',
+ eos_token: Optional[str] = '<|endoftext|>',
+ bos_token: Optional[str] = '<|endoftext|>',
+ pad_token: Optional[str] = None,
+ errors: str = 'replace',
+ **kwargs: Any):
+ """Constructor creates a tiktoken tokenizer to use as the underlying.
+
+ tokenizer.
+
+ Args:
+ model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None.
+ Either model_name or encoding_name must be set, but not both.
+ encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
+ Either model_name or encoding_name must be set, but not both.
+ add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
+ add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
+ use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False.
+ unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
+ eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
+ bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
+ pad_token (Optional[str], optional): The pad token. Defaults to None.
+ errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ Defaults to `"replace"`.
+ """
+ try:
+ import tiktoken
+ except:
+ raise ImportError(
+ 'You need to install tiktoken to use TiktokenTokenizerWrapper.')
+
+ # Workaround to make tiktokenizer picklable.
+ # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
+ # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
+ import copyreg
+ import functools
+
+ from tiktoken import Encoding # type: ignore (thirdParty)
+
+ def pickle_Encoding(enc: Encoding):
+ return (functools.partial(Encoding,
+ enc.name,
+ pat_str=enc._pat_str,
+ mergeable_ranks=enc._mergeable_ranks,
+ special_tokens=enc._special_tokens), ())
+
+ copyreg.pickle(Encoding, pickle_Encoding)
+
+ if model_name is not None and encoding_name is not None:
+ raise ValueError(
+ 'You need to specify either model_name or encoding_name, not both.'
+ )
+
+ self.model_name = model_name
+ self.encoding_name = encoding_name
+
+ if self.model_name is not None:
+ self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty)
+ self.model_name)
+ elif self.encoding_name is not None:
+ self.encoding = tiktoken.get_encoding( # type: ignore (thirdParty)
+ self.encoding_name)
+ else:
+ raise ValueError(
+ 'You need to specify either model_name or encoding_name.')
+
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.use_default_system_prompt = use_default_system_prompt
+
+ self.byte_encoder = bytes_to_unicode()
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+ self.errors = errors
+
+ self.decoder: Dict[int, str] = {}
+ for i in range(self.encoding.n_vocab):
+ try:
+ self.encoding.decode_single_token_bytes(i)
+ except KeyError:
+ continue
+ # Taken from
+ # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
+ decoding = ''.join([
+ bytes_to_unicode()[ord(char)] for char in
+ self.encoding.decode_single_token_bytes(i).decode('latin-1')
+ ])
+ self.decoder[i] = decoding
+
+ self.encoder: Dict[str, int] = {}
+ for i in range(self.encoding.n_vocab):
+ if i in self.decoder:
+ self.encoder[self.decoder[i]] = i
+
+ super().__init__(model_name=model_name,
+ encoding_name=encoding_name,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ use_default_system_prompt=use_default_system_prompt,
+ unk_token=unk_token,
+ eos_token=eos_token,
+ bos_token=bos_token,
+ pad_token=pad_token,
+ errors=errors,
+ **kwargs)
+
+ @property
+ def vocab_size(self) -> int:
+ """Returns vocab size."""
+ return self.encoding.n_vocab
+
+ @property
+ def is_fast(self) -> bool:
+ return False
+
+ @property
+ def default_chat_template(self):
+ """Chat ML Template for User/Assistant.
+
+ Pinning default Chat ML template in case defaults change.
+ """
+ template = (
+ "{% if messages[0]['role'] == 'system' %}"
+ '{% set loop_messages = messages[1:] %}'
+ "{% set system_message = messages[0]['content'] %}"
+ "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
+ '{% set loop_messages = messages %}'
+ "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
+ '{% else %}'
+ '{% set loop_messages = messages %}'
+ '{% set system_message = false %}'
+ '{% endif %}'
+ '{% for message in loop_messages %}'
+ '{% if loop.index0 == 0 %}'
+ '{% if system_message != false %}'
+ "{{ '<|im_start|>system\n' + system_message.strip() + '<|im_end|>\n'}}"
+ '{% endif %}'
+ "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+ '{% else %}'
+ "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+ '{% endif %}'
+ '{% if (add_generation_prompt == true and loop.last) %}'
+ "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
+ '{% endif %}'
+ '{% endfor %}')
+ template = template.replace(
+ 'USE_DEFAULT_PROMPT',
+ 'true' if self.use_default_system_prompt else 'false')
+ template = template.replace('DEFAULT_SYSTEM_PROMPT',
+ dbrx_system_prompt())
+ return template
+
+ def get_vocab(self) -> Dict[str, int]:
+ """Returns vocab as a dict."""
+ # As far as I can tell, we don't require get_vocab to completely work,
+ # but when using additional_special_tokens, Hugging Face determines the next
+ # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
+ vocab_clone = self.encoder.copy()
+ extra_id_index = 0
+ candidate_extra_id = f''
+ indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
+ vocab_clone.values())
+
+ # Add enough indices to make get_vocab() the right length
+ for index_to_add in indices_to_fill_in:
+ # Make sure we don't overwrite a token that already exists
+ while candidate_extra_id in vocab_clone:
+ extra_id_index += 1
+ candidate_extra_id = f''
+
+ # Get an index to add and add the item
+ vocab_clone[candidate_extra_id] = index_to_add
+
+ return vocab_clone
+
+ def _tokenize(self, text: str) -> List[str]:
+ """Returns a tokenized string."""
+ if not isinstance(text, str):
+ raise ValueError(
+ f'Expected a string input to _tokenize but got {type(text)}.')
+
+ tokens = [
+ self.decoder[t]
+ for t in self.encoding.encode(text, allowed_special='all')
+ ]
+
+ return tokens
+
+ def _convert_token_to_id(self, token: str) -> Optional[int]:
+ """Converts a token (str) in an id using the vocab."""
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+ def _convert_id_to_token(self, index: int) -> Optional[str]:
+ """Converts an index (integer) in a token (str) using the vocab."""
+ # For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
+ # we return empty string. This matches the behavior of Hugging Face fast tokenizers,
+ # but not slow tokenizers.
+ return self.decoder.get(index, '')
+
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
+ """Converts a sequence of tokens (string) in a single string."""
+ text = ''.join(tokens)
+ text = bytearray([self.byte_decoder[c] for c in text
+ ]).decode('utf-8', errors=self.errors)
+ return text
+
+ def build_inputs_with_special_tokens(
+ self,
+ token_ids_0: List[int],
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self,
+ token_ids_0: List[int],
+ token_ids_1: Optional[List[int]] = None,
+ already_has_special_tokens: bool = False) -> List[int]:
+ """Retrieves sequence ids from a token list that has no special tokens.
+
+ Function copied from
+ https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295
+
+ added. This method is called when adding special tokens using the
+ tokenizer `prepare_for_model` or `encode_plus` methods.
+
+ Args:
+ token_ids_0 (`List[int]`):
+ List of IDs.
+ token_ids_1 (`List[int]`, *optional*):
+ Optional second list of IDs for sequence pairs.
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not the token list is already formatted with special tokens for the model.
+
+ Returns:
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ """
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0,
+ token_ids_1=token_ids_1,
+ already_has_special_tokens=True)
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+ bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
+
+ def create_token_type_ids_from_sequences(
+ self,
+ token_ids_0: List[int],
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
+ sep = [self.sep_token_id]
+
+ if token_ids_1 is None:
+ return len(token_ids_0 + sep) * [0]
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+ def save_vocabulary(self,
+ save_directory: str,
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
+
+ # ignore the below type to keep the original signature
+ # we are knowingly breaking the signature here, although not 100% certain
+ # it doesn't have side effects
+ # There is some code in huggingface that calls this function to get the vocab files,
+ # but it doesn't seem to access them (or at least checks for their existence
+ # before accessing them)
+ return (None, None) # type: ignore
+
+ def sanitize_special_tokens(self) -> int:
+ """Make sure that all the special tokens attributes of the tokenizer.
+
+ (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the
+ vocabulary.
+
+ Add the missing ones to the vocabulary if needed.
+
+ Return:
+ `int`: The number of tokens added in the vocabulary during the operation.
+ """
+ actual_new_tokens = []
+ for token in self.all_special_tokens_extended:
+ encoded = self.encoding.encode(token, allowed_special='all')
+ if len(encoded) > 1:
+ actual_new_tokens.append(token)
+
+ return self.add_tokens(actual_new_tokens, special_tokens=True)
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5affd9ee0aadb839a0f3283a74e687f4ad40b126
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,48 @@
+{
+ "add_bos_token": false,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "100257": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "100277": {
+ "content": "<|im_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "100278": {
+ "content": "<|im_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ }
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tiktoken.TiktokenTokenizerWrapper",
+ null
+ ]
+ },
+ "bos_token": "<|endoftext|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "encoding_name": null,
+ "eos_token": "<|im_end|>",
+ "errors": "replace",
+ "model_max_length": 1000000000000000019884624838656,
+ "model_name": "gpt-4",
+ "pad_token": "<|pad|>",
+ "tokenizer_class": "TiktokenTokenizerWrapper",
+ "unk_token": "<|endoftext|>",
+ "use_default_system_prompt": false
+}