diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..330d4d913607ecfa58675bc8aab2bbf8be169b3d --- /dev/null +++ b/README.md @@ -0,0 +1,786 @@ +--- +tags: +- generated_from_trainer +model-index: +- name: out + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.0` +```yaml +base_model: /workspace/axolotl/dbrx-checkpoint +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: false +# load_in_4bit: true +strict: false + +# adapter: qlora +# lora_modules_to_save: [embed_tokens, lm_head] + +# lora_r: 32 +# lora_alpha: 16 +# lora_dropout: 0.05 +# lora_target_linear: false +# lora_fan_in_fan_out: + +datasets: + - path: /workspace/datasets/dolphin-2.9/dolphin201-sharegpt2.jsonl + type: sharegpt + conversation: chatml + # - path: /workspace/datasets/dolphin-2.9/Ultrachat200kunfiltered.jsonl + # type: sharegpt + # conversation: chatml + - path: /workspace/datasets/dolphin-2.9/dolphin-coder-translate-sharegpt2.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/dolphin-coder-codegen-sharegpt2.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/not_samantha_norefusals.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/Orca-Math-resort-unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/agent_instruct_react_unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/toolbench_instruct_j1s1_3k_unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/toolbench_negative_unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/toolbench_react_10p_unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/toolbench_tflan_cot_30p_unfiltered.jsonl + type: sharegpt + conversation: chatml + - path: /workspace/datasets/dolphin-2.9/openhermes200k_unfiltered.jsonl + type: sharegpt + conversation: chatml + # - path: /workspace/datasets/dolphin-2.9/SystemConversations.jsonl + # type: sharegpt + # conversation: chatml + +chat_template: chatml + +unfrozen_parameters: +- ^lm_head.weight$ +# ffn.experts.mlp_experts.0.v1 layers +- transformer.blocks.30.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.25.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.22.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.31.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.21.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.0.v1 +- transformer.blocks.23.ffn.experts.mlp_experts.0.v1 +# ffn.experts.mlp_experts.0.w1 layers +- transformer.blocks.7.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.32.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.25.ffn.experts.mlp_experts.0.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.0.w1 +# ffn.experts.mlp_experts.0.w2 layers +- transformer.blocks.25.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.22.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.7.ffn.experts.mlp_experts.0.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.0.w2 +# ffn.experts.mlp_experts.1.v1 layers +- transformer.blocks.27.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.25.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.23.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.6.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.21.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.1.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.1.v1 +# ffn.experts.mlp_experts.1.w1 layers +- transformer.blocks.0.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.27.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.1.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.1.w1 +# ffn.experts.mlp_experts.1.w2 layers +- transformer.blocks.25.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.23.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.21.ffn.experts.mlp_experts.1.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.1.w2 +# ffn.experts.mlp_experts.10.v1 layers +- transformer.blocks.28.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.34.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.36.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.20.ffn.experts.mlp_experts.10.v1 +- transformer.blocks.35.ffn.experts.mlp_experts.10.v1 +# ffn.experts.mlp_experts.10.w1 layers +- transformer.blocks.24.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.34.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.28.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.10.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.10.w1 +# ffn.experts.mlp_experts.10.w2 layers +- transformer.blocks.24.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.23.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.10.w2 +- transformer.blocks.20.ffn.experts.mlp_experts.10.w2 +# ffn.experts.mlp_experts.11.w1 layers +- transformer.blocks.6.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.28.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.11.w1 +- transformer.blocks.13.ffn.experts.mlp_experts.11.w1 +# ffn.experts.mlp_experts.11.w2 layers +- transformer.blocks.27.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.22.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.7.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.11.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.11.w2 +# ffn.experts.mlp_experts.12.v1 layers +- transformer.blocks.30.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.21.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.10.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.23.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.6.ffn.experts.mlp_experts.12.v1 +- transformer.blocks.20.ffn.experts.mlp_experts.12.v1 +# ffn.experts.mlp_experts.12.w1 layers +- transformer.blocks.8.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.2.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.17.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.12.w1 +- transformer.blocks.21.ffn.experts.mlp_experts.12.w1 +# ffn.experts.mlp_experts.12.w2 layers +- transformer.blocks.6.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.8.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.21.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.12.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.12.w2 +# ffn.experts.mlp_experts.13.v1 layers +- transformer.blocks.31.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.10.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.11.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.25.ffn.experts.mlp_experts.13.v1 +- transformer.blocks.36.ffn.experts.mlp_experts.13.v1 +# ffn.experts.mlp_experts.13.w1 layers +- transformer.blocks.4.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.24.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.13.w1 +- transformer.blocks.11.ffn.experts.mlp_experts.13.w1 +# ffn.experts.mlp_experts.13.w2 layers +- transformer.blocks.24.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.20.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.13.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.13.w2 +# ffn.experts.mlp_experts.14.v1 layers +- transformer.blocks.28.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.35.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.11.ffn.experts.mlp_experts.14.v1 +- transformer.blocks.22.ffn.experts.mlp_experts.14.v1 +# ffn.experts.mlp_experts.14.w1 layers +- transformer.blocks.8.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.13.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.28.ffn.experts.mlp_experts.14.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.14.w1 +# ffn.experts.mlp_experts.14.w2 layers +- transformer.blocks.26.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.14.w2 +- transformer.blocks.22.ffn.experts.mlp_experts.14.w2 +# ffn.experts.mlp_experts.15.v1 layers +- transformer.blocks.33.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.31.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.9.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.34.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.17.ffn.experts.mlp_experts.15.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.15.v1 +# ffn.experts.mlp_experts.15.w1 layers +- transformer.blocks.6.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.14.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.34.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.15.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.15.w1 +# ffn.experts.mlp_experts.15.w2 layers +- transformer.blocks.28.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.7.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.15.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.15.w2 +# ffn.experts.mlp_experts.2.v1 layers +- transformer.blocks.31.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.23.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.35.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.21.ffn.experts.mlp_experts.2.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.2.v1 +# ffn.experts.mlp_experts.2.w1 layers +- transformer.blocks.7.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.31.ffn.experts.mlp_experts.2.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.2.w1 +# ffn.experts.mlp_experts.2.w2 layers +- transformer.blocks.26.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.23.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.2.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.2.w2 +# ffn.experts.mlp_experts.3.v1 layers +- transformer.blocks.28.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.36.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.14.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.10.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.31.ffn.experts.mlp_experts.3.v1 +- transformer.blocks.21.ffn.experts.mlp_experts.3.v1 +# ffn.experts.mlp_experts.3.w1 layers +- transformer.blocks.7.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.3.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.3.w1 +# ffn.experts.mlp_experts.3.w2 layers +- transformer.blocks.28.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.21.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.3.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.3.w2 +# ffn.experts.mlp_experts.4.v1 layers +- transformer.blocks.34.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.31.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.14.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.6.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.20.ffn.experts.mlp_experts.4.v1 +- transformer.blocks.9.ffn.experts.mlp_experts.4.v1 +# ffn.experts.mlp_experts.4.w1 layers +- transformer.blocks.6.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.14.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.34.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.4.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.4.w1 +# ffn.experts.mlp_experts.4.w2 layers +- transformer.blocks.25.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.36.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.4.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.4.w2 +# ffn.experts.mlp_experts.5.v1 layers +- transformer.blocks.35.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.5.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.5.v1 +# ffn.experts.mlp_experts.5.w1 layers +- transformer.blocks.0.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.7.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.12.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.5.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.5.w1 +# ffn.experts.mlp_experts.5.w2 layers +- transformer.blocks.26.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.6.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.5.w2 +- transformer.blocks.7.ffn.experts.mlp_experts.5.w2 +# ffn.experts.mlp_experts.6.v1 layers +- transformer.blocks.34.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.31.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.26.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.35.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.20.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.15.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.10.ffn.experts.mlp_experts.6.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.6.v1 +# ffn.experts.mlp_experts.6.w1 layers +- transformer.blocks.0.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.34.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.26.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.2.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.6.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.6.w1 +# ffn.experts.mlp_experts.6.w2 layers +- transformer.blocks.24.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.26.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.32.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.20.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.4.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.6.w2 +- transformer.blocks.9.ffn.experts.mlp_experts.6.w2 +# ffn.experts.mlp_experts.7.v1 layers +- transformer.blocks.27.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.11.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.12.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.10.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.23.ffn.experts.mlp_experts.7.v1 +- transformer.blocks.34.ffn.experts.mlp_experts.7.v1 +# ffn.experts.mlp_experts.7.w1 layers +- transformer.blocks.12.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.5.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.29.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.10.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.34.ffn.experts.mlp_experts.7.w1 +- transformer.blocks.33.ffn.experts.mlp_experts.7.w1 +# ffn.experts.mlp_experts.7.w2 layers +- transformer.blocks.23.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.5.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.7.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.7.w2 +# ffn.experts.mlp_experts.8.v1 layers +- transformer.blocks.30.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.20.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.34.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.9.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.6.ffn.experts.mlp_experts.8.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.8.v1 +# ffn.experts.mlp_experts.8.w1 layers +- transformer.blocks.7.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.0.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.3.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.2.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.30.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.24.ffn.experts.mlp_experts.8.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.8.w1 +# ffn.experts.mlp_experts.8.w2 layers +- transformer.blocks.32.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.24.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.30.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.3.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.23.ffn.experts.mlp_experts.8.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.8.w2 +# ffn.experts.mlp_experts.9.v1 layers +- transformer.blocks.31.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.29.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.33.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.25.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.14.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.32.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.9.ffn.experts.mlp_experts.9.v1 +- transformer.blocks.34.ffn.experts.mlp_experts.9.v1 +# ffn.experts.mlp_experts.9.w1 layers +- transformer.blocks.7.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.1.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.9.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.2.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.27.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.12.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.4.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.6.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.19.ffn.experts.mlp_experts.9.w1 +- transformer.blocks.8.ffn.experts.mlp_experts.9.w1 +# ffn.experts.mlp_experts.9.w2 layers +- transformer.blocks.26.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.25.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.28.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.27.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.31.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.29.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.7.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.34.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.2.ffn.experts.mlp_experts.9.w2 +- transformer.blocks.33.ffn.experts.mlp_experts.9.w2 +# ffn.router.layer layers +- transformer.blocks.2.ffn.router.layer +- transformer.blocks.3.ffn.router.layer +- transformer.blocks.4.ffn.router.layer +- transformer.blocks.5.ffn.router.layer +- transformer.blocks.6.ffn.router.layer +- transformer.blocks.7.ffn.router.layer +- transformer.blocks.8.ffn.router.layer +- transformer.blocks.9.ffn.router.layer +- transformer.blocks.10.ffn.router.layer +- transformer.blocks.11.ffn.router.layer +# norm_attn_norm.attn.Wqkv layers +- transformer.blocks.16.norm_attn_norm.attn.Wqkv +- transformer.blocks.15.norm_attn_norm.attn.Wqkv +- transformer.blocks.11.norm_attn_norm.attn.Wqkv +- transformer.blocks.14.norm_attn_norm.attn.Wqkv +- transformer.blocks.12.norm_attn_norm.attn.Wqkv +- transformer.blocks.20.norm_attn_norm.attn.Wqkv +- transformer.blocks.10.norm_attn_norm.attn.Wqkv +- transformer.blocks.9.norm_attn_norm.attn.Wqkv +- transformer.blocks.19.norm_attn_norm.attn.Wqkv +- transformer.blocks.18.norm_attn_norm.attn.Wqkv +# norm_attn_norm.attn.out_proj layers +- transformer.blocks.1.norm_attn_norm.attn.out_proj +- transformer.blocks.18.norm_attn_norm.attn.out_proj +- transformer.blocks.2.norm_attn_norm.attn.out_proj +- transformer.blocks.16.norm_attn_norm.attn.out_proj +- transformer.blocks.0.norm_attn_norm.attn.out_proj +- transformer.blocks.39.norm_attn_norm.attn.out_proj +- transformer.blocks.23.norm_attn_norm.attn.out_proj +- transformer.blocks.8.norm_attn_norm.attn.out_proj +- transformer.blocks.24.norm_attn_norm.attn.out_proj +- transformer.blocks.19.norm_attn_norm.attn.out_proj +# norm_attn_norm.norm_1 layers +- transformer.blocks.0.norm_attn_norm.norm_1 +- transformer.blocks.1.norm_attn_norm.norm_1 +- transformer.blocks.2.norm_attn_norm.norm_1 +- transformer.blocks.3.norm_attn_norm.norm_1 +- transformer.blocks.4.norm_attn_norm.norm_1 +- transformer.blocks.5.norm_attn_norm.norm_1 +- transformer.blocks.6.norm_attn_norm.norm_1 +- transformer.blocks.7.norm_attn_norm.norm_1 +- transformer.blocks.8.norm_attn_norm.norm_1 +- transformer.blocks.9.norm_attn_norm.norm_1 +# norm_attn_norm.norm_2 layers +- transformer.blocks.0.norm_attn_norm.norm_2 +- transformer.blocks.1.norm_attn_norm.norm_2 +- transformer.blocks.2.norm_attn_norm.norm_2 +- transformer.blocks.3.norm_attn_norm.norm_2 +- transformer.blocks.4.norm_attn_norm.norm_2 +- transformer.blocks.5.norm_attn_norm.norm_2 +- transformer.blocks.6.norm_attn_norm.norm_2 +- transformer.blocks.7.norm_attn_norm.norm_2 +- transformer.blocks.8.norm_attn_norm.norm_2 +- transformer.blocks.9.norm_attn_norm.norm_2 +# transformer.norm_f layers +# transformer.wte layers +# ffn.experts.mlp_experts.11.v1 layers +- transformer.blocks.29.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.27.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.30.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.28.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.22.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.7.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.24.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.8.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.6.ffn.experts.mlp_experts.11.v1 +- transformer.blocks.12.ffn.experts.mlp_experts.11.v1 + + + +dataset_prepared_path: dbrx2 +val_set_size: 0.01 +output_dir: ./out + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +wandb_project: dolphin-2.9-Dbrx +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 8 +micro_batch_size: 1 +num_epochs: 1 +optimizer: paged_adamw_8bit +lr_scheduler: cosine +learning_rate: 1e-5 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +early_stopping_patience: +# resume_from_checkpoint: /workspace/axolotl/dbrx-checkpoint +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 4 +eval_table_size: +saves_per_epoch: 4 +save_total_limit: 2 +save_steps: +debug: +deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json +weight_decay: 0.05 +fsdp: +fsdp_config: +special_tokens: + bos_token: "<|endoftext|>" + eos_token: "<|im_end|>" + pad_token: "<|pad|>" + unk_token: "<|endoftext|>" +tokens: + - "<|im_start|>" + - "<|im_end|>" + + +``` + +

+ +# out + +This model was trained from scratch on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 0.4336 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 64 +- total_eval_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 1 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:-----:|:----:|:---------------:| +| 0.4009 | 0.0 | 1 | 0.4328 | +| 0.413 | 0.25 | 587 | 0.4408 | +| 0.3626 | 0.5 | 1174 | 0.4368 | +| 0.3896 | 0.75 | 1761 | 0.4336 | + + +### Framework versions + +- Transformers 4.40.0.dev0 +- Pytorch 2.2.2+cu121 +- Datasets 2.15.0 +- Tokenizers 0.15.0 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..8fd93dbaccc9e3824c96d24bd07102836233454e --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|im_end|>": 100278, + "<|im_start|>": 100277 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..818f2c4e5d0d13111d9e3a7749f3b8e510924130 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "/workspace/axolotl/dbrx-checkpoint", + "architectures": [ + "DbrxForCausalLM" + ], + "attn_config": { + "clip_qkv": 8, + "kv_n_heads": 8, + "model_type": "", + "rope_theta": 500000 + }, + "auto_map": { + "AutoConfig": "configuration_dbrx.DbrxConfig", + "AutoModelForCausalLM": "modeling_dbrx.DbrxForCausalLM" + }, + "d_model": 6144, + "emb_pdrop": 0.0, + "ffn_config": { + "ffn_hidden_size": 10752, + "model_type": "", + "moe_jitter_eps": 0.01, + "moe_loss_weight": 0.05, + "moe_num_experts": 16, + "moe_top_k": 4 + }, + "initializer_range": 0.02, + "max_seq_len": 32768, + "model_type": "dbrx", + "n_heads": 48, + "n_layers": 40, + "output_router_logits": false, + "resid_pdrop": 0.0, + "router_aux_loss_coef": 0.05, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": false, + "vocab_size": 100352 +} diff --git a/configuration_dbrx.py b/configuration_dbrx.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c387be81edd9a192e935aa44692726f061b508 --- /dev/null +++ b/configuration_dbrx.py @@ -0,0 +1,264 @@ +"""Dbrx configuration.""" +from typing import Any, Optional + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class DbrxAttentionConfig(PretrainedConfig): + """Configuration class for Dbrx Attention. + + [`DbrxAttention`] class. It is used to instantiate attention layers + according to the specified arguments, defining the layers architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + attn_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability for the attention layers. + clip_qkv (`float`, *optional*, defualts to None): + If not `None`, clip the queries, keys, and values in the attention layer to this value. + kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. + rope_theta (float): The base frequency for rope. + """ + + def __init__( + self, + attn_pdrop: float = 0, + clip_qkv: Optional[float] = None, + kv_n_heads: int = 1, + rope_theta: float = 10000.0, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.attn_pdrop = attn_pdrop + self.clip_qkv = clip_qkv + self.kv_n_heads = kv_n_heads + self.rope_theta = rope_theta + + for k in ['model_type']: + if k in kwargs: + kwargs.pop(k) + if len(kwargs) != 0: + raise ValueError(f'Found unknown {kwargs=}') + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, + **kwargs: Any) -> 'PretrainedConfig': + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, + **kwargs) + + if config_dict.get('model_type') == 'dbrx': + config_dict = config_dict['attn_config'] + + if 'model_type' in config_dict and hasattr( + cls, + 'model_type') and config_dict['model_type'] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + + + f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' + ) + + return cls.from_dict(config_dict, **kwargs) + + +class DbrxFFNConfig(PretrainedConfig): + """Configuration class for Dbrx FFN. + + [`DbrxFFN`] class. It is used to instantiate feedforward layers according to + the specified arguments, defining the layers architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + ffn_act_fn (dict, optional): A dict specifying activation function for the FFN. + The dict should have a key 'name' with the value being the name of + the activation function along with any additional keyword arguments. + ffn_hidden_size (int, optional): The hidden size of the feedforward network. + moe_num_experts (int, optional): The number of experts in the mixture of experts layer. + moe_top_k (int, optional): The number of experts to use in the mixture of experts layer. + moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer. + moe_loss_weight (float, optional): The loss weight for the mixture of experts layer. + moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights. + uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment. + This should only be used for benchmarking purposes. + """ + + def __init__( + self, + ffn_act_fn: Optional[dict] = None, + ffn_hidden_size: int = 3584, + moe_num_experts: int = 4, + moe_top_k: int = 1, + moe_jitter_eps: Optional[float] = None, + moe_loss_weight: float = 0.01, + moe_normalize_expert_weights: Optional[float] = 1, + uniform_expert_assignment: bool = False, + **kwargs: Any, + ): + super().__init__() + if ffn_act_fn is None: + ffn_act_fn = {'name': 'silu'} + self.ffn_act_fn = ffn_act_fn + self.ffn_hidden_size = ffn_hidden_size + self.moe_num_experts = moe_num_experts + self.moe_top_k = moe_top_k + self.moe_jitter_eps = moe_jitter_eps + self.moe_loss_weight = moe_loss_weight + self.moe_normalize_expert_weights = moe_normalize_expert_weights + self.uniform_expert_assignment = uniform_expert_assignment + + for k in ['model_type']: + if k in kwargs: + kwargs.pop(k) + if len(kwargs) != 0: + raise ValueError(f'Found unknown {kwargs=}') + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, + **kwargs: Any) -> 'PretrainedConfig': + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, + **kwargs) + + if config_dict.get('model_type') == 'dbrx': + config_dict = config_dict['ffn_config'] + + if 'model_type' in config_dict and hasattr( + cls, + 'model_type') and config_dict['model_type'] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + + + f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' + ) + + return cls.from_dict(config_dict, **kwargs) + + +class DbrxConfig(PretrainedConfig): + """Configuration class for Dbrx. + + [`DbrxModel`]. It is used to instantiate a Dbrx model according to the + specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + d_model (`int`, *optional*, defaults to 6144): + Dimensionality of the embeddings and hidden states. + n_heads (`int`, *optional*, defaults to 48): + Number of attention heads for each attention layer in the Transformer encoder. + n_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer encoder. + max_seq_len (`int`, *optional*, defaults to 32768): + The maximum sequence length of the model. + vocab_size (`int`, *optional*, defaults to 100352): + Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by + the `inputs_ids` passed when calling [`DbrxModel`]. + resid_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability applied to the attention output before combining with residual. + emb_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability for the embedding layer. + attn_config (`dict`, *optional*): + A dictionary used to configure the model's attention module. + ffn_config (`dict`, *optional*): + A dictionary used to configure the model's FFN module. + use_cache (`bool`, *optional*, defaults to `False`): + Whether or not the model should return the last key/values attentions (not used by all models). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss. See [here]() for more details + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + + Example: + ```python + >>> from transformers import DbrxConfig, DbrxModel + + >>> # Initializing a Dbrx configuration + >>> configuration = DbrxConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DbrxModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = 'dbrx' + attribute_map = { + 'num_attention_heads': 'n_heads', + 'hidden_size': 'd_model', + 'num_hidden_layers': 'n_layers', + 'max_position_embeddings': 'max_seq_len' + } + + def __init__( + self, + d_model: int = 2048, + n_heads: int = 16, + n_layers: int = 24, + max_seq_len: int = 2048, + vocab_size: int = 32000, + resid_pdrop: float = 0.0, + emb_pdrop: float = 0.0, + attn_config: Optional[DbrxAttentionConfig] = None, + ffn_config: Optional[DbrxFFNConfig] = None, + use_cache: bool = True, + initializer_range: float = 0.02, + output_router_logits: bool = False, + router_aux_loss_coef: float = 0.05, + **kwargs: Any, + ): + if attn_config is None: + self.attn_config = DbrxAttentionConfig() + elif isinstance(attn_config, dict): + self.attn_config = DbrxAttentionConfig(**attn_config) + else: + self.attn_config = attn_config + + if ffn_config is None: + self.ffn_config = DbrxFFNConfig() + elif isinstance(ffn_config, dict): + self.ffn_config = DbrxFFNConfig(**ffn_config) + else: + self.ffn_config = ffn_config + + self.d_model = d_model + self.n_heads = n_heads + self.n_layers = n_layers + self.max_seq_len = max_seq_len + self.vocab_size = vocab_size + self.resid_pdrop = resid_pdrop + self.emb_pdrop = emb_pdrop + self.use_cache = use_cache + self.initializer_range = initializer_range + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + tie_word_embeddings = kwargs.pop('tie_word_embeddings', False) + if tie_word_embeddings: + raise ValueError( + 'tie_word_embeddings is not supported for Dbrx models.') + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e80125b496d618b762f321e0d026bd80f2e937a --- /dev/null +++ b/generation_config.json @@ -0,0 +1,5 @@ +{ + "_from_model_config": true, + "do_sample": true, + "transformers_version": "4.40.0.dev0" +} diff --git a/model-00001-of-00054.safetensors b/model-00001-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ddcc762fe3bc7954d216e56a1a9e1db99f8862e5 --- /dev/null +++ b/model-00001-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:844d6ee310e60f776437f34b532a89764c869474fa771babcc88f457a1a41b49 +size 4976767312 diff --git a/model-00002-of-00054.safetensors b/model-00002-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..50aae60cf016537ca9b266594fcebf161f02093f --- /dev/null +++ b/model-00002-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67edbaf8ce22cef89b551054543eaf80c2a65f71702a0a6e818300e19a7d9883 +size 4932728256 diff --git a/model-00003-of-00054.safetensors b/model-00003-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..22b4581ccf33f6d6376288331442705671a5460a --- /dev/null +++ b/model-00003-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9432fdf70381f02519e595cc1a32171a24cdff89817bab9b9162d9261df8cb +size 4932728256 diff --git a/model-00004-of-00054.safetensors b/model-00004-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f47e81494ad23344068845bfa3be26689ad09d6d --- /dev/null +++ b/model-00004-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a92ceadf870e5641c61925094356618bbccb078ee8ddadf6e0fe7000f02a22 +size 4888466376 diff --git a/model-00005-of-00054.safetensors b/model-00005-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0cb009a051de0e5f1829b8e1e31e1d1e13df1d83 --- /dev/null +++ b/model-00005-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbaeb5f57d35a71310802168cc1c7d0c40adff467da82b4a40bd7923a9ee35e4 +size 4932728248 diff --git a/model-00006-of-00054.safetensors b/model-00006-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c31cad1fa0905af91ed784e09c224cb6c753004e --- /dev/null +++ b/model-00006-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b968c9e9d8e880db7517ed6d6fce3688ba186bd6bf52813cb5be6d2ca9d94bd +size 4932728256 diff --git a/model-00007-of-00054.safetensors b/model-00007-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7dace946fb66554538eb0540993badd35002e267 --- /dev/null +++ b/model-00007-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc898a1265501c5d7816d0d5a5bec727c82ff398e34b01216d19a08d1276441 +size 4932728256 diff --git a/model-00008-of-00054.safetensors b/model-00008-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e7cd9a6a020ba15264246af703ae6652eccbed1 --- /dev/null +++ b/model-00008-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a03a9d91c30daf0be095e7fe207f2ec64459f3b84de5354436d66ee7bc87fdb5 +size 4888466376 diff --git a/model-00009-of-00054.safetensors b/model-00009-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec4e171ee7f219791388dc569584cfb6cb014b84 --- /dev/null +++ b/model-00009-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019e332f7c02b72f409d07597a51fe6b9750f7e3c6844e625ff7d5b64fc53dd4 +size 4932728248 diff --git a/model-00010-of-00054.safetensors b/model-00010-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b07d5c235d4890b64988afc3addbc980ebe5a818 --- /dev/null +++ b/model-00010-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08796964f4ab0f84558720edf6eaa3a84d4284fd8fb46e5efa8c307acee50bfb +size 4932728256 diff --git a/model-00011-of-00054.safetensors b/model-00011-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6159d0207a7a5e12a4a1eec69b61898a0168707 --- /dev/null +++ b/model-00011-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b923056be1b7922da2ccde84fc5fbce0e4493e74d075080c9cff6d6d72baccc +size 4932728256 diff --git a/model-00012-of-00054.safetensors b/model-00012-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cde62cc9db78749c584823105248d66d14635a4a --- /dev/null +++ b/model-00012-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4df2fe028ea52491ed31525dd581ecac07e19da6585e066ed00513f956c1e4a2 +size 4888466376 diff --git a/model-00013-of-00054.safetensors b/model-00013-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..015a792803b40114974e9389d28e6ca36e8f2e6b --- /dev/null +++ b/model-00013-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51cc2f9e11b4d986cf83a0568afb4ae3a0796606bd9bdd8337a4f44f734ca86b +size 4932728240 diff --git a/model-00014-of-00054.safetensors b/model-00014-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ad4b949c4f48b84d5182a6e31d2c94c7ffae136 --- /dev/null +++ b/model-00014-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:166b445c266747bfef1b529494a8b64b8014ae9f681d3b31bb279f5ce56148dc +size 4932728280 diff --git a/model-00015-of-00054.safetensors b/model-00015-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6fdc9a7c9e682fb40e4222a967cbcaa768b84a3f --- /dev/null +++ b/model-00015-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c4a8745ab117e3993ec7e41933885240759ff5766b61309e6210740e5f0687 +size 4932728296 diff --git a/model-00016-of-00054.safetensors b/model-00016-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a883a164797650bf1dc85091bda24c84e3f69b6 --- /dev/null +++ b/model-00016-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401be63b10132316fd3d16a1a4eb5ab1b41d38dcd24dfba7585a13faf36b1d55 +size 4888466416 diff --git a/model-00017-of-00054.safetensors b/model-00017-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bbbc912d299f44183314c27c7244648ede92384a --- /dev/null +++ b/model-00017-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f508fbcfac88c06a1a9c01c6041530cd7bc32261753faa86240ce733f96c335d +size 4932728288 diff --git a/model-00018-of-00054.safetensors b/model-00018-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..88814875df426f1c69ec030fa3bd4dd4cb4c6be3 --- /dev/null +++ b/model-00018-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba87d72f59502b88a04fa2a145f73712d412fa784e63c816b72f872cccc167e3 +size 4932728296 diff --git a/model-00019-of-00054.safetensors b/model-00019-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bc0f854410a250214becd65d770e9445b79a5d8f --- /dev/null +++ b/model-00019-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c0313fb62cb31922e14d2d71b3794d6ca82a6193ca1fced36fb57fc445b0c2 +size 4932728296 diff --git a/model-00020-of-00054.safetensors b/model-00020-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1ce508bdc49296058c5202a2d35df31732c26031 --- /dev/null +++ b/model-00020-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b983476b64d4eafadfe50647c307d0bd1114415be8eb1e2f65c612c778bf07 +size 4888466416 diff --git a/model-00021-of-00054.safetensors b/model-00021-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3bca03bda6969424cb4adf5cf3e2ea043e0f6a1f --- /dev/null +++ b/model-00021-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d59190fb228d8490d7c65a4003e7f65877bd8d0f2c527b7b3a1b493026efa88 +size 4932728288 diff --git a/model-00022-of-00054.safetensors b/model-00022-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ddf8f402bce562831ce0a7d40ec776ef1866b8c3 --- /dev/null +++ b/model-00022-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8b37e5d9a55ff376f923cafe175c109ab3329dc0dcd703d1d5110d25f5cd2f +size 4932728296 diff --git a/model-00023-of-00054.safetensors b/model-00023-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..468b4594daf2ec82c63d56d4223bbffca8d6f3d2 --- /dev/null +++ b/model-00023-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042526a2936e7fb570abd769da46aedd6fcc65745aaee13f00bcdc70a5e06b81 +size 4932728296 diff --git a/model-00024-of-00054.safetensors b/model-00024-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3fa69f3fccee8906b19b5d32d2c8a521a9234522 --- /dev/null +++ b/model-00024-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eebc383a0a528db88d0d4965a02b4cbc81d702991963e4523a6b2dfc3a9151f9 +size 4888466416 diff --git a/model-00025-of-00054.safetensors b/model-00025-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93126067808f613bf91694841b88091dea3ad6b4 --- /dev/null +++ b/model-00025-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae5b9fcc37e67350cd83ff8b3db1313d119ed5afc7d594ab8a6918077918eba +size 4932728288 diff --git a/model-00026-of-00054.safetensors b/model-00026-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d3a78c63273c1e3f92cf710c1f69f3163468e6f2 --- /dev/null +++ b/model-00026-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033d368ca1793ae5ba73bb03b8c6cc7256eb4a18f77329c2c8fdeb8b5fbd3411 +size 4932728296 diff --git a/model-00027-of-00054.safetensors b/model-00027-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23ebab5cc8107364c30eefe35b755d91f63354fc --- /dev/null +++ b/model-00027-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f35cfd716d3f28eb636926ab692a0fef1cca2a5fc6df2aaf508895b4d6b8c5 +size 4932728296 diff --git a/model-00028-of-00054.safetensors b/model-00028-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b60bc8a3de05295ad5860d4c914bc5e4fc9ada8 --- /dev/null +++ b/model-00028-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee70a22cc0b03e27154164b623d987837f156c813d3ad3db979859995a9101da +size 4888466416 diff --git a/model-00029-of-00054.safetensors b/model-00029-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..83246d499a4b86c50423a5090163d32e4d896037 --- /dev/null +++ b/model-00029-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d970806d6867f0808c497eb159a21dcded2194cc676a3da58159c5449a424c8 +size 4932728288 diff --git a/model-00030-of-00054.safetensors b/model-00030-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea89c07d58b616f8f8bed69ddea44d88678c6859 --- /dev/null +++ b/model-00030-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2d1b9f8845e95ca6afcd803e158d82807646f35114e06164451d303b9ab9ec8 +size 4932728296 diff --git a/model-00031-of-00054.safetensors b/model-00031-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..afa20e5c00619da964d3119aab54fd92a62c6562 --- /dev/null +++ b/model-00031-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b116b356f8871f98dcac1130097315bb6e534e9f91e8703b2c5a12ad9ba000f +size 4932728296 diff --git a/model-00032-of-00054.safetensors b/model-00032-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d1195d5381701518594d59ee9a8903145a0bdc8f --- /dev/null +++ b/model-00032-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca0e4c2054445a7b33a548f45c6b09bf469e97ffb0c48e27e6b277bf04e6037 +size 4888466416 diff --git a/model-00033-of-00054.safetensors b/model-00033-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c52732c9b76b4f35b19aa4988540eeca4a8687c --- /dev/null +++ b/model-00033-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e18cbafcfc97cecc047fa18f64ff805b61a3976b8b6b01b333c6cae73c3b9797 +size 4932728288 diff --git a/model-00034-of-00054.safetensors b/model-00034-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e61ea9f213df58eb80bed2cd7a824b6c5ec8bf1f --- /dev/null +++ b/model-00034-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc7bfcbd66ee533cd39cf2c236ac7a32f249f4b90c6a1d025bd30e3dcba8b37e +size 4932728288 diff --git a/model-00035-of-00054.safetensors b/model-00035-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6da8c0274fb1b49a54a9ff00f299a1ce3d1e8e25 --- /dev/null +++ b/model-00035-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3957da1791e004a08595a89a2ea4587c168a1c6b916da521fd4fde3751b68a89 +size 4932728296 diff --git a/model-00036-of-00054.safetensors b/model-00036-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f2ba0e35a3e3c8b8ca195c9452a78227e1322a8e --- /dev/null +++ b/model-00036-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5691f61db31dd0894d272f6a2107e366484825fe1279952f9abfc835421cf16e +size 4989142256 diff --git a/model-00037-of-00054.safetensors b/model-00037-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a5aa6e8fa63a5a302116c356031f7d299af3df11 --- /dev/null +++ b/model-00037-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f2dbd599835d83511dd2122d5bea4ad6647f521f950dcb901699c1aa1bcfcb +size 4964173160 diff --git a/model-00038-of-00054.safetensors b/model-00038-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cc5733076103fe074cfad8fa2215eeb578f8f5f1 --- /dev/null +++ b/model-00038-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500fd82e253552d7283f6bc2dd7287a1cfc524d3a483bd6e525de912238c815c +size 4932728288 diff --git a/model-00039-of-00054.safetensors b/model-00039-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a2f72cd067eb3d692aea0e354674b6d61579d301 --- /dev/null +++ b/model-00039-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79307da855fad9bd2377cc36c914938657dfc6554a35edaee4874b6153bef98f +size 4932728296 diff --git a/model-00040-of-00054.safetensors b/model-00040-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..674179602aea35f3e85b149b3a85f56ce7bf683e --- /dev/null +++ b/model-00040-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b300507198f682ee40a81b1af4b16169023ae07fc3f45767eea3d0019c8f84f6 +size 4932728296 diff --git a/model-00041-of-00054.safetensors b/model-00041-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af658cd70e0c1e07b6081db2ee48d6ff2238a07e --- /dev/null +++ b/model-00041-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c20d1529859d1a2cd0ba96512ce0dfe4d97137e591febf0998d80a2ee497731 +size 4888466408 diff --git a/model-00042-of-00054.safetensors b/model-00042-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c5418ec99c7bd769bb37bc336d5918a957779dd --- /dev/null +++ b/model-00042-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfdf0dcff1dc6f5da4754dbf6d58f4ec69102b185f95a3116c106597c9fd34b6 +size 4932728288 diff --git a/model-00043-of-00054.safetensors b/model-00043-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdd9dfcc66a0a4c2293f29a3ba6a038790f9934f --- /dev/null +++ b/model-00043-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62413dbb7ec0a905a8f03b47f86693ddf0570c35dc8afb83cdc31892708d420 +size 4932728296 diff --git a/model-00044-of-00054.safetensors b/model-00044-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7858820231c3d1057961202585f745cf949a3f4 --- /dev/null +++ b/model-00044-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de53f440d3e537a70891a225e39555f0a730ae2ba92916f98087e86531d330d +size 4932728296 diff --git a/model-00045-of-00054.safetensors b/model-00045-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69cf3adc330fd8cca46a32cb02bc29913ad54d34 --- /dev/null +++ b/model-00045-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74006b98dac79bfa8765e97abab9ef348e65c76b581c7810578489ab7c2258cc +size 4888466408 diff --git a/model-00046-of-00054.safetensors b/model-00046-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64cf7fe5dba42e823740702bb2ed9b4f2086cd94 --- /dev/null +++ b/model-00046-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73f51a9b5c9acf866d7ed02b12e7a250ad1beb790c91408f576a254da750b635 +size 4932728288 diff --git a/model-00047-of-00054.safetensors b/model-00047-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f377108d89abd808538cac2f1455f2c281e0aa1 --- /dev/null +++ b/model-00047-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a91080f6f0aa8d2a9a6981be7ae4161a625d10dd79347d45c1834cb5d38aff +size 4932728296 diff --git a/model-00048-of-00054.safetensors b/model-00048-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c99f6c382198a7c6d910922b6f8f61de8cc5458e --- /dev/null +++ b/model-00048-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:224d1b62723bf612b8758ec2e30490508110571541ad9df597d389f462b24dcd +size 4932728296 diff --git a/model-00049-of-00054.safetensors b/model-00049-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1086a827de2feab6df6a3ba2ec00165bb07e2406 --- /dev/null +++ b/model-00049-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8890a1f9f691819d7ced6e5170d73fc83cad1a98262bcd40f3fb364a93cfc664 +size 4888466408 diff --git a/model-00050-of-00054.safetensors b/model-00050-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d1442f08797b9dd90d7b0a97b0c37a8ee9458b9f --- /dev/null +++ b/model-00050-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13724ca02f921494021f2b03c292dd852c5f1133e1e60c1cdcc07a247465bf47 +size 4932728288 diff --git a/model-00051-of-00054.safetensors b/model-00051-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71560cd413f00a7f1903fa63dd526cf6d1b8149b --- /dev/null +++ b/model-00051-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30120bc6a14b2531eb6b1faa369b838a4dab9863aaa75ed1eebe919caa681e1 +size 4932728296 diff --git a/model-00052-of-00054.safetensors b/model-00052-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f121d969fa1efeb751f62f5106a396e02e58d551 --- /dev/null +++ b/model-00052-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c7e8dfcfe4189122f5f558352f8aa35f0d8df3ffa5d8316634a06a6b73392c0 +size 4932728296 diff --git a/model-00053-of-00054.safetensors b/model-00053-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..70b8dbaea57dc9eed182fc2ee67b252794f9383f --- /dev/null +++ b/model-00053-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e6b8f8df0793fe3c5133a3643c269ff16d58bb03a01ef20c6531a2fb0f637c4 +size 4888466416 diff --git a/model-00054-of-00054.safetensors b/model-00054-of-00054.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33dfeb8838f519f9742529da870214a48937e391 --- /dev/null +++ b/model-00054-of-00054.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff597e0c7f828a974de3149d022689165724784f00e3443a544b684594c1e9d1 +size 2157982888 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..481f189b62ed8caaf9cd2f0c9d2f6e3e453c1894 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,2130 @@ +{ + "metadata": { + "total_size": 263193047040 + }, + "weight_map": { + "lm_head.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.0.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.0.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.0.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.1.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.1.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.1.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.10.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.10.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.10.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.11.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.11.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.11.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.12.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.12.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.12.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.13.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.13.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.13.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.14.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.14.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.14.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.15.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.15.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.15.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.2.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.2.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.2.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.3.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.3.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.3.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.4.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.4.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.4.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.5.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.5.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.5.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.6.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.6.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.6.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.7.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.7.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.7.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.8.v1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.8.w1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.8.w2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.9.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.9.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.experts.mlp_experts.9.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.0.ffn.router.layer.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.norm_attn_norm.attn.Wqkv.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.norm_attn_norm.attn.out_proj.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.norm_attn_norm.norm_1.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.0.norm_attn_norm.norm_2.weight": "model-00001-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.0.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.0.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.0.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.1.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.1.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.1.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.10.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.10.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.10.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.11.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.11.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.11.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.12.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.12.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.12.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.13.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.13.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.13.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.14.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.14.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.14.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.15.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.15.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.15.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.2.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.2.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.2.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.3.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.3.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.3.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.4.v1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.4.w1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.4.w2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.5.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.5.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.5.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.6.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.6.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.6.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.7.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.7.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.7.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.8.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.8.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.8.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.9.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.9.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.experts.mlp_experts.9.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.1.ffn.router.layer.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.norm_attn_norm.attn.Wqkv.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.norm_attn_norm.attn.out_proj.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.norm_attn_norm.norm_1.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.1.norm_attn_norm.norm_2.weight": "model-00002-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.0.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.0.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.0.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.1.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.1.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.1.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.10.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.10.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.10.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.11.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.11.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.11.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.12.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.12.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.12.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.13.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.13.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.13.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.14.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.14.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.14.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.15.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.15.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.15.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.2.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.2.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.2.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.3.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.3.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.3.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.4.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.4.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.4.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.5.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.5.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.5.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.6.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.6.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.6.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.7.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.7.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.7.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.8.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.8.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.8.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.9.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.9.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.experts.mlp_experts.9.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.10.ffn.router.layer.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.norm_attn_norm.attn.Wqkv.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.norm_attn_norm.attn.out_proj.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.norm_attn_norm.norm_1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.10.norm_attn_norm.norm_2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.0.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.0.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.0.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.1.v1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.1.w1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.1.w2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.10.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.10.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.10.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.11.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.11.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.11.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.12.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.12.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.12.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.13.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.13.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.13.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.14.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.14.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.14.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.15.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.15.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.15.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.2.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.2.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.2.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.3.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.3.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.3.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.4.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.4.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.4.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.5.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.5.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.5.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.6.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.6.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.6.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.7.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.7.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.7.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.8.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.8.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.8.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.9.v1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.9.w1.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.experts.mlp_experts.9.w2.weight": "model-00016-of-00054.safetensors", + "transformer.blocks.11.ffn.router.layer.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.norm_attn_norm.attn.Wqkv.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.norm_attn_norm.attn.out_proj.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.norm_attn_norm.norm_1.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.11.norm_attn_norm.norm_2.weight": "model-00015-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.0.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.0.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.0.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.1.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.1.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.1.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.10.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.10.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.10.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.11.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.11.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.11.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.12.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.12.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.12.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.13.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.13.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.13.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.14.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.14.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.14.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.15.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.15.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.15.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.2.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.2.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.2.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.3.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.3.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.3.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.4.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.4.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.4.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.5.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.5.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.5.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.6.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.6.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.6.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.7.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.7.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.7.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.8.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.8.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.8.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.9.v1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.9.w1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.experts.mlp_experts.9.w2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.ffn.router.layer.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.norm_attn_norm.attn.Wqkv.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.norm_attn_norm.attn.out_proj.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.norm_attn_norm.norm_1.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.12.norm_attn_norm.norm_2.weight": "model-00017-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.0.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.0.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.0.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.1.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.1.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.1.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.10.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.10.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.10.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.11.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.11.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.11.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.12.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.12.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.12.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.13.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.13.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.13.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.14.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.14.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.14.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.15.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.15.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.15.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.2.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.2.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.2.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.3.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.3.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.3.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.4.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.4.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.4.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.5.v1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.5.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.5.w2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.6.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.6.w1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.6.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.7.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.7.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.7.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.8.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.8.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.8.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.9.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.9.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.experts.mlp_experts.9.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.13.ffn.router.layer.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.norm_attn_norm.attn.Wqkv.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.norm_attn_norm.attn.out_proj.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.norm_attn_norm.norm_1.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.13.norm_attn_norm.norm_2.weight": "model-00018-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.0.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.0.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.0.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.1.v1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.1.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.1.w2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.10.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.10.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.10.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.11.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.11.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.11.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.12.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.12.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.12.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.13.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.13.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.13.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.14.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.14.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.14.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.15.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.15.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.15.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.2.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.2.w1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.2.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.3.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.3.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.3.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.4.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.4.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.4.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.5.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.5.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.5.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.6.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.6.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.6.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.7.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.7.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.7.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.8.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.8.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.8.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.9.v1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.9.w1.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.experts.mlp_experts.9.w2.weight": "model-00020-of-00054.safetensors", + "transformer.blocks.14.ffn.router.layer.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.norm_attn_norm.attn.Wqkv.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.norm_attn_norm.attn.out_proj.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.norm_attn_norm.norm_1.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.14.norm_attn_norm.norm_2.weight": "model-00019-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.0.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.0.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.0.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.1.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.1.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.1.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.10.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.10.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.10.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.11.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.11.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.11.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.12.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.12.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.12.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.13.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.13.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.13.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.14.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.14.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.14.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.15.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.15.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.15.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.2.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.2.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.2.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.3.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.3.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.3.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.4.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.4.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.4.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.5.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.5.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.5.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.6.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.6.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.6.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.7.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.7.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.7.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.8.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.8.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.8.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.9.v1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.9.w1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.experts.mlp_experts.9.w2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.ffn.router.layer.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.norm_attn_norm.attn.Wqkv.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.norm_attn_norm.attn.out_proj.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.norm_attn_norm.norm_1.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.15.norm_attn_norm.norm_2.weight": "model-00021-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.0.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.0.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.0.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.1.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.1.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.1.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.10.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.10.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.10.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.11.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.11.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.11.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.12.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.12.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.12.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.13.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.13.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.13.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.14.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.14.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.14.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.15.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.15.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.15.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.2.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.2.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.2.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.3.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.3.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.3.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.4.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.4.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.4.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.5.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.5.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.5.w2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.6.v1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.6.w1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.6.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.7.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.7.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.7.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.8.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.8.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.8.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.9.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.9.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.experts.mlp_experts.9.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.16.ffn.router.layer.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.norm_attn_norm.attn.Wqkv.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.norm_attn_norm.attn.out_proj.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.norm_attn_norm.norm_1.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.16.norm_attn_norm.norm_2.weight": "model-00022-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.0.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.0.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.0.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.1.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.1.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.1.w2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.10.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.10.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.10.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.11.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.11.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.11.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.12.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.12.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.12.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.13.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.13.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.13.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.14.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.14.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.14.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.15.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.15.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.15.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.2.v1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.2.w1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.2.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.3.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.3.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.3.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.4.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.4.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.4.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.5.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.5.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.5.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.6.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.6.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.6.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.7.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.7.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.7.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.8.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.8.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.8.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.9.v1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.9.w1.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.experts.mlp_experts.9.w2.weight": "model-00024-of-00054.safetensors", + "transformer.blocks.17.ffn.router.layer.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.norm_attn_norm.attn.Wqkv.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.norm_attn_norm.attn.out_proj.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.norm_attn_norm.norm_1.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.17.norm_attn_norm.norm_2.weight": "model-00023-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.0.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.0.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.0.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.1.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.1.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.1.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.10.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.10.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.10.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.11.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.11.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.11.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.12.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.12.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.12.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.13.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.13.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.13.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.14.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.14.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.14.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.15.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.15.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.15.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.2.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.2.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.2.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.3.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.3.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.3.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.4.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.4.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.4.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.5.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.5.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.5.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.6.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.6.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.6.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.7.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.7.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.7.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.8.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.8.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.8.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.9.v1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.9.w1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.experts.mlp_experts.9.w2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.ffn.router.layer.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.norm_attn_norm.attn.Wqkv.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.norm_attn_norm.attn.out_proj.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.norm_attn_norm.norm_1.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.18.norm_attn_norm.norm_2.weight": "model-00025-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.0.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.0.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.0.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.1.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.1.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.1.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.10.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.10.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.10.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.11.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.11.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.11.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.12.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.12.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.12.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.13.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.13.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.13.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.14.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.14.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.14.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.15.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.15.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.15.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.2.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.2.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.2.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.3.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.3.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.3.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.4.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.4.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.4.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.5.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.5.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.5.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.6.v1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.6.w1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.6.w2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.7.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.7.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.7.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.8.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.8.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.8.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.9.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.9.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.experts.mlp_experts.9.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.19.ffn.router.layer.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.norm_attn_norm.attn.Wqkv.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.norm_attn_norm.attn.out_proj.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.norm_attn_norm.norm_1.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.19.norm_attn_norm.norm_2.weight": "model-00026-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.0.v1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.0.w1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.0.w2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.1.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.1.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.1.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.10.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.10.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.10.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.11.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.11.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.11.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.12.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.12.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.12.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.13.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.13.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.13.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.14.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.14.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.14.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.15.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.15.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.15.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.2.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.2.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.2.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.3.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.3.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.3.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.4.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.4.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.4.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.5.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.5.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.5.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.6.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.6.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.6.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.7.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.7.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.7.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.8.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.8.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.8.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.9.v1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.9.w1.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.experts.mlp_experts.9.w2.weight": "model-00004-of-00054.safetensors", + "transformer.blocks.2.ffn.router.layer.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.norm_attn_norm.attn.Wqkv.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.norm_attn_norm.attn.out_proj.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.norm_attn_norm.norm_1.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.2.norm_attn_norm.norm_2.weight": "model-00003-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.0.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.0.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.0.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.1.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.1.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.1.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.10.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.10.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.10.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.11.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.11.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.11.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.12.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.12.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.12.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.13.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.13.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.13.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.14.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.14.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.14.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.15.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.15.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.15.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.2.v1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.2.w1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.2.w2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.3.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.3.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.3.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.4.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.4.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.4.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.5.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.5.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.5.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.6.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.6.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.6.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.7.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.7.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.7.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.8.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.8.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.8.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.9.v1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.9.w1.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.experts.mlp_experts.9.w2.weight": "model-00028-of-00054.safetensors", + "transformer.blocks.20.ffn.router.layer.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.norm_attn_norm.attn.Wqkv.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.norm_attn_norm.attn.out_proj.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.norm_attn_norm.norm_1.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.20.norm_attn_norm.norm_2.weight": "model-00027-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.0.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.0.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.0.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.1.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.1.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.1.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.10.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.10.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.10.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.11.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.11.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.11.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.12.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.12.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.12.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.13.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.13.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.13.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.14.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.14.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.14.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.15.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.15.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.15.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.2.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.2.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.2.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.3.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.3.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.3.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.4.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.4.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.4.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.5.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.5.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.5.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.6.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.6.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.6.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.7.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.7.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.7.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.8.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.8.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.8.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.9.v1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.9.w1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.experts.mlp_experts.9.w2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.ffn.router.layer.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.norm_attn_norm.attn.Wqkv.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.norm_attn_norm.attn.out_proj.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.norm_attn_norm.norm_1.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.21.norm_attn_norm.norm_2.weight": "model-00029-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.0.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.0.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.0.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.1.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.1.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.1.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.10.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.10.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.10.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.11.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.11.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.11.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.12.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.12.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.12.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.13.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.13.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.13.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.14.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.14.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.14.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.15.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.15.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.15.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.2.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.2.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.2.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.3.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.3.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.3.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.4.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.4.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.4.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.5.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.5.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.5.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.6.v1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.6.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.6.w2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.7.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.7.w1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.7.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.8.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.8.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.8.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.9.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.9.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.experts.mlp_experts.9.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.22.ffn.router.layer.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.norm_attn_norm.attn.Wqkv.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.norm_attn_norm.attn.out_proj.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.norm_attn_norm.norm_1.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.22.norm_attn_norm.norm_2.weight": "model-00030-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.0.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.0.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.0.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.1.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.1.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.1.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.10.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.10.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.10.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.11.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.11.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.11.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.12.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.12.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.12.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.13.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.13.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.13.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.14.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.14.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.14.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.15.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.15.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.15.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.2.v1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.2.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.2.w2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.3.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.3.w1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.3.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.4.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.4.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.4.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.5.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.5.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.5.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.6.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.6.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.6.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.7.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.7.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.7.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.8.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.8.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.8.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.9.v1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.9.w1.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.experts.mlp_experts.9.w2.weight": "model-00032-of-00054.safetensors", + "transformer.blocks.23.ffn.router.layer.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.norm_attn_norm.attn.Wqkv.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.norm_attn_norm.attn.out_proj.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.norm_attn_norm.norm_1.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.23.norm_attn_norm.norm_2.weight": "model-00031-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.0.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.0.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.0.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.1.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.1.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.1.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.10.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.10.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.10.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.11.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.11.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.11.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.12.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.12.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.12.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.13.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.13.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.13.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.14.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.14.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.14.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.15.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.15.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.15.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.2.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.2.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.2.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.3.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.3.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.3.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.4.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.4.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.4.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.5.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.5.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.5.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.6.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.6.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.6.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.7.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.7.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.7.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.8.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.8.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.8.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.9.v1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.9.w1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.experts.mlp_experts.9.w2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.ffn.router.layer.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.norm_attn_norm.attn.Wqkv.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.norm_attn_norm.attn.out_proj.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.norm_attn_norm.norm_1.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.24.norm_attn_norm.norm_2.weight": "model-00033-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.0.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.0.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.0.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.1.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.1.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.1.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.10.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.10.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.10.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.11.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.11.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.11.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.12.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.12.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.12.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.13.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.13.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.13.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.14.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.14.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.14.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.15.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.15.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.15.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.2.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.2.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.2.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.3.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.3.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.3.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.4.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.4.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.4.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.5.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.5.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.5.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.6.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.6.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.6.w2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.7.v1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.7.w1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.7.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.8.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.8.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.8.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.9.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.9.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.experts.mlp_experts.9.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.25.ffn.router.layer.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.norm_attn_norm.attn.Wqkv.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.norm_attn_norm.attn.out_proj.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.norm_attn_norm.norm_1.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.25.norm_attn_norm.norm_2.weight": "model-00034-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.0.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.0.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.0.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.1.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.1.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.1.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.10.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.10.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.10.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.11.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.11.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.11.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.12.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.12.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.12.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.13.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.13.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.13.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.14.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.14.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.14.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.15.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.15.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.15.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.2.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.2.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.2.w2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.3.v1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.3.w1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.3.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.4.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.4.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.4.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.5.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.5.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.5.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.6.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.6.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.6.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.7.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.7.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.7.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.8.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.8.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.8.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.9.v1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.9.w1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.experts.mlp_experts.9.w2.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.26.ffn.router.layer.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.norm_attn_norm.attn.Wqkv.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.norm_attn_norm.attn.out_proj.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.norm_attn_norm.norm_1.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.26.norm_attn_norm.norm_2.weight": "model-00035-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.0.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.0.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.0.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.1.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.1.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.1.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.10.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.10.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.10.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.11.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.11.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.11.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.12.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.12.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.12.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.13.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.13.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.13.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.14.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.14.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.14.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.15.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.15.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.15.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.2.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.2.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.2.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.3.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.3.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.3.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.4.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.4.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.4.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.5.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.5.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.5.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.6.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.6.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.6.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.7.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.7.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.7.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.8.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.8.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.8.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.9.v1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.9.w1.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.experts.mlp_experts.9.w2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.ffn.router.layer.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.norm_attn_norm.attn.Wqkv.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.27.norm_attn_norm.attn.out_proj.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.27.norm_attn_norm.norm_1.weight": "model-00036-of-00054.safetensors", + "transformer.blocks.27.norm_attn_norm.norm_2.weight": "model-00037-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.0.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.0.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.0.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.1.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.1.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.1.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.10.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.10.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.10.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.11.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.11.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.11.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.12.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.12.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.12.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.13.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.13.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.13.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.14.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.14.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.14.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.15.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.15.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.15.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.2.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.2.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.2.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.3.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.3.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.3.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.4.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.4.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.4.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.5.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.5.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.5.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.6.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.6.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.6.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.7.v1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.7.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.7.w2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.8.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.8.w1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.8.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.9.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.9.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.experts.mlp_experts.9.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.28.ffn.router.layer.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.norm_attn_norm.attn.Wqkv.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.norm_attn_norm.attn.out_proj.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.norm_attn_norm.norm_1.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.28.norm_attn_norm.norm_2.weight": "model-00038-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.0.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.0.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.0.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.1.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.1.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.1.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.10.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.10.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.10.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.11.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.11.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.11.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.12.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.12.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.12.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.13.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.13.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.13.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.14.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.14.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.14.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.15.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.15.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.15.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.2.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.2.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.2.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.3.v1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.3.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.3.w2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.4.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.4.w1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.4.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.5.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.5.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.5.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.6.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.6.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.6.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.7.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.7.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.7.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.8.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.8.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.8.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.9.v1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.9.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.experts.mlp_experts.9.w2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.29.ffn.router.layer.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.norm_attn_norm.attn.Wqkv.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.norm_attn_norm.attn.out_proj.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.norm_attn_norm.norm_1.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.29.norm_attn_norm.norm_2.weight": "model-00039-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.0.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.0.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.0.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.1.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.1.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.1.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.10.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.10.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.10.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.11.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.11.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.11.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.12.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.12.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.12.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.13.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.13.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.13.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.14.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.14.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.14.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.15.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.15.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.15.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.2.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.2.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.2.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.3.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.3.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.3.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.4.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.4.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.4.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.5.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.5.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.5.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.6.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.6.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.6.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.7.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.7.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.7.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.8.v1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.8.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.8.w2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.9.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.9.w1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.ffn.experts.mlp_experts.9.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.3.ffn.router.layer.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.norm_attn_norm.attn.Wqkv.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.norm_attn_norm.attn.out_proj.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.norm_attn_norm.norm_1.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.3.norm_attn_norm.norm_2.weight": "model-00005-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.0.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.0.w1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.0.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.1.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.1.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.1.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.10.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.10.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.10.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.11.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.11.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.11.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.12.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.12.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.12.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.13.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.13.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.13.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.14.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.14.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.14.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.15.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.15.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.15.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.2.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.2.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.2.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.3.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.3.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.3.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.4.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.4.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.4.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.5.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.5.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.5.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.6.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.6.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.6.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.7.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.7.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.7.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.8.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.8.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.8.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.9.v1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.9.w1.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.experts.mlp_experts.9.w2.weight": "model-00041-of-00054.safetensors", + "transformer.blocks.30.ffn.router.layer.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.30.norm_attn_norm.attn.Wqkv.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.30.norm_attn_norm.attn.out_proj.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.30.norm_attn_norm.norm_1.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.30.norm_attn_norm.norm_2.weight": "model-00040-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.0.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.0.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.0.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.1.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.1.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.1.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.10.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.10.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.10.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.11.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.11.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.11.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.12.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.12.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.12.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.13.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.13.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.13.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.14.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.14.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.14.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.15.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.15.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.15.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.2.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.2.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.2.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.3.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.3.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.3.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.4.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.4.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.4.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.5.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.5.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.5.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.6.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.6.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.6.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.7.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.7.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.7.w2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.8.v1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.8.w1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.8.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.9.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.9.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.experts.mlp_experts.9.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.31.ffn.router.layer.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.norm_attn_norm.attn.Wqkv.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.norm_attn_norm.attn.out_proj.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.norm_attn_norm.norm_1.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.31.norm_attn_norm.norm_2.weight": "model-00042-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.0.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.0.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.0.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.1.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.1.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.1.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.10.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.10.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.10.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.11.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.11.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.11.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.12.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.12.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.12.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.13.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.13.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.13.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.14.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.14.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.14.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.15.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.15.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.15.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.2.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.2.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.2.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.3.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.3.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.3.w2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.4.v1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.4.w1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.4.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.5.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.5.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.5.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.6.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.6.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.6.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.7.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.7.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.7.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.8.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.8.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.8.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.9.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.9.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.experts.mlp_experts.9.w2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.32.ffn.router.layer.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.norm_attn_norm.attn.Wqkv.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.norm_attn_norm.attn.out_proj.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.norm_attn_norm.norm_1.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.32.norm_attn_norm.norm_2.weight": "model-00043-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.0.v1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.0.w1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.0.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.1.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.1.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.1.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.10.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.10.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.10.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.11.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.11.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.11.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.12.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.12.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.12.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.13.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.13.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.13.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.14.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.14.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.14.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.15.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.15.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.15.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.2.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.2.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.2.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.3.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.3.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.3.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.4.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.4.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.4.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.5.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.5.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.5.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.6.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.6.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.6.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.7.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.7.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.7.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.8.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.8.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.8.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.9.v1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.9.w1.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.experts.mlp_experts.9.w2.weight": "model-00045-of-00054.safetensors", + "transformer.blocks.33.ffn.router.layer.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.norm_attn_norm.attn.Wqkv.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.norm_attn_norm.attn.out_proj.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.norm_attn_norm.norm_1.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.33.norm_attn_norm.norm_2.weight": "model-00044-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.0.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.0.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.0.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.1.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.1.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.1.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.10.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.10.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.10.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.11.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.11.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.11.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.12.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.12.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.12.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.13.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.13.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.13.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.14.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.14.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.14.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.15.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.15.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.15.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.2.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.2.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.2.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.3.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.3.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.3.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.4.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.4.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.4.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.5.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.5.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.5.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.6.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.6.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.6.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.7.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.7.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.7.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.8.v1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.8.w1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.8.w2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.9.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.9.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.experts.mlp_experts.9.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.34.ffn.router.layer.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.norm_attn_norm.attn.Wqkv.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.norm_attn_norm.attn.out_proj.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.norm_attn_norm.norm_1.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.34.norm_attn_norm.norm_2.weight": "model-00046-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.0.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.0.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.0.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.1.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.1.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.1.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.10.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.10.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.10.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.11.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.11.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.11.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.12.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.12.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.12.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.13.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.13.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.13.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.14.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.14.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.14.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.15.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.15.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.15.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.2.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.2.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.2.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.3.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.3.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.3.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.4.v1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.4.w1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.4.w2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.5.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.5.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.5.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.6.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.6.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.6.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.7.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.7.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.7.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.8.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.8.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.8.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.9.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.9.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.experts.mlp_experts.9.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.35.ffn.router.layer.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.norm_attn_norm.attn.Wqkv.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.norm_attn_norm.attn.out_proj.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.norm_attn_norm.norm_1.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.35.norm_attn_norm.norm_2.weight": "model-00047-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.0.v1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.0.w1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.0.w2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.1.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.1.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.1.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.10.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.10.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.10.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.11.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.11.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.11.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.12.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.12.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.12.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.13.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.13.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.13.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.14.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.14.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.14.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.15.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.15.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.15.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.2.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.2.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.2.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.3.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.3.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.3.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.4.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.4.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.4.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.5.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.5.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.5.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.6.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.6.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.6.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.7.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.7.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.7.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.8.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.8.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.8.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.9.v1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.9.w1.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.experts.mlp_experts.9.w2.weight": "model-00049-of-00054.safetensors", + "transformer.blocks.36.ffn.router.layer.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.norm_attn_norm.attn.Wqkv.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.norm_attn_norm.attn.out_proj.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.norm_attn_norm.norm_1.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.36.norm_attn_norm.norm_2.weight": "model-00048-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.0.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.0.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.0.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.1.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.1.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.1.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.10.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.10.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.10.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.11.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.11.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.11.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.12.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.12.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.12.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.13.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.13.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.13.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.14.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.14.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.14.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.15.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.15.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.15.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.2.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.2.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.2.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.3.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.3.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.3.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.4.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.4.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.4.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.5.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.5.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.5.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.6.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.6.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.6.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.7.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.7.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.7.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.8.v1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.8.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.8.w2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.9.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.9.w1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.ffn.experts.mlp_experts.9.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.37.ffn.router.layer.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.norm_attn_norm.attn.Wqkv.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.norm_attn_norm.attn.out_proj.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.norm_attn_norm.norm_1.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.37.norm_attn_norm.norm_2.weight": "model-00050-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.0.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.0.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.0.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.1.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.1.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.1.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.10.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.10.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.10.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.11.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.11.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.11.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.12.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.12.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.12.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.13.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.13.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.13.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.14.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.14.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.14.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.15.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.15.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.15.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.2.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.2.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.2.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.3.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.3.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.3.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.4.v1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.4.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.4.w2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.5.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.5.w1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.5.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.6.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.6.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.6.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.7.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.7.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.7.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.8.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.8.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.8.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.9.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.9.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.experts.mlp_experts.9.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.38.ffn.router.layer.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.norm_attn_norm.attn.Wqkv.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.norm_attn_norm.attn.out_proj.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.norm_attn_norm.norm_1.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.38.norm_attn_norm.norm_2.weight": "model-00051-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.0.v1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.0.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.0.w2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.1.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.1.w1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.1.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.10.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.10.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.10.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.11.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.11.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.11.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.12.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.12.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.12.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.13.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.13.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.13.w2.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.14.v1.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.14.w1.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.14.w2.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.15.v1.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.15.w1.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.15.w2.weight": "model-00054-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.2.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.2.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.2.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.3.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.3.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.3.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.4.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.4.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.4.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.5.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.5.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.5.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.6.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.6.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.6.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.7.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.7.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.7.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.8.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.8.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.8.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.9.v1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.9.w1.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.experts.mlp_experts.9.w2.weight": "model-00053-of-00054.safetensors", + "transformer.blocks.39.ffn.router.layer.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.norm_attn_norm.attn.Wqkv.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.norm_attn_norm.attn.out_proj.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.norm_attn_norm.norm_1.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.39.norm_attn_norm.norm_2.weight": "model-00052-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.0.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.0.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.0.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.1.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.1.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.1.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.10.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.10.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.10.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.11.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.11.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.11.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.12.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.12.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.12.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.13.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.13.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.13.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.14.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.14.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.14.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.15.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.15.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.15.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.2.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.2.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.2.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.3.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.3.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.3.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.4.v1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.4.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.4.w2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.5.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.5.w1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.5.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.6.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.6.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.6.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.7.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.7.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.7.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.8.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.8.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.8.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.9.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.9.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.experts.mlp_experts.9.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.4.ffn.router.layer.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.norm_attn_norm.attn.Wqkv.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.norm_attn_norm.attn.out_proj.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.norm_attn_norm.norm_1.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.4.norm_attn_norm.norm_2.weight": "model-00006-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.0.v1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.0.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.0.w2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.1.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.1.w1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.1.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.10.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.10.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.10.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.11.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.11.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.11.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.12.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.12.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.12.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.13.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.13.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.13.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.14.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.14.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.14.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.15.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.15.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.15.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.2.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.2.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.2.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.3.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.3.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.3.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.4.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.4.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.4.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.5.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.5.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.5.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.6.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.6.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.6.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.7.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.7.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.7.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.8.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.8.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.8.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.9.v1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.9.w1.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.experts.mlp_experts.9.w2.weight": "model-00008-of-00054.safetensors", + "transformer.blocks.5.ffn.router.layer.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.norm_attn_norm.attn.Wqkv.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.norm_attn_norm.attn.out_proj.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.norm_attn_norm.norm_1.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.5.norm_attn_norm.norm_2.weight": "model-00007-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.0.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.0.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.0.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.1.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.1.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.1.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.10.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.10.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.10.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.11.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.11.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.11.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.12.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.12.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.12.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.13.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.13.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.13.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.14.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.14.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.14.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.15.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.15.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.15.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.2.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.2.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.2.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.3.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.3.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.3.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.4.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.4.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.4.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.5.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.5.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.5.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.6.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.6.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.6.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.7.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.7.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.7.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.8.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.8.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.8.w2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.9.v1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.9.w1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.ffn.experts.mlp_experts.9.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.6.ffn.router.layer.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.norm_attn_norm.attn.Wqkv.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.norm_attn_norm.attn.out_proj.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.norm_attn_norm.norm_1.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.6.norm_attn_norm.norm_2.weight": "model-00009-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.0.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.0.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.0.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.1.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.1.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.1.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.10.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.10.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.10.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.11.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.11.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.11.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.12.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.12.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.12.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.13.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.13.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.13.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.14.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.14.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.14.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.15.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.15.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.15.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.2.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.2.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.2.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.3.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.3.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.3.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.4.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.4.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.4.w2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.5.v1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.5.w1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.5.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.6.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.6.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.6.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.7.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.7.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.7.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.8.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.8.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.8.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.9.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.9.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.experts.mlp_experts.9.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.7.ffn.router.layer.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.norm_attn_norm.attn.Wqkv.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.norm_attn_norm.attn.out_proj.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.norm_attn_norm.norm_1.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.7.norm_attn_norm.norm_2.weight": "model-00010-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.0.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.0.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.0.w2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.1.v1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.1.w1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.1.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.10.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.10.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.10.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.11.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.11.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.11.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.12.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.12.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.12.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.13.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.13.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.13.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.14.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.14.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.14.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.15.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.15.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.15.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.2.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.2.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.2.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.3.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.3.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.3.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.4.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.4.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.4.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.5.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.5.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.5.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.6.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.6.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.6.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.7.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.7.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.7.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.8.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.8.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.8.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.9.v1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.9.w1.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.experts.mlp_experts.9.w2.weight": "model-00012-of-00054.safetensors", + "transformer.blocks.8.ffn.router.layer.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.norm_attn_norm.attn.Wqkv.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.norm_attn_norm.attn.out_proj.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.norm_attn_norm.norm_1.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.8.norm_attn_norm.norm_2.weight": "model-00011-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.0.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.0.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.0.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.1.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.1.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.1.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.10.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.10.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.10.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.11.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.11.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.11.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.12.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.12.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.12.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.13.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.13.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.13.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.14.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.14.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.14.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.15.v1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.15.w1.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.15.w2.weight": "model-00014-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.2.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.2.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.2.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.3.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.3.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.3.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.4.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.4.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.4.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.5.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.5.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.5.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.6.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.6.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.6.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.7.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.7.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.7.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.8.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.8.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.8.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.9.v1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.9.w1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.experts.mlp_experts.9.w2.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.ffn.router.layer.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.norm_attn_norm.attn.Wqkv.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.norm_attn_norm.attn.out_proj.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.norm_attn_norm.norm_1.weight": "model-00013-of-00054.safetensors", + "transformer.blocks.9.norm_attn_norm.norm_2.weight": "model-00013-of-00054.safetensors", + "transformer.norm_f.weight": "model-00054-of-00054.safetensors", + "transformer.wte.weight": "model-00001-of-00054.safetensors" + } +} diff --git a/modeling_dbrx.py b/modeling_dbrx.py new file mode 100644 index 0000000000000000000000000000000000000000..21f018aaa80c7c0a8522ce03b66c09aa5aec7ee1 --- /dev/null +++ b/modeling_dbrx.py @@ -0,0 +1,1454 @@ +"""PyTorch Dbrx model.""" + +import math +import warnings +from copy import deepcopy +from functools import partial +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from transformers.cache_utils import Cache, DynamicCache, StaticCache +from transformers.modeling_attn_mask_utils import AttentionMaskConverter +from transformers.modeling_outputs import (MoeCausalLMOutputWithPast, + MoeModelOutputWithPast) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import is_flash_attn_2_available, logging + +from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig + +if is_flash_attn_2_available(): + try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import pad_input # noqa + from flash_attn.bert_padding import index_first_axis, unpad_input + except: + pass + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = 'DbrxConfig' + +############################################################################# +# Copied from LLaMaRotaryEmbedding +############################################################################# + + +class DbrxRotaryEmbedding(nn.Module): + + def __init__(self, + dim: int, + max_position_embeddings: int = 2048, + base: float = 10000.0, + scaling_factor: float = 1.0): + super().__init__() + self.scaling_factor = scaling_factor + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base**( + torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer('inv_freq', inv_freq, persistent=False) + # For BC we register cos and sin cached + self.max_seq_len_cached = max_position_embeddings + + @torch.no_grad() + def forward( + self, x: torch.Tensor, position_ids: torch.LongTensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # x: [bs, num_attention_heads, seq_len, head_size] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand( + position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance( + device_type, str) and device_type != 'mps' else 'cpu' + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() + @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + unsqueeze_dim: int = 1) -> Tuple[torch.Tensor, torch.Tensor]: + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and + sin so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). + + The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to + (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, + None, :, :].expand(batch, num_key_value_heads, + n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, + head_dim) + + +############################################################################# + +############################################################################# +# Modified from modeling_mixtral +############################################################################# + + +def load_balancing_loss_func( + gate_logits: torch.Tensor, + num_experts: int, + top_k: int, + attention_mask: Optional[torch.Tensor], +) -> torch.Tensor: + r"""Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): + Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [batch_size X sequence_length, num_experts]. + num_experts (`int`): + Number of experts. + top_k (`int`): + The number of experts each token is routed to. + attention_mask (`torch.Tensor`, None): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return torch.tensor(0.0) + + if isinstance(gate_logits, tuple): + compute_device = gate_logits[0].device + concatenated_gate_logits = torch.cat( + [layer_gate.to(compute_device) for layer_gate in gate_logits], + dim=0) + + routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, + dim=-1) + + _, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + + expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.mean(expert_mask.float(), dim=0) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = concatenated_gate_logits.shape[0] // ( + batch_size * sequence_length) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = (attention_mask[None, :, :, None, None].expand( + (num_hidden_layers, batch_size, sequence_length, top_k, + num_experts)).reshape(-1, top_k, num_experts).to(compute_device)) + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.sum( + expert_mask.float() * expert_attention_mask, dim=0) / torch.sum( + expert_attention_mask, dim=0) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None].expand( + (num_hidden_layers, batch_size, sequence_length, + num_experts)).reshape(-1, num_experts).to(compute_device)) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum( + routing_weights * router_per_expert_attention_mask, + dim=0) / torch.sum(router_per_expert_attention_mask, dim=0) + + overall_loss = torch.sum(tokens_per_expert * + router_prob_per_expert.unsqueeze(0)) + return overall_loss * num_experts + + +############################################################################# + + +def resolve_ffn_act_fn( + ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]: + """Resolve the activation function for the feed-forward network. + + Args: + ffn_act_fn (dict): The configuration dictionary for the activation function. + The dict config must specify the 'name' of a torch.nn.functional activation + function. All of other key values pairs are bound to the function as a partial. + + Returns: + Callable[[torch.Tensor], torch.Tensor]: The activation function. + """ + config = deepcopy(ffn_act_fn) + name = config.pop('name') + if not hasattr(nn.functional, name): + raise ValueError(f'Unrecognised activation function name ({name}).') + act = getattr(nn.functional, name) + return partial(act, **config) + + +############################################################################# +# Copied from LLaMaAttention +############################################################################# + + +def _get_unpad_data(attention_mask: torch.Tensor): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), + (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +class DbrxAttention(nn.Module): + """Multi-head self attention.""" + + def __init__(self, + hidden_size: int, + num_heads: int, + max_position_embeddings: int, + attn_config: DbrxAttentionConfig, + block_idx: Optional[int] = None): + super().__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = max_position_embeddings + self.block_idx = block_idx + self.config = attn_config + if block_idx is None: + logger.warning_once( + f'Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will ' + + + 'lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` ' + + 'when creating this class.') + + self.attn_pdrop = attn_config.attn_pdrop + self.clip_qkv = attn_config.clip_qkv + self.num_key_value_heads = attn_config.kv_n_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.rope_theta = attn_config.rope_theta + + self.Wqkv = nn.Linear(self.hidden_size, + self.hidden_size + + 2 * self.num_key_value_heads * self.head_dim, + bias=False) + self.out_proj = nn.Linear(self.hidden_size, + self.hidden_size, + bias=False) + self.rotary_emb = DbrxRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Any, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]: + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.Wqkv(hidden_states) + if self.clip_qkv is not None: + qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv) + + query_states, key_states, value_states = qkv_states.split( + [ + self.hidden_size, + self.num_key_value_heads * self.head_dim, + self.num_key_value_heads * self.head_dim, + ], + dim=2, + ) + + query_states = query_states.view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + + past_key_value = getattr(self, 'past_key_value', past_key_value) + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, + key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = { + 'sin': sin, + 'cos': cos, + 'cache_position': cache_position + } + key_states, value_states = past_key_value.update( + key_states, value_states, self.block_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose( + 2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, :key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, + dim=-1, + dtype=torch.float32).to( + query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, + p=self.attn_pdrop, + training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is' + + f' {attn_output.size()}') + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class DbrxFlashAttention2(DbrxAttention): + """Dbrx flash attention module. + + This module inherits from `DbrxAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it + calls the public API of flash attention. + """ + + def __init__(self, *args: Any, **kwargs: Any): + if not is_flash_attn_2_available(): + raise ImportError( + 'Flash Attention 2 is not available. Please install it with `pip install flash-attn`.' + ) + + super().__init__(*args, **kwargs) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Any, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + logger.info( + 'Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.' + ) + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.Wqkv(hidden_states) + if self.clip_qkv is not None: + qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv) + + query_states, key_states, value_states = qkv_states.split( + [ + self.hidden_size, + self.num_key_value_heads * self.head_dim, + self.num_key_value_heads * self.head_dim, + ], + dim=2, + ) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, + key_states, cos, sin) + + past_key_value = getattr(self, 'past_key_value', past_key_value) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + 'sin': sin, + 'cos': cos, + 'cache_position': cache_position + } + key_states, value_states = past_key_value.update( + key_states, value_states, self.block_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout + # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attn_pdrop if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, '_pre_quantization_dtype'): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = query_states.dtype + + logger.warning_once( + f'The input hidden states seems to be silently casted in float32, this might be ' + + + f'related to the fact you have upcasted embedding or layer norm layers in ' + + f'float32. We will cast back the input in {target_dtype}.') + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + ) + + attn_output = attn_output.reshape(bsz, q_len, + self.hidden_size).contiguous() + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value # type: ignore + + def _flash_attention_forward( + self, + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + attention_mask: Union[torch.LongTensor, None], + query_length: int, + dropout: float = 0.0, + softmax_scale: Optional[float] = None, + ): + """Use FlashAttention, stripping padding tokens if necessary. + + Args: + query_states (torch.Tensor): Input query states to be passed to Flash Attention API + key_states (torch.Tensor): Input key states to be passed to Flash Attention API + value_states (torch.Tensor): Input value states to be passed to Flash Attention API + attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size + (batch_size, seq_len) where 0 stands for the position of padding tokens and 1 + for the position of non-padding tokens. + query_length (int): The length of the query sequence + dropout (float): Attention dropout + softmax_scale (float, optional): The scaling of QK^T before applying softmax. + Defaults to 1 / sqrt(head_dim) + """ + causal = True + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, + query_length) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input( + attn_output_unpad, + indices_q, + batch_size, + query_length, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + return attn_output + + def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor, + value_layer: torch.Tensor, attention_mask: torch.Tensor, + query_length: int): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data( + attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, + head_dim), indices_k) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, + head_dim), indices_k) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, + head_dim), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( + query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +DBRX_ATTENTION_CLASSES = { + 'eager': DbrxAttention, + 'flash_attention_2': DbrxFlashAttention2, +} + + +class DbrxNormAttentionNorm(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + max_position_embeddings: int, + resid_pdrop: float, + attn_implementation: str, + attn_config: DbrxAttentionConfig, + block_idx: Optional[int] = None, + ): + super().__init__() + self.block_idx = block_idx + self.resid_pdrop = resid_pdrop + self.norm_1 = nn.LayerNorm(hidden_size, bias=False) + self.attn = DBRX_ATTENTION_CLASSES[attn_implementation]( + hidden_size=hidden_size, + num_heads=num_heads, + max_position_embeddings=max_position_embeddings, + attn_config=attn_config, + block_idx=block_idx, + ) + self.norm_2 = nn.LayerNorm(hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Any, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[Cache]]: + + residual_states = hidden_states + hidden_states = self.norm_1(hidden_states).to(hidden_states.dtype) + + hidden_states, attn_weights, past_key_value = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = nn.functional.dropout(hidden_states, + p=self.resid_pdrop, + training=self.training) + hidden_states = hidden_states + residual_states + + residual_states = hidden_states + hidden_states = self.norm_2(hidden_states).to(hidden_states.dtype) + + return residual_states, hidden_states, attn_weights, past_key_value + + +class DbrxRouter(nn.Module): + + def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int, + moe_jitter_eps: Optional[float], + moe_normalize_expert_weights: Optional[float], + uniform_expert_assignment: bool): + super().__init__() + self.hidden_size = hidden_size + self.moe_num_experts = moe_num_experts + self.moe_top_k = moe_top_k + self.moe_jitter_eps = moe_jitter_eps + self.moe_normalize_expert_weights = moe_normalize_expert_weights + self.uniform_expert_assignment = uniform_expert_assignment + + self.layer = nn.Linear(self.hidden_size, + self.moe_num_experts, + bias=False) + + def jitter(self, x: torch.Tensor) -> torch.Tensor: + if self.moe_jitter_eps is None: + raise RuntimeError('The router does not have moe_jitter_eps set.') + low = 1.0 - self.moe_jitter_eps + high = 1.0 + self.moe_jitter_eps + noise = torch.rand(x.size(), dtype=x.dtype, device=x.device) + return low + noise * (high - low) + + def forward( + self, x: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: + if self.training and self.moe_jitter_eps is not None: + x = x * self.jitter(x) + + weights = self.layer(x.view(-1, + x.shape[-1])).softmax(dim=-1, + dtype=torch.float32) + top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) + + if self.moe_normalize_expert_weights: + top_weights = top_weights / torch.norm( + top_weights, + p=self.moe_normalize_expert_weights, + dim=-1, + keepdim=True) + + if self.uniform_expert_assignment: + with torch.no_grad(): + uniform_tensor = torch.arange( + 0, + top_experts.numel(), + device=top_experts.device, + dtype=top_experts.dtype) % self.moe_num_experts + top_experts = uniform_tensor.reshape(top_experts.shape) + # Note, weights and top_weights are not changed + + weights = weights.to(x.dtype) + top_weights = top_weights.to(x.dtype) + return weights, top_weights, top_experts # type: ignore + + +class DbrxExpertGLU(nn.Module): + + def __init__(self, hidden_size: int, ffn_hidden_size: int, ffn_act_fn: dict): + super().__init__() + self.w1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False) + self.v1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False) + self.w2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False) + self.activation_fn = resolve_ffn_act_fn(ffn_act_fn) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x1 = self.w1(x) + x2 = self.v1(x) + x1 = self.activation_fn(x1) + x1 = x1 * x2 + x1 = self.w2(x1) + return x1 + + +class DbrxExperts(nn.Module): + + def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict): + super().__init__() + self.moe_num_experts = moe_num_experts + self.mlp_experts = nn.ModuleList([DbrxExpertGLU(hidden_size, ffn_hidden_size, ffn_act_fn) for _ in range(moe_num_experts)]) + + def forward(self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor) -> torch.Tensor: + bsz, q_len, hidden_size = x.shape + x = x.view(-1, hidden_size) + out = torch.zeros_like(x) + + expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0) + for expert_idx in range(0, self.moe_num_experts): + topk_idx, token_idx = torch.where(expert_mask[expert_idx]) + if token_idx.shape[0] == 0: + continue + + token_list = token_idx.tolist() + topk_list = topk_idx.tolist() + + expert_tokens = x[None, token_list].reshape(-1, hidden_size) + expert_out = self.mlp_experts[expert_idx](expert_tokens) * top_weights[token_list, topk_list, None] + + out.index_add_(0, token_idx, expert_out) + + out = out.reshape(bsz, q_len, hidden_size) + return out + + +class DbrxFFN(nn.Module): + + def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig): + super().__init__() + + self.router = DbrxRouter( + hidden_size, + moe_num_experts=ffn_config.moe_num_experts, + moe_top_k=ffn_config.moe_top_k, + moe_jitter_eps=ffn_config.moe_jitter_eps, + moe_normalize_expert_weights=ffn_config. + moe_normalize_expert_weights, + uniform_expert_assignment=ffn_config.uniform_expert_assignment, + ) + + self.experts = DbrxExperts( + hidden_size=hidden_size, + ffn_hidden_size=ffn_config.ffn_hidden_size, + moe_num_experts=ffn_config.moe_num_experts, + ffn_act_fn=ffn_config.ffn_act_fn, + ) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + weights, top_weights, top_experts = self.router(x) + out = self.experts(x, weights, top_weights, top_experts) + return out, weights + + +class DbrxBlock(nn.Module): + + def __init__(self, config: DbrxConfig, block_idx: int): + super().__init__() + self.hidden_size = config.d_model + self.resid_pdrop = config.resid_pdrop + self.block_idx = block_idx + self.norm_attn_norm = DbrxNormAttentionNorm( + hidden_size=config.d_model, + num_heads=config.n_heads, + max_position_embeddings=config.max_seq_len, + resid_pdrop=config.resid_pdrop, + attn_implementation=config._attn_implementation, + attn_config=config.attn_config, + block_idx=block_idx, + ) + self.ffn = DbrxFFN(hidden_size=config.d_model, + ffn_config=config.ffn_config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: torch.LongTensor = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Any, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, Optional[torch.Tensor]], + Tuple[torch.Tensor, Optional[Cache]], Tuple[ + torch.Tensor, Optional[torch.Tensor], Optional[Cache]], + Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]], Tuple[ + torch.Tensor, Optional[Cache], Optional[torch.Tensor]], + Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache], + Optional[torch.Tensor]],]: + """Forward function for DbrxBlock. + + Args: + hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)` + attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length) + if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length) + if default attention is used. + past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states + output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all + attention layers. See `attentions` under returned tensors for more detail. + output_router_logits (`bool`, optional): Whether or not to return the router logits. + use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are + returned and can be used to speed up decoding (see `past_key_values`). + cache_position (`torch.LongTensor`, optional): position ids of the cache + """ + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`' + ) + + # Norm + Attention + Norm + resid_states, hidden_states, self_attn_weights, present_key_value = self.norm_attn_norm( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + # Fully Connected + hidden_states, router_logits = self.ffn(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, + p=self.resid_pdrop, + training=self.training) + hidden_states = resid_states + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if output_router_logits: + outputs += (router_logits,) + + return outputs + + +class DbrxPreTrainedModel(PreTrainedModel): + config_class = DbrxConfig + base_model_prefix = 'transformer' + supports_gradient_checkpointing = True + _no_split_modules = ['DbrxBlock'] + _skip_keys_device_placement = ['past_key_values'] + _supports_flash_attn_2 = True + _supports_sdpa = False + _supports_cache_class = True + + def _init_weights(self, module: nn.Module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + + def _setup_cache(self, cache_cls: Any, max_batch_size: int, + max_cache_len: int): # TODO: how to set var type of class? + if self.config._attn_implementation == 'flash_attention_2' and cache_cls == StaticCache: + raise ValueError( + '`static` cache implementation is not compatible with ' + + '`attn_implementation==flash_attention_2`. Make sure to use ' + + '`spda` in the mean time and open an issue at https://github.com/huggingface/transformers.' + ) + + for block in self.transformer.blocks: + device = block.norm_attn_norm.norm_1.weight.device + if hasattr(self.config, '_pre_quantization_dtype'): + dtype = self.config._pre_quantization_dtype + else: + dtype = block.norm_attn_norm.attn.out_proj.weight.dtype + block.norm_attn_norm.attn.past_key_value = cache_cls(self.config, + max_batch_size, + max_cache_len, + device=device, + dtype=dtype) + + def _reset_cache(self): + for block in self.transformer.blocks: + block.norm_attn_norm.attn.past_key_value = None + + +class DbrxModel(DbrxPreTrainedModel): + """Transformer decoder consisting of *config.num_hidden_layers* + + [`DbrxBlock`] layers. + + Args: + config: DbrxConfig + """ + + def __init__(self, config: DbrxConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.emb_pdrop = config.emb_pdrop + + self.wte = nn.Embedding(config.vocab_size, config.d_model, + self.padding_idx) + self.blocks = nn.ModuleList([ + DbrxBlock(config, block_idx) for block_idx in range(config.n_layers) + ]) + self.norm_f = nn.LayerNorm(config.d_model, bias=False) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Embedding: + return self.wte + + def set_input_embeddings(self, value: nn.Embedding): + self.wte = value + + def _autocast_input_embeddings(self, + inputs_embeds: torch.Tensor) -> torch.Tensor: + if inputs_embeds.device.type == 'cuda' and torch.is_autocast_enabled(): + return inputs_embeds.to(dtype=torch.get_autocast_gpu_dtype()) + elif inputs_embeds.device.type == 'cpu' and torch.is_autocast_cpu_enabled( + ): + return inputs_embeds.to(dtype=torch.get_autocast_cpu_dtype()) + else: + return inputs_embeds + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, MoeModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + output_router_logits = (output_router_logits + if output_router_logits is not None else + self.config.output_router_logits) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one' + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.' + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + + inputs_embeds = self._autocast_input_embeddings( + inputs_embeds) # type: ignore + inputs_embeds = nn.functional.dropout(inputs_embeds, + p=self.emb_pdrop, + training=self.training) + + past_seen_tokens = 0 + if use_cache: # kept for BC (cache positions) + if not isinstance(past_key_values, StaticCache): + past_key_values = DynamicCache.from_legacy_cache( + past_key_values) + past_seen_tokens = past_key_values.get_seq_length( # type: ignore + ) + + if cache_position is None: + if isinstance(past_key_values, StaticCache): + raise ValueError( + 'cache_position is a required argument when using StaticCache.' + ) + cache_position = torch.arange( # type: ignore + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) # type: ignore + + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, + cache_position) # type: ignore + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_router_logits = () if output_router_logits else None + next_decoder_cache = None + + for block in self.blocks: + if output_hidden_states: + all_hidden_states += (hidden_states,) # type: ignore + + if self.gradient_checkpointing and self.training: + block_outputs = self._gradient_checkpointing_func( + block.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + output_router_logits, + use_cache, + cache_position, + ) + else: + block_outputs = block( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + output_router_logits=output_router_logits, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = block_outputs[0] + + if use_cache: + next_decoder_cache = block_outputs[ + 2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (block_outputs[1],) # type: ignore + + if output_router_logits: + all_router_logits += (block_outputs[-1],) # type: ignore + + hidden_states = self.norm_f(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) # type: ignore + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() # type: ignore + if isinstance(next_decoder_cache, Cache) else + next_decoder_cache) + if not return_dict: + return tuple(v for v in [ + hidden_states, next_cache, all_hidden_states, all_self_attns, + all_router_logits + ] if v is not None) + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + router_logits=all_router_logits, + ) + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + def _update_causal_mask( + self, attention_mask: Optional[torch.Tensor], + input_tensor: torch.Tensor, + cache_position: torch.Tensor) -> Optional[torch.Tensor]: + if self.config._attn_implementation == 'flash_attention_2': + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if hasattr(self.blocks[0].norm_attn_norm.attn, + 'past_key_value'): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = (attention_mask.shape[-1] if isinstance( + attention_mask, torch.Tensor) else cache_position[-1] + 1) + target_length = int(target_length) + + causal_mask = torch.full((sequence_length, target_length), + fill_value=min_dtype, + dtype=dtype, + device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange( + target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, + None, :, :].expand(input_tensor.shape[0], 1, + -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone( + ) # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq( + 0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[ + ..., :mask_length].masked_fill(padding_mask, min_dtype) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[ + -2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to( + dtype=dtype) * min_dtype + causal_mask[:mask_shape[0], :mask_shape[1], + offset:mask_shape[2] + + offset, :mask_shape[3]] = mask_slice + + if (self.config._attn_implementation == 'sdpa' and + attention_mask is not None and + attention_mask.device.type == 'cuda'): + # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). + is_tracing = ( + torch.jit.is_tracing() or + isinstance(input_tensor, torch.fx.Proxy) or # type: ignore + (hasattr(torch, '_dynamo') and torch._dynamo.is_compiling())) + if not is_tracing and torch.any(attention_mask != 1): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended( + causal_mask, min_dtype) + + return causal_mask + + +class DbrxForCausalLM(DbrxPreTrainedModel): + + def __init__(self, config: DbrxConfig): + super().__init__(config) + self.transformer = DbrxModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, + config.vocab_size, + bias=False) + self.router_aux_loss_coef = config.router_aux_loss_coef + self.num_experts = config.ffn_config.moe_num_experts + self.num_experts_per_tok = config.ffn_config.moe_top_k + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Embedding: + return self.transformer.get_input_embeddings() + + def set_input_embeddings(self, value: nn.Embedding): + self.transformer.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Linear: + return self.lm_head + + def set_output_embeddings(self, new_embeddings: nn.Linear): + self.lm_head = new_embeddings + + def set_decoder(self, decoder: DbrxModel): + self.transformer = decoder + + def get_decoder(self) -> DbrxModel: + return self.transformer + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, MoeCausalLMOutputWithPast]: + r"""Forward function for causal language modeling. + + Example: + ```python + >>> from transformers import AutoTokenizer, DbrxForCausalLM + + >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx") + >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + output_router_logits = (output_router_logits + if output_router_logits is not None else + self.config.output_router_logits) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_router_logits=output_router_logits, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits if return_dict else outputs[-1], + self.num_experts, + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None and loss is not None: + loss += self.router_aux_loss_coef * aux_loss.to( + loss.device) # make sure to reside in the same device + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: Any) -> Dict[str, Any]: + past_length = 0 + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[ + 1] > input_ids.shape[1]: + input_ids = input_ids[:, + -(attention_mask.shape[1] - past_length):] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if (max_cache_length is not None and attention_mask is not None and + cache_length + input_ids.shape[1] > max_cache_length): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get('position_ids', None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1]:] + + if self.generation_config.cache_implementation == 'static': + # generation with static cache + cache_position = kwargs.get('cache_position', None) + if cache_position is None: + past_length = 0 + else: + past_length = cache_position[-1] + 1 + input_ids = input_ids[:, past_length:] + position_ids = position_ids[:, + past_length:] if position_ids is not None else None + + # TODO @gante we should only keep a `cache_position` in generate, and do +=1. + # same goes for position ids. Could also help with continued generation. + input_length = position_ids.shape[ + -1] if position_ids is not None else input_ids.shape[-1] + cache_position = torch.arange(past_length, + past_length + input_length, + device=input_ids.device) + position_ids = position_ids.contiguous( + ) if position_ids is not None else None + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + model_inputs.update( + { # type: ignore + 'position_ids': position_ids, + 'cache_position': cache_position, + 'past_key_values': past_key_values, + 'use_cache': kwargs.get('use_cache'), + 'attention_mask': attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values: Cache, beam_idx: torch.LongTensor): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past),) + return reordered_past diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5ed1cbeedb0ca6503c3f2e9141576c8e86279da --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tiktoken.py b/tiktoken.py new file mode 100644 index 0000000000000000000000000000000000000000..bbc0f10c1bbc05d25657755c73b4779c080d2b93 --- /dev/null +++ b/tiktoken.py @@ -0,0 +1,374 @@ +"""Dbrx tokenizer.""" + +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple + +from transformers import PreTrainedTokenizer + + +def dbrx_system_prompt(): + # This is inspired by the Claude3 prompt. + # source: https://twitter.com/AmandaAskell/status/1765207842993434880 + # Identity and knowledge + prompt = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\n' + prompt += 'YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\n' + # Capabilities (and reminder to use ``` for JSON blocks and tables, which it can forget). Also a reminder that it can't browse the internet or run code. + prompt += 'You assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n' + prompt += '(You do not have real-time data access or code execution capabilities. ' + # Ethical guidelines + prompt += 'You avoid stereotyping and provide balanced perspectives on controversial topics. ' + # Data: the model doesn't know what it was trained on; it thinks that everything that it is aware of was in its training data. This is a reminder that it wasn't. + # We also encourage it not to try to generate lyrics or poems + prompt += 'You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\n' + # The model really wants to talk about its system prompt, to the point where it is annoying, so encourage it not to + prompt += 'This is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\n' + prompt += 'You do not mention any of this information about yourself unless the information is directly pertinent to the user\\\'s query.'.upper() + return prompt + + +# Taken from +# https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84 +@lru_cache() +def bytes_to_unicode(): + """Returns list of utf-8 byte and a mapping to unicode strings. + + We specifically avoids mapping to whitespace/control characters the bpe code + barfs on. + + The reversible bpe codes work on unicode strings. This means you need a + large # of unicode characters in your vocab if you want to avoid UNKs. When + you're at something like a 10B token dataset you end up needing around 5K + for decent coverage. This is a significant percentage of your normal, say, + 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and + unicode strings. + """ + bs = (list(range(ord('!'), + ord('~') + 1)) + list(range(ord('¡'), + ord('¬') + 1)) + + list(range(ord('®'), + ord('ÿ') + 1))) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +class TiktokenTokenizerWrapper(PreTrainedTokenizer): + """A thin wrapper around tiktoken to make it compatible with Hugging Face. + + tokenizers. + + See HuggingFace for further documentation on general tokenizer methods. + """ + + model_input_names = ['input_ids', 'attention_mask'] + + def __init__(self, + model_name: Optional[str] = None, + encoding_name: Optional[str] = None, + add_bos_token: bool = False, + add_eos_token: bool = False, + use_default_system_prompt: bool = False, + unk_token: Optional[str] = '<|endoftext|>', + eos_token: Optional[str] = '<|endoftext|>', + bos_token: Optional[str] = '<|endoftext|>', + pad_token: Optional[str] = None, + errors: str = 'replace', + **kwargs: Any): + """Constructor creates a tiktoken tokenizer to use as the underlying. + + tokenizer. + + Args: + model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None. + Either model_name or encoding_name must be set, but not both. + encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None. + Either model_name or encoding_name must be set, but not both. + add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False. + add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False. + use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False. + unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'. + eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'. + bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'. + pad_token (Optional[str], optional): The pad token. Defaults to None. + errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + Defaults to `"replace"`. + """ + try: + import tiktoken + except: + raise ImportError( + 'You need to install tiktoken to use TiktokenTokenizerWrapper.') + + # Workaround to make tiktokenizer picklable. + # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347 + # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181 + import copyreg + import functools + + from tiktoken import Encoding # type: ignore (thirdParty) + + def pickle_Encoding(enc: Encoding): + return (functools.partial(Encoding, + enc.name, + pat_str=enc._pat_str, + mergeable_ranks=enc._mergeable_ranks, + special_tokens=enc._special_tokens), ()) + + copyreg.pickle(Encoding, pickle_Encoding) + + if model_name is not None and encoding_name is not None: + raise ValueError( + 'You need to specify either model_name or encoding_name, not both.' + ) + + self.model_name = model_name + self.encoding_name = encoding_name + + if self.model_name is not None: + self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty) + self.model_name) + elif self.encoding_name is not None: + self.encoding = tiktoken.get_encoding( # type: ignore (thirdParty) + self.encoding_name) + else: + raise ValueError( + 'You need to specify either model_name or encoding_name.') + + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.use_default_system_prompt = use_default_system_prompt + + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + self.errors = errors + + self.decoder: Dict[int, str] = {} + for i in range(self.encoding.n_vocab): + try: + self.encoding.decode_single_token_bytes(i) + except KeyError: + continue + # Taken from + # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee + decoding = ''.join([ + bytes_to_unicode()[ord(char)] for char in + self.encoding.decode_single_token_bytes(i).decode('latin-1') + ]) + self.decoder[i] = decoding + + self.encoder: Dict[str, int] = {} + for i in range(self.encoding.n_vocab): + if i in self.decoder: + self.encoder[self.decoder[i]] = i + + super().__init__(model_name=model_name, + encoding_name=encoding_name, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + use_default_system_prompt=use_default_system_prompt, + unk_token=unk_token, + eos_token=eos_token, + bos_token=bos_token, + pad_token=pad_token, + errors=errors, + **kwargs) + + @property + def vocab_size(self) -> int: + """Returns vocab size.""" + return self.encoding.n_vocab + + @property + def is_fast(self) -> bool: + return False + + @property + def default_chat_template(self): + """Chat ML Template for User/Assistant. + + Pinning default Chat ML template in case defaults change. + """ + template = ( + "{% if messages[0]['role'] == 'system' %}" + '{% set loop_messages = messages[1:] %}' + "{% set system_message = messages[0]['content'] %}" + "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}" + '{% set loop_messages = messages %}' + "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}" + '{% else %}' + '{% set loop_messages = messages %}' + '{% set system_message = false %}' + '{% endif %}' + '{% for message in loop_messages %}' + '{% if loop.index0 == 0 %}' + '{% if system_message != false %}' + "{{ '<|im_start|>system\n' + system_message.strip() + '<|im_end|>\n'}}" + '{% endif %}' + "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" + '{% else %}' + "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" + '{% endif %}' + '{% if (add_generation_prompt == true and loop.last) %}' + "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}" + '{% endif %}' + '{% endfor %}') + template = template.replace( + 'USE_DEFAULT_PROMPT', + 'true' if self.use_default_system_prompt else 'false') + template = template.replace('DEFAULT_SYSTEM_PROMPT', + dbrx_system_prompt()) + return template + + def get_vocab(self) -> Dict[str, int]: + """Returns vocab as a dict.""" + # As far as I can tell, we don't require get_vocab to completely work, + # but when using additional_special_tokens, Hugging Face determines the next + # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct. + vocab_clone = self.encoder.copy() + extra_id_index = 0 + candidate_extra_id = f'' + indices_to_fill_in = {i for i in range(self.vocab_size)} - set( + vocab_clone.values()) + + # Add enough indices to make get_vocab() the right length + for index_to_add in indices_to_fill_in: + # Make sure we don't overwrite a token that already exists + while candidate_extra_id in vocab_clone: + extra_id_index += 1 + candidate_extra_id = f'' + + # Get an index to add and add the item + vocab_clone[candidate_extra_id] = index_to_add + + return vocab_clone + + def _tokenize(self, text: str) -> List[str]: + """Returns a tokenized string.""" + if not isinstance(text, str): + raise ValueError( + f'Expected a string input to _tokenize but got {type(text)}.') + + tokens = [ + self.decoder[t] + for t in self.encoding.encode(text, allowed_special='all') + ] + + return tokens + + def _convert_token_to_id(self, token: str) -> Optional[int]: + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> Optional[str]: + """Converts an index (integer) in a token (str) using the vocab.""" + # For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer, + # we return empty string. This matches the behavior of Hugging Face fast tokenizers, + # but not slow tokenizers. + return self.decoder.get(index, '') + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """Converts a sequence of tokens (string) in a single string.""" + text = ''.join(tokens) + text = bytearray([self.byte_decoder[c] for c in text + ]).decode('utf-8', errors=self.errors) + return text + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False) -> List[int]: + """Retrieves sequence ids from a token list that has no special tokens. + + Function copied from + https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295 + + added. This method is called when adding special tokens using the + tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) + + def create_token_type_ids_from_sequences( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + sep = [self.sep_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + + # ignore the below type to keep the original signature + # we are knowingly breaking the signature here, although not 100% certain + # it doesn't have side effects + # There is some code in huggingface that calls this function to get the vocab files, + # but it doesn't seem to access them (or at least checks for their existence + # before accessing them) + return (None, None) # type: ignore + + def sanitize_special_tokens(self) -> int: + """Make sure that all the special tokens attributes of the tokenizer. + + (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the + vocabulary. + + Add the missing ones to the vocabulary if needed. + + Return: + `int`: The number of tokens added in the vocabulary during the operation. + """ + actual_new_tokens = [] + for token in self.all_special_tokens_extended: + encoded = self.encoding.encode(token, allowed_special='all') + if len(encoded) > 1: + actual_new_tokens.append(token) + + return self.add_tokens(actual_new_tokens, special_tokens=True) diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5affd9ee0aadb839a0f3283a74e687f4ad40b126 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "added_tokens_decoder": { + "100257": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100277": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100278": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "auto_map": { + "AutoTokenizer": [ + "tiktoken.TiktokenTokenizerWrapper", + null + ] + }, + "bos_token": "<|endoftext|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "encoding_name": null, + "eos_token": "<|im_end|>", + "errors": "replace", + "model_max_length": 1000000000000000019884624838656, + "model_name": "gpt-4", + "pad_token": "<|pad|>", + "tokenizer_class": "TiktokenTokenizerWrapper", + "unk_token": "<|endoftext|>", + "use_default_system_prompt": false +}