jarrelscy commited on
Commit
2ac94b8
·
verified ·
1 Parent(s): dd2ed88

Upload MixtralForCausalLM

Browse files
config.json CHANGED
@@ -24,7 +24,56 @@
24
  "dataset": "wikitext2",
25
  "desc_act": false,
26
  "group_size": 128,
27
- "modules_in_block_to_quantize": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "quant_method": "gptq",
29
  "sym": true,
30
  "true_sequential": true
@@ -36,6 +85,6 @@
36
  "tie_word_embeddings": false,
37
  "torch_dtype": "float16",
38
  "transformers_version": "4.39.3",
39
- "use_cache": false,
40
  "vocab_size": 32768
41
  }
 
24
  "dataset": "wikitext2",
25
  "desc_act": false,
26
  "group_size": 128,
27
+ "modules_in_block_to_quantize": [
28
+ [
29
+ "self_attn.k_proj",
30
+ "self_attn.v_proj",
31
+ "self_attn.q_proj"
32
+ ],
33
+ [
34
+ "self_attn.o_proj"
35
+ ],
36
+ [
37
+ "block_sparse_moe.experts.0.w1",
38
+ "block_sparse_moe.experts.0.w2",
39
+ "block_sparse_moe.experts.0.w3"
40
+ ],
41
+ [
42
+ "block_sparse_moe.experts.1.w1",
43
+ "block_sparse_moe.experts.1.w2",
44
+ "block_sparse_moe.experts.1.w3"
45
+ ],
46
+ [
47
+ "block_sparse_moe.experts.2.w1",
48
+ "block_sparse_moe.experts.2.w2",
49
+ "block_sparse_moe.experts.2.w3"
50
+ ],
51
+ [
52
+ "block_sparse_moe.experts.3.w1",
53
+ "block_sparse_moe.experts.3.w2",
54
+ "block_sparse_moe.experts.3.w3"
55
+ ],
56
+ [
57
+ "block_sparse_moe.experts.4.w1",
58
+ "block_sparse_moe.experts.4.w2",
59
+ "block_sparse_moe.experts.4.w3"
60
+ ],
61
+ [
62
+ "block_sparse_moe.experts.5.w1",
63
+ "block_sparse_moe.experts.5.w2",
64
+ "block_sparse_moe.experts.5.w3"
65
+ ],
66
+ [
67
+ "block_sparse_moe.experts.6.w1",
68
+ "block_sparse_moe.experts.6.w2",
69
+ "block_sparse_moe.experts.6.w3"
70
+ ],
71
+ [
72
+ "block_sparse_moe.experts.7.w1",
73
+ "block_sparse_moe.experts.7.w2",
74
+ "block_sparse_moe.experts.7.w3"
75
+ ]
76
+ ],
77
  "quant_method": "gptq",
78
  "sym": true,
79
  "true_sequential": true
 
85
  "tie_word_embeddings": false,
86
  "torch_dtype": "float16",
87
  "transformers_version": "4.39.3",
88
+ "use_cache": true,
89
  "vocab_size": 32768
90
  }
model-00001-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04c046ae8f0ccf402ec6b08f6402308187be4409350232c6ac3cba2112796647
3
- size 4984890344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee7fe4d38e1c900d0e5e5a251bab06fac3c173df8818109e0af8e7f89a44b124
3
+ size 4985132728
model-00002-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbe99adbf4bc6d7f28bb70c5b4b9c846979725ea5a8d5e431ba25701c0622351
3
- size 4999568144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fbe866a1d6c91cace9b51846bf298cbdd97c371e86475ab4ad52db3b8ad28ec
3
+ size 4999760248
model-00003-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:481d9f89c907617061f0132c67f1e3b29faff5d1ae0c832bfd3cc925ebf78502
3
- size 4950494384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39992a82031ba7b33200137ed6a9b5786672e2c3d6bc05627a188b00ec2cdc95
3
+ size 4950686368
model-00004-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14179eca9b205cecec039d2a090cfa30d6af62585307faa25445a1965ef661a0
3
- size 4955254808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e799d93d64e39ea2808305fc9e5f043554e843bb7aec083722ab2f42a7c5876f
3
+ size 4955398880
model-00005-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cc5f8af01512f2c955b80df5c589eb2d94a94d65251e54e36448add56760693
3
- size 4999568816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0d2de9aaf7de39c018fc23ef4f9cb370de165c195d711778cfa117912f9e536
3
+ size 4999760800
model-00006-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a87a041552f53382a8c731f8a18f2a42fb1eeb468d0387f44112c2655a7ce17
3
- size 4950494632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9378fb98aa10078fd99764cefb73895acc1399f3ff7c1d342a58ca40bbaf01b
3
+ size 4950686728
model-00007-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7baaabdb3a5aa973af09503ac31f3d2ad54246b4d890f344b8ae5c63b4ce869
3
- size 4999568824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed74705bb7f52550dc74e59464fabedc311fdebc728e925dddaba91d87e9de16
3
+ size 4999760800
model-00008-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f32a4236c4511c8711144bab24bbfc9e551456dde2e5ed220d35a6d79db90de
3
- size 4950494632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b1a7379f8ffa1a847e7f320644d9d818d7c04ebcedeb22ab32d61d140bd4aa9
3
+ size 4950686728
model-00009-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b797ebf00413c7ac2127115b282004940b33f6b348f9b5d0c2a4c169981ad03
3
- size 4999568824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:556b48d690c2598bcec7836a203e39af86126077f4618af669b52b8971e618c7
3
+ size 4999760800
model-00010-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef47303d080a9bae1fafe5d3b51aac7ce8f65807dc5ba3f55b8a967cca6c647f
3
- size 4956885384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdb2b8a2b7721d090a77dc7095d4768526323f14c71b9a1698bf7c01b6b810a5
3
+ size 4957029456
model-00011-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0421ea2a53380143d52173ef4479008204e55bdfe9261d32d151022b6900e0a
3
- size 4999568824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8189ea0ca0d64236746e00d2a32f50cd8cfb5063a052edbc5cb8c385113cf0ad
3
+ size 4999760800
model-00012-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9deb8be90bafffca69cddccca415d610e1ec69d269da2442f95cff3afbcd49a5
3
- size 4950494632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5614341acb2b11941f9a25cf6acb0a065c1d9b87f6c869c12eec4f0eecdc4ea0
3
+ size 4950686728
model-00013-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ecd14eb036ef3ce831339ff64e1d109bb26d58b48941e2d4f72be0d7626855b
3
- size 4999568824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7b10e681f268eb7753efd3adea3e64778a3b6485e970554386a6e330cc3dd9
3
+ size 4999760800
model-00014-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdd7d67a99dc337e73b1ea92028f4fd1df2866ebcf5cd29bf453fa170787913a
3
- size 4950494632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9de71eac6df45a8dc50a7f1383faa5b8a6767fe126501ffa629e1dc8cea147b
3
+ size 4950686728
model-00015-of-00015.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25c1afe5fa496c0caca169faa252113ef4d8f0106679d19d03c6f19521f742c0
3
- size 4107685704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:054531e9c6efdeb6531a74d19e435df5f60a367677816caf4cd1b4ce45dff328
3
+ size 4107731232
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 73753582464
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00015-of-00015.safetensors",
@@ -125,11 +125,7 @@
125
  "model.layers.0.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
126
  "model.layers.0.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
127
  "model.layers.0.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
128
- "model.layers.0.block_sparse_moe.gate.bias": "model-00001-of-00015.safetensors",
129
- "model.layers.0.block_sparse_moe.gate.g_idx": "model-00001-of-00015.safetensors",
130
- "model.layers.0.block_sparse_moe.gate.qweight": "model-00001-of-00015.safetensors",
131
- "model.layers.0.block_sparse_moe.gate.qzeros": "model-00001-of-00015.safetensors",
132
- "model.layers.0.block_sparse_moe.gate.scales": "model-00001-of-00015.safetensors",
133
  "model.layers.0.input_layernorm.weight": "model-00001-of-00015.safetensors",
134
  "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
135
  "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
@@ -272,10 +268,7 @@
272
  "model.layers.1.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
273
  "model.layers.1.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
274
  "model.layers.1.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
275
- "model.layers.1.block_sparse_moe.gate.bias": "model-00001-of-00015.safetensors",
276
- "model.layers.1.block_sparse_moe.gate.g_idx": "model-00001-of-00015.safetensors",
277
- "model.layers.1.block_sparse_moe.gate.qweight": "model-00001-of-00015.safetensors",
278
- "model.layers.1.block_sparse_moe.gate.scales": "model-00001-of-00015.safetensors",
279
  "model.layers.1.input_layernorm.weight": "model-00001-of-00015.safetensors",
280
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
281
  "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
@@ -418,10 +411,7 @@
418
  "model.layers.10.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
419
  "model.layers.10.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
420
  "model.layers.10.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
421
- "model.layers.10.block_sparse_moe.gate.bias": "model-00003-of-00015.safetensors",
422
- "model.layers.10.block_sparse_moe.gate.g_idx": "model-00003-of-00015.safetensors",
423
- "model.layers.10.block_sparse_moe.gate.qweight": "model-00003-of-00015.safetensors",
424
- "model.layers.10.block_sparse_moe.gate.scales": "model-00003-of-00015.safetensors",
425
  "model.layers.10.input_layernorm.weight": "model-00003-of-00015.safetensors",
426
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
427
  "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
@@ -564,10 +554,7 @@
564
  "model.layers.11.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
565
  "model.layers.11.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
566
  "model.layers.11.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
567
- "model.layers.11.block_sparse_moe.gate.bias": "model-00004-of-00015.safetensors",
568
- "model.layers.11.block_sparse_moe.gate.g_idx": "model-00004-of-00015.safetensors",
569
- "model.layers.11.block_sparse_moe.gate.qweight": "model-00004-of-00015.safetensors",
570
- "model.layers.11.block_sparse_moe.gate.scales": "model-00004-of-00015.safetensors",
571
  "model.layers.11.input_layernorm.weight": "model-00004-of-00015.safetensors",
572
  "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
573
  "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
@@ -710,10 +697,7 @@
710
  "model.layers.12.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
711
  "model.layers.12.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
712
  "model.layers.12.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
713
- "model.layers.12.block_sparse_moe.gate.bias": "model-00004-of-00015.safetensors",
714
- "model.layers.12.block_sparse_moe.gate.g_idx": "model-00004-of-00015.safetensors",
715
- "model.layers.12.block_sparse_moe.gate.qweight": "model-00004-of-00015.safetensors",
716
- "model.layers.12.block_sparse_moe.gate.scales": "model-00004-of-00015.safetensors",
717
  "model.layers.12.input_layernorm.weight": "model-00004-of-00015.safetensors",
718
  "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
719
  "model.layers.12.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
@@ -856,10 +840,7 @@
856
  "model.layers.13.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
857
  "model.layers.13.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
858
  "model.layers.13.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
859
- "model.layers.13.block_sparse_moe.gate.bias": "model-00004-of-00015.safetensors",
860
- "model.layers.13.block_sparse_moe.gate.g_idx": "model-00004-of-00015.safetensors",
861
- "model.layers.13.block_sparse_moe.gate.qweight": "model-00004-of-00015.safetensors",
862
- "model.layers.13.block_sparse_moe.gate.scales": "model-00004-of-00015.safetensors",
863
  "model.layers.13.input_layernorm.weight": "model-00004-of-00015.safetensors",
864
  "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
865
  "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
@@ -1002,10 +983,7 @@
1002
  "model.layers.14.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1003
  "model.layers.14.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1004
  "model.layers.14.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1005
- "model.layers.14.block_sparse_moe.gate.bias": "model-00005-of-00015.safetensors",
1006
- "model.layers.14.block_sparse_moe.gate.g_idx": "model-00005-of-00015.safetensors",
1007
- "model.layers.14.block_sparse_moe.gate.qweight": "model-00005-of-00015.safetensors",
1008
- "model.layers.14.block_sparse_moe.gate.scales": "model-00005-of-00015.safetensors",
1009
  "model.layers.14.input_layernorm.weight": "model-00005-of-00015.safetensors",
1010
  "model.layers.14.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1011
  "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
@@ -1148,11 +1126,7 @@
1148
  "model.layers.15.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1149
  "model.layers.15.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1150
  "model.layers.15.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1151
- "model.layers.15.block_sparse_moe.gate.bias": "model-00005-of-00015.safetensors",
1152
- "model.layers.15.block_sparse_moe.gate.g_idx": "model-00005-of-00015.safetensors",
1153
- "model.layers.15.block_sparse_moe.gate.qweight": "model-00005-of-00015.safetensors",
1154
- "model.layers.15.block_sparse_moe.gate.qzeros": "model-00005-of-00015.safetensors",
1155
- "model.layers.15.block_sparse_moe.gate.scales": "model-00005-of-00015.safetensors",
1156
  "model.layers.15.input_layernorm.weight": "model-00005-of-00015.safetensors",
1157
  "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1158
  "model.layers.15.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
@@ -1295,10 +1269,7 @@
1295
  "model.layers.16.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1296
  "model.layers.16.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1297
  "model.layers.16.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1298
- "model.layers.16.block_sparse_moe.gate.bias": "model-00005-of-00015.safetensors",
1299
- "model.layers.16.block_sparse_moe.gate.g_idx": "model-00005-of-00015.safetensors",
1300
- "model.layers.16.block_sparse_moe.gate.qweight": "model-00005-of-00015.safetensors",
1301
- "model.layers.16.block_sparse_moe.gate.scales": "model-00005-of-00015.safetensors",
1302
  "model.layers.16.input_layernorm.weight": "model-00005-of-00015.safetensors",
1303
  "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1304
  "model.layers.16.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
@@ -1441,10 +1412,7 @@
1441
  "model.layers.17.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1442
  "model.layers.17.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1443
  "model.layers.17.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1444
- "model.layers.17.block_sparse_moe.gate.bias": "model-00005-of-00015.safetensors",
1445
- "model.layers.17.block_sparse_moe.gate.g_idx": "model-00005-of-00015.safetensors",
1446
- "model.layers.17.block_sparse_moe.gate.qweight": "model-00005-of-00015.safetensors",
1447
- "model.layers.17.block_sparse_moe.gate.scales": "model-00005-of-00015.safetensors",
1448
  "model.layers.17.input_layernorm.weight": "model-00005-of-00015.safetensors",
1449
  "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1450
  "model.layers.17.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
@@ -1587,10 +1555,7 @@
1587
  "model.layers.18.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
1588
  "model.layers.18.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
1589
  "model.layers.18.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
1590
- "model.layers.18.block_sparse_moe.gate.bias": "model-00006-of-00015.safetensors",
1591
- "model.layers.18.block_sparse_moe.gate.g_idx": "model-00006-of-00015.safetensors",
1592
- "model.layers.18.block_sparse_moe.gate.qweight": "model-00006-of-00015.safetensors",
1593
- "model.layers.18.block_sparse_moe.gate.scales": "model-00006-of-00015.safetensors",
1594
  "model.layers.18.input_layernorm.weight": "model-00006-of-00015.safetensors",
1595
  "model.layers.18.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
1596
  "model.layers.18.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
@@ -1733,10 +1698,7 @@
1733
  "model.layers.19.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
1734
  "model.layers.19.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
1735
  "model.layers.19.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
1736
- "model.layers.19.block_sparse_moe.gate.bias": "model-00006-of-00015.safetensors",
1737
- "model.layers.19.block_sparse_moe.gate.g_idx": "model-00006-of-00015.safetensors",
1738
- "model.layers.19.block_sparse_moe.gate.qweight": "model-00006-of-00015.safetensors",
1739
- "model.layers.19.block_sparse_moe.gate.scales": "model-00006-of-00015.safetensors",
1740
  "model.layers.19.input_layernorm.weight": "model-00006-of-00015.safetensors",
1741
  "model.layers.19.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
1742
  "model.layers.19.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
@@ -1879,10 +1841,7 @@
1879
  "model.layers.2.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
1880
  "model.layers.2.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
1881
  "model.layers.2.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
1882
- "model.layers.2.block_sparse_moe.gate.bias": "model-00001-of-00015.safetensors",
1883
- "model.layers.2.block_sparse_moe.gate.g_idx": "model-00001-of-00015.safetensors",
1884
- "model.layers.2.block_sparse_moe.gate.qweight": "model-00001-of-00015.safetensors",
1885
- "model.layers.2.block_sparse_moe.gate.scales": "model-00001-of-00015.safetensors",
1886
  "model.layers.2.input_layernorm.weight": "model-00001-of-00015.safetensors",
1887
  "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
1888
  "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
@@ -2025,10 +1984,7 @@
2025
  "model.layers.20.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
2026
  "model.layers.20.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
2027
  "model.layers.20.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
2028
- "model.layers.20.block_sparse_moe.gate.bias": "model-00006-of-00015.safetensors",
2029
- "model.layers.20.block_sparse_moe.gate.g_idx": "model-00006-of-00015.safetensors",
2030
- "model.layers.20.block_sparse_moe.gate.qweight": "model-00006-of-00015.safetensors",
2031
- "model.layers.20.block_sparse_moe.gate.scales": "model-00006-of-00015.safetensors",
2032
  "model.layers.20.input_layernorm.weight": "model-00006-of-00015.safetensors",
2033
  "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
2034
  "model.layers.20.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
@@ -2171,10 +2127,7 @@
2171
  "model.layers.21.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
2172
  "model.layers.21.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
2173
  "model.layers.21.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
2174
- "model.layers.21.block_sparse_moe.gate.bias": "model-00006-of-00015.safetensors",
2175
- "model.layers.21.block_sparse_moe.gate.g_idx": "model-00006-of-00015.safetensors",
2176
- "model.layers.21.block_sparse_moe.gate.qweight": "model-00006-of-00015.safetensors",
2177
- "model.layers.21.block_sparse_moe.gate.scales": "model-00006-of-00015.safetensors",
2178
  "model.layers.21.input_layernorm.weight": "model-00006-of-00015.safetensors",
2179
  "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
2180
  "model.layers.21.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
@@ -2317,10 +2270,7 @@
2317
  "model.layers.22.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2318
  "model.layers.22.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2319
  "model.layers.22.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2320
- "model.layers.22.block_sparse_moe.gate.bias": "model-00007-of-00015.safetensors",
2321
- "model.layers.22.block_sparse_moe.gate.g_idx": "model-00007-of-00015.safetensors",
2322
- "model.layers.22.block_sparse_moe.gate.qweight": "model-00007-of-00015.safetensors",
2323
- "model.layers.22.block_sparse_moe.gate.scales": "model-00007-of-00015.safetensors",
2324
  "model.layers.22.input_layernorm.weight": "model-00007-of-00015.safetensors",
2325
  "model.layers.22.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2326
  "model.layers.22.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
@@ -2463,11 +2413,7 @@
2463
  "model.layers.23.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2464
  "model.layers.23.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2465
  "model.layers.23.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2466
- "model.layers.23.block_sparse_moe.gate.bias": "model-00007-of-00015.safetensors",
2467
- "model.layers.23.block_sparse_moe.gate.g_idx": "model-00007-of-00015.safetensors",
2468
- "model.layers.23.block_sparse_moe.gate.qweight": "model-00007-of-00015.safetensors",
2469
- "model.layers.23.block_sparse_moe.gate.qzeros": "model-00007-of-00015.safetensors",
2470
- "model.layers.23.block_sparse_moe.gate.scales": "model-00007-of-00015.safetensors",
2471
  "model.layers.23.input_layernorm.weight": "model-00007-of-00015.safetensors",
2472
  "model.layers.23.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2473
  "model.layers.23.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
@@ -2610,10 +2556,7 @@
2610
  "model.layers.24.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2611
  "model.layers.24.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2612
  "model.layers.24.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2613
- "model.layers.24.block_sparse_moe.gate.bias": "model-00007-of-00015.safetensors",
2614
- "model.layers.24.block_sparse_moe.gate.g_idx": "model-00007-of-00015.safetensors",
2615
- "model.layers.24.block_sparse_moe.gate.qweight": "model-00007-of-00015.safetensors",
2616
- "model.layers.24.block_sparse_moe.gate.scales": "model-00007-of-00015.safetensors",
2617
  "model.layers.24.input_layernorm.weight": "model-00007-of-00015.safetensors",
2618
  "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2619
  "model.layers.24.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
@@ -2756,10 +2699,7 @@
2756
  "model.layers.25.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2757
  "model.layers.25.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2758
  "model.layers.25.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2759
- "model.layers.25.block_sparse_moe.gate.bias": "model-00007-of-00015.safetensors",
2760
- "model.layers.25.block_sparse_moe.gate.g_idx": "model-00007-of-00015.safetensors",
2761
- "model.layers.25.block_sparse_moe.gate.qweight": "model-00007-of-00015.safetensors",
2762
- "model.layers.25.block_sparse_moe.gate.scales": "model-00007-of-00015.safetensors",
2763
  "model.layers.25.input_layernorm.weight": "model-00007-of-00015.safetensors",
2764
  "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2765
  "model.layers.25.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
@@ -2902,10 +2842,7 @@
2902
  "model.layers.26.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
2903
  "model.layers.26.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
2904
  "model.layers.26.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
2905
- "model.layers.26.block_sparse_moe.gate.bias": "model-00008-of-00015.safetensors",
2906
- "model.layers.26.block_sparse_moe.gate.g_idx": "model-00008-of-00015.safetensors",
2907
- "model.layers.26.block_sparse_moe.gate.qweight": "model-00008-of-00015.safetensors",
2908
- "model.layers.26.block_sparse_moe.gate.scales": "model-00008-of-00015.safetensors",
2909
  "model.layers.26.input_layernorm.weight": "model-00008-of-00015.safetensors",
2910
  "model.layers.26.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
2911
  "model.layers.26.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
@@ -3048,10 +2985,7 @@
3048
  "model.layers.27.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
3049
  "model.layers.27.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
3050
  "model.layers.27.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
3051
- "model.layers.27.block_sparse_moe.gate.bias": "model-00008-of-00015.safetensors",
3052
- "model.layers.27.block_sparse_moe.gate.g_idx": "model-00008-of-00015.safetensors",
3053
- "model.layers.27.block_sparse_moe.gate.qweight": "model-00008-of-00015.safetensors",
3054
- "model.layers.27.block_sparse_moe.gate.scales": "model-00008-of-00015.safetensors",
3055
  "model.layers.27.input_layernorm.weight": "model-00008-of-00015.safetensors",
3056
  "model.layers.27.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
3057
  "model.layers.27.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
@@ -3194,10 +3128,7 @@
3194
  "model.layers.28.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
3195
  "model.layers.28.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
3196
  "model.layers.28.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
3197
- "model.layers.28.block_sparse_moe.gate.bias": "model-00008-of-00015.safetensors",
3198
- "model.layers.28.block_sparse_moe.gate.g_idx": "model-00008-of-00015.safetensors",
3199
- "model.layers.28.block_sparse_moe.gate.qweight": "model-00008-of-00015.safetensors",
3200
- "model.layers.28.block_sparse_moe.gate.scales": "model-00008-of-00015.safetensors",
3201
  "model.layers.28.input_layernorm.weight": "model-00008-of-00015.safetensors",
3202
  "model.layers.28.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
3203
  "model.layers.28.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
@@ -3340,10 +3271,7 @@
3340
  "model.layers.29.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
3341
  "model.layers.29.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
3342
  "model.layers.29.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
3343
- "model.layers.29.block_sparse_moe.gate.bias": "model-00008-of-00015.safetensors",
3344
- "model.layers.29.block_sparse_moe.gate.g_idx": "model-00008-of-00015.safetensors",
3345
- "model.layers.29.block_sparse_moe.gate.qweight": "model-00008-of-00015.safetensors",
3346
- "model.layers.29.block_sparse_moe.gate.scales": "model-00008-of-00015.safetensors",
3347
  "model.layers.29.input_layernorm.weight": "model-00008-of-00015.safetensors",
3348
  "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
3349
  "model.layers.29.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
@@ -3486,10 +3414,7 @@
3486
  "model.layers.3.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
3487
  "model.layers.3.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
3488
  "model.layers.3.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
3489
- "model.layers.3.block_sparse_moe.gate.bias": "model-00002-of-00015.safetensors",
3490
- "model.layers.3.block_sparse_moe.gate.g_idx": "model-00002-of-00015.safetensors",
3491
- "model.layers.3.block_sparse_moe.gate.qweight": "model-00002-of-00015.safetensors",
3492
- "model.layers.3.block_sparse_moe.gate.scales": "model-00002-of-00015.safetensors",
3493
  "model.layers.3.input_layernorm.weight": "model-00002-of-00015.safetensors",
3494
  "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
3495
  "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
@@ -3632,10 +3557,7 @@
3632
  "model.layers.30.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3633
  "model.layers.30.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3634
  "model.layers.30.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3635
- "model.layers.30.block_sparse_moe.gate.bias": "model-00009-of-00015.safetensors",
3636
- "model.layers.30.block_sparse_moe.gate.g_idx": "model-00009-of-00015.safetensors",
3637
- "model.layers.30.block_sparse_moe.gate.qweight": "model-00009-of-00015.safetensors",
3638
- "model.layers.30.block_sparse_moe.gate.scales": "model-00009-of-00015.safetensors",
3639
  "model.layers.30.input_layernorm.weight": "model-00009-of-00015.safetensors",
3640
  "model.layers.30.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3641
  "model.layers.30.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
@@ -3778,11 +3700,7 @@
3778
  "model.layers.31.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3779
  "model.layers.31.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3780
  "model.layers.31.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3781
- "model.layers.31.block_sparse_moe.gate.bias": "model-00009-of-00015.safetensors",
3782
- "model.layers.31.block_sparse_moe.gate.g_idx": "model-00009-of-00015.safetensors",
3783
- "model.layers.31.block_sparse_moe.gate.qweight": "model-00009-of-00015.safetensors",
3784
- "model.layers.31.block_sparse_moe.gate.qzeros": "model-00009-of-00015.safetensors",
3785
- "model.layers.31.block_sparse_moe.gate.scales": "model-00009-of-00015.safetensors",
3786
  "model.layers.31.input_layernorm.weight": "model-00009-of-00015.safetensors",
3787
  "model.layers.31.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3788
  "model.layers.31.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
@@ -3925,10 +3843,7 @@
3925
  "model.layers.32.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3926
  "model.layers.32.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3927
  "model.layers.32.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3928
- "model.layers.32.block_sparse_moe.gate.bias": "model-00009-of-00015.safetensors",
3929
- "model.layers.32.block_sparse_moe.gate.g_idx": "model-00009-of-00015.safetensors",
3930
- "model.layers.32.block_sparse_moe.gate.qweight": "model-00009-of-00015.safetensors",
3931
- "model.layers.32.block_sparse_moe.gate.scales": "model-00009-of-00015.safetensors",
3932
  "model.layers.32.input_layernorm.weight": "model-00009-of-00015.safetensors",
3933
  "model.layers.32.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3934
  "model.layers.32.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
@@ -4071,10 +3986,7 @@
4071
  "model.layers.33.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
4072
  "model.layers.33.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
4073
  "model.layers.33.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
4074
- "model.layers.33.block_sparse_moe.gate.bias": "model-00009-of-00015.safetensors",
4075
- "model.layers.33.block_sparse_moe.gate.g_idx": "model-00009-of-00015.safetensors",
4076
- "model.layers.33.block_sparse_moe.gate.qweight": "model-00009-of-00015.safetensors",
4077
- "model.layers.33.block_sparse_moe.gate.scales": "model-00009-of-00015.safetensors",
4078
  "model.layers.33.input_layernorm.weight": "model-00009-of-00015.safetensors",
4079
  "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
4080
  "model.layers.33.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
@@ -4217,10 +4129,7 @@
4217
  "model.layers.34.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4218
  "model.layers.34.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4219
  "model.layers.34.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4220
- "model.layers.34.block_sparse_moe.gate.bias": "model-00010-of-00015.safetensors",
4221
- "model.layers.34.block_sparse_moe.gate.g_idx": "model-00010-of-00015.safetensors",
4222
- "model.layers.34.block_sparse_moe.gate.qweight": "model-00010-of-00015.safetensors",
4223
- "model.layers.34.block_sparse_moe.gate.scales": "model-00010-of-00015.safetensors",
4224
  "model.layers.34.input_layernorm.weight": "model-00010-of-00015.safetensors",
4225
  "model.layers.34.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4226
  "model.layers.34.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
@@ -4363,10 +4272,7 @@
4363
  "model.layers.35.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4364
  "model.layers.35.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4365
  "model.layers.35.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4366
- "model.layers.35.block_sparse_moe.gate.bias": "model-00010-of-00015.safetensors",
4367
- "model.layers.35.block_sparse_moe.gate.g_idx": "model-00010-of-00015.safetensors",
4368
- "model.layers.35.block_sparse_moe.gate.qweight": "model-00010-of-00015.safetensors",
4369
- "model.layers.35.block_sparse_moe.gate.scales": "model-00010-of-00015.safetensors",
4370
  "model.layers.35.input_layernorm.weight": "model-00010-of-00015.safetensors",
4371
  "model.layers.35.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4372
  "model.layers.35.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
@@ -4509,10 +4415,7 @@
4509
  "model.layers.36.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4510
  "model.layers.36.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4511
  "model.layers.36.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4512
- "model.layers.36.block_sparse_moe.gate.bias": "model-00010-of-00015.safetensors",
4513
- "model.layers.36.block_sparse_moe.gate.g_idx": "model-00010-of-00015.safetensors",
4514
- "model.layers.36.block_sparse_moe.gate.qweight": "model-00010-of-00015.safetensors",
4515
- "model.layers.36.block_sparse_moe.gate.scales": "model-00010-of-00015.safetensors",
4516
  "model.layers.36.input_layernorm.weight": "model-00010-of-00015.safetensors",
4517
  "model.layers.36.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4518
  "model.layers.36.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
@@ -4655,10 +4558,7 @@
4655
  "model.layers.37.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4656
  "model.layers.37.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4657
  "model.layers.37.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4658
- "model.layers.37.block_sparse_moe.gate.bias": "model-00011-of-00015.safetensors",
4659
- "model.layers.37.block_sparse_moe.gate.g_idx": "model-00011-of-00015.safetensors",
4660
- "model.layers.37.block_sparse_moe.gate.qweight": "model-00011-of-00015.safetensors",
4661
- "model.layers.37.block_sparse_moe.gate.scales": "model-00011-of-00015.safetensors",
4662
  "model.layers.37.input_layernorm.weight": "model-00011-of-00015.safetensors",
4663
  "model.layers.37.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4664
  "model.layers.37.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
@@ -4801,10 +4701,7 @@
4801
  "model.layers.38.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4802
  "model.layers.38.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4803
  "model.layers.38.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4804
- "model.layers.38.block_sparse_moe.gate.bias": "model-00011-of-00015.safetensors",
4805
- "model.layers.38.block_sparse_moe.gate.g_idx": "model-00011-of-00015.safetensors",
4806
- "model.layers.38.block_sparse_moe.gate.qweight": "model-00011-of-00015.safetensors",
4807
- "model.layers.38.block_sparse_moe.gate.scales": "model-00011-of-00015.safetensors",
4808
  "model.layers.38.input_layernorm.weight": "model-00011-of-00015.safetensors",
4809
  "model.layers.38.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4810
  "model.layers.38.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
@@ -4947,11 +4844,7 @@
4947
  "model.layers.39.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4948
  "model.layers.39.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4949
  "model.layers.39.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4950
- "model.layers.39.block_sparse_moe.gate.bias": "model-00011-of-00015.safetensors",
4951
- "model.layers.39.block_sparse_moe.gate.g_idx": "model-00011-of-00015.safetensors",
4952
- "model.layers.39.block_sparse_moe.gate.qweight": "model-00011-of-00015.safetensors",
4953
- "model.layers.39.block_sparse_moe.gate.qzeros": "model-00011-of-00015.safetensors",
4954
- "model.layers.39.block_sparse_moe.gate.scales": "model-00011-of-00015.safetensors",
4955
  "model.layers.39.input_layernorm.weight": "model-00011-of-00015.safetensors",
4956
  "model.layers.39.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4957
  "model.layers.39.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
@@ -5094,10 +4987,7 @@
5094
  "model.layers.4.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
5095
  "model.layers.4.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
5096
  "model.layers.4.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
5097
- "model.layers.4.block_sparse_moe.gate.bias": "model-00002-of-00015.safetensors",
5098
- "model.layers.4.block_sparse_moe.gate.g_idx": "model-00002-of-00015.safetensors",
5099
- "model.layers.4.block_sparse_moe.gate.qweight": "model-00002-of-00015.safetensors",
5100
- "model.layers.4.block_sparse_moe.gate.scales": "model-00002-of-00015.safetensors",
5101
  "model.layers.4.input_layernorm.weight": "model-00002-of-00015.safetensors",
5102
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
5103
  "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
@@ -5240,10 +5130,7 @@
5240
  "model.layers.40.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
5241
  "model.layers.40.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
5242
  "model.layers.40.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
5243
- "model.layers.40.block_sparse_moe.gate.bias": "model-00011-of-00015.safetensors",
5244
- "model.layers.40.block_sparse_moe.gate.g_idx": "model-00011-of-00015.safetensors",
5245
- "model.layers.40.block_sparse_moe.gate.qweight": "model-00011-of-00015.safetensors",
5246
- "model.layers.40.block_sparse_moe.gate.scales": "model-00011-of-00015.safetensors",
5247
  "model.layers.40.input_layernorm.weight": "model-00011-of-00015.safetensors",
5248
  "model.layers.40.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
5249
  "model.layers.40.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
@@ -5386,10 +5273,7 @@
5386
  "model.layers.41.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5387
  "model.layers.41.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5388
  "model.layers.41.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5389
- "model.layers.41.block_sparse_moe.gate.bias": "model-00012-of-00015.safetensors",
5390
- "model.layers.41.block_sparse_moe.gate.g_idx": "model-00012-of-00015.safetensors",
5391
- "model.layers.41.block_sparse_moe.gate.qweight": "model-00012-of-00015.safetensors",
5392
- "model.layers.41.block_sparse_moe.gate.scales": "model-00012-of-00015.safetensors",
5393
  "model.layers.41.input_layernorm.weight": "model-00012-of-00015.safetensors",
5394
  "model.layers.41.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5395
  "model.layers.41.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
@@ -5532,10 +5416,7 @@
5532
  "model.layers.42.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5533
  "model.layers.42.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5534
  "model.layers.42.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5535
- "model.layers.42.block_sparse_moe.gate.bias": "model-00012-of-00015.safetensors",
5536
- "model.layers.42.block_sparse_moe.gate.g_idx": "model-00012-of-00015.safetensors",
5537
- "model.layers.42.block_sparse_moe.gate.qweight": "model-00012-of-00015.safetensors",
5538
- "model.layers.42.block_sparse_moe.gate.scales": "model-00012-of-00015.safetensors",
5539
  "model.layers.42.input_layernorm.weight": "model-00012-of-00015.safetensors",
5540
  "model.layers.42.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5541
  "model.layers.42.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
@@ -5678,10 +5559,7 @@
5678
  "model.layers.43.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5679
  "model.layers.43.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5680
  "model.layers.43.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5681
- "model.layers.43.block_sparse_moe.gate.bias": "model-00012-of-00015.safetensors",
5682
- "model.layers.43.block_sparse_moe.gate.g_idx": "model-00012-of-00015.safetensors",
5683
- "model.layers.43.block_sparse_moe.gate.qweight": "model-00012-of-00015.safetensors",
5684
- "model.layers.43.block_sparse_moe.gate.scales": "model-00012-of-00015.safetensors",
5685
  "model.layers.43.input_layernorm.weight": "model-00012-of-00015.safetensors",
5686
  "model.layers.43.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5687
  "model.layers.43.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
@@ -5824,10 +5702,7 @@
5824
  "model.layers.44.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5825
  "model.layers.44.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5826
  "model.layers.44.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5827
- "model.layers.44.block_sparse_moe.gate.bias": "model-00012-of-00015.safetensors",
5828
- "model.layers.44.block_sparse_moe.gate.g_idx": "model-00012-of-00015.safetensors",
5829
- "model.layers.44.block_sparse_moe.gate.qweight": "model-00012-of-00015.safetensors",
5830
- "model.layers.44.block_sparse_moe.gate.scales": "model-00012-of-00015.safetensors",
5831
  "model.layers.44.input_layernorm.weight": "model-00012-of-00015.safetensors",
5832
  "model.layers.44.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5833
  "model.layers.44.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
@@ -5970,10 +5845,7 @@
5970
  "model.layers.45.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
5971
  "model.layers.45.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
5972
  "model.layers.45.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
5973
- "model.layers.45.block_sparse_moe.gate.bias": "model-00013-of-00015.safetensors",
5974
- "model.layers.45.block_sparse_moe.gate.g_idx": "model-00013-of-00015.safetensors",
5975
- "model.layers.45.block_sparse_moe.gate.qweight": "model-00013-of-00015.safetensors",
5976
- "model.layers.45.block_sparse_moe.gate.scales": "model-00013-of-00015.safetensors",
5977
  "model.layers.45.input_layernorm.weight": "model-00013-of-00015.safetensors",
5978
  "model.layers.45.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
5979
  "model.layers.45.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
@@ -6116,10 +5988,7 @@
6116
  "model.layers.46.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
6117
  "model.layers.46.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
6118
  "model.layers.46.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
6119
- "model.layers.46.block_sparse_moe.gate.bias": "model-00013-of-00015.safetensors",
6120
- "model.layers.46.block_sparse_moe.gate.g_idx": "model-00013-of-00015.safetensors",
6121
- "model.layers.46.block_sparse_moe.gate.qweight": "model-00013-of-00015.safetensors",
6122
- "model.layers.46.block_sparse_moe.gate.scales": "model-00013-of-00015.safetensors",
6123
  "model.layers.46.input_layernorm.weight": "model-00013-of-00015.safetensors",
6124
  "model.layers.46.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
6125
  "model.layers.46.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
@@ -6262,11 +6131,7 @@
6262
  "model.layers.47.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
6263
  "model.layers.47.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
6264
  "model.layers.47.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
6265
- "model.layers.47.block_sparse_moe.gate.bias": "model-00013-of-00015.safetensors",
6266
- "model.layers.47.block_sparse_moe.gate.g_idx": "model-00013-of-00015.safetensors",
6267
- "model.layers.47.block_sparse_moe.gate.qweight": "model-00013-of-00015.safetensors",
6268
- "model.layers.47.block_sparse_moe.gate.qzeros": "model-00013-of-00015.safetensors",
6269
- "model.layers.47.block_sparse_moe.gate.scales": "model-00013-of-00015.safetensors",
6270
  "model.layers.47.input_layernorm.weight": "model-00013-of-00015.safetensors",
6271
  "model.layers.47.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
6272
  "model.layers.47.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
@@ -6409,10 +6274,7 @@
6409
  "model.layers.48.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
6410
  "model.layers.48.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
6411
  "model.layers.48.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
6412
- "model.layers.48.block_sparse_moe.gate.bias": "model-00013-of-00015.safetensors",
6413
- "model.layers.48.block_sparse_moe.gate.g_idx": "model-00013-of-00015.safetensors",
6414
- "model.layers.48.block_sparse_moe.gate.qweight": "model-00013-of-00015.safetensors",
6415
- "model.layers.48.block_sparse_moe.gate.scales": "model-00013-of-00015.safetensors",
6416
  "model.layers.48.input_layernorm.weight": "model-00013-of-00015.safetensors",
6417
  "model.layers.48.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
6418
  "model.layers.48.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
@@ -6555,10 +6417,7 @@
6555
  "model.layers.49.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6556
  "model.layers.49.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6557
  "model.layers.49.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6558
- "model.layers.49.block_sparse_moe.gate.bias": "model-00014-of-00015.safetensors",
6559
- "model.layers.49.block_sparse_moe.gate.g_idx": "model-00014-of-00015.safetensors",
6560
- "model.layers.49.block_sparse_moe.gate.qweight": "model-00014-of-00015.safetensors",
6561
- "model.layers.49.block_sparse_moe.gate.scales": "model-00014-of-00015.safetensors",
6562
  "model.layers.49.input_layernorm.weight": "model-00014-of-00015.safetensors",
6563
  "model.layers.49.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6564
  "model.layers.49.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
@@ -6701,10 +6560,7 @@
6701
  "model.layers.5.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
6702
  "model.layers.5.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
6703
  "model.layers.5.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
6704
- "model.layers.5.block_sparse_moe.gate.bias": "model-00002-of-00015.safetensors",
6705
- "model.layers.5.block_sparse_moe.gate.g_idx": "model-00002-of-00015.safetensors",
6706
- "model.layers.5.block_sparse_moe.gate.qweight": "model-00002-of-00015.safetensors",
6707
- "model.layers.5.block_sparse_moe.gate.scales": "model-00002-of-00015.safetensors",
6708
  "model.layers.5.input_layernorm.weight": "model-00002-of-00015.safetensors",
6709
  "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
6710
  "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
@@ -6847,10 +6703,7 @@
6847
  "model.layers.50.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6848
  "model.layers.50.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6849
  "model.layers.50.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6850
- "model.layers.50.block_sparse_moe.gate.bias": "model-00014-of-00015.safetensors",
6851
- "model.layers.50.block_sparse_moe.gate.g_idx": "model-00014-of-00015.safetensors",
6852
- "model.layers.50.block_sparse_moe.gate.qweight": "model-00014-of-00015.safetensors",
6853
- "model.layers.50.block_sparse_moe.gate.scales": "model-00014-of-00015.safetensors",
6854
  "model.layers.50.input_layernorm.weight": "model-00014-of-00015.safetensors",
6855
  "model.layers.50.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6856
  "model.layers.50.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
@@ -6993,10 +6846,7 @@
6993
  "model.layers.51.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6994
  "model.layers.51.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6995
  "model.layers.51.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6996
- "model.layers.51.block_sparse_moe.gate.bias": "model-00014-of-00015.safetensors",
6997
- "model.layers.51.block_sparse_moe.gate.g_idx": "model-00014-of-00015.safetensors",
6998
- "model.layers.51.block_sparse_moe.gate.qweight": "model-00014-of-00015.safetensors",
6999
- "model.layers.51.block_sparse_moe.gate.scales": "model-00014-of-00015.safetensors",
7000
  "model.layers.51.input_layernorm.weight": "model-00014-of-00015.safetensors",
7001
  "model.layers.51.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
7002
  "model.layers.51.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
@@ -7139,10 +6989,7 @@
7139
  "model.layers.52.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
7140
  "model.layers.52.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
7141
  "model.layers.52.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
7142
- "model.layers.52.block_sparse_moe.gate.bias": "model-00014-of-00015.safetensors",
7143
- "model.layers.52.block_sparse_moe.gate.g_idx": "model-00014-of-00015.safetensors",
7144
- "model.layers.52.block_sparse_moe.gate.qweight": "model-00014-of-00015.safetensors",
7145
- "model.layers.52.block_sparse_moe.gate.scales": "model-00014-of-00015.safetensors",
7146
  "model.layers.52.input_layernorm.weight": "model-00014-of-00015.safetensors",
7147
  "model.layers.52.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
7148
  "model.layers.52.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
@@ -7285,10 +7132,7 @@
7285
  "model.layers.53.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7286
  "model.layers.53.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7287
  "model.layers.53.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7288
- "model.layers.53.block_sparse_moe.gate.bias": "model-00015-of-00015.safetensors",
7289
- "model.layers.53.block_sparse_moe.gate.g_idx": "model-00015-of-00015.safetensors",
7290
- "model.layers.53.block_sparse_moe.gate.qweight": "model-00015-of-00015.safetensors",
7291
- "model.layers.53.block_sparse_moe.gate.scales": "model-00015-of-00015.safetensors",
7292
  "model.layers.53.input_layernorm.weight": "model-00015-of-00015.safetensors",
7293
  "model.layers.53.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7294
  "model.layers.53.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
@@ -7431,10 +7275,7 @@
7431
  "model.layers.54.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7432
  "model.layers.54.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7433
  "model.layers.54.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7434
- "model.layers.54.block_sparse_moe.gate.bias": "model-00015-of-00015.safetensors",
7435
- "model.layers.54.block_sparse_moe.gate.g_idx": "model-00015-of-00015.safetensors",
7436
- "model.layers.54.block_sparse_moe.gate.qweight": "model-00015-of-00015.safetensors",
7437
- "model.layers.54.block_sparse_moe.gate.scales": "model-00015-of-00015.safetensors",
7438
  "model.layers.54.input_layernorm.weight": "model-00015-of-00015.safetensors",
7439
  "model.layers.54.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7440
  "model.layers.54.self_attn.k_proj.bias": "model-00015-of-00015.safetensors",
@@ -7577,11 +7418,7 @@
7577
  "model.layers.55.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7578
  "model.layers.55.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7579
  "model.layers.55.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7580
- "model.layers.55.block_sparse_moe.gate.bias": "model-00015-of-00015.safetensors",
7581
- "model.layers.55.block_sparse_moe.gate.g_idx": "model-00015-of-00015.safetensors",
7582
- "model.layers.55.block_sparse_moe.gate.qweight": "model-00015-of-00015.safetensors",
7583
- "model.layers.55.block_sparse_moe.gate.qzeros": "model-00015-of-00015.safetensors",
7584
- "model.layers.55.block_sparse_moe.gate.scales": "model-00015-of-00015.safetensors",
7585
  "model.layers.55.input_layernorm.weight": "model-00015-of-00015.safetensors",
7586
  "model.layers.55.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7587
  "model.layers.55.self_attn.k_proj.bias": "model-00015-of-00015.safetensors",
@@ -7724,10 +7561,7 @@
7724
  "model.layers.6.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
7725
  "model.layers.6.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
7726
  "model.layers.6.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
7727
- "model.layers.6.block_sparse_moe.gate.bias": "model-00002-of-00015.safetensors",
7728
- "model.layers.6.block_sparse_moe.gate.g_idx": "model-00002-of-00015.safetensors",
7729
- "model.layers.6.block_sparse_moe.gate.qweight": "model-00002-of-00015.safetensors",
7730
- "model.layers.6.block_sparse_moe.gate.scales": "model-00002-of-00015.safetensors",
7731
  "model.layers.6.input_layernorm.weight": "model-00002-of-00015.safetensors",
7732
  "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
7733
  "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
@@ -7870,11 +7704,7 @@
7870
  "model.layers.7.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
7871
  "model.layers.7.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
7872
  "model.layers.7.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
7873
- "model.layers.7.block_sparse_moe.gate.bias": "model-00003-of-00015.safetensors",
7874
- "model.layers.7.block_sparse_moe.gate.g_idx": "model-00003-of-00015.safetensors",
7875
- "model.layers.7.block_sparse_moe.gate.qweight": "model-00003-of-00015.safetensors",
7876
- "model.layers.7.block_sparse_moe.gate.qzeros": "model-00003-of-00015.safetensors",
7877
- "model.layers.7.block_sparse_moe.gate.scales": "model-00003-of-00015.safetensors",
7878
  "model.layers.7.input_layernorm.weight": "model-00003-of-00015.safetensors",
7879
  "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
7880
  "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
@@ -8017,10 +7847,7 @@
8017
  "model.layers.8.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
8018
  "model.layers.8.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
8019
  "model.layers.8.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
8020
- "model.layers.8.block_sparse_moe.gate.bias": "model-00003-of-00015.safetensors",
8021
- "model.layers.8.block_sparse_moe.gate.g_idx": "model-00003-of-00015.safetensors",
8022
- "model.layers.8.block_sparse_moe.gate.qweight": "model-00003-of-00015.safetensors",
8023
- "model.layers.8.block_sparse_moe.gate.scales": "model-00003-of-00015.safetensors",
8024
  "model.layers.8.input_layernorm.weight": "model-00003-of-00015.safetensors",
8025
  "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
8026
  "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
@@ -8163,10 +7990,7 @@
8163
  "model.layers.9.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
8164
  "model.layers.9.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
8165
  "model.layers.9.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
8166
- "model.layers.9.block_sparse_moe.gate.bias": "model-00003-of-00015.safetensors",
8167
- "model.layers.9.block_sparse_moe.gate.g_idx": "model-00003-of-00015.safetensors",
8168
- "model.layers.9.block_sparse_moe.gate.qweight": "model-00003-of-00015.safetensors",
8169
- "model.layers.9.block_sparse_moe.gate.scales": "model-00003-of-00015.safetensors",
8170
  "model.layers.9.input_layernorm.weight": "model-00003-of-00015.safetensors",
8171
  "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
8172
  "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 73756291072
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00015-of-00015.safetensors",
 
125
  "model.layers.0.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
126
  "model.layers.0.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
127
  "model.layers.0.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
128
+ "model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00015.safetensors",
 
 
 
 
129
  "model.layers.0.input_layernorm.weight": "model-00001-of-00015.safetensors",
130
  "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
131
  "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
 
268
  "model.layers.1.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
269
  "model.layers.1.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
270
  "model.layers.1.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
271
+ "model.layers.1.block_sparse_moe.gate.weight": "model-00001-of-00015.safetensors",
 
 
 
272
  "model.layers.1.input_layernorm.weight": "model-00001-of-00015.safetensors",
273
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
274
  "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
 
411
  "model.layers.10.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
412
  "model.layers.10.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
413
  "model.layers.10.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
414
+ "model.layers.10.block_sparse_moe.gate.weight": "model-00003-of-00015.safetensors",
 
 
 
415
  "model.layers.10.input_layernorm.weight": "model-00003-of-00015.safetensors",
416
  "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
417
  "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
 
554
  "model.layers.11.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
555
  "model.layers.11.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
556
  "model.layers.11.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
557
+ "model.layers.11.block_sparse_moe.gate.weight": "model-00003-of-00015.safetensors",
 
 
 
558
  "model.layers.11.input_layernorm.weight": "model-00004-of-00015.safetensors",
559
  "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
560
  "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
 
697
  "model.layers.12.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
698
  "model.layers.12.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
699
  "model.layers.12.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
700
+ "model.layers.12.block_sparse_moe.gate.weight": "model-00004-of-00015.safetensors",
 
 
 
701
  "model.layers.12.input_layernorm.weight": "model-00004-of-00015.safetensors",
702
  "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
703
  "model.layers.12.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
 
840
  "model.layers.13.block_sparse_moe.experts.7.w3.qweight": "model-00004-of-00015.safetensors",
841
  "model.layers.13.block_sparse_moe.experts.7.w3.qzeros": "model-00004-of-00015.safetensors",
842
  "model.layers.13.block_sparse_moe.experts.7.w3.scales": "model-00004-of-00015.safetensors",
843
+ "model.layers.13.block_sparse_moe.gate.weight": "model-00004-of-00015.safetensors",
 
 
 
844
  "model.layers.13.input_layernorm.weight": "model-00004-of-00015.safetensors",
845
  "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00015.safetensors",
846
  "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
 
983
  "model.layers.14.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
984
  "model.layers.14.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
985
  "model.layers.14.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
986
+ "model.layers.14.block_sparse_moe.gate.weight": "model-00004-of-00015.safetensors",
 
 
 
987
  "model.layers.14.input_layernorm.weight": "model-00005-of-00015.safetensors",
988
  "model.layers.14.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
989
  "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00015.safetensors",
 
1126
  "model.layers.15.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1127
  "model.layers.15.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1128
  "model.layers.15.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1129
+ "model.layers.15.block_sparse_moe.gate.weight": "model-00005-of-00015.safetensors",
 
 
 
 
1130
  "model.layers.15.input_layernorm.weight": "model-00005-of-00015.safetensors",
1131
  "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1132
  "model.layers.15.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
 
1269
  "model.layers.16.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1270
  "model.layers.16.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1271
  "model.layers.16.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1272
+ "model.layers.16.block_sparse_moe.gate.weight": "model-00005-of-00015.safetensors",
 
 
 
1273
  "model.layers.16.input_layernorm.weight": "model-00005-of-00015.safetensors",
1274
  "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1275
  "model.layers.16.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
 
1412
  "model.layers.17.block_sparse_moe.experts.7.w3.qweight": "model-00005-of-00015.safetensors",
1413
  "model.layers.17.block_sparse_moe.experts.7.w3.qzeros": "model-00005-of-00015.safetensors",
1414
  "model.layers.17.block_sparse_moe.experts.7.w3.scales": "model-00005-of-00015.safetensors",
1415
+ "model.layers.17.block_sparse_moe.gate.weight": "model-00005-of-00015.safetensors",
 
 
 
1416
  "model.layers.17.input_layernorm.weight": "model-00005-of-00015.safetensors",
1417
  "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00015.safetensors",
1418
  "model.layers.17.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
 
1555
  "model.layers.18.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
1556
  "model.layers.18.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
1557
  "model.layers.18.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
1558
+ "model.layers.18.block_sparse_moe.gate.weight": "model-00005-of-00015.safetensors",
 
 
 
1559
  "model.layers.18.input_layernorm.weight": "model-00006-of-00015.safetensors",
1560
  "model.layers.18.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
1561
  "model.layers.18.self_attn.k_proj.bias": "model-00005-of-00015.safetensors",
 
1698
  "model.layers.19.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
1699
  "model.layers.19.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
1700
  "model.layers.19.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
1701
+ "model.layers.19.block_sparse_moe.gate.weight": "model-00006-of-00015.safetensors",
 
 
 
1702
  "model.layers.19.input_layernorm.weight": "model-00006-of-00015.safetensors",
1703
  "model.layers.19.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
1704
  "model.layers.19.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
 
1841
  "model.layers.2.block_sparse_moe.experts.7.w3.qweight": "model-00001-of-00015.safetensors",
1842
  "model.layers.2.block_sparse_moe.experts.7.w3.qzeros": "model-00001-of-00015.safetensors",
1843
  "model.layers.2.block_sparse_moe.experts.7.w3.scales": "model-00001-of-00015.safetensors",
1844
+ "model.layers.2.block_sparse_moe.gate.weight": "model-00001-of-00015.safetensors",
 
 
 
1845
  "model.layers.2.input_layernorm.weight": "model-00001-of-00015.safetensors",
1846
  "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00015.safetensors",
1847
  "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
 
1984
  "model.layers.20.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
1985
  "model.layers.20.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
1986
  "model.layers.20.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
1987
+ "model.layers.20.block_sparse_moe.gate.weight": "model-00006-of-00015.safetensors",
 
 
 
1988
  "model.layers.20.input_layernorm.weight": "model-00006-of-00015.safetensors",
1989
  "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
1990
  "model.layers.20.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
 
2127
  "model.layers.21.block_sparse_moe.experts.7.w3.qweight": "model-00006-of-00015.safetensors",
2128
  "model.layers.21.block_sparse_moe.experts.7.w3.qzeros": "model-00006-of-00015.safetensors",
2129
  "model.layers.21.block_sparse_moe.experts.7.w3.scales": "model-00006-of-00015.safetensors",
2130
+ "model.layers.21.block_sparse_moe.gate.weight": "model-00006-of-00015.safetensors",
 
 
 
2131
  "model.layers.21.input_layernorm.weight": "model-00006-of-00015.safetensors",
2132
  "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00015.safetensors",
2133
  "model.layers.21.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
 
2270
  "model.layers.22.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2271
  "model.layers.22.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2272
  "model.layers.22.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2273
+ "model.layers.22.block_sparse_moe.gate.weight": "model-00006-of-00015.safetensors",
 
 
 
2274
  "model.layers.22.input_layernorm.weight": "model-00007-of-00015.safetensors",
2275
  "model.layers.22.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2276
  "model.layers.22.self_attn.k_proj.bias": "model-00006-of-00015.safetensors",
 
2413
  "model.layers.23.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2414
  "model.layers.23.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2415
  "model.layers.23.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2416
+ "model.layers.23.block_sparse_moe.gate.weight": "model-00007-of-00015.safetensors",
 
 
 
 
2417
  "model.layers.23.input_layernorm.weight": "model-00007-of-00015.safetensors",
2418
  "model.layers.23.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2419
  "model.layers.23.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
 
2556
  "model.layers.24.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2557
  "model.layers.24.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2558
  "model.layers.24.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2559
+ "model.layers.24.block_sparse_moe.gate.weight": "model-00007-of-00015.safetensors",
 
 
 
2560
  "model.layers.24.input_layernorm.weight": "model-00007-of-00015.safetensors",
2561
  "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2562
  "model.layers.24.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
 
2699
  "model.layers.25.block_sparse_moe.experts.7.w3.qweight": "model-00007-of-00015.safetensors",
2700
  "model.layers.25.block_sparse_moe.experts.7.w3.qzeros": "model-00007-of-00015.safetensors",
2701
  "model.layers.25.block_sparse_moe.experts.7.w3.scales": "model-00007-of-00015.safetensors",
2702
+ "model.layers.25.block_sparse_moe.gate.weight": "model-00007-of-00015.safetensors",
 
 
 
2703
  "model.layers.25.input_layernorm.weight": "model-00007-of-00015.safetensors",
2704
  "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00015.safetensors",
2705
  "model.layers.25.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
 
2842
  "model.layers.26.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
2843
  "model.layers.26.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
2844
  "model.layers.26.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
2845
+ "model.layers.26.block_sparse_moe.gate.weight": "model-00007-of-00015.safetensors",
 
 
 
2846
  "model.layers.26.input_layernorm.weight": "model-00008-of-00015.safetensors",
2847
  "model.layers.26.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
2848
  "model.layers.26.self_attn.k_proj.bias": "model-00007-of-00015.safetensors",
 
2985
  "model.layers.27.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
2986
  "model.layers.27.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
2987
  "model.layers.27.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
2988
+ "model.layers.27.block_sparse_moe.gate.weight": "model-00008-of-00015.safetensors",
 
 
 
2989
  "model.layers.27.input_layernorm.weight": "model-00008-of-00015.safetensors",
2990
  "model.layers.27.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
2991
  "model.layers.27.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
 
3128
  "model.layers.28.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
3129
  "model.layers.28.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
3130
  "model.layers.28.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
3131
+ "model.layers.28.block_sparse_moe.gate.weight": "model-00008-of-00015.safetensors",
 
 
 
3132
  "model.layers.28.input_layernorm.weight": "model-00008-of-00015.safetensors",
3133
  "model.layers.28.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
3134
  "model.layers.28.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
 
3271
  "model.layers.29.block_sparse_moe.experts.7.w3.qweight": "model-00008-of-00015.safetensors",
3272
  "model.layers.29.block_sparse_moe.experts.7.w3.qzeros": "model-00008-of-00015.safetensors",
3273
  "model.layers.29.block_sparse_moe.experts.7.w3.scales": "model-00008-of-00015.safetensors",
3274
+ "model.layers.29.block_sparse_moe.gate.weight": "model-00008-of-00015.safetensors",
 
 
 
3275
  "model.layers.29.input_layernorm.weight": "model-00008-of-00015.safetensors",
3276
  "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00015.safetensors",
3277
  "model.layers.29.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
 
3414
  "model.layers.3.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
3415
  "model.layers.3.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
3416
  "model.layers.3.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
3417
+ "model.layers.3.block_sparse_moe.gate.weight": "model-00001-of-00015.safetensors",
 
 
 
3418
  "model.layers.3.input_layernorm.weight": "model-00002-of-00015.safetensors",
3419
  "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
3420
  "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00015.safetensors",
 
3557
  "model.layers.30.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3558
  "model.layers.30.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3559
  "model.layers.30.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3560
+ "model.layers.30.block_sparse_moe.gate.weight": "model-00008-of-00015.safetensors",
 
 
 
3561
  "model.layers.30.input_layernorm.weight": "model-00009-of-00015.safetensors",
3562
  "model.layers.30.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3563
  "model.layers.30.self_attn.k_proj.bias": "model-00008-of-00015.safetensors",
 
3700
  "model.layers.31.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3701
  "model.layers.31.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3702
  "model.layers.31.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3703
+ "model.layers.31.block_sparse_moe.gate.weight": "model-00009-of-00015.safetensors",
 
 
 
 
3704
  "model.layers.31.input_layernorm.weight": "model-00009-of-00015.safetensors",
3705
  "model.layers.31.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3706
  "model.layers.31.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
 
3843
  "model.layers.32.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3844
  "model.layers.32.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3845
  "model.layers.32.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3846
+ "model.layers.32.block_sparse_moe.gate.weight": "model-00009-of-00015.safetensors",
 
 
 
3847
  "model.layers.32.input_layernorm.weight": "model-00009-of-00015.safetensors",
3848
  "model.layers.32.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3849
  "model.layers.32.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
 
3986
  "model.layers.33.block_sparse_moe.experts.7.w3.qweight": "model-00009-of-00015.safetensors",
3987
  "model.layers.33.block_sparse_moe.experts.7.w3.qzeros": "model-00009-of-00015.safetensors",
3988
  "model.layers.33.block_sparse_moe.experts.7.w3.scales": "model-00009-of-00015.safetensors",
3989
+ "model.layers.33.block_sparse_moe.gate.weight": "model-00009-of-00015.safetensors",
 
 
 
3990
  "model.layers.33.input_layernorm.weight": "model-00009-of-00015.safetensors",
3991
  "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00015.safetensors",
3992
  "model.layers.33.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
 
4129
  "model.layers.34.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4130
  "model.layers.34.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4131
  "model.layers.34.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4132
+ "model.layers.34.block_sparse_moe.gate.weight": "model-00009-of-00015.safetensors",
 
 
 
4133
  "model.layers.34.input_layernorm.weight": "model-00010-of-00015.safetensors",
4134
  "model.layers.34.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4135
  "model.layers.34.self_attn.k_proj.bias": "model-00009-of-00015.safetensors",
 
4272
  "model.layers.35.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4273
  "model.layers.35.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4274
  "model.layers.35.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4275
+ "model.layers.35.block_sparse_moe.gate.weight": "model-00010-of-00015.safetensors",
 
 
 
4276
  "model.layers.35.input_layernorm.weight": "model-00010-of-00015.safetensors",
4277
  "model.layers.35.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4278
  "model.layers.35.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
 
4415
  "model.layers.36.block_sparse_moe.experts.7.w3.qweight": "model-00010-of-00015.safetensors",
4416
  "model.layers.36.block_sparse_moe.experts.7.w3.qzeros": "model-00010-of-00015.safetensors",
4417
  "model.layers.36.block_sparse_moe.experts.7.w3.scales": "model-00010-of-00015.safetensors",
4418
+ "model.layers.36.block_sparse_moe.gate.weight": "model-00010-of-00015.safetensors",
 
 
 
4419
  "model.layers.36.input_layernorm.weight": "model-00010-of-00015.safetensors",
4420
  "model.layers.36.post_attention_layernorm.weight": "model-00010-of-00015.safetensors",
4421
  "model.layers.36.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
 
4558
  "model.layers.37.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4559
  "model.layers.37.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4560
  "model.layers.37.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4561
+ "model.layers.37.block_sparse_moe.gate.weight": "model-00010-of-00015.safetensors",
 
 
 
4562
  "model.layers.37.input_layernorm.weight": "model-00011-of-00015.safetensors",
4563
  "model.layers.37.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4564
  "model.layers.37.self_attn.k_proj.bias": "model-00010-of-00015.safetensors",
 
4701
  "model.layers.38.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4702
  "model.layers.38.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4703
  "model.layers.38.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4704
+ "model.layers.38.block_sparse_moe.gate.weight": "model-00011-of-00015.safetensors",
 
 
 
4705
  "model.layers.38.input_layernorm.weight": "model-00011-of-00015.safetensors",
4706
  "model.layers.38.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4707
  "model.layers.38.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
 
4844
  "model.layers.39.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
4845
  "model.layers.39.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
4846
  "model.layers.39.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
4847
+ "model.layers.39.block_sparse_moe.gate.weight": "model-00011-of-00015.safetensors",
 
 
 
 
4848
  "model.layers.39.input_layernorm.weight": "model-00011-of-00015.safetensors",
4849
  "model.layers.39.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
4850
  "model.layers.39.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
 
4987
  "model.layers.4.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
4988
  "model.layers.4.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
4989
  "model.layers.4.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
4990
+ "model.layers.4.block_sparse_moe.gate.weight": "model-00002-of-00015.safetensors",
 
 
 
4991
  "model.layers.4.input_layernorm.weight": "model-00002-of-00015.safetensors",
4992
  "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
4993
  "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
 
5130
  "model.layers.40.block_sparse_moe.experts.7.w3.qweight": "model-00011-of-00015.safetensors",
5131
  "model.layers.40.block_sparse_moe.experts.7.w3.qzeros": "model-00011-of-00015.safetensors",
5132
  "model.layers.40.block_sparse_moe.experts.7.w3.scales": "model-00011-of-00015.safetensors",
5133
+ "model.layers.40.block_sparse_moe.gate.weight": "model-00011-of-00015.safetensors",
 
 
 
5134
  "model.layers.40.input_layernorm.weight": "model-00011-of-00015.safetensors",
5135
  "model.layers.40.post_attention_layernorm.weight": "model-00011-of-00015.safetensors",
5136
  "model.layers.40.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
 
5273
  "model.layers.41.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5274
  "model.layers.41.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5275
  "model.layers.41.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5276
+ "model.layers.41.block_sparse_moe.gate.weight": "model-00011-of-00015.safetensors",
 
 
 
5277
  "model.layers.41.input_layernorm.weight": "model-00012-of-00015.safetensors",
5278
  "model.layers.41.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5279
  "model.layers.41.self_attn.k_proj.bias": "model-00011-of-00015.safetensors",
 
5416
  "model.layers.42.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5417
  "model.layers.42.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5418
  "model.layers.42.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5419
+ "model.layers.42.block_sparse_moe.gate.weight": "model-00012-of-00015.safetensors",
 
 
 
5420
  "model.layers.42.input_layernorm.weight": "model-00012-of-00015.safetensors",
5421
  "model.layers.42.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5422
  "model.layers.42.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
 
5559
  "model.layers.43.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5560
  "model.layers.43.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5561
  "model.layers.43.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5562
+ "model.layers.43.block_sparse_moe.gate.weight": "model-00012-of-00015.safetensors",
 
 
 
5563
  "model.layers.43.input_layernorm.weight": "model-00012-of-00015.safetensors",
5564
  "model.layers.43.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5565
  "model.layers.43.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
 
5702
  "model.layers.44.block_sparse_moe.experts.7.w3.qweight": "model-00012-of-00015.safetensors",
5703
  "model.layers.44.block_sparse_moe.experts.7.w3.qzeros": "model-00012-of-00015.safetensors",
5704
  "model.layers.44.block_sparse_moe.experts.7.w3.scales": "model-00012-of-00015.safetensors",
5705
+ "model.layers.44.block_sparse_moe.gate.weight": "model-00012-of-00015.safetensors",
 
 
 
5706
  "model.layers.44.input_layernorm.weight": "model-00012-of-00015.safetensors",
5707
  "model.layers.44.post_attention_layernorm.weight": "model-00012-of-00015.safetensors",
5708
  "model.layers.44.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
 
5845
  "model.layers.45.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
5846
  "model.layers.45.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
5847
  "model.layers.45.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
5848
+ "model.layers.45.block_sparse_moe.gate.weight": "model-00012-of-00015.safetensors",
 
 
 
5849
  "model.layers.45.input_layernorm.weight": "model-00013-of-00015.safetensors",
5850
  "model.layers.45.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
5851
  "model.layers.45.self_attn.k_proj.bias": "model-00012-of-00015.safetensors",
 
5988
  "model.layers.46.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
5989
  "model.layers.46.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
5990
  "model.layers.46.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
5991
+ "model.layers.46.block_sparse_moe.gate.weight": "model-00013-of-00015.safetensors",
 
 
 
5992
  "model.layers.46.input_layernorm.weight": "model-00013-of-00015.safetensors",
5993
  "model.layers.46.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
5994
  "model.layers.46.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
 
6131
  "model.layers.47.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
6132
  "model.layers.47.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
6133
  "model.layers.47.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
6134
+ "model.layers.47.block_sparse_moe.gate.weight": "model-00013-of-00015.safetensors",
 
 
 
 
6135
  "model.layers.47.input_layernorm.weight": "model-00013-of-00015.safetensors",
6136
  "model.layers.47.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
6137
  "model.layers.47.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
 
6274
  "model.layers.48.block_sparse_moe.experts.7.w3.qweight": "model-00013-of-00015.safetensors",
6275
  "model.layers.48.block_sparse_moe.experts.7.w3.qzeros": "model-00013-of-00015.safetensors",
6276
  "model.layers.48.block_sparse_moe.experts.7.w3.scales": "model-00013-of-00015.safetensors",
6277
+ "model.layers.48.block_sparse_moe.gate.weight": "model-00013-of-00015.safetensors",
 
 
 
6278
  "model.layers.48.input_layernorm.weight": "model-00013-of-00015.safetensors",
6279
  "model.layers.48.post_attention_layernorm.weight": "model-00013-of-00015.safetensors",
6280
  "model.layers.48.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
 
6417
  "model.layers.49.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6418
  "model.layers.49.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6419
  "model.layers.49.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6420
+ "model.layers.49.block_sparse_moe.gate.weight": "model-00013-of-00015.safetensors",
 
 
 
6421
  "model.layers.49.input_layernorm.weight": "model-00014-of-00015.safetensors",
6422
  "model.layers.49.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6423
  "model.layers.49.self_attn.k_proj.bias": "model-00013-of-00015.safetensors",
 
6560
  "model.layers.5.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
6561
  "model.layers.5.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
6562
  "model.layers.5.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
6563
+ "model.layers.5.block_sparse_moe.gate.weight": "model-00002-of-00015.safetensors",
 
 
 
6564
  "model.layers.5.input_layernorm.weight": "model-00002-of-00015.safetensors",
6565
  "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
6566
  "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
 
6703
  "model.layers.50.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6704
  "model.layers.50.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6705
  "model.layers.50.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6706
+ "model.layers.50.block_sparse_moe.gate.weight": "model-00014-of-00015.safetensors",
 
 
 
6707
  "model.layers.50.input_layernorm.weight": "model-00014-of-00015.safetensors",
6708
  "model.layers.50.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6709
  "model.layers.50.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
 
6846
  "model.layers.51.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6847
  "model.layers.51.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6848
  "model.layers.51.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6849
+ "model.layers.51.block_sparse_moe.gate.weight": "model-00014-of-00015.safetensors",
 
 
 
6850
  "model.layers.51.input_layernorm.weight": "model-00014-of-00015.safetensors",
6851
  "model.layers.51.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6852
  "model.layers.51.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
 
6989
  "model.layers.52.block_sparse_moe.experts.7.w3.qweight": "model-00014-of-00015.safetensors",
6990
  "model.layers.52.block_sparse_moe.experts.7.w3.qzeros": "model-00014-of-00015.safetensors",
6991
  "model.layers.52.block_sparse_moe.experts.7.w3.scales": "model-00014-of-00015.safetensors",
6992
+ "model.layers.52.block_sparse_moe.gate.weight": "model-00014-of-00015.safetensors",
 
 
 
6993
  "model.layers.52.input_layernorm.weight": "model-00014-of-00015.safetensors",
6994
  "model.layers.52.post_attention_layernorm.weight": "model-00014-of-00015.safetensors",
6995
  "model.layers.52.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
 
7132
  "model.layers.53.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7133
  "model.layers.53.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7134
  "model.layers.53.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7135
+ "model.layers.53.block_sparse_moe.gate.weight": "model-00014-of-00015.safetensors",
 
 
 
7136
  "model.layers.53.input_layernorm.weight": "model-00015-of-00015.safetensors",
7137
  "model.layers.53.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7138
  "model.layers.53.self_attn.k_proj.bias": "model-00014-of-00015.safetensors",
 
7275
  "model.layers.54.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7276
  "model.layers.54.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7277
  "model.layers.54.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7278
+ "model.layers.54.block_sparse_moe.gate.weight": "model-00015-of-00015.safetensors",
 
 
 
7279
  "model.layers.54.input_layernorm.weight": "model-00015-of-00015.safetensors",
7280
  "model.layers.54.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7281
  "model.layers.54.self_attn.k_proj.bias": "model-00015-of-00015.safetensors",
 
7418
  "model.layers.55.block_sparse_moe.experts.7.w3.qweight": "model-00015-of-00015.safetensors",
7419
  "model.layers.55.block_sparse_moe.experts.7.w3.qzeros": "model-00015-of-00015.safetensors",
7420
  "model.layers.55.block_sparse_moe.experts.7.w3.scales": "model-00015-of-00015.safetensors",
7421
+ "model.layers.55.block_sparse_moe.gate.weight": "model-00015-of-00015.safetensors",
 
 
 
 
7422
  "model.layers.55.input_layernorm.weight": "model-00015-of-00015.safetensors",
7423
  "model.layers.55.post_attention_layernorm.weight": "model-00015-of-00015.safetensors",
7424
  "model.layers.55.self_attn.k_proj.bias": "model-00015-of-00015.safetensors",
 
7561
  "model.layers.6.block_sparse_moe.experts.7.w3.qweight": "model-00002-of-00015.safetensors",
7562
  "model.layers.6.block_sparse_moe.experts.7.w3.qzeros": "model-00002-of-00015.safetensors",
7563
  "model.layers.6.block_sparse_moe.experts.7.w3.scales": "model-00002-of-00015.safetensors",
7564
+ "model.layers.6.block_sparse_moe.gate.weight": "model-00002-of-00015.safetensors",
 
 
 
7565
  "model.layers.6.input_layernorm.weight": "model-00002-of-00015.safetensors",
7566
  "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00015.safetensors",
7567
  "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
 
7704
  "model.layers.7.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
7705
  "model.layers.7.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
7706
  "model.layers.7.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
7707
+ "model.layers.7.block_sparse_moe.gate.weight": "model-00002-of-00015.safetensors",
 
 
 
 
7708
  "model.layers.7.input_layernorm.weight": "model-00003-of-00015.safetensors",
7709
  "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
7710
  "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00015.safetensors",
 
7847
  "model.layers.8.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
7848
  "model.layers.8.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
7849
  "model.layers.8.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
7850
+ "model.layers.8.block_sparse_moe.gate.weight": "model-00003-of-00015.safetensors",
 
 
 
7851
  "model.layers.8.input_layernorm.weight": "model-00003-of-00015.safetensors",
7852
  "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
7853
  "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",
 
7990
  "model.layers.9.block_sparse_moe.experts.7.w3.qweight": "model-00003-of-00015.safetensors",
7991
  "model.layers.9.block_sparse_moe.experts.7.w3.qzeros": "model-00003-of-00015.safetensors",
7992
  "model.layers.9.block_sparse_moe.experts.7.w3.scales": "model-00003-of-00015.safetensors",
7993
+ "model.layers.9.block_sparse_moe.gate.weight": "model-00003-of-00015.safetensors",
 
 
 
7994
  "model.layers.9.input_layernorm.weight": "model-00003-of-00015.safetensors",
7995
  "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00015.safetensors",
7996
  "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00015.safetensors",