maxall4 commited on
Commit
ad9eb94
·
verified ·
1 Parent(s): 2a542d3

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +57 -89
config.json CHANGED
@@ -1,90 +1,58 @@
1
  {
2
- "_commit_hash": null,
3
- "_name_or_path": "togethercomputer/evo-1-131k-base",
4
- "architectures": [
5
- "StripedHyenaModelForCausalLM"
6
- ],
7
- "attn_layer_idxs": [
8
- 8,
9
- 16,
10
- 24
11
- ],
12
- "auto_map": {
13
- "AutoConfig": "configuration_hyena.StripedHyenaConfig",
14
- "AutoModelForCausalLM": "modeling_hyena.StripedHyenaModelForCausalLM",
15
- "AutoTokenizer": [
16
- "tokenizer.ByteTokenizer",
17
- null
18
- ]
19
- },
20
- "column_split": false,
21
- "column_split_hyena": true,
22
- "eps": 1e-06,
23
- "final_norm": true,
24
- "hidden_size": 4096,
25
- "hyena_filter_groups": 1,
26
- "hyena_layer_idxs": [
27
- 0,
28
- 1,
29
- 2,
30
- 3,
31
- 4,
32
- 5,
33
- 6,
34
- 7,
35
- 9,
36
- 10,
37
- 11,
38
- 12,
39
- 13,
40
- 14,
41
- 15,
42
- 17,
43
- 18,
44
- 19,
45
- 20,
46
- 21,
47
- 22,
48
- 23,
49
- 25,
50
- 26,
51
- 27,
52
- 28,
53
- 29,
54
- 30,
55
- 31
56
- ],
57
- "inference_mode": false,
58
- "inner_mlp_size": 10928,
59
- "log_intermediate_values": false,
60
- "make_vocab_size_divisible_by": 8,
61
- "max_seqlen": 131072,
62
- "mha_out_proj_bias": true,
63
- "mlp_activation": "gelu",
64
- "model_parallel_size": 1,
65
- "model_type": "stripedhyena",
66
- "num_attention_heads": 32,
67
- "num_filters": 4096,
68
- "num_layers": 32,
69
- "pipe_parallel_size": 1,
70
- "prefill_style": "fft",
71
- "proj_groups": 1,
72
- "qkv_proj_bias": true,
73
- "rotary_emb_base": 10000,
74
- "rotary_emb_scaling_factor": 16,
75
- "short_filter_bias": true,
76
- "short_filter_length": 3,
77
- "smeared_gqa": false,
78
- "split_k0": true,
79
- "state_size": 8,
80
- "tie_embeddings": true,
81
- "torch_dtype": "bfloat16",
82
- "transformers_version": null,
83
- "use_cache": true,
84
- "use_flash_attn": true,
85
- "use_flash_depthwise": true,
86
- "use_flash_rmsnorm": false,
87
- "use_flashfft": false,
88
- "use_interpolated_rotary_pos_emb": true,
89
- "vocab_size": 512
90
- }
 
1
  {
2
+ "_commit_hash": "3b191f9a32eeba9187bbba4475a4b2a1b2de6b3d",
3
+ "_name_or_path": "togethercomputer/evo-1-131k-base",
4
+ "architectures": ["StripedHyenaModelForCausalLM"],
5
+ "attn_layer_idxs": [8, 16, 24],
6
+ "auto_map": {
7
+ "AutoConfig": "togethercomputer/evo-1-131k-base--configuration_hyena.StripedHyenaConfig",
8
+ "AutoModelForCausalLM": "togethercomputer/evo-1-131k-base--modeling_hyena.StripedHyenaModelForCausalLM",
9
+ "AutoTokenizer": [
10
+ "togethercomputer/evo-1-131k-base--tokenizer.ByteTokenizer",
11
+ null
12
+ ]
13
+ },
14
+ "column_split": false,
15
+ "column_split_hyena": true,
16
+ "eps": 1e-6,
17
+ "final_norm": true,
18
+ "hidden_size": 4096,
19
+ "hyena_filter_groups": 1,
20
+ "hyena_layer_idxs": [
21
+ 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21,
22
+ 22, 23, 25, 26, 27, 28, 29, 30, 31
23
+ ],
24
+ "inference_mode": false,
25
+ "inner_mlp_size": 10928,
26
+ "log_intermediate_values": false,
27
+ "make_vocab_size_divisible_by": 8,
28
+ "max_seqlen": 131072,
29
+ "mha_out_proj_bias": true,
30
+ "mlp_activation": "gelu",
31
+ "model_parallel_size": 1,
32
+ "model_type": "stripedhyena",
33
+ "num_attention_heads": 32,
34
+ "num_filters": 4096,
35
+ "num_layers": 32,
36
+ "pipe_parallel_size": 1,
37
+ "prefill_style": "fft",
38
+ "proj_groups": 1,
39
+ "qkv_proj_bias": true,
40
+ "rotary_emb_base": 10000,
41
+ "rotary_emb_scaling_factor": 16,
42
+ "short_filter_bias": true,
43
+ "short_filter_length": 3,
44
+ "smeared_gqa": false,
45
+ "split_k0": true,
46
+ "state_size": 8,
47
+ "tie_embeddings": true,
48
+ "torch_dtype": "bfloat16",
49
+ "transformers_version": null,
50
+ "use_cache": false,
51
+ "use_flash_attention_2": true,
52
+ "use_flash_attn": true,
53
+ "use_flash_depthwise": true,
54
+ "use_flash_rmsnorm": false,
55
+ "use_flashfft": false,
56
+ "use_interpolated_rotary_pos_emb": true,
57
+ "vocab_size": 512
58
+ }