File size: 4,722 Bytes
e890784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
{
  "architectures": [
    "LlamaForCausalLMWithGNN"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "gnn_config": {
    "GIN_after_attention": true,
    "GIN_after_attention_pre_GIN_norm": true,
    "GIN_after_attention_skip": true,
    "GIN_edge_weight_scaling": true,
    "GIN_hidden_dim_multiplier": 1,
    "GIN_use_MLP": true,
    "GIN_use_norm": false,
    "LlamaAttentionHierarchicalPerceiverAR_use_rope": true,
    "LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true,
    "MLP_type": "standard_MLP",
    "N_GNN_from_attention_layers": 3,
    "activation": "prelu",
    "add_rope": false,
    "adj_construction_method": "sum",
    "adj_transform_hidden_dim": 128,
    "attention_GIN_MLP_GIN_MLP_mode": "shared",
    "attention_GIN_MLP_GIN_MLP_pre_aggregate": true,
    "attention_GIN_MLP_GIN_binary_scale": 1.0,
    "attention_GIN_MLP_GIN_fuse_mode": "epsilon",
    "attention_GIN_MLP_GIN_learnable_threshold": false,
    "attention_GIN_MLP_GIN_mode": "default",
    "attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0,
    "attention_GIN_MLP_GIN_softmax_temperature": 1.0,
    "attention_GIN_MLP_GIN_threshold_mode": "none",
    "attention_GIN_MLP_GIN_threshold_value": 0.2,
    "attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1,
    "attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true,
    "attention_GIN_MLP_GIN_use_softmax": false,
    "attention_GIN_MLP_attention_mix_mode": "A",
    "attention_GIN_MLP_multiplier": 2,
    "attention_GIN_MLP_o_proj_at_end": false,
    "attention_GIN_MLP_scoring_hidden_dim": 512,
    "attention_GIN_MLP_second_order_factor": 0.1,
    "attention_GIN_MLP_separate_attention": false,
    "attention_GIN_MLP_use_scoring_fnct": true,
    "attention_GIN_MLP_use_second_order": false,
    "attention_epsilon_strategy": "default",
    "attention_epsilon_uniform_value": 0.5,
    "combined_norm": false,
    "continuous_transform_alpha": 10.0,
    "distance_scaling_method": "power",
    "distance_weight_strength": 1.0,
    "dropout": 0.1,
    "enforce_causality": true,
    "epsilon_threshold": 0.6,
    "gnn_logic": "before_MLP",
    "gnn_mode": "single",
    "gnn_residual": false,
    "gnn_type": "causal_gin",
    "group_tokens_for_coarse_graining": false,
    "hidden_dim": 155,
    "hierarchical_enc_dec_type": "PerceiverAR",
    "initial_sharpening_value": 1.0,
    "lambda_GNN": 0.5,
    "lambda_GNN_initial": 0.0,
    "learnable_aggregate_activation": "softmax",
    "max_position_embeddings": 2048,
    "mix_weights_initial": 0.5,
    "model_type": "",
    "norm_to_hidden_states": false,
    "num_latent_layers": 4,
    "num_latents": 32,
    "num_latents_list": [
      64,
      32,
      8
    ],
    "num_layers": 1,
    "per_head_ff": false,
    "plot_for_debugging": false,
    "remove_self_connections": false,
    "residual_epsilon_strategy": "default",
    "residual_epsilon_uniform_value": 0.1,
    "rms_norm_eps": 1e-05,
    "sharpening_value_init": "value",
    "soft_masking_initial_threshold": 0.01,
    "soft_masking_k": 10.0,
    "threshold": 0.1,
    "threshold_any_tau": 0.1,
    "tokenizer": null,
    "top_k": 8,
    "use_GNN_from_attention": "none",
    "use_GNN_from_attention_add_RoPE_at_every_layer": false,
    "use_differential_attention": false,
    "use_differential_attention_group_norm": false,
    "use_distance_scaling": false,
    "use_fixed_number_of_tokens_per_latent": false,
    "use_graph_property_modulation": false,
    "use_graph_property_modulation_with_norm": false,
    "use_graph_property_modulation_with_norm_use_causal_clustering": true,
    "use_hierarchical_attention": false,
    "use_layer_norm": true,
    "use_layer_norm_in_GIN_MLP": false,
    "use_no_norm_in_GIN_MLP": false,
    "use_original_hidden_states": false,
    "use_original_hidden_states_add_attention": false,
    "use_projection": true,
    "use_sharpening": false,
    "use_soft_masking": false,
    "zero_below_epsilon_threshold": true
  },
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.46.1",
  "use_cache": false,
  "vocab_size": 128256
}