|
{ |
|
"architectures": [ |
|
"LlamaForCausalLMWithGNN" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"gnn_config": { |
|
"GIN_after_attention": true, |
|
"GIN_after_attention_pre_GIN_norm": true, |
|
"GIN_after_attention_skip": true, |
|
"GIN_edge_weight_scaling": true, |
|
"GIN_hidden_dim_multiplier": 1, |
|
"GIN_use_MLP": true, |
|
"GIN_use_norm": false, |
|
"LlamaAttentionHierarchicalPerceiverAR_use_rope": true, |
|
"LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true, |
|
"MLP_type": "standard_MLP", |
|
"N_GNN_from_attention_layers": 3, |
|
"activation": "prelu", |
|
"add_rope": false, |
|
"adj_construction_method": "sum", |
|
"adj_transform_hidden_dim": 128, |
|
"attention_GIN_MLP_GIN_MLP_mode": "shared", |
|
"attention_GIN_MLP_GIN_MLP_pre_aggregate": true, |
|
"attention_GIN_MLP_GIN_binary_scale": 1.0, |
|
"attention_GIN_MLP_GIN_fuse_mode": "epsilon", |
|
"attention_GIN_MLP_GIN_learnable_threshold": false, |
|
"attention_GIN_MLP_GIN_mode": "default", |
|
"attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0, |
|
"attention_GIN_MLP_GIN_softmax_temperature": 1.0, |
|
"attention_GIN_MLP_GIN_threshold_mode": "none", |
|
"attention_GIN_MLP_GIN_threshold_value": 0.2, |
|
"attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1, |
|
"attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true, |
|
"attention_GIN_MLP_GIN_use_softmax": false, |
|
"attention_GIN_MLP_attention_mix_mode": "A", |
|
"attention_GIN_MLP_multiplier": 2, |
|
"attention_GIN_MLP_o_proj_at_end": false, |
|
"attention_GIN_MLP_scoring_hidden_dim": 512, |
|
"attention_GIN_MLP_second_order_factor": 0.1, |
|
"attention_GIN_MLP_separate_attention": false, |
|
"attention_GIN_MLP_use_scoring_fnct": true, |
|
"attention_GIN_MLP_use_second_order": false, |
|
"attention_epsilon_strategy": "default", |
|
"attention_epsilon_uniform_value": 0.5, |
|
"combined_norm": false, |
|
"continuous_transform_alpha": 10.0, |
|
"distance_scaling_method": "power", |
|
"distance_weight_strength": 1.0, |
|
"dropout": 0.1, |
|
"enforce_causality": true, |
|
"epsilon_threshold": 0.6, |
|
"gnn_logic": "before_MLP", |
|
"gnn_mode": "single", |
|
"gnn_residual": false, |
|
"gnn_type": "causal_gin", |
|
"group_tokens_for_coarse_graining": false, |
|
"hidden_dim": 155, |
|
"hierarchical_enc_dec_type": "PerceiverAR", |
|
"initial_sharpening_value": 1.0, |
|
"lambda_GNN": 0.5, |
|
"lambda_GNN_initial": 0.0, |
|
"learnable_aggregate_activation": "softmax", |
|
"max_position_embeddings": 2048, |
|
"mix_weights_initial": 0.5, |
|
"model_type": "", |
|
"norm_to_hidden_states": false, |
|
"num_latent_layers": 4, |
|
"num_latents": 32, |
|
"num_latents_list": [ |
|
64, |
|
32, |
|
8 |
|
], |
|
"num_layers": 1, |
|
"per_head_ff": false, |
|
"plot_for_debugging": false, |
|
"remove_self_connections": false, |
|
"residual_epsilon_strategy": "default", |
|
"residual_epsilon_uniform_value": 0.1, |
|
"rms_norm_eps": 1e-05, |
|
"sharpening_value_init": "value", |
|
"soft_masking_initial_threshold": 0.01, |
|
"soft_masking_k": 10.0, |
|
"threshold": 0.1, |
|
"threshold_any_tau": 0.1, |
|
"tokenizer": null, |
|
"top_k": 8, |
|
"use_GNN_from_attention": "none", |
|
"use_GNN_from_attention_add_RoPE_at_every_layer": false, |
|
"use_differential_attention": false, |
|
"use_differential_attention_group_norm": false, |
|
"use_distance_scaling": false, |
|
"use_fixed_number_of_tokens_per_latent": false, |
|
"use_graph_property_modulation": false, |
|
"use_graph_property_modulation_with_norm": false, |
|
"use_graph_property_modulation_with_norm_use_causal_clustering": true, |
|
"use_hierarchical_attention": false, |
|
"use_layer_norm": true, |
|
"use_layer_norm_in_GIN_MLP": false, |
|
"use_no_norm_in_GIN_MLP": false, |
|
"use_original_hidden_states": false, |
|
"use_original_hidden_states_add_attention": false, |
|
"use_projection": true, |
|
"use_sharpening": false, |
|
"use_soft_masking": false, |
|
"zero_below_epsilon_threshold": true |
|
}, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 3072, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 24, |
|
"num_hidden_layers": 28, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.46.1", |
|
"use_cache": false, |
|
"vocab_size": 128256 |
|
} |
|
|