mjbuehler's picture
Upload LlamaForCausalLMWithGNN
e890784 verified
{
"architectures": [
"LlamaForCausalLMWithGNN"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": [
128001,
128008,
128009
],
"gnn_config": {
"GIN_after_attention": true,
"GIN_after_attention_pre_GIN_norm": true,
"GIN_after_attention_skip": true,
"GIN_edge_weight_scaling": true,
"GIN_hidden_dim_multiplier": 1,
"GIN_use_MLP": true,
"GIN_use_norm": false,
"LlamaAttentionHierarchicalPerceiverAR_use_rope": true,
"LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true,
"MLP_type": "standard_MLP",
"N_GNN_from_attention_layers": 3,
"activation": "prelu",
"add_rope": false,
"adj_construction_method": "sum",
"adj_transform_hidden_dim": 128,
"attention_GIN_MLP_GIN_MLP_mode": "shared",
"attention_GIN_MLP_GIN_MLP_pre_aggregate": true,
"attention_GIN_MLP_GIN_binary_scale": 1.0,
"attention_GIN_MLP_GIN_fuse_mode": "epsilon",
"attention_GIN_MLP_GIN_learnable_threshold": false,
"attention_GIN_MLP_GIN_mode": "default",
"attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0,
"attention_GIN_MLP_GIN_softmax_temperature": 1.0,
"attention_GIN_MLP_GIN_threshold_mode": "none",
"attention_GIN_MLP_GIN_threshold_value": 0.2,
"attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1,
"attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true,
"attention_GIN_MLP_GIN_use_softmax": false,
"attention_GIN_MLP_attention_mix_mode": "A",
"attention_GIN_MLP_multiplier": 2,
"attention_GIN_MLP_o_proj_at_end": false,
"attention_GIN_MLP_scoring_hidden_dim": 512,
"attention_GIN_MLP_second_order_factor": 0.1,
"attention_GIN_MLP_separate_attention": false,
"attention_GIN_MLP_use_scoring_fnct": true,
"attention_GIN_MLP_use_second_order": false,
"attention_epsilon_strategy": "default",
"attention_epsilon_uniform_value": 0.5,
"combined_norm": false,
"continuous_transform_alpha": 10.0,
"distance_scaling_method": "power",
"distance_weight_strength": 1.0,
"dropout": 0.1,
"enforce_causality": true,
"epsilon_threshold": 0.6,
"gnn_logic": "before_MLP",
"gnn_mode": "single",
"gnn_residual": false,
"gnn_type": "causal_gin",
"group_tokens_for_coarse_graining": false,
"hidden_dim": 155,
"hierarchical_enc_dec_type": "PerceiverAR",
"initial_sharpening_value": 1.0,
"lambda_GNN": 0.5,
"lambda_GNN_initial": 0.0,
"learnable_aggregate_activation": "softmax",
"max_position_embeddings": 2048,
"mix_weights_initial": 0.5,
"model_type": "",
"norm_to_hidden_states": false,
"num_latent_layers": 4,
"num_latents": 32,
"num_latents_list": [
64,
32,
8
],
"num_layers": 1,
"per_head_ff": false,
"plot_for_debugging": false,
"remove_self_connections": false,
"residual_epsilon_strategy": "default",
"residual_epsilon_uniform_value": 0.1,
"rms_norm_eps": 1e-05,
"sharpening_value_init": "value",
"soft_masking_initial_threshold": 0.01,
"soft_masking_k": 10.0,
"threshold": 0.1,
"threshold_any_tau": 0.1,
"tokenizer": null,
"top_k": 8,
"use_GNN_from_attention": "none",
"use_GNN_from_attention_add_RoPE_at_every_layer": false,
"use_differential_attention": false,
"use_differential_attention_group_norm": false,
"use_distance_scaling": false,
"use_fixed_number_of_tokens_per_latent": false,
"use_graph_property_modulation": false,
"use_graph_property_modulation_with_norm": false,
"use_graph_property_modulation_with_norm_use_causal_clustering": true,
"use_hierarchical_attention": false,
"use_layer_norm": true,
"use_layer_norm_in_GIN_MLP": false,
"use_no_norm_in_GIN_MLP": false,
"use_original_hidden_states": false,
"use_original_hidden_states_add_attention": false,
"use_projection": true,
"use_sharpening": false,
"use_soft_masking": false,
"zero_below_epsilon_threshold": true
},
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 3072,
"initializer_range": 0.02,
"intermediate_size": 8192,
"max_position_embeddings": 131072,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 24,
"num_hidden_layers": 28,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"tie_word_embeddings": true,
"torch_dtype": "float32",
"transformers_version": "4.46.1",
"use_cache": false,
"vocab_size": 128256
}