{ | |
"attn_layers": [ | |
-1 | |
], | |
"attn_loss_weights": [ | |
10.0 | |
], | |
"attn_sigma": 25, | |
"chunk_size": 50, | |
"dim_feedforward": 3200, | |
"dim_gaze_decoder": 512, | |
"dim_gaze_decoder_feedforward": 3200, | |
"dim_model": 512, | |
"dropout": 0.1, | |
"eyes": { | |
"observation.left_eye": "observation.images.left_eye_cam" | |
}, | |
"feedforward_activation": "relu", | |
"freeze_backbone": true, | |
"gaze_loss_weight": 1.0, | |
"gaze_sigma": 50.0, | |
"image_size": [ | |
336, | |
448 | |
], | |
"input_normalization_modes": { | |
"observation.images.left_eye_cam": "mean_std", | |
"observation.images.right_eye_cam": "mean_std", | |
"observation.state": "mean_std" | |
}, | |
"input_shapes": { | |
"observation.images.left_eye_cam": [ | |
3, | |
480, | |
640 | |
], | |
"observation.images.right_eye_cam": [ | |
3, | |
480, | |
640 | |
], | |
"observation.state": [ | |
21 | |
] | |
}, | |
"kl_weight": 10.0, | |
"latent_dim": 32, | |
"n_action_steps": 50, | |
"n_decoder_layers": 1, | |
"n_encoder_layers": 4, | |
"n_gaze_decoder_layers": 1, | |
"n_heads": 8, | |
"n_obs_steps": 1, | |
"n_vae_encoder_layers": 4, | |
"output_normalization_modes": { | |
"action": "mean_std" | |
}, | |
"output_shapes": { | |
"action": [ | |
21 | |
] | |
}, | |
"pre_norm": false, | |
"pretrained_backbone_weights": "dinov2_vits14_reg", | |
"replace_final_stride_with_dilation": false, | |
"temporal_ensemble_coeff": null, | |
"use_attn": true, | |
"use_gaze": false, | |
"use_vae": true, | |
"vision_backbone": "dinov2" | |
} |