|
model: |
|
scale_factor: 1.15258426 |
|
disable_first_stage_autocast: true |
|
log_keys: |
|
- txt |
|
denoiser_config: |
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser |
|
params: |
|
num_idx: 1000 |
|
quantize_c_noise: false |
|
weighting_config: |
|
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting |
|
scaling_config: |
|
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling |
|
discretization_config: |
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
|
params: |
|
shift_scale: 3.0 |
|
network_config: |
|
target: dit_video_concat.DiffusionTransformer |
|
params: |
|
time_embed_dim: 512 |
|
elementwise_affine: true |
|
num_frames: 49 |
|
time_compressed_rate: 4 |
|
latent_width: 90 |
|
latent_height: 60 |
|
num_layers: 30 |
|
patch_size: 2 |
|
in_channels: 16 |
|
out_channels: 16 |
|
hidden_size: 1920 |
|
adm_in_channels: 256 |
|
num_attention_heads: 30 |
|
transformer_args: |
|
checkpoint_activations: true |
|
vocab_size: 1 |
|
max_sequence_length: 64 |
|
layernorm_order: pre |
|
skip_init: false |
|
model_parallel_size: 1 |
|
is_decoder: false |
|
modules: |
|
pos_embed_config: |
|
target: dit_video_concat.Basic3DPositionEmbeddingMixin |
|
params: |
|
text_length: 226 |
|
height_interpolation: 1.875 |
|
width_interpolation: 1.875 |
|
patch_embed_config: |
|
target: dit_video_concat.ImagePatchEmbeddingMixin |
|
params: |
|
text_hidden_size: 4096 |
|
adaln_layer_config: |
|
target: dit_video_concat.AdaLNMixin |
|
params: |
|
qk_ln: true |
|
final_layer_config: |
|
target: dit_video_concat.FinalLayerMixin |
|
conditioner_config: |
|
target: sgm.modules.GeneralConditioner |
|
params: |
|
emb_models: |
|
- is_trainable: false |
|
input_key: txt |
|
ucg_rate: 0.1 |
|
target: sgm.modules.encoders.modules.FrozenT5Embedder |
|
params: |
|
model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl |
|
max_length: 226 |
|
first_stage_config: |
|
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper |
|
params: |
|
cp_size: 1 |
|
ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt |
|
ignore_keys: |
|
- loss |
|
loss_config: |
|
target: torch.nn.Identity |
|
regularizer_config: |
|
target: vae_modules.regularizers.DiagonalGaussianRegularizer |
|
encoder_config: |
|
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D |
|
params: |
|
double_z: true |
|
z_channels: 16 |
|
resolution: 256 |
|
in_channels: 3 |
|
out_ch: 3 |
|
ch: 128 |
|
ch_mult: |
|
- 1 |
|
- 2 |
|
- 2 |
|
- 4 |
|
attn_resolutions: [] |
|
num_res_blocks: 3 |
|
dropout: 0.0 |
|
gather_norm: true |
|
decoder_config: |
|
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D |
|
params: |
|
double_z: true |
|
z_channels: 16 |
|
resolution: 256 |
|
in_channels: 3 |
|
out_ch: 3 |
|
ch: 128 |
|
ch_mult: |
|
- 1 |
|
- 2 |
|
- 2 |
|
- 4 |
|
attn_resolutions: [] |
|
num_res_blocks: 3 |
|
dropout: 0.0 |
|
gather_norm: false |
|
loss_fn_config: |
|
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss |
|
params: |
|
offset_noise_level: 0 |
|
sigma_sampler_config: |
|
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling |
|
params: |
|
uniform_sampling: true |
|
num_idx: 1000 |
|
discretization_config: |
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
|
params: |
|
shift_scale: 3.0 |
|
sampler_config: |
|
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler |
|
params: |
|
num_steps: 50 |
|
verbose: true |
|
discretization_config: |
|
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
|
params: |
|
shift_scale: 3.0 |
|
guider_config: |
|
target: sgm.modules.diffusionmodules.guiders.DynamicCFG |
|
params: |
|
scale: 6 |
|
exp: 5 |
|
num_steps: 50 |
|
args: |
|
checkpoint_activations: true |
|
model_parallel_size: 1 |
|
experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue |
|
mode: finetune |
|
load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08 |
|
no_load_rng: true |
|
train_iters: 100000 |
|
eval_iters: 1 |
|
eval_interval: 100 |
|
eval_batch_size: 1 |
|
save: ckpts_2b_lora |
|
save_interval: 1000 |
|
log_interval: 20 |
|
train_data: |
|
- /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json |
|
valid_data: |
|
- /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json |
|
split: 1,0,0 |
|
num_workers: 8 |
|
force_train: true |
|
only_log_video_latents: true |
|
data: |
|
target: data_video.PetrelDataset |
|
params: |
|
video_size: |
|
- 480 |
|
- 720 |
|
fps: 8 |
|
max_num_frames: 49 |
|
skip_frms_num: 3.0 |
|
deepspeed: |
|
train_micro_batch_size_per_gpu: 2 |
|
gradient_accumulation_steps: 1 |
|
steps_per_print: 50 |
|
gradient_clipping: 0.1 |
|
zero_optimization: |
|
stage: 2 |
|
cpu_offload: false |
|
contiguous_gradients: false |
|
overlap_comm: true |
|
reduce_scatter: true |
|
reduce_bucket_size: 1000000000 |
|
allgather_bucket_size: 1000000000 |
|
load_from_fp32_weights: false |
|
zero_allow_untested_optimizer: true |
|
bf16: |
|
enabled: false |
|
fp16: |
|
enabled: true |
|
loss_scale: 0 |
|
loss_scale_window: 400 |
|
hysteresis: 2 |
|
min_loss_scale: 1 |
|
optimizer: |
|
type: sat.ops.FusedEmaAdam |
|
params: |
|
lr: 2.0e-05 |
|
betas: |
|
- 0.9 |
|
- 0.95 |
|
eps: 1.0e-08 |
|
weight_decay: 0.0001 |
|
activation_checkpointing: |
|
partition_activations: false |
|
contiguous_memory_optimization: false |
|
wall_clock_breakdown: false |
|
|