Text-to-Video
File size: 6,052 Bytes
c30c5bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
model:
  scale_factor: 1.15258426
  disable_first_stage_autocast: true
  log_keys:
  - txt
  denoiser_config:
    target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
    params:
      num_idx: 1000
      quantize_c_noise: false
      weighting_config:
        target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
      scaling_config:
        target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
        params:
          shift_scale: 3.0
  network_config:
    target: dit_video_concat.DiffusionTransformer
    params:
      time_embed_dim: 512
      elementwise_affine: true
      num_frames: 49
      time_compressed_rate: 4
      latent_width: 90
      latent_height: 60
      num_layers: 30
      patch_size: 2
      in_channels: 16
      out_channels: 16
      hidden_size: 1920
      adm_in_channels: 256
      num_attention_heads: 30
      transformer_args:
        checkpoint_activations: true
        vocab_size: 1
        max_sequence_length: 64
        layernorm_order: pre
        skip_init: false
        model_parallel_size: 1
        is_decoder: false
      modules:
        pos_embed_config:
          target: dit_video_concat.Basic3DPositionEmbeddingMixin
          params:
            text_length: 226
            height_interpolation: 1.875
            width_interpolation: 1.875
        patch_embed_config:
          target: dit_video_concat.ImagePatchEmbeddingMixin
          params:
            text_hidden_size: 4096
        adaln_layer_config:
          target: dit_video_concat.AdaLNMixin
          params:
            qk_ln: true
        final_layer_config:
          target: dit_video_concat.FinalLayerMixin
  conditioner_config:
    target: sgm.modules.GeneralConditioner
    params:
      emb_models:
      - is_trainable: false
        input_key: txt
        ucg_rate: 0.1
        target: sgm.modules.encoders.modules.FrozenT5Embedder
        params:
          model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl
          max_length: 226
  first_stage_config:
    target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
    params:
      cp_size: 1
      ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt
      ignore_keys:
      - loss
      loss_config:
        target: torch.nn.Identity
      regularizer_config:
        target: vae_modules.regularizers.DiagonalGaussianRegularizer
      encoder_config:
        target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
        params:
          double_z: true
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          attn_resolutions: []
          num_res_blocks: 3
          dropout: 0.0
          gather_norm: true
      decoder_config:
        target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
        params:
          double_z: true
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          attn_resolutions: []
          num_res_blocks: 3
          dropout: 0.0
          gather_norm: false
  loss_fn_config:
    target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
    params:
      offset_noise_level: 0
      sigma_sampler_config:
        target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
        params:
          uniform_sampling: true
          num_idx: 1000
          discretization_config:
            target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
            params:
              shift_scale: 3.0
  sampler_config:
    target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
    params:
      num_steps: 50
      verbose: true
      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
        params:
          shift_scale: 3.0
      guider_config:
        target: sgm.modules.diffusionmodules.guiders.DynamicCFG
        params:
          scale: 6
          exp: 5
          num_steps: 50
args:
  checkpoint_activations: true
  model_parallel_size: 1
  experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue
  mode: finetune
  load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08
  no_load_rng: true
  train_iters: 100000
  eval_iters: 1
  eval_interval: 100
  eval_batch_size: 1
  save: ckpts_2b_lora
  save_interval: 1000
  log_interval: 20
  train_data:
  - /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json
  valid_data:
  - /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json
  split: 1,0,0
  num_workers: 8
  force_train: true
  only_log_video_latents: true
data:
  target: data_video.PetrelDataset
  params:
    video_size:
    - 480
    - 720
    fps: 8
    max_num_frames: 49
    skip_frms_num: 3.0
deepspeed:
  train_micro_batch_size_per_gpu: 2
  gradient_accumulation_steps: 1
  steps_per_print: 50
  gradient_clipping: 0.1
  zero_optimization:
    stage: 2
    cpu_offload: false
    contiguous_gradients: false
    overlap_comm: true
    reduce_scatter: true
    reduce_bucket_size: 1000000000
    allgather_bucket_size: 1000000000
    load_from_fp32_weights: false
  zero_allow_untested_optimizer: true
  bf16:
    enabled: false
  fp16:
    enabled: true
  loss_scale: 0
  loss_scale_window: 400
  hysteresis: 2
  min_loss_scale: 1
  optimizer:
    type: sat.ops.FusedEmaAdam
    params:
      lr: 2.0e-05
      betas:
      - 0.9
      - 0.95
      eps: 1.0e-08
      weight_decay: 0.0001
  activation_checkpointing:
    partition_activations: false
    contiguous_memory_optimization: false
  wall_clock_breakdown: false