Text-to-Video
weepiess2383 commited on
Commit
c30c5bd
·
verified ·
1 Parent(s): 730e11e

Upload folder using huggingface_hub

Browse files
35000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cd81f7ccd798d940e149d812e94f422df9e3de96ae833a10988c2edd14052bb
3
+ size 23436481678
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ 35000
model_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_class": "SATVideoDiffusionEngine",
3
+ "model_parallel_size": 1
4
+ }
training_config.yaml ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ scale_factor: 1.15258426
3
+ disable_first_stage_autocast: true
4
+ log_keys:
5
+ - txt
6
+ denoiser_config:
7
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
8
+ params:
9
+ num_idx: 1000
10
+ quantize_c_noise: false
11
+ weighting_config:
12
+ target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
13
+ scaling_config:
14
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
15
+ discretization_config:
16
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
17
+ params:
18
+ shift_scale: 3.0
19
+ network_config:
20
+ target: dit_video_concat.DiffusionTransformer
21
+ params:
22
+ time_embed_dim: 512
23
+ elementwise_affine: true
24
+ num_frames: 49
25
+ time_compressed_rate: 4
26
+ latent_width: 90
27
+ latent_height: 60
28
+ num_layers: 30
29
+ patch_size: 2
30
+ in_channels: 16
31
+ out_channels: 16
32
+ hidden_size: 1920
33
+ adm_in_channels: 256
34
+ num_attention_heads: 30
35
+ transformer_args:
36
+ checkpoint_activations: true
37
+ vocab_size: 1
38
+ max_sequence_length: 64
39
+ layernorm_order: pre
40
+ skip_init: false
41
+ model_parallel_size: 1
42
+ is_decoder: false
43
+ modules:
44
+ pos_embed_config:
45
+ target: dit_video_concat.Basic3DPositionEmbeddingMixin
46
+ params:
47
+ text_length: 226
48
+ height_interpolation: 1.875
49
+ width_interpolation: 1.875
50
+ patch_embed_config:
51
+ target: dit_video_concat.ImagePatchEmbeddingMixin
52
+ params:
53
+ text_hidden_size: 4096
54
+ adaln_layer_config:
55
+ target: dit_video_concat.AdaLNMixin
56
+ params:
57
+ qk_ln: true
58
+ final_layer_config:
59
+ target: dit_video_concat.FinalLayerMixin
60
+ conditioner_config:
61
+ target: sgm.modules.GeneralConditioner
62
+ params:
63
+ emb_models:
64
+ - is_trainable: false
65
+ input_key: txt
66
+ ucg_rate: 0.1
67
+ target: sgm.modules.encoders.modules.FrozenT5Embedder
68
+ params:
69
+ model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl
70
+ max_length: 226
71
+ first_stage_config:
72
+ target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
73
+ params:
74
+ cp_size: 1
75
+ ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt
76
+ ignore_keys:
77
+ - loss
78
+ loss_config:
79
+ target: torch.nn.Identity
80
+ regularizer_config:
81
+ target: vae_modules.regularizers.DiagonalGaussianRegularizer
82
+ encoder_config:
83
+ target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
84
+ params:
85
+ double_z: true
86
+ z_channels: 16
87
+ resolution: 256
88
+ in_channels: 3
89
+ out_ch: 3
90
+ ch: 128
91
+ ch_mult:
92
+ - 1
93
+ - 2
94
+ - 2
95
+ - 4
96
+ attn_resolutions: []
97
+ num_res_blocks: 3
98
+ dropout: 0.0
99
+ gather_norm: true
100
+ decoder_config:
101
+ target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
102
+ params:
103
+ double_z: true
104
+ z_channels: 16
105
+ resolution: 256
106
+ in_channels: 3
107
+ out_ch: 3
108
+ ch: 128
109
+ ch_mult:
110
+ - 1
111
+ - 2
112
+ - 2
113
+ - 4
114
+ attn_resolutions: []
115
+ num_res_blocks: 3
116
+ dropout: 0.0
117
+ gather_norm: false
118
+ loss_fn_config:
119
+ target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
120
+ params:
121
+ offset_noise_level: 0
122
+ sigma_sampler_config:
123
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
124
+ params:
125
+ uniform_sampling: true
126
+ num_idx: 1000
127
+ discretization_config:
128
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
129
+ params:
130
+ shift_scale: 3.0
131
+ sampler_config:
132
+ target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
133
+ params:
134
+ num_steps: 50
135
+ verbose: true
136
+ discretization_config:
137
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
138
+ params:
139
+ shift_scale: 3.0
140
+ guider_config:
141
+ target: sgm.modules.diffusionmodules.guiders.DynamicCFG
142
+ params:
143
+ scale: 6
144
+ exp: 5
145
+ num_steps: 50
146
+ args:
147
+ checkpoint_activations: true
148
+ model_parallel_size: 1
149
+ experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue
150
+ mode: finetune
151
+ load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08
152
+ no_load_rng: true
153
+ train_iters: 100000
154
+ eval_iters: 1
155
+ eval_interval: 100
156
+ eval_batch_size: 1
157
+ save: ckpts_2b_lora
158
+ save_interval: 1000
159
+ log_interval: 20
160
+ train_data:
161
+ - /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json
162
+ valid_data:
163
+ - /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json
164
+ split: 1,0,0
165
+ num_workers: 8
166
+ force_train: true
167
+ only_log_video_latents: true
168
+ data:
169
+ target: data_video.PetrelDataset
170
+ params:
171
+ video_size:
172
+ - 480
173
+ - 720
174
+ fps: 8
175
+ max_num_frames: 49
176
+ skip_frms_num: 3.0
177
+ deepspeed:
178
+ train_micro_batch_size_per_gpu: 2
179
+ gradient_accumulation_steps: 1
180
+ steps_per_print: 50
181
+ gradient_clipping: 0.1
182
+ zero_optimization:
183
+ stage: 2
184
+ cpu_offload: false
185
+ contiguous_gradients: false
186
+ overlap_comm: true
187
+ reduce_scatter: true
188
+ reduce_bucket_size: 1000000000
189
+ allgather_bucket_size: 1000000000
190
+ load_from_fp32_weights: false
191
+ zero_allow_untested_optimizer: true
192
+ bf16:
193
+ enabled: false
194
+ fp16:
195
+ enabled: true
196
+ loss_scale: 0
197
+ loss_scale_window: 400
198
+ hysteresis: 2
199
+ min_loss_scale: 1
200
+ optimizer:
201
+ type: sat.ops.FusedEmaAdam
202
+ params:
203
+ lr: 2.0e-05
204
+ betas:
205
+ - 0.9
206
+ - 0.95
207
+ eps: 1.0e-08
208
+ weight_decay: 0.0001
209
+ activation_checkpointing:
210
+ partition_activations: false
211
+ contiguous_memory_optimization: false
212
+ wall_clock_breakdown: false