File size: 6,052 Bytes
c30c5bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
log_keys:
- txt
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
quantize_c_noise: false
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
network_config:
target: dit_video_concat.DiffusionTransformer
params:
time_embed_dim: 512
elementwise_affine: true
num_frames: 49
time_compressed_rate: 4
latent_width: 90
latent_height: 60
num_layers: 30
patch_size: 2
in_channels: 16
out_channels: 16
hidden_size: 1920
adm_in_channels: 256
num_attention_heads: 30
transformer_args:
checkpoint_activations: true
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
skip_init: false
model_parallel_size: 1
is_decoder: false
modules:
pos_embed_config:
target: dit_video_concat.Basic3DPositionEmbeddingMixin
params:
text_length: 226
height_interpolation: 1.875
width_interpolation: 1.875
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
text_hidden_size: 4096
adaln_layer_config:
target: dit_video_concat.AdaLNMixin
params:
qk_ln: true
final_layer_config:
target: dit_video_concat.FinalLayerMixin
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: txt
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt
ignore_keys:
- loss
loss_config:
target: torch.nn.Identity
regularizer_config:
target: vae_modules.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 2
- 4
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: true
decoder_config:
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 2
- 4
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: false
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
params:
offset_noise_level: 0
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
uniform_sampling: true
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
sampler_config:
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
params:
num_steps: 50
verbose: true
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
guider_config:
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
params:
scale: 6
exp: 5
num_steps: 50
args:
checkpoint_activations: true
model_parallel_size: 1
experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue
mode: finetune
load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08
no_load_rng: true
train_iters: 100000
eval_iters: 1
eval_interval: 100
eval_batch_size: 1
save: ckpts_2b_lora
save_interval: 1000
log_interval: 20
train_data:
- /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json
valid_data:
- /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json
split: 1,0,0
num_workers: 8
force_train: true
only_log_video_latents: true
data:
target: data_video.PetrelDataset
params:
video_size:
- 480
- 720
fps: 8
max_num_frames: 49
skip_frms_num: 3.0
deepspeed:
train_micro_batch_size_per_gpu: 2
gradient_accumulation_steps: 1
steps_per_print: 50
gradient_clipping: 0.1
zero_optimization:
stage: 2
cpu_offload: false
contiguous_gradients: false
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 1000000000
allgather_bucket_size: 1000000000
load_from_fp32_weights: false
zero_allow_untested_optimizer: true
bf16:
enabled: false
fp16:
enabled: true
loss_scale: 0
loss_scale_window: 400
hysteresis: 2
min_loss_scale: 1
optimizer:
type: sat.ops.FusedEmaAdam
params:
lr: 2.0e-05
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0001
activation_checkpointing:
partition_activations: false
contiguous_memory_optimization: false
wall_clock_breakdown: false
|