Text-to-Video
RepVideo / training_config.yaml
weepiess2383's picture
Upload folder using huggingface_hub
c30c5bd verified
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
log_keys:
- txt
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
quantize_c_noise: false
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
network_config:
target: dit_video_concat.DiffusionTransformer
params:
time_embed_dim: 512
elementwise_affine: true
num_frames: 49
time_compressed_rate: 4
latent_width: 90
latent_height: 60
num_layers: 30
patch_size: 2
in_channels: 16
out_channels: 16
hidden_size: 1920
adm_in_channels: 256
num_attention_heads: 30
transformer_args:
checkpoint_activations: true
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
skip_init: false
model_parallel_size: 1
is_decoder: false
modules:
pos_embed_config:
target: dit_video_concat.Basic3DPositionEmbeddingMixin
params:
text_length: 226
height_interpolation: 1.875
width_interpolation: 1.875
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
text_hidden_size: 4096
adaln_layer_config:
target: dit_video_concat.AdaLNMixin
params:
qk_ln: true
final_layer_config:
target: dit_video_concat.FinalLayerMixin
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: txt
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt
ignore_keys:
- loss
loss_config:
target: torch.nn.Identity
regularizer_config:
target: vae_modules.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 2
- 4
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: true
decoder_config:
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
params:
double_z: true
z_channels: 16
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 2
- 4
attn_resolutions: []
num_res_blocks: 3
dropout: 0.0
gather_norm: false
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
params:
offset_noise_level: 0
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
uniform_sampling: true
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
sampler_config:
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
params:
num_steps: 50
verbose: true
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
params:
shift_scale: 3.0
guider_config:
target: sgm.modules.diffusionmodules.guiders.DynamicCFG
params:
scale: 6
exp: 5
num_steps: 50
args:
checkpoint_activations: true
model_parallel_size: 1
experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue
mode: finetune
load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08
no_load_rng: true
train_iters: 100000
eval_iters: 1
eval_interval: 100
eval_batch_size: 1
save: ckpts_2b_lora
save_interval: 1000
log_interval: 20
train_data:
- /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json
valid_data:
- /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json
split: 1,0,0
num_workers: 8
force_train: true
only_log_video_latents: true
data:
target: data_video.PetrelDataset
params:
video_size:
- 480
- 720
fps: 8
max_num_frames: 49
skip_frms_num: 3.0
deepspeed:
train_micro_batch_size_per_gpu: 2
gradient_accumulation_steps: 1
steps_per_print: 50
gradient_clipping: 0.1
zero_optimization:
stage: 2
cpu_offload: false
contiguous_gradients: false
overlap_comm: true
reduce_scatter: true
reduce_bucket_size: 1000000000
allgather_bucket_size: 1000000000
load_from_fp32_weights: false
zero_allow_untested_optimizer: true
bf16:
enabled: false
fp16:
enabled: true
loss_scale: 0
loss_scale_window: 400
hysteresis: 2
min_loss_scale: 1
optimizer:
type: sat.ops.FusedEmaAdam
params:
lr: 2.0e-05
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0001
activation_checkpointing:
partition_activations: false
contiguous_memory_optimization: false
wall_clock_breakdown: false