RepVideo / training_config.yaml

Upload folder using huggingface_hub

c30c5bd verified 4 days ago

6.05 kB

	model:
	scale_factor: 1.15258426
	disable_first_stage_autocast: true
	log_keys:
	- txt
	denoiser_config:
	target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
	params:
	num_idx: 1000
	quantize_c_noise: false
	weighting_config:
	target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
	scaling_config:
	target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
	discretization_config:
	target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
	params:
	shift_scale: 3.0
	network_config:
	target: dit_video_concat.DiffusionTransformer
	params:
	time_embed_dim: 512
	elementwise_affine: true
	num_frames: 49
	time_compressed_rate: 4
	latent_width: 90
	latent_height: 60
	num_layers: 30
	patch_size: 2
	in_channels: 16
	out_channels: 16
	hidden_size: 1920
	adm_in_channels: 256
	num_attention_heads: 30
	transformer_args:
	checkpoint_activations: true
	vocab_size: 1
	max_sequence_length: 64
	layernorm_order: pre
	skip_init: false
	model_parallel_size: 1
	is_decoder: false
	modules:
	pos_embed_config:
	target: dit_video_concat.Basic3DPositionEmbeddingMixin
	params:
	text_length: 226
	height_interpolation: 1.875
	width_interpolation: 1.875
	patch_embed_config:
	target: dit_video_concat.ImagePatchEmbeddingMixin
	params:
	text_hidden_size: 4096
	adaln_layer_config:
	target: dit_video_concat.AdaLNMixin
	params:
	qk_ln: true
	final_layer_config:
	target: dit_video_concat.FinalLayerMixin
	conditioner_config:
	target: sgm.modules.GeneralConditioner
	params:
	emb_models:
	- is_trainable: false
	input_key: txt
	ucg_rate: 0.1
	target: sgm.modules.encoders.modules.FrozenT5Embedder
	params:
	model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl
	max_length: 226
	first_stage_config:
	target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
	params:
	cp_size: 1
	ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt
	ignore_keys:
	- loss
	loss_config:
	target: torch.nn.Identity
	regularizer_config:
	target: vae_modules.regularizers.DiagonalGaussianRegularizer
	encoder_config:
	target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
	params:
	double_z: true
	z_channels: 16
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult:
	- 1
	- 2
	- 2
	- 4
	attn_resolutions: []
	num_res_blocks: 3
	dropout: 0.0
	gather_norm: true
	decoder_config:
	target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
	params:
	double_z: true
	z_channels: 16
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult:
	- 1
	- 2
	- 2
	- 4
	attn_resolutions: []
	num_res_blocks: 3
	dropout: 0.0
	gather_norm: false
	loss_fn_config:
	target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
	params:
	offset_noise_level: 0
	sigma_sampler_config:
	target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
	params:
	uniform_sampling: true
	num_idx: 1000
	discretization_config:
	target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
	params:
	shift_scale: 3.0
	sampler_config:
	target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
	params:
	num_steps: 50
	verbose: true
	discretization_config:
	target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
	params:
	shift_scale: 3.0
	guider_config:
	target: sgm.modules.diffusionmodules.guiders.DynamicCFG
	params:
	scale: 6
	exp: 5
	num_steps: 50
	args:
	checkpoint_activations: true
	model_parallel_size: 1
	experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue
	mode: finetune
	load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08
	no_load_rng: true
	train_iters: 100000
	eval_iters: 1
	eval_interval: 100
	eval_batch_size: 1
	save: ckpts_2b_lora
	save_interval: 1000
	log_interval: 20
	train_data:
	- /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json
	valid_data:
	- /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json
	split: 1,0,0
	num_workers: 8
	force_train: true
	only_log_video_latents: true
	data:
	target: data_video.PetrelDataset
	params:
	video_size:
	- 480
	- 720
	fps: 8
	max_num_frames: 49
	skip_frms_num: 3.0
	deepspeed:
	train_micro_batch_size_per_gpu: 2
	gradient_accumulation_steps: 1
	steps_per_print: 50
	gradient_clipping: 0.1
	zero_optimization:
	stage: 2
	cpu_offload: false
	contiguous_gradients: false
	overlap_comm: true
	reduce_scatter: true
	reduce_bucket_size: 1000000000
	allgather_bucket_size: 1000000000
	load_from_fp32_weights: false
	zero_allow_untested_optimizer: true
	bf16:
	enabled: false
	fp16:
	enabled: true
	loss_scale: 0
	loss_scale_window: 400
	hysteresis: 2
	min_loss_scale: 1
	optimizer:
	type: sat.ops.FusedEmaAdam
	params:
	lr: 2.0e-05
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-08
	weight_decay: 0.0001
	activation_checkpointing:
	partition_activations: false
	contiguous_memory_optimization: false
	wall_clock_breakdown: false