## Config file # Log seed: 777 use_cuda: 1 # 1 for True, 0 for False # dataset speaker_no: 2 mix_lst_path: ./data/allData/voxceleb2/mixture_data_list_2mix_pretrain.csv audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/voxceleb2/audio_clean reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/ # not used audio_sr: 16000 ref_sr: 25 # dataloader num_workers: 4 batch_size: 2 # 4-GPU training with a total effective batch size of 8 accu_grad: 0 effec_batch_size: 4 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size max_length: 5 # truncate the utterances in dataloader, in seconds # network settings init_from: checkpoints/log_2024-09-30(09:49:14) # 'None' or a log name 'log_2024-07-22(18:12:13)' causal: 0 # 1 for True, 0 for False network_reference: cue: lip # lip or speech or gesture or EEG backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64 emb_size: 256 # resnet18:256 network_audio: backbone: mossformer2 encoder_kernel_size: 16 encoder_out_nchannels: 512 encoder_in_nchannels: 1 masknet_numspks: 1 masknet_chunksize: 250 masknet_numlayers: 1 masknet_norm: "ln" masknet_useextralinearlayer: False masknet_extraskipconnection: True intra_numlayers: 24 intra_nhead: 8 intra_dffn: 1024 intra_dropout: 0 intra_use_positional: True intra_norm_before: True # optimizer loss_type: hybrid # "snr", "sisdr", "hybrid" init_learning_rate: 0.00015 max_epoch: 150 clip_grad_norm: 5