Spaces:
Build error
Build error
base_config: ./base.yaml | |
task_cls: tasks.tts.fs2.FastSpeech2Task | |
# model | |
hidden_size: 256 | |
dropout: 0.1 | |
encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer | |
decoder_type: fft # fft|rnn|conv|conformer|wn | |
# rnn enc/dec | |
encoder_K: 8 | |
decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2 | |
# fft enc/dec | |
use_pos_embed: true | |
dec_num_heads: 2 | |
dec_layers: 4 | |
ffn_hidden_size: 1024 | |
enc_ffn_kernel_size: 9 | |
dec_ffn_kernel_size: 9 | |
# conv enc/dec | |
enc_dec_norm: ln | |
conv_use_pos: false | |
layers_in_block: 2 | |
enc_dilations: [ 1, 1, 1, 1 ] | |
enc_kernel_size: 5 | |
dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder | |
dec_kernel_size: 5 | |
dur_loss: mse # huber|mol | |
# duration | |
predictor_hidden: -1 | |
predictor_kernel: 5 | |
predictor_layers: 2 | |
dur_predictor_kernel: 3 | |
dur_predictor_layers: 2 | |
predictor_dropout: 0.5 | |
# pitch and energy | |
pitch_norm: standard # standard|log | |
use_pitch_embed: true | |
pitch_type: frame # frame|ph|cwt | |
use_uv: true | |
cwt_hidden_size: 128 | |
cwt_layers: 2 | |
cwt_loss: l1 | |
cwt_add_f0_loss: false | |
cwt_std_scale: 0.8 | |
pitch_ar: false | |
pitch_embed_type: 0 | |
pitch_loss: 'l1' # l1|l2|ssim | |
pitch_ssim_win: 11 | |
use_energy_embed: false | |
# reference encoder and speaker embedding | |
use_ref_enc: false | |
use_var_enc: false | |
lambda_commit: 0.25 | |
var_enc_vq_codes: 64 | |
ref_norm_layer: bn | |
dec_inp_add_noise: false | |
sil_add_noise: false | |
ref_hidden_stride_kernel: | |
- 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size | |
- 0,3,5 | |
- 0,2,5 | |
- 0,2,5 | |
- 0,2,5 | |
pitch_enc_hidden_stride_kernel: | |
- 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size | |
- 0,2,5 | |
- 0,2,5 | |
dur_enc_hidden_stride_kernel: | |
- 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size | |
- 0,2,3 | |
- 0,1,3 | |
# mel | |
mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 | |
# loss lambda | |
lambda_f0: 1.0 | |
lambda_uv: 1.0 | |
lambda_energy: 0.1 | |
lambda_ph_dur: 0.1 | |
lambda_sent_dur: 1.0 | |
lambda_word_dur: 1.0 | |
predictor_grad: 0.1 | |
# train and eval | |
pretrain_fs_ckpt: '' | |
warmup_updates: 2000 | |
max_tokens: 32000 | |
max_sentences: 100000 | |
max_valid_sentences: 1 | |
max_updates: 120000 | |
use_gt_dur: false | |
use_gt_f0: false | |
ds_workers: 2 | |
lr: 1.0 | |