# Seed needs to be set at top of yaml, before objects with parameters # are instantiated seed: 1994 __set_seed: !apply:torch.manual_seed [!ref ] skip_training: True output_folder: !ref output_folder_seq2seq_cv_podcast_arhiv_augmentation_128_emb_5000_vocab output_wer_folder: !ref / save_folder: !ref /save train_log: !ref /train_log.txt lm_folder: LM/output_folder_lm # Data files data_folder: "../../data/combined_data/speechbrain_splits" wav2vec2_hub: facebook/wav2vec2-large-xlsr-53 wav2vec2_folder: !ref /wav2vec2_checkpoint # pretrained_tokenizer_path: "Tokenizer/output_folder_cv/1K_subword_unigram" # Use this for the CV model pretrained_tokenizer_path: "Tokenizer/output_folder_cv_podcast_arhiv/5K_subword_unigram" # Use this for the CV+Podcast+Arhiv model ####################### Training Parameters #################################### number_of_epochs: 50 number_of_ctc_epochs: 15 # batch_size: 16 # batch_size: 6 # for cv+podcast batch_size: 6 # for cv+podcast+arhiv label_smoothing: 0.1 lr: 0.0001 ctc_weight: 0.5 opt_class: !name:torch.optim.Adam lr: !ref lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 # Dataloader options num_workers: 4 train_dataloader_opts: num_workers: !ref batch_size: !ref valid_dataloader_opts: num_workers: !ref batch_size: !ref test_dataloader_opts: batch_size: 1 ####################### Model Parameters ####################################### dropout: 0.15 wav2vec_output_dim: 1024 emb_size: 128 dec_neurons: 1024 dec_layers: 1 output_neurons: 5000 blank_index: 0 bos_index: 0 eos_index: 0 unk_index: 0 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 valid_beam_size: 10 test_beam_size: 10 using_eos_threshold: True eos_threshold: 1.5 using_max_attn_shift: True max_attn_shift: 300 temperature: 1.0 ctc_window_size: 200 temperature_lm: 1.25 # Scoring parameters ctc_weight_decode: 0.0 coverage_penalty: 1.5 lm_weight: 0.0 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref # Wav2vec2 encoder encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref output_norm: True freeze: False freeze_feature_extractor: True save_path: !ref output_all_hiddens: False embedding: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref # Attention-based RNN decoder. decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref input_size: !ref rnn_type: gru attn_type: location hidden_size: !ref attn_dim: 512 num_layers: !ref scaling: 1.0 channels: 10 kernel_size: 100 re_init: True dropout: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: !ref nll_cost: !name:speechbrain.nnet.losses.nll_loss label_smoothing: 0.1 # This is the RNNLM that is used according to the Huggingface repository # NB: It has to match the pre-trained RNNLM!! #lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM # output_neurons: !ref # embedding_dim: !ref # activation: !name:torch.nn.LeakyReLU # dropout: 0.0 # rnn_layers: 2 # rnn_neurons: 2048 # dnn_blocks: 1 # dnn_neurons: 512 # return_hidden: True # For inference tokenizer: !new:sentencepiece.SentencePieceProcessor model_file: !ref /5000_unigram.model modules: encoder_w2v2: !ref embedding: !ref decoder: !ref ctc_lin: !ref seq_lin: !ref #lm_model: !ref model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref , !ref ] ############################## Decoding & optimiser ############################ #coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer # vocab_size: !ref # #rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer # language_model: !ref # temperature: !ref # #scorer: !new:speechbrain.decoders.scorer.ScorerBuilder # full_scorers: [!ref , # !ref ] # weights: # rnnlm: !ref # coverage: !ref # Search greedy_search: !new:speechbrain.decoders.S2SRNNGreedySearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref using_max_attn_shift: !ref max_attn_shift: !ref temperature: !ref #scorer: !ref ############################## Augmentations ################################### # Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: 16000 speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. drop_freq: !new:speechbrain.augment.time_domain.DropFreq drop_freq_low: 0 drop_freq_high: 1 drop_freq_count_low: 1 drop_freq_count_high: 3 drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. drop_chunk: !new:speechbrain.augment.time_domain.DropChunk drop_length_low: 1000 drop_length_high: 2000 drop_count_low: 1 drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter concat_original: False min_augmentations: 1 max_augmentations: 3 augment_prob: 0.5 augmentations: [ !ref , !ref , !ref ] ############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref scheduler: !ref counter: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True # The pretrainer allows a mapping between pretrained files and instances that # are declared in the yaml. E.g here, we will download the file lm.ckpt # and it will be loaded into "lm" which is pointing to the defined # before. #pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer # collect_in: !ref # loadables: # lm: !ref # paths: # lm: !ref /save/CKPT+2024-07-19+14-16-05+00/model.ckpt