File size: 4,723 Bytes
e1faefa 1d790af 23ab61f e1faefa 4bad4f3 369b729 2835954 4bad4f3 23ab61f 4ed44b9 23ab61f 9c63cd5 23ab61f 03d0e2a 23ab61f 2b61326 23ab61f f00a2c7 2b61326 23ab61f 1d790af 23ab61f 19dfde0 1d790af f56e17a 1d790af f56e17a 1d790af 23ab61f ce7435b 23ab61f 1d790af 23ab61f e305a79 1d790af e305a79 1d790af 23ab61f 81fa3cc 23ab61f e1faefa 1d790af e1faefa 9b56dc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# Hparams NEEDED
HPARAMS_NEEDED: ["wav2vec_output_dim", "emb_size", "dec_neurons", "dec_layers", "output_neurons", "log_softmax", "tokenizer"]
# Modules Needed
MODULES_NEEDED: ["encoder_w2v2", "embedding", "ctc_lin", "seq_lin", "lm_model"]
# Pretrain folder (HuggingFace)
output_folder: !ref output_folder_seq2seq_cv_podcast_arhiv_augmentation
pretrained_path: Macedonian-ASR/wav2vec2-aed-macedonian-asr
# wav2vec2_hub: facebook/wav2vec2-large-xlsr-53
wav2vec2_hub: jonatasgrosman/wav2vec2-large-xlsr-53-russian
save_folder: !ref <output_folder>/save
wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
####################### Training Parameters ####################################
####################### Model Parameters #######################################
dropout: 0.15
wav2vec_output_dim: 1024
emb_size: 128
dec_neurons: 1024
dec_layers: 1
output_neurons: 1000
blank_index: 0
bos_index: 1
eos_index: 2
unk_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 10
test_beam_size: 20
using_eos_threshold: True
eos_threshold: 1.5
using_max_attn_shift: False
max_attn_shift: 700
length_normalization: True
temperature: 1.0
temperature_lm: 1.4
# Scoring parameters
coverage_penalty: 1.5
lm_weight: 0.2
# This is the RNNLM that is used according to the Huggingface repository
# NB: It has to match the pre-trained RNNLM!!
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
output_neurons: !ref <output_neurons>
embedding_dim: !ref <emb_size>
activation: !name:torch.nn.LeakyReLU
dropout: 0.0
rnn_layers: 3
rnn_neurons: 2048
dnn_blocks: 2
dnn_neurons: 1024
return_hidden: True # For inference
# Wav2vec2 encoder
encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
source: !ref <wav2vec2_hub>
output_norm: True
freeze: False
freeze_feature_extractor: True
save_path: !ref <wav2vec2_folder>
output_all_hiddens: False
embedding: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
embedding_dim: !ref <emb_size>
# Attention-based RNN decoder.
decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: !ref <wav2vec_output_dim>
input_size: !ref <emb_size>
rnn_type: gru
attn_type: location
hidden_size: !ref <dec_neurons>
attn_dim: 512
num_layers: !ref <dec_layers>
scaling: 1.0
channels: 10
kernel_size: 100
re_init: True
dropout: !ref <dropout>
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <wav2vec_output_dim>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <output_neurons>
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
tokenizer: !new:sentencepiece.SentencePieceProcessor
model_file: 1000_unigram.model
modules:
encoder_w2v2: !ref <encoder_w2v2>
embedding: !ref <embedding>
decoder: !ref <decoder>
ctc_lin: !ref <ctc_lin>
seq_lin: !ref <seq_lin>
lm_model: !ref <lm_model>
model: !new:torch.nn.ModuleList
- [!ref <encoder_w2v2>, !ref <embedding>, !ref <decoder>, !ref <ctc_lin>, !ref <seq_lin>]
############################## Decoding & optimiser ############################
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
vocab_size: !ref <output_neurons>
rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
language_model: !ref <lm_model>
temperature: !ref <temperature_lm>
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <coverage_scorer>]
weights:
coverage: !ref <coverage_penalty>
scorer_lm: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <rnnlm_scorer>,
!ref <coverage_scorer>]
weights:
rnnlm: !ref <lm_weight>
coverage: !ref <coverage_penalty>
test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <embedding>
decoder: !ref <decoder>
linear: !ref <seq_lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
eos_threshold: !ref <eos_threshold>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
temperature: !ref <temperature>
scorer: !ref <scorer_lm>
############################## Logging and Pretrainer ##########################
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
lm: !ref <lm_model>
paths:
model: !ref <pretrained_path>/model.ckpt
lm: !ref <pretrained_path>/lm.ckpt
|