Porjaz commited on
Commit
e1faefa
·
verified ·
1 Parent(s): 1729620

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +11 -158
hyperparams.yaml CHANGED
@@ -1,60 +1,15 @@
1
- # Seed needs to be set at top of yaml, before objects with parameters
2
- # are instantiated
3
- seed: 1994
4
- __set_seed: !apply:torch.manual_seed [!ref <seed>]
5
 
6
- skip_training: True
7
-
8
- output_folder: !ref output_folder_seq2seq_cv_podcast_arhiv_augmentation_128_emb_5000_vocab
9
- output_wer_folder: !ref <output_folder>/
10
- save_folder: !ref <output_folder>/save
11
- train_log: !ref <output_folder>/train_log.txt
12
-
13
- lm_folder: LM/output_folder_lm
14
-
15
- # Data files
16
- data_folder: "../../data/combined_data/speechbrain_splits"
17
-
18
- wav2vec2_hub: facebook/wav2vec2-large-xlsr-53
19
- wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
20
 
21
 
22
  ####################### Training Parameters ####################################
23
 
24
- number_of_epochs: 50
25
- number_of_ctc_epochs: 15
26
- # batch_size: 16
27
- # batch_size: 6 # for cv+podcast
28
- batch_size: 6 # for cv+podcast+arhiv
29
- label_smoothing: 0.1
30
- lr: 0.0001
31
- ctc_weight: 0.5
32
-
33
- opt_class: !name:torch.optim.Adam
34
- lr: !ref <lr>
35
-
36
- lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
37
- initial_value: !ref <lr>
38
- improvement_threshold: 0.0025
39
- annealing_factor: 0.8
40
- patient: 0
41
-
42
- # Dataloader options
43
- num_workers: 4
44
- train_dataloader_opts:
45
- num_workers: !ref <num_workers>
46
- batch_size: !ref <batch_size>
47
-
48
- valid_dataloader_opts:
49
- num_workers: !ref <num_workers>
50
- batch_size: !ref <batch_size>
51
-
52
- test_dataloader_opts:
53
- batch_size: 1
54
-
55
  ####################### Model Parameters #######################################
56
-
57
- dropout: 0.15
58
  wav2vec_output_dim: 1024
59
  emb_size: 128
60
  dec_neurons: 1024
@@ -83,8 +38,6 @@ ctc_weight_decode: 0.0
83
  coverage_penalty: 1.5
84
  lm_weight: 0.0
85
 
86
- epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
87
- limit: !ref <number_of_epochs>
88
 
89
  # Wav2vec2 encoder
90
  encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
@@ -125,25 +78,6 @@ seq_lin: !new:speechbrain.nnet.linear.Linear
125
  log_softmax: !new:speechbrain.nnet.activations.Softmax
126
  apply_log: True
127
 
128
- ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
129
- blank_index: !ref <blank_index>
130
-
131
- nll_cost: !name:speechbrain.nnet.losses.nll_loss
132
- label_smoothing: 0.1
133
-
134
- # This is the RNNLM that is used according to the Huggingface repository
135
- # NB: It has to match the pre-trained RNNLM!!
136
- #lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
137
- # output_neurons: !ref <output_neurons>
138
- # embedding_dim: !ref <emb_size>
139
- # activation: !name:torch.nn.LeakyReLU
140
- # dropout: 0.0
141
- # rnn_layers: 2
142
- # rnn_neurons: 2048
143
- # dnn_blocks: 1
144
- # dnn_neurons: 512
145
- # return_hidden: True # For inference
146
-
147
  tokenizer: !new:sentencepiece.SentencePieceProcessor
148
  model_file: 1000_unigram.model
149
 
@@ -153,37 +87,11 @@ modules:
153
  decoder: !ref <decoder>
154
  ctc_lin: !ref <ctc_lin>
155
  seq_lin: !ref <seq_lin>
156
- #lm_model: !ref <lm_model>
157
 
158
  model: !new:torch.nn.ModuleList
159
  - [!ref <encoder_w2v2>, !ref <embedding>, !ref <decoder>, !ref <ctc_lin>, !ref <seq_lin>]
160
 
161
  ############################## Decoding & optimiser ############################
162
- #coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
163
- # vocab_size: !ref <output_neurons>
164
- #
165
- #rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
166
- # language_model: !ref <lm_model>
167
- # temperature: !ref <temperature_lm>
168
- #
169
- #scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
170
- # full_scorers: [!ref <rnnlm_scorer>,
171
- # !ref <coverage_scorer>]
172
- # weights:
173
- # rnnlm: !ref <lm_weight>
174
- # coverage: !ref <coverage_penalty>
175
-
176
-
177
- # Search
178
- greedy_search: !new:speechbrain.decoders.S2SRNNGreedySearcher
179
- embedding: !ref <embedding>
180
- decoder: !ref <decoder>
181
- linear: !ref <seq_lin>
182
- bos_index: !ref <bos_index>
183
- eos_index: !ref <eos_index>
184
- min_decode_ratio: !ref <min_decode_ratio>
185
- max_decode_ratio: !ref <max_decode_ratio>
186
-
187
  test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
188
  embedding: !ref <embedding>
189
  decoder: !ref <decoder>
@@ -200,65 +108,10 @@ test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
200
  #scorer: !ref <scorer>
201
 
202
 
203
- ############################## Augmentations ###################################
204
-
205
- # Speed perturbation
206
- speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
207
- orig_freq: 16000
208
- speeds: [95, 100, 105]
209
-
210
- # Frequency drop: randomly drops a number of frequency bands to zero.
211
- drop_freq: !new:speechbrain.augment.time_domain.DropFreq
212
- drop_freq_low: 0
213
- drop_freq_high: 1
214
- drop_freq_count_low: 1
215
- drop_freq_count_high: 3
216
- drop_freq_width: 0.05
217
-
218
- # Time drop: randomly drops a number of temporal chunks.
219
- drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
220
- drop_length_low: 1000
221
- drop_length_high: 2000
222
- drop_count_low: 1
223
- drop_count_high: 5
224
-
225
- # Augmenter: Combines previously defined augmentations to perform data augmentation
226
- wav_augment: !new:speechbrain.augment.augmenter.Augmenter
227
- concat_original: False
228
- min_augmentations: 1
229
- max_augmentations: 3
230
- augment_prob: 0.5
231
- augmentations: [
232
- !ref <speed_perturb>,
233
- !ref <drop_freq>,
234
- !ref <drop_chunk>]
235
-
236
-
237
  ############################## Logging and Pretrainer ##########################
238
 
239
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
240
- checkpoints_dir: !ref <save_folder>
241
- recoverables:
242
- model: !ref <model>
243
- scheduler: !ref <lr_annealing>
244
- counter: !ref <epoch_counter>
245
-
246
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
247
- save_file: !ref <train_log>
248
-
249
- error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
250
-
251
- cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
252
- split_tokens: True
253
-
254
-
255
- # The pretrainer allows a mapping between pretrained files and instances that
256
- # are declared in the yaml. E.g here, we will download the file lm.ckpt
257
- # and it will be loaded into "lm" which is pointing to the <lm_model> defined
258
- # before.
259
- #pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
260
- # collect_in: !ref <lm_folder>
261
- # loadables:
262
- # lm: !ref <lm_model>
263
- # paths:
264
- # lm: !ref <lm_folder>/save/CKPT+2024-07-19+14-16-05+00/model.ckpt
 
1
+ # Hparams NEEDED
2
+ HPARAMS_NEEDED: ["wav2vec_output_dim", "emb_size", "dec_neurons", "dec_layers", "output_neurons", "log_softmax", "tokenizer"]
3
+ # Modules Needed
4
+ MODULES_NEEDED: ["encoder_w2v2", "embedding", "ctc_lin", "seq_lin"]
5
 
6
+ # Pretrain folder (HuggingFace)
7
+ pretrained_path: Porjaz/wav2vec2-aed-macedonian-asr
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  ####################### Training Parameters ####################################
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ####################### Model Parameters #######################################
 
 
13
  wav2vec_output_dim: 1024
14
  emb_size: 128
15
  dec_neurons: 1024
 
38
  coverage_penalty: 1.5
39
  lm_weight: 0.0
40
 
 
 
41
 
42
  # Wav2vec2 encoder
43
  encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
 
78
  log_softmax: !new:speechbrain.nnet.activations.Softmax
79
  apply_log: True
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  tokenizer: !new:sentencepiece.SentencePieceProcessor
82
  model_file: 1000_unigram.model
83
 
 
87
  decoder: !ref <decoder>
88
  ctc_lin: !ref <ctc_lin>
89
  seq_lin: !ref <seq_lin>
 
90
 
91
  model: !new:torch.nn.ModuleList
92
  - [!ref <encoder_w2v2>, !ref <embedding>, !ref <decoder>, !ref <ctc_lin>, !ref <seq_lin>]
93
 
94
  ############################## Decoding & optimiser ############################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
96
  embedding: !ref <embedding>
97
  decoder: !ref <decoder>
 
108
  #scorer: !ref <scorer>
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  ############################## Logging and Pretrainer ##########################
112
 
113
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
114
+ loadables:
115
+ model: !ref <model>
116
+ paths:
117
+ model: !ref <pretrained_path>/model.ckpt