m3hrdadfi commited on
Commit
9c719bb
·
1 Parent(s): 2b8e6a0

Initial model

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: et
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - xlsr-fine-tuning-week
10
+ license: apache-2.0
11
+ widget:
12
+ - label: Common Voice sample 1123
13
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-estonian/resolve/main/sample1123.flac
14
+ - label: Common Voice sample 910
15
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-estonian/resolve/main/sample910.flac
16
+ model-index:
17
+ - name: XLSR Wav2Vec2 Estonian by Mehrdad Farahani
18
+ results:
19
+ - task:
20
+ name: Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: Common Voice et
24
+ type: common_voice
25
+ args: et
26
+ metrics:
27
+ - name: Test WER
28
+ type: wer
29
+ value: 33.73
30
+
31
+ ---
32
+
33
+ # Wav2Vec2-Large-XLSR-53-Estonian
34
+
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Estonian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
36
+
37
+ ## Usage
38
+ The model can be used directly (without a language model) as follows:
39
+
40
+ **Requirements**
41
+ ```bash
42
+ # requirement packages
43
+ !pip install git+https://github.com/huggingface/datasets.git
44
+ !pip install git+https://github.com/huggingface/transformers.git
45
+ !pip install torchaudio
46
+ !pip install librosa
47
+ !pip install jiwer
48
+ ```
49
+
50
+
51
+ **Prediction**
52
+ ```python
53
+ import librosa
54
+ import torch
55
+ import torchaudio
56
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
57
+ from datasets import load_dataset
58
+
59
+ import numpy as np
60
+ import re
61
+ import string
62
+
63
+ import IPython.display as ipd
64
+
65
+ chars_to_ignore = [
66
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
67
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
68
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
69
+ ]
70
+ chars_to_mapping = {
71
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
72
+ }
73
+
74
+ def multiple_replace(text, chars_to_mapping):
75
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
76
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
77
+
78
+ def remove_special_characters(text, chars_to_ignore_regex):
79
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
80
+ return text
81
+
82
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
83
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
84
+ text = batch["sentence"].lower().strip()
85
+
86
+ text = text.replace("\u0307", " ").strip()
87
+ text = multiple_replace(text, chars_to_mapping)
88
+ text = remove_special_characters(text, chars_to_ignore_regex)
89
+
90
+ batch["sentence"] = text
91
+ return batch
92
+
93
+
94
+ def speech_file_to_array_fn(batch):
95
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
96
+ speech_array = speech_array.squeeze().numpy()
97
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
98
+
99
+ batch["speech"] = speech_array
100
+ return batch
101
+
102
+
103
+ def predict(batch):
104
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
+
106
+ input_values = features.input_values.to(device)
107
+ attention_mask = features.attention_mask.to(device)
108
+
109
+ with torch.no_grad():
110
+ logits = model(input_values, attention_mask=attention_mask).logits
111
+
112
+ pred_ids = torch.argmax(logits, dim=-1)
113
+
114
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
115
+ return batch
116
+
117
+
118
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
119
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian")
120
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian").to(device)
121
+
122
+ dataset = load_dataset("common_voice", "et", split="test[:1%]")
123
+ dataset = dataset.map(
124
+ normalizer,
125
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
126
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
127
+ )
128
+
129
+ dataset = dataset.map(speech_file_to_array_fn)
130
+ result = dataset.map(predict)
131
+
132
+ max_items = np.random.randint(0, len(result), 10).tolist()
133
+ for i in max_items:
134
+ reference, predicted = result["sentence"][i], result["predicted"][i]
135
+ print("reference:", reference)
136
+ print("predicted:", predicted)
137
+ print('---')
138
+ ```
139
+
140
+ **Output:**
141
+ ```text
142
+ reference: õhulossid lagunevad ning ees ootab maapind
143
+ predicted: õhulassid lagunevad ning ees ootab maapind
144
+ ---
145
+ reference: milliseks kiievisse pääsemise nimel võistlev muusik soome muusikamaastiku hetkeseisu hindab ning kas ta ka ennast sellel tulevikus tegutsemas näeb kuuled videost
146
+ predicted: milliseks gievisse pääsemise nimel võitlev muusiks soome muusikama aastiku hetke seisu hindab ning kas ta ennast selle tulevikus tegutsemast näeb kuulad videost
147
+ ---
148
+ reference: näiteks kui pool seina on tehtud tekib tunne et tahaks tegelikult natuke teistsugust ja hakkame otsast peale
149
+ predicted: näiteks kui pool seine on tehtud tekib tunnetahaks tegelikult matuka teistsugust jahappanna otsast peane
150
+ ---
151
+ reference: neuroesteetilised katsed näitavad et just nägude vaatlemine aktiveerib inimese aju esteetilist keskust
152
+ predicted: neuroaisteetiliselt katsed näitaval et just nägude vaatlemine aptiveerid inimese aju est eedilist keskust
153
+ ---
154
+ reference: paljud inimesed kindlasti kadestavad teid kuid ei julge samamoodi vabalt võtta
155
+ predicted: paljud inimesed kindlasti kadestavadteid kuid ei julge sama moodi vabalt võtta
156
+ ---
157
+ reference: parem on otsida pileteid inkognito veebi kaudu
158
+ predicted: parem on otsida pileteid ning kognitu veebikaudu
159
+ ---
160
+ reference: ja vot siin ma jäin vaikseks
161
+ predicted: ja vat siisma ja invaikseks
162
+ ---
163
+ reference: mida sa iseendale juubeli puhul soovid
164
+ predicted: mida saise endale jubeli puhul soovid
165
+ ---
166
+ reference: kuumuse ja kõrge temperatuuri tõttu kuivas tühjadel karjamaadel rohi mis muutus kergesti süttivaks
167
+ predicted: kuumuse ja kõrge temperatuuri tõttu kuivast ühjadal karjamaadel rohi mis muutus kergesti süttivaks
168
+ ---
169
+ reference: ilmselt on inimesi kelle jaoks on see hea lahendus
170
+ predicted: ilmselt on inimesi kelle jaoks on see hea lahendus
171
+ ---
172
+ ```
173
+
174
+
175
+ ## Evaluation
176
+
177
+ The model can be evaluated as follows on the Estonian test data of Common Voice.
178
+
179
+ ```python
180
+ import librosa
181
+ import torch
182
+ import torchaudio
183
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
184
+ from datasets import load_dataset, load_metric
185
+
186
+ import numpy as np
187
+ import re
188
+ import string
189
+
190
+
191
+ chars_to_ignore = [
192
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
193
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
194
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
195
+ ]
196
+ chars_to_mapping = {
197
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
198
+ }
199
+
200
+ def multiple_replace(text, chars_to_mapping):
201
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
202
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
203
+
204
+ def remove_special_characters(text, chars_to_ignore_regex):
205
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
206
+ return text
207
+
208
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
209
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
210
+ text = batch["sentence"].lower().strip()
211
+
212
+ text = multiple_replace(text, chars_to_mapping)
213
+ text = remove_special_characters(text, chars_to_ignore_regex)
214
+
215
+ batch["sentence"] = text
216
+ return batch
217
+
218
+
219
+ def speech_file_to_array_fn(batch):
220
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
221
+ speech_array = speech_array.squeeze().numpy()
222
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
223
+
224
+ batch["speech"] = speech_array
225
+ return batch
226
+
227
+
228
+ def predict(batch):
229
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
230
+
231
+ input_values = features.input_values.to(device)
232
+ attention_mask = features.attention_mask.to(device)
233
+
234
+ with torch.no_grad():
235
+ logits = model(input_values, attention_mask=attention_mask).logits
236
+
237
+ pred_ids = torch.argmax(logits, dim=-1)
238
+
239
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
240
+ return batch
241
+
242
+
243
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
244
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian")
245
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian").to(device)
246
+
247
+ dataset = load_dataset("common_voice", "et", split="test")
248
+ dataset = dataset.map(
249
+ normalizer,
250
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
251
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
252
+ )
253
+
254
+ dataset = dataset.map(speech_file_to_array_fn)
255
+ result = dataset.map(predict)
256
+
257
+ wer = load_metric("wer")
258
+
259
+ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
260
+ ```
261
+ ]
262
+
263
+ **Test Result**:
264
+ - WER: 33.73%
265
+
266
+
267
+ ## Training & Report
268
+ The Common Voice `train`, `validation` datasets were used for training.
269
+
270
+ You can see the training states [here](https://wandb.ai/m3hrdadfi/finetuned_wav2vec_xlsr_estonian/reports/Fine-Tuning-for-Wav2Vec2-Large-XLSR-53-Estonian--Vmlldzo1NjA1MTI?accessToken=k2b2g3a2i12m1sdwf13q8b226pplmmyw12joxo6vk38eb4djellfzmn9fp2725fw)
271
+
272
+ The script used for training can be found [here](https://colab.research.google.com/github/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Estonian_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)
README.md ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: et
3
+ datasets:
4
+ - common_voice
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - xlsr-fine-tuning-week
10
+ license: apache-2.0
11
+ widget:
12
+ - label: Common Voice sample 1123
13
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-estonian/resolve/main/sample1123.flac
14
+ - label: Common Voice sample 910
15
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-estonian/resolve/main/sample910.flac
16
+ model-index:
17
+ - name: XLSR Wav2Vec2 Estonian by Mehrdad Farahani
18
+ results:
19
+ - task:
20
+ name: Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: Common Voice et
24
+ type: common_voice
25
+ args: et
26
+ metrics:
27
+ - name: Test WER
28
+ type: wer
29
+ value: 33.73
30
+
31
+ ---
32
+
33
+ # Wav2Vec2-Large-XLSR-53-Estonian
34
+
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Estonian using [Common Voice](https://huggingface.co/datasets/common_voice). When using this model, make sure that your speech input is sampled at 16kHz.
36
+
37
+ ## Usage
38
+ The model can be used directly (without a language model) as follows:
39
+
40
+ **Requirements**
41
+ ```bash
42
+ # requirement packages
43
+ !pip install git+https://github.com/huggingface/datasets.git
44
+ !pip install git+https://github.com/huggingface/transformers.git
45
+ !pip install torchaudio
46
+ !pip install librosa
47
+ !pip install jiwer
48
+ ```
49
+
50
+
51
+ **Prediction**
52
+ ```python
53
+ import librosa
54
+ import torch
55
+ import torchaudio
56
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
57
+ from datasets import load_dataset
58
+
59
+ import numpy as np
60
+ import re
61
+ import string
62
+
63
+ import IPython.display as ipd
64
+
65
+ chars_to_ignore = [
66
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
67
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
68
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
69
+ ]
70
+ chars_to_mapping = {
71
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
72
+ }
73
+
74
+ def multiple_replace(text, chars_to_mapping):
75
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
76
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
77
+
78
+ def remove_special_characters(text, chars_to_ignore_regex):
79
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
80
+ return text
81
+
82
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
83
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
84
+ text = batch["sentence"].lower().strip()
85
+
86
+ text = text.replace("\u0307", " ").strip()
87
+ text = multiple_replace(text, chars_to_mapping)
88
+ text = remove_special_characters(text, chars_to_ignore_regex)
89
+
90
+ batch["sentence"] = text
91
+ return batch
92
+
93
+
94
+ def speech_file_to_array_fn(batch):
95
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
96
+ speech_array = speech_array.squeeze().numpy()
97
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
98
+
99
+ batch["speech"] = speech_array
100
+ return batch
101
+
102
+
103
+ def predict(batch):
104
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
+
106
+ input_values = features.input_values.to(device)
107
+ attention_mask = features.attention_mask.to(device)
108
+
109
+ with torch.no_grad():
110
+ logits = model(input_values, attention_mask=attention_mask).logits
111
+
112
+ pred_ids = torch.argmax(logits, dim=-1)
113
+
114
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
115
+ return batch
116
+
117
+
118
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
119
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian")
120
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian").to(device)
121
+
122
+ dataset = load_dataset("common_voice", "et", split="test[:1%]")
123
+ dataset = dataset.map(
124
+ normalizer,
125
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
126
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
127
+ )
128
+
129
+ dataset = dataset.map(speech_file_to_array_fn)
130
+ result = dataset.map(predict)
131
+
132
+ max_items = np.random.randint(0, len(result), 10).tolist()
133
+ for i in max_items:
134
+ reference, predicted = result["sentence"][i], result["predicted"][i]
135
+ print("reference:", reference)
136
+ print("predicted:", predicted)
137
+ print('---')
138
+ ```
139
+
140
+ **Output:**
141
+ ```text
142
+ reference: õhulossid lagunevad ning ees ootab maapind
143
+ predicted: õhulassid lagunevad ning ees ootab maapind
144
+ ---
145
+ reference: milliseks kiievisse pääsemise nimel võistlev muusik soome muusikamaastiku hetkeseisu hindab ning kas ta ka ennast sellel tulevikus tegutsemas näeb kuuled videost
146
+ predicted: milliseks gievisse pääsemise nimel võitlev muusiks soome muusikama aastiku hetke seisu hindab ning kas ta ennast selle tulevikus tegutsemast näeb kuulad videost
147
+ ---
148
+ reference: näiteks kui pool seina on tehtud tekib tunne et tahaks tegelikult natuke teistsugust ja hakkame otsast peale
149
+ predicted: näiteks kui pool seine on tehtud tekib tunnetahaks tegelikult matuka teistsugust jahappanna otsast peane
150
+ ---
151
+ reference: neuroesteetilised katsed näitavad et just nägude vaatlemine aktiveerib inimese aju esteetilist keskust
152
+ predicted: neuroaisteetiliselt katsed näitaval et just nägude vaatlemine aptiveerid inimese aju est eedilist keskust
153
+ ---
154
+ reference: paljud inimesed kindlasti kadestavad teid kuid ei julge samamoodi vabalt võtta
155
+ predicted: paljud inimesed kindlasti kadestavadteid kuid ei julge sama moodi vabalt võtta
156
+ ---
157
+ reference: parem on otsida pileteid inkognito veebi kaudu
158
+ predicted: parem on otsida pileteid ning kognitu veebikaudu
159
+ ---
160
+ reference: ja vot siin ma jäin vaikseks
161
+ predicted: ja vat siisma ja invaikseks
162
+ ---
163
+ reference: mida sa iseendale juubeli puhul soovid
164
+ predicted: mida saise endale jubeli puhul soovid
165
+ ---
166
+ reference: kuumuse ja kõrge temperatuuri tõttu kuivas tühjadel karjamaadel rohi mis muutus kergesti süttivaks
167
+ predicted: kuumuse ja kõrge temperatuuri tõttu kuivast ühjadal karjamaadel rohi mis muutus kergesti süttivaks
168
+ ---
169
+ reference: ilmselt on inimesi kelle jaoks on see hea lahendus
170
+ predicted: ilmselt on inimesi kelle jaoks on see hea lahendus
171
+ ---
172
+ ```
173
+
174
+
175
+ ## Evaluation
176
+
177
+ The model can be evaluated as follows on the Estonian test data of Common Voice.
178
+
179
+ ```python
180
+ import librosa
181
+ import torch
182
+ import torchaudio
183
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
184
+ from datasets import load_dataset, load_metric
185
+
186
+ import numpy as np
187
+ import re
188
+ import string
189
+
190
+
191
+ chars_to_ignore = [
192
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
193
+ "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
194
+ "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
195
+ ]
196
+ chars_to_mapping = {
197
+ "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
198
+ }
199
+
200
+ def multiple_replace(text, chars_to_mapping):
201
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
202
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
203
+
204
+ def remove_special_characters(text, chars_to_ignore_regex):
205
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
206
+ return text
207
+
208
+ def normalizer(batch, chars_to_ignore, chars_to_mapping):
209
+ chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
210
+ text = batch["sentence"].lower().strip()
211
+
212
+ text = multiple_replace(text, chars_to_mapping)
213
+ text = remove_special_characters(text, chars_to_ignore_regex)
214
+
215
+ batch["sentence"] = text
216
+ return batch
217
+
218
+
219
+ def speech_file_to_array_fn(batch):
220
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
221
+ speech_array = speech_array.squeeze().numpy()
222
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
223
+
224
+ batch["speech"] = speech_array
225
+ return batch
226
+
227
+
228
+ def predict(batch):
229
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
230
+
231
+ input_values = features.input_values.to(device)
232
+ attention_mask = features.attention_mask.to(device)
233
+
234
+ with torch.no_grad():
235
+ logits = model(input_values, attention_mask=attention_mask).logits
236
+
237
+ pred_ids = torch.argmax(logits, dim=-1)
238
+
239
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
240
+ return batch
241
+
242
+
243
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
244
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian")
245
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-estonian").to(device)
246
+
247
+ dataset = load_dataset("common_voice", "et", split="test")
248
+ dataset = dataset.map(
249
+ normalizer,
250
+ fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping},
251
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
252
+ )
253
+
254
+ dataset = dataset.map(speech_file_to_array_fn)
255
+ result = dataset.map(predict)
256
+
257
+ wer = load_metric("wer")
258
+
259
+ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
260
+ ```
261
+ ]
262
+
263
+ **Test Result**:
264
+ - WER: 33.73%
265
+
266
+
267
+ ## Training & Report
268
+ The Common Voice `train`, `validation` datasets were used for training.
269
+
270
+ You can see the training states [here](https://wandb.ai/m3hrdadfi/finetuned_wav2vec_xlsr_estonian/reports/Fine-Tuning-for-Wav2Vec2-Large-XLSR-53-Estonian--Vmlldzo1NjA1MTI?accessToken=k2b2g3a2i12m1sdwf13q8b226pplmmyw12joxo6vk38eb4djellfzmn9fp2725fw)
271
+
272
+ The script used for training can be found [here](https://colab.research.google.com/github/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Estonian_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)
all_results.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_loss": 0.35807397961616516,
4
+ "eval_mem_cpu_alloc_delta": 204106932,
5
+ "eval_mem_cpu_peaked_delta": 25299703,
6
+ "eval_mem_gpu_alloc_delta": 0,
7
+ "eval_mem_gpu_peaked_delta": 6155822592,
8
+ "eval_runtime": 345.8016,
9
+ "eval_samples": 2509,
10
+ "eval_samples_per_second": 7.256,
11
+ "eval_wer": 0.35535428875865743,
12
+ "init_mem_cpu_alloc_delta": 9477734,
13
+ "init_mem_cpu_peaked_delta": 18306,
14
+ "init_mem_gpu_alloc_delta": 1261906944,
15
+ "init_mem_gpu_peaked_delta": 0,
16
+ "total_flos": 3.401354569939134e+19,
17
+ "train_mem_cpu_alloc_delta": 16811478,
18
+ "train_mem_cpu_peaked_delta": 230270329,
19
+ "train_mem_gpu_alloc_delta": 3786449408,
20
+ "train_mem_gpu_peaked_delta": 6820308992,
21
+ "train_runtime": 32379.322,
22
+ "train_samples": 5473,
23
+ "train_samples_per_second": 0.091
24
+ }
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.0,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 0,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 37
76
+ }
eval_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_loss": 0.35807397961616516,
4
+ "eval_mem_cpu_alloc_delta": 204106932,
5
+ "eval_mem_cpu_peaked_delta": 25299703,
6
+ "eval_mem_gpu_alloc_delta": 0,
7
+ "eval_mem_gpu_peaked_delta": 6155822592,
8
+ "eval_runtime": 345.8016,
9
+ "eval_samples": 2509,
10
+ "eval_samples_per_second": 7.256,
11
+ "eval_wer": 0.35535428875865743
12
+ }
predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:797b6155a39ac92af7588b2fead1fccce9892a261e97d5ac5c2db9d547031b1d
3
+ size 1262085527
result.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4a7896012604abad91cc051fc09c44ff46c874582eebd44fabc6447b12a83fe
3
+ size 3183
sample1123.flac ADDED
Binary file (104 kB). View file
 
sample910.flac ADDED
Binary file (45.8 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
train_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "init_mem_cpu_alloc_delta": 9477734,
4
+ "init_mem_cpu_peaked_delta": 18306,
5
+ "init_mem_gpu_alloc_delta": 1261906944,
6
+ "init_mem_gpu_peaked_delta": 0,
7
+ "total_flos": 3.401354569939134e+19,
8
+ "train_mem_cpu_alloc_delta": 16811478,
9
+ "train_mem_cpu_peaked_delta": 230270329,
10
+ "train_mem_gpu_alloc_delta": 3786449408,
11
+ "train_mem_gpu_peaked_delta": 6820308992,
12
+ "train_runtime": 32379.322,
13
+ "train_samples": 5473,
14
+ "train_samples_per_second": 0.091
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 30.0,
5
+ "global_step": 2940,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 3.06,
12
+ "learning_rate": 0.00017999999999999998,
13
+ "loss": 5.039,
14
+ "step": 300
15
+ },
16
+ {
17
+ "epoch": 3.06,
18
+ "eval_loss": 2.9455764293670654,
19
+ "eval_runtime": 358.486,
20
+ "eval_samples_per_second": 6.999,
21
+ "eval_wer": 1.0,
22
+ "step": 300
23
+ },
24
+ {
25
+ "epoch": 6.12,
26
+ "learning_rate": 0.00028770491803278684,
27
+ "loss": 2.9411,
28
+ "step": 600
29
+ },
30
+ {
31
+ "epoch": 6.12,
32
+ "eval_loss": 2.8372132778167725,
33
+ "eval_runtime": 362.2267,
34
+ "eval_samples_per_second": 6.927,
35
+ "eval_wer": 1.0,
36
+ "step": 600
37
+ },
38
+ {
39
+ "epoch": 9.18,
40
+ "learning_rate": 0.00025081967213114756,
41
+ "loss": 0.9286,
42
+ "step": 900
43
+ },
44
+ {
45
+ "epoch": 9.18,
46
+ "eval_loss": 0.3580135107040405,
47
+ "eval_runtime": 372.9874,
48
+ "eval_samples_per_second": 6.727,
49
+ "eval_wer": 0.514917421417155,
50
+ "step": 900
51
+ },
52
+ {
53
+ "epoch": 12.24,
54
+ "learning_rate": 0.0002139344262295082,
55
+ "loss": 0.3039,
56
+ "step": 1200
57
+ },
58
+ {
59
+ "epoch": 12.24,
60
+ "eval_loss": 0.33028751611709595,
61
+ "eval_runtime": 377.8898,
62
+ "eval_samples_per_second": 6.64,
63
+ "eval_wer": 0.44296084176878,
64
+ "step": 1200
65
+ },
66
+ {
67
+ "epoch": 15.31,
68
+ "learning_rate": 0.00017704918032786883,
69
+ "loss": 0.2022,
70
+ "step": 1500
71
+ },
72
+ {
73
+ "epoch": 15.31,
74
+ "eval_loss": 0.30936819314956665,
75
+ "eval_runtime": 374.6431,
76
+ "eval_samples_per_second": 6.697,
77
+ "eval_wer": 0.4007059136920618,
78
+ "step": 1500
79
+ },
80
+ {
81
+ "epoch": 18.37,
82
+ "learning_rate": 0.00014016393442622951,
83
+ "loss": 0.1529,
84
+ "step": 1800
85
+ },
86
+ {
87
+ "epoch": 18.37,
88
+ "eval_loss": 0.3163447976112366,
89
+ "eval_runtime": 381.4751,
90
+ "eval_samples_per_second": 6.577,
91
+ "eval_wer": 0.38395711241342567,
92
+ "step": 1800
93
+ },
94
+ {
95
+ "epoch": 21.43,
96
+ "learning_rate": 0.00010327868852459015,
97
+ "loss": 0.1165,
98
+ "step": 2100
99
+ },
100
+ {
101
+ "epoch": 21.43,
102
+ "eval_loss": 0.3391939103603363,
103
+ "eval_runtime": 380.5289,
104
+ "eval_samples_per_second": 6.593,
105
+ "eval_wer": 0.3692061800745871,
106
+ "step": 2100
107
+ },
108
+ {
109
+ "epoch": 24.49,
110
+ "learning_rate": 6.639344262295081e-05,
111
+ "loss": 0.1058,
112
+ "step": 2400
113
+ },
114
+ {
115
+ "epoch": 24.49,
116
+ "eval_loss": 0.35883328318595886,
117
+ "eval_runtime": 376.435,
118
+ "eval_samples_per_second": 6.665,
119
+ "eval_wer": 0.3648774640383591,
120
+ "step": 2400
121
+ },
122
+ {
123
+ "epoch": 27.55,
124
+ "learning_rate": 2.950819672131147e-05,
125
+ "loss": 0.086,
126
+ "step": 2700
127
+ },
128
+ {
129
+ "epoch": 27.55,
130
+ "eval_loss": 0.35711735486984253,
131
+ "eval_runtime": 376.6441,
132
+ "eval_samples_per_second": 6.661,
133
+ "eval_wer": 0.3572522642514651,
134
+ "step": 2700
135
+ },
136
+ {
137
+ "epoch": 30.0,
138
+ "step": 2940,
139
+ "total_flos": 3.401354569939134e+19,
140
+ "train_runtime": 32379.322,
141
+ "train_samples_per_second": 0.091
142
+ }
143
+ ],
144
+ "max_steps": 2940,
145
+ "num_train_epochs": 30,
146
+ "total_flos": 3.401354569939134e+19,
147
+ "trial_name": null,
148
+ "trial_params": null
149
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2d423da1472e41b342fc1cc8bc8b3c4aa0672544957d802085dbaadee8c2de
3
+ size 2351
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15, "l": 16, "m": 17, "n": 18, "o": 19, "p": 20, "q": 21, "r": 22, "s": 23, "t": 24, "u": 25, "v": 26, "w": 27, "x": 28, "y": 29, "z": 30, "ä": 31, "õ": 32, "ö": 33, "ü": 34, "š": 35, "ž": 36}