gorkemgoknar
commited on
Commit
·
ee893eb
1
Parent(s):
0dd7f50
Updated readme to give info on merging datasets
Browse files
README.md
CHANGED
@@ -3,6 +3,7 @@ language:
|
|
3 |
- tr
|
4 |
datasets:
|
5 |
- common_voice
|
|
|
6 |
metrics:
|
7 |
- wer
|
8 |
tags:
|
@@ -12,7 +13,7 @@ tags:
|
|
12 |
- xlsr-fine-tuning-week
|
13 |
license: apache-2.0
|
14 |
model-index:
|
15 |
-
- name: XLSR Wav2Vec2 Large Turkish by Gorkem Goknar
|
16 |
results:
|
17 |
- task:
|
18 |
name: Speech Recognition
|
@@ -24,18 +25,17 @@ model-index:
|
|
24 |
metrics:
|
25 |
- name: Test WER
|
26 |
type: wer
|
27 |
-
value:
|
28 |
---
|
29 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
30 |
|
31 |
-
Note:
|
32 |
-
|
33 |
|
34 |
-
Please try speech yourself
|
35 |
-
I hope some news channels or movie producers lets use their data for test/training (I asked some no reply)
|
36 |
|
|
|
37 |
|
38 |
-
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
39 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
40 |
## Usage
|
41 |
The model can be used directly (without a language model) as follows:
|
@@ -132,11 +132,10 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
132 |
#not working without complex library compilation in windows for mp3
|
133 |
#speech_array, sampling_rate = torchaudio.load(batch["path"])
|
134 |
#speech_array, sampling_rate = librosa.load(batch["path"])
|
135 |
-
|
136 |
#sampling_rate = pydub.utils.info['sample_rate'] ##gets current samplerate
|
137 |
|
138 |
sound = pydub.AudioSegment.from_file(file=batch["path"])
|
139 |
-
|
140 |
sound = sound.set_frame_rate(new_sample_rate)
|
141 |
left = sound.split_to_mono()[0]
|
142 |
bit_depth = left.sample_width * 8
|
@@ -146,23 +145,17 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
146 |
|
147 |
speech_array = torch.FloatTensor(numeric_array)
|
148 |
|
149 |
-
|
150 |
-
|
151 |
-
#batch["target_text"] = batch["sentence"]
|
152 |
-
|
153 |
-
return batch
|
154 |
|
155 |
def remove_special_characters(batch):
|
156 |
|
157 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
158 |
batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
|
159 |
-
|
160 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
161 |
batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
|
162 |
-
|
163 |
##replace three dots (that are inside string with single)
|
164 |
batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
|
165 |
-
|
166 |
#standart ignore list
|
167 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
168 |
|
@@ -171,12 +164,18 @@ def remove_special_characters(batch):
|
|
171 |
|
172 |
# Preprocessing the datasets.
|
173 |
# We need to read the aduio files as arrays
|
|
|
174 |
def speech_file_to_array_fn(batch):
|
175 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
176 |
##speech_array, sampling_rate = torchaudio.load(batch["path"])
|
177 |
##load and conversion done in resampler , takes and returns batch
|
178 |
-
|
|
|
|
|
|
|
|
|
179 |
return batch
|
|
|
180 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
181 |
# Preprocessing the datasets.
|
182 |
# We need to read the aduio files as arrays
|
@@ -196,7 +195,64 @@ result = test_dataset.map(evaluate, batched=True, batch_size=2)
|
|
196 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
197 |
|
198 |
```
|
|
|
199 |
**Test Result**: 50.41 %
|
|
|
|
|
200 |
## Training
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
- tr
|
4 |
datasets:
|
5 |
- common_voice
|
6 |
+
- movies
|
7 |
metrics:
|
8 |
- wer
|
9 |
tags:
|
|
|
13 |
- xlsr-fine-tuning-week
|
14 |
license: apache-2.0
|
15 |
model-index:
|
16 |
+
- name: XLSR Wav2Vec2 Large Turkish by Gorkem Goknar
|
17 |
results:
|
18 |
- task:
|
19 |
name: Speech Recognition
|
|
|
25 |
metrics:
|
26 |
- name: Test WER
|
27 |
type: wer
|
28 |
+
value: 50.41
|
29 |
---
|
30 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
31 |
|
32 |
+
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
+
Although WER is high (50%) per common voice test dataset, its recognition (with some letter errors) seems better.
|
34 |
|
35 |
+
Please try speech yourself on the right side to see its performance.
|
|
|
36 |
|
37 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice) and 5 Turkish movies that include background noise/talkers .
|
38 |
|
|
|
39 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
40 |
## Usage
|
41 |
The model can be used directly (without a language model) as follows:
|
|
|
132 |
#not working without complex library compilation in windows for mp3
|
133 |
#speech_array, sampling_rate = torchaudio.load(batch["path"])
|
134 |
#speech_array, sampling_rate = librosa.load(batch["path"])
|
|
|
135 |
#sampling_rate = pydub.utils.info['sample_rate'] ##gets current samplerate
|
136 |
|
137 |
sound = pydub.AudioSegment.from_file(file=batch["path"])
|
138 |
+
|
139 |
sound = sound.set_frame_rate(new_sample_rate)
|
140 |
left = sound.split_to_mono()[0]
|
141 |
bit_depth = left.sample_width * 8
|
|
|
145 |
|
146 |
speech_array = torch.FloatTensor(numeric_array)
|
147 |
|
148 |
+
|
149 |
+
return speech_array, new_sample_rate
|
|
|
|
|
|
|
150 |
|
151 |
def remove_special_characters(batch):
|
152 |
|
153 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
154 |
batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
|
|
|
155 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
156 |
batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
|
|
|
157 |
##replace three dots (that are inside string with single)
|
158 |
batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
|
|
|
159 |
#standart ignore list
|
160 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
161 |
|
|
|
164 |
|
165 |
# Preprocessing the datasets.
|
166 |
# We need to read the aduio files as arrays
|
167 |
+
new_sample_rate = 16000
|
168 |
def speech_file_to_array_fn(batch):
|
169 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
170 |
##speech_array, sampling_rate = torchaudio.load(batch["path"])
|
171 |
##load and conversion done in resampler , takes and returns batch
|
172 |
+
speech_array, sampling_rate = audio_resampler(batch, new_sample_rate = new_sample_rate)
|
173 |
+
batch["speech"] = speech_array
|
174 |
+
batch["sampling_rate"] = sampling_rate
|
175 |
+
batch["target_text"] = batch["sentence"]
|
176 |
+
|
177 |
return batch
|
178 |
+
|
179 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
180 |
# Preprocessing the datasets.
|
181 |
# We need to read the aduio files as arrays
|
|
|
195 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
196 |
|
197 |
```
|
198 |
+
|
199 |
**Test Result**: 50.41 %
|
200 |
+
|
201 |
+
|
202 |
## Training
|
203 |
+
|
204 |
+
|
205 |
+
The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used for training.
|
206 |
+
Similar training model used as base fine-tuning, additional audio resampler is on above code.
|
207 |
+
|
208 |
+
Putting model building and merging code below for reference
|
209 |
+
|
210 |
+
|
211 |
+
```python
|
212 |
+
import pandas as pd
|
213 |
+
from datasets import load_dataset, load_metric
|
214 |
+
|
215 |
+
import os
|
216 |
+
from pathlib import Path
|
217 |
+
from datasets import Dataset
|
218 |
+
import csv
|
219 |
+
|
220 |
+
#Walk all subdirectories of base_set_path and find csv files
|
221 |
+
base_set_path = r"C:\dataset_extracts\"
|
222 |
+
|
223 |
+
csv_files = []
|
224 |
+
for path, subdirs, files in os.walk(base_set_path):
|
225 |
+
for name in files:
|
226 |
+
if name.endswith(".csv"):
|
227 |
+
deckfile= os.path.join(path, name)
|
228 |
+
csv_files.append(deckfile)
|
229 |
+
|
230 |
+
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
231 |
+
path = Path(csvfilename)
|
232 |
+
csv_delimiter="\t" ##tab seperated, change if something else
|
233 |
+
|
234 |
+
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
235 |
+
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
236 |
+
return Dataset.from_pandas(df)
|
237 |
+
|
238 |
+
custom_datasets= []
|
239 |
+
for csv_file in csv_files:
|
240 |
+
this_dataset=get_dataset_from_csv_file(csv_file)
|
241 |
+
custom_datasets.append(this_dataset)
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
+
|
246 |
+
from datasets import concatenate_datasets, load_dataset
|
247 |
+
from datasets import load_from_disk
|
248 |
+
|
249 |
+
# Merge datasets together (from csv files)
|
250 |
+
dataset_file_path = ".\dataset_file"
|
251 |
+
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
252 |
+
|
253 |
+
#save this one to disk
|
254 |
+
custom_datasets_concat.save_to_disk( dataset_file_path )
|
255 |
+
|
256 |
+
#load back from disk
|
257 |
+
custom_datasets_from_disk = load_from_disk(dataset_file_path)
|
258 |
+
```
|