gorkemgoknar
commited on
Commit
·
329f311
1
Parent(s):
eefd788
Update README.md
Browse files
README.md
CHANGED
@@ -30,7 +30,9 @@ model-index:
|
|
30 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
31 |
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
-
Although WER is high (50%) per common voice test dataset,
|
|
|
|
|
34 |
|
35 |
Please try speech yourself on the right side to see its performance.
|
36 |
|
@@ -120,7 +122,7 @@ model.to("cuda")
|
|
120 |
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
#Note: Not ignoring "'" on this one
|
123 |
-
chars_to_ignore_regex = '[
|
124 |
|
125 |
|
126 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
@@ -151,11 +153,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
151 |
def remove_special_characters(batch):
|
152 |
|
153 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
154 |
-
batch["sentence"] = re.sub('
|
155 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
156 |
-
batch["sentence"] = re.sub('
|
157 |
##replace three dots (that are inside string with single)
|
158 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
159 |
#standart ignore list
|
160 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
161 |
|
@@ -202,10 +204,10 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
|
|
202 |
## Training
|
203 |
|
204 |
|
205 |
-
The Common Voice `train` and `validation` datasets were used for training.
|
206 |
Similar training model used as base fine-tuning, additional audio resampler is on above code.
|
207 |
|
208 |
-
Putting
|
209 |
|
210 |
|
211 |
```python
|
@@ -218,7 +220,7 @@ from datasets import Dataset
|
|
218 |
import csv
|
219 |
|
220 |
#Walk all subdirectories of base_set_path and find csv files
|
221 |
-
base_set_path = r'C
|
222 |
csv_files = []
|
223 |
for path, subdirs, files in os.walk(base_set_path):
|
224 |
for name in files:
|
@@ -228,7 +230,7 @@ for path, subdirs, files in os.walk(base_set_path):
|
|
228 |
|
229 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
230 |
path = Path(csvfilename)
|
231 |
-
csv_delimiter="
|
232 |
|
233 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
234 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
@@ -246,7 +248,7 @@ from datasets import concatenate_datasets, load_dataset
|
|
246 |
from datasets import load_from_disk
|
247 |
|
248 |
# Merge datasets together (from csv files)
|
249 |
-
dataset_file_path = "
|
250 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
251 |
|
252 |
#save this one to disk
|
|
|
30 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
31 |
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
+
Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
|
34 |
+
|
35 |
+
Dataset building from csv and merging code can be found on below of this Readme.
|
36 |
|
37 |
Please try speech yourself on the right side to see its performance.
|
38 |
|
|
|
122 |
|
123 |
#Note: Not ignoring "'" on this one
|
124 |
#Note: Not ignoring "'" on this one
|
125 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
|
126 |
|
127 |
|
128 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
|
|
153 |
def remove_special_characters(batch):
|
154 |
|
155 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
156 |
+
batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
|
157 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
158 |
+
batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
|
159 |
##replace three dots (that are inside string with single)
|
160 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
|
161 |
#standart ignore list
|
162 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
163 |
|
|
|
204 |
## Training
|
205 |
|
206 |
|
207 |
+
The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used for training.
|
208 |
Similar training model used as base fine-tuning, additional audio resampler is on above code.
|
209 |
|
210 |
+
Putting model building and merging code below for reference
|
211 |
|
212 |
|
213 |
```python
|
|
|
220 |
import csv
|
221 |
|
222 |
#Walk all subdirectories of base_set_path and find csv files
|
223 |
+
base_set_path = r'C:\dataset_extracts'
|
224 |
csv_files = []
|
225 |
for path, subdirs, files in os.walk(base_set_path):
|
226 |
for name in files:
|
|
|
230 |
|
231 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
232 |
path = Path(csvfilename)
|
233 |
+
csv_delimiter="\t" ##tab seperated, change if something else
|
234 |
|
235 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
236 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
|
|
248 |
from datasets import load_from_disk
|
249 |
|
250 |
# Merge datasets together (from csv files)
|
251 |
+
dataset_file_path = ".\dataset_file"
|
252 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
253 |
|
254 |
#save this one to disk
|