gorkemgoknar
commited on
Commit
·
d739c10
1
Parent(s):
e7650c7
Update README.md
Browse files
README.md
CHANGED
@@ -32,6 +32,8 @@ model-index:
|
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
|
34 |
|
|
|
|
|
35 |
Dataset building from csv and merging code can be found on below of this Readme.
|
36 |
|
37 |
Please try speech yourself on the right side to see its performance.
|
@@ -122,7 +124,7 @@ model.to("cuda")
|
|
122 |
|
123 |
#Note: Not ignoring "'" on this one
|
124 |
#Note: Not ignoring "'" on this one
|
125 |
-
chars_to_ignore_regex = '[
|
126 |
|
127 |
|
128 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
@@ -153,11 +155,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
153 |
def remove_special_characters(batch):
|
154 |
|
155 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
156 |
-
batch["sentence"] = re.sub('
|
157 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
158 |
-
batch["sentence"] = re.sub('
|
159 |
##replace three dots (that are inside string with single)
|
160 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
161 |
#standart ignore list
|
162 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
163 |
|
@@ -220,7 +222,7 @@ from datasets import Dataset
|
|
220 |
import csv
|
221 |
|
222 |
#Walk all subdirectories of base_set_path and find csv files
|
223 |
-
base_set_path = r'C
|
224 |
csv_files = []
|
225 |
for path, subdirs, files in os.walk(base_set_path):
|
226 |
for name in files:
|
@@ -230,7 +232,7 @@ for path, subdirs, files in os.walk(base_set_path):
|
|
230 |
|
231 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
232 |
path = Path(csvfilename)
|
233 |
-
csv_delimiter="
|
234 |
|
235 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
236 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
@@ -248,7 +250,7 @@ from datasets import concatenate_datasets, load_dataset
|
|
248 |
from datasets import load_from_disk
|
249 |
|
250 |
# Merge datasets together (from csv files)
|
251 |
-
dataset_file_path = "
|
252 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
253 |
|
254 |
#save this one to disk
|
|
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
|
34 |
|
35 |
+
Disclaimer: Please use another wav2vec2-tr model in hub for "clean environment" dialogues as they tend to do better in clean sounds with less background noise.
|
36 |
+
|
37 |
Dataset building from csv and merging code can be found on below of this Readme.
|
38 |
|
39 |
Please try speech yourself on the right side to see its performance.
|
|
|
124 |
|
125 |
#Note: Not ignoring "'" on this one
|
126 |
#Note: Not ignoring "'" on this one
|
127 |
+
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
|
128 |
|
129 |
|
130 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
|
|
155 |
def remove_special_characters(batch):
|
156 |
|
157 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
158 |
+
batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
|
159 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
160 |
+
batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
|
161 |
##replace three dots (that are inside string with single)
|
162 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
|
163 |
#standart ignore list
|
164 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
165 |
|
|
|
222 |
import csv
|
223 |
|
224 |
#Walk all subdirectories of base_set_path and find csv files
|
225 |
+
base_set_path = r'C:\\dataset_extracts'
|
226 |
csv_files = []
|
227 |
for path, subdirs, files in os.walk(base_set_path):
|
228 |
for name in files:
|
|
|
232 |
|
233 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
234 |
path = Path(csvfilename)
|
235 |
+
csv_delimiter="\\t" ##tab seperated, change if something else
|
236 |
|
237 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
238 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
|
|
250 |
from datasets import load_from_disk
|
251 |
|
252 |
# Merge datasets together (from csv files)
|
253 |
+
dataset_file_path = ".\\dataset_file"
|
254 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
255 |
|
256 |
#save this one to disk
|