gorkemgoknar
commited on
Commit
·
eefd788
1
Parent(s):
c53aa1d
Update README.md
Browse files
README.md
CHANGED
@@ -30,7 +30,7 @@ model-index:
|
|
30 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
31 |
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
-
Although WER is high (50%) per common voice test dataset,
|
34 |
|
35 |
Please try speech yourself on the right side to see its performance.
|
36 |
|
@@ -120,7 +120,7 @@ model.to("cuda")
|
|
120 |
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
#Note: Not ignoring "'" on this one
|
123 |
-
chars_to_ignore_regex = '[
|
124 |
|
125 |
|
126 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
@@ -151,11 +151,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
151 |
def remove_special_characters(batch):
|
152 |
|
153 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
154 |
-
batch["sentence"] = re.sub('
|
155 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
156 |
-
batch["sentence"] = re.sub('
|
157 |
##replace three dots (that are inside string with single)
|
158 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
159 |
#standart ignore list
|
160 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
161 |
|
@@ -218,7 +218,7 @@ from datasets import Dataset
|
|
218 |
import csv
|
219 |
|
220 |
#Walk all subdirectories of base_set_path and find csv files
|
221 |
-
base_set_path = r'C
|
222 |
csv_files = []
|
223 |
for path, subdirs, files in os.walk(base_set_path):
|
224 |
for name in files:
|
@@ -228,7 +228,7 @@ for path, subdirs, files in os.walk(base_set_path):
|
|
228 |
|
229 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
230 |
path = Path(csvfilename)
|
231 |
-
csv_delimiter="
|
232 |
|
233 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
234 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
@@ -246,7 +246,7 @@ from datasets import concatenate_datasets, load_dataset
|
|
246 |
from datasets import load_from_disk
|
247 |
|
248 |
# Merge datasets together (from csv files)
|
249 |
-
dataset_file_path = "
|
250 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
251 |
|
252 |
#save this one to disk
|
|
|
30 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
31 |
|
32 |
Note: This model is trained with 5 Turkish movies additional to common voice dataset.
|
33 |
+
Although WER is high (50%) per common voice test dataset, testing with voice with background noise and on browser, derived letters are pretty close.
|
34 |
|
35 |
Please try speech yourself on the right side to see its performance.
|
36 |
|
|
|
120 |
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
#Note: Not ignoring "'" on this one
|
123 |
+
chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
|
124 |
|
125 |
|
126 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
|
|
151 |
def remove_special_characters(batch):
|
152 |
|
153 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
154 |
+
batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
|
155 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
156 |
+
batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
|
157 |
##replace three dots (that are inside string with single)
|
158 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
|
159 |
#standart ignore list
|
160 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
161 |
|
|
|
218 |
import csv
|
219 |
|
220 |
#Walk all subdirectories of base_set_path and find csv files
|
221 |
+
base_set_path = r'C:\\\\\\\\dataset_extracts'
|
222 |
csv_files = []
|
223 |
for path, subdirs, files in os.walk(base_set_path):
|
224 |
for name in files:
|
|
|
228 |
|
229 |
def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
|
230 |
path = Path(csvfilename)
|
231 |
+
csv_delimiter="\\\\\\\\t" ##tab seperated, change if something else
|
232 |
|
233 |
##Pandas has bug reading non-ascii file names, make sure use open with encoding
|
234 |
df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
|
|
|
246 |
from datasets import load_from_disk
|
247 |
|
248 |
# Merge datasets together (from csv files)
|
249 |
+
dataset_file_path = ".\\\\\\\\dataset_file"
|
250 |
custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
|
251 |
|
252 |
#save this one to disk
|