gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -30,7 +30,7 @@ model-index:
 # Wav2Vec2-Large-XLSR-53-Turkish
 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
-Although WER is high (50%) per common voice test dataset, its recognition (with some letter errors) seems better.
 Please try speech yourself on the right side to see its performance.
@@ -120,7 +120,7 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\#\\\\>\\\\<\\\\_\\\\’\\\\[\\\\]\\\\{\\\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -151,11 +151,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\\\\b\\\\d{2}:\\\\d{2}:\\\\d{2}(,+\\\\d{2})?\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\\\\[(\\\\b[A-Z]+\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\.\\\\.\\\\.", r"\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
@@ -218,7 +218,7 @@ from datasets import Dataset
 import csv
 #Walk all subdirectories of base_set_path and find csv files
-base_set_path = r'C:\\\\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
@@ -228,7 +228,7 @@ for path, subdirs, files in os.walk(base_set_path):
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
-  csv_delimiter="\\\\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -246,7 +246,7 @@ from datasets import concatenate_datasets, load_dataset
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
-dataset_file_path = ".\\\\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk

 # Wav2Vec2-Large-XLSR-53-Turkish
 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
+Although WER is high (50%) per common voice test dataset, testing with voice with background noise and on browser, derived letters are pretty close.
 Please try speech yourself on the right side to see its performance.
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 import csv
 #Walk all subdirectories of base_set_path and find csv files
+base_set_path = r'C:\\\\\\\\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
+  csv_delimiter="\\\\\\\\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
+dataset_file_path = ".\\\\\\\\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk