gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -32,6 +32,8 @@ model-index:
 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
 Although WER is high (50%) per common voice test dataset,  performance from "other sources " seems pretty good.
 Dataset building from csv and merging code can be found on below of this Readme.
 Please try speech yourself on the right side to see its performance.
@@ -122,7 +124,7 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -153,11 +155,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
@@ -220,7 +222,7 @@ from datasets import Dataset
 import csv
 #Walk all subdirectories of base_set_path and find csv files
-base_set_path = r'C:\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
@@ -230,7 +232,7 @@ for path, subdirs, files in os.walk(base_set_path):
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
-  csv_delimiter="\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -248,7 +250,7 @@ from datasets import concatenate_datasets, load_dataset
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
-dataset_file_path = ".\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk

 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
 Although WER is high (50%) per common voice test dataset,  performance from "other sources " seems pretty good.
+Disclaimer: Please use another wav2vec2-tr model in hub for "clean environment" dialogues as they tend to do better in clean sounds with less background noise.
 Dataset building from csv and merging code can be found on below of this Readme.
 Please try speech yourself on the right side to see its performance.
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 import csv
 #Walk all subdirectories of base_set_path and find csv files
+base_set_path = r'C:\\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
+  csv_delimiter="\\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
+dataset_file_path = ".\\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk