gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -28,9 +28,10 @@ model-index:
 ---
 # Wav2Vec2-Large-XLSR-53-Turkish
-Note: Common voice Turkish data is no background noise voice only dataset
-In this model although Word Error rate for test is 50% it is agains Common Voice text
-Please try speech yourself and see it is converting pretty good
 I hope some news channels or movie producers lets use their data for test/training (I asked some no reply)
@@ -119,7 +120,7 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
@@ -154,13 +155,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "

 ---
 # Wav2Vec2-Large-XLSR-53-Turkish
+Note: Common voice Turkish data is no background noise voice only dataset.
+In this model although Word Error rate for test is 50% it is agains Common Voice text.
+Please try speech yourself and see it is converting pretty good .
 I hope some news channels or movie producers lets use their data for test/training (I asked some no reply)
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = """[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]"""
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "