gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -49,39 +49,40 @@ model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turk
-def audio_resampler(batch,new_sample_rate = 16000):
-    ##torchaudio and librosa troublesome to use for mp3 in windows
     #speech_array, sampling_rate = torchaudio.load(batch["path"])
     #speech_array, sampling_rate = librosa.load(batch["path"])
-    #AudioSegment does the job over ffmpeg(need install)
-    sound = AudioSegment.from_file(file=batch["path"])
     sound = sound.set_frame_rate(new_sample_rate)
     left = sound.split_to_mono()[0]
     bit_depth = left.sample_width * 8
-    array_type = get_array_type(bit_depth)
     numeric_array = np.array(array.array(array_type, left._data) )
-    #windows hack as torchaudio cannot read mp3
     speech_array = torch.FloatTensor(numeric_array)
     batch["speech"] = numeric_array
-    batch["sampling_rate"] = new_sample_rate
-    batch["target_text"] = batch["sentence"]
     return batch
-resampler = audio_resampler(16000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
-    speech_array, sampling_rate = torchaudio.load(batch["path"])
-    batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
@@ -99,12 +100,8 @@ import torchaudio
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
-import torch
 import pydub
-from pydub.utils import mediainfo
 import array
-from pydub import AudioSegment
-from pydub.utils import get_array_type
 import numpy as np
 test_dataset = load_dataset("common_voice", "tr", split="test")
@@ -115,20 +112,12 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
 new_sample_rate = 16000
-import torchaudio
-import torch
-import pydub
-import array
-import numpy as np
 def audio_resampler(batch, new_sample_rate = 16000):
     #not working without complex library compilation in windows for mp3
@@ -158,13 +147,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "

+new_sample_rate = 16000
+def audio_resampler(batch, new_sample_rate = 16000):
+    #not working without complex library compilation in windows for mp3
     #speech_array, sampling_rate = torchaudio.load(batch["path"])
     #speech_array, sampling_rate = librosa.load(batch["path"])
+    #sampling_rate =  pydub.utils.info['sample_rate']  ##gets current samplerate
+    sound = pydub.AudioSegment.from_file(file=batch["path"])
+    sampling_rate = new_sample_rate
     sound = sound.set_frame_rate(new_sample_rate)
     left = sound.split_to_mono()[0]
     bit_depth = left.sample_width * 8
+    array_type = pydub.utils.get_array_type(bit_depth)
     numeric_array = np.array(array.array(array_type, left._data) )
     speech_array = torch.FloatTensor(numeric_array)
     batch["speech"] = numeric_array
+    batch["sampling_rate"] = sampling_rate
+    #batch["target_text"] = batch["sentence"]
     return batch
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
+    batch = audio_resampler(batch, new_sample_rate = new_sample_rate)
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
 import pydub
 import array
 import numpy as np
 test_dataset = load_dataset("common_voice", "tr", split="test")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
 new_sample_rate = 16000
 def audio_resampler(batch, new_sample_rate = 16000):
     #not working without complex library compilation in windows for mp3
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "