gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

Automatic Speech Recognition

xlsr-fine-tuning-week

Inference Endpoints

Model card Files Files and versions Community

gorkemgoknar commited on Mar 28, 2021

Commit

eea98ca

·

1 Parent(s): 6a41c83

Update README.md

Files changed (1) hide show

README.md +38 -2

README.md CHANGED Viewed

@@ -34,12 +34,48 @@ The model can be used directly (without a language model) as follows:
 ```python
 import torch
 import torchaudio
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
@@ -69,7 +105,7 @@ model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turk
 model.to("cuda")
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.

 ```python
 import torch
 import torchaudio
+import pydub
+from pydub.utils import mediainfo
+import array
+from pydub import AudioSegment
+from pydub.utils import get_array_type
+import numpy as np
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
+def audio_resampler(batch,new_sample_rate = 16000):
+    ##torchaudio and librosa troublesome to use for mp3 in windows
+    #speech_array, sampling_rate = torchaudio.load(batch["path"])
+    #speech_array, sampling_rate = librosa.load(batch["path"])
+    #AudioSegment does the job over ffmpeg(need install)
+    sound = AudioSegment.from_file(file=batch["path"])
+    sound = sound.set_frame_rate(new_sample_rate)
+    left = sound.split_to_mono()[0]
+    bit_depth = left.sample_width * 8
+    array_type = get_array_type(bit_depth)
+    numeric_array = np.array(array.array(array_type, left._data) )
+    #windows hack as torchaudio cannot read mp3
+    speech_array = torch.FloatTensor(numeric_array)
+    batch["speech"] = numeric_array
+    batch["sampling_rate"] = new_sample_rate
+    batch["target_text"] = batch["sentence"]
+    return batch
+resampler = audio_resampler(16000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
 model.to("cuda")
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.