gorkemgoknar
commited on
Commit
·
eea98ca
1
Parent(s):
6a41c83
Update README.md
Browse files
README.md
CHANGED
@@ -34,12 +34,48 @@ The model can be used directly (without a language model) as follows:
|
|
34 |
```python
|
35 |
import torch
|
36 |
import torchaudio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
from datasets import load_dataset
|
38 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
39 |
test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
|
40 |
processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
|
41 |
model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Preprocessing the datasets.
|
44 |
# We need to read the aduio files as arrays
|
45 |
def speech_file_to_array_fn(batch):
|
@@ -69,7 +105,7 @@ model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turk
|
|
69 |
model.to("cuda")
|
70 |
|
71 |
#Note: Not ignoring "'" on this one
|
72 |
-
chars_to_ignore_regex = '[
|
73 |
|
74 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
75 |
# Preprocessing the datasets.
|
|
|
34 |
```python
|
35 |
import torch
|
36 |
import torchaudio
|
37 |
+
import pydub
|
38 |
+
from pydub.utils import mediainfo
|
39 |
+
import array
|
40 |
+
from pydub import AudioSegment
|
41 |
+
from pydub.utils import get_array_type
|
42 |
+
import numpy as np
|
43 |
+
|
44 |
from datasets import load_dataset
|
45 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
46 |
test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
|
47 |
processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
|
48 |
model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
def audio_resampler(batch,new_sample_rate = 16000):
|
53 |
+
|
54 |
+
##torchaudio and librosa troublesome to use for mp3 in windows
|
55 |
+
#speech_array, sampling_rate = torchaudio.load(batch["path"])
|
56 |
+
#speech_array, sampling_rate = librosa.load(batch["path"])
|
57 |
+
|
58 |
+
#AudioSegment does the job over ffmpeg(need install)
|
59 |
+
sound = AudioSegment.from_file(file=batch["path"])
|
60 |
+
sound = sound.set_frame_rate(new_sample_rate)
|
61 |
+
|
62 |
+
left = sound.split_to_mono()[0]
|
63 |
+
bit_depth = left.sample_width * 8
|
64 |
+
array_type = get_array_type(bit_depth)
|
65 |
+
|
66 |
+
numeric_array = np.array(array.array(array_type, left._data) )
|
67 |
+
|
68 |
+
#windows hack as torchaudio cannot read mp3
|
69 |
+
speech_array = torch.FloatTensor(numeric_array)
|
70 |
+
|
71 |
+
batch["speech"] = numeric_array
|
72 |
+
batch["sampling_rate"] = new_sample_rate
|
73 |
+
batch["target_text"] = batch["sentence"]
|
74 |
+
|
75 |
+
return batch
|
76 |
+
|
77 |
+
resampler = audio_resampler(16000)
|
78 |
+
|
79 |
# Preprocessing the datasets.
|
80 |
# We need to read the aduio files as arrays
|
81 |
def speech_file_to_array_fn(batch):
|
|
|
105 |
model.to("cuda")
|
106 |
|
107 |
#Note: Not ignoring "'" on this one
|
108 |
+
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
|
109 |
|
110 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
111 |
# Preprocessing the datasets.
|