gorkemgoknar commited on
Commit
ad33257
·
1 Parent(s): 0569c34

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -29
README.md CHANGED
@@ -49,39 +49,40 @@ model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turk
49
 
50
 
51
 
52
- def audio_resampler(batch,new_sample_rate = 16000):
 
 
53
 
54
- ##torchaudio and librosa troublesome to use for mp3 in windows
55
  #speech_array, sampling_rate = torchaudio.load(batch["path"])
56
  #speech_array, sampling_rate = librosa.load(batch["path"])
57
 
58
- #AudioSegment does the job over ffmpeg(need install)
59
- sound = AudioSegment.from_file(file=batch["path"])
 
 
60
  sound = sound.set_frame_rate(new_sample_rate)
61
-
62
  left = sound.split_to_mono()[0]
63
  bit_depth = left.sample_width * 8
64
- array_type = get_array_type(bit_depth)
65
 
66
  numeric_array = np.array(array.array(array_type, left._data) )
67
 
68
- #windows hack as torchaudio cannot read mp3
69
  speech_array = torch.FloatTensor(numeric_array)
70
 
71
  batch["speech"] = numeric_array
72
- batch["sampling_rate"] = new_sample_rate
73
- batch["target_text"] = batch["sentence"]
74
 
75
  return batch
76
-
77
- resampler = audio_resampler(16000)
78
 
79
  # Preprocessing the datasets.
80
  # We need to read the aduio files as arrays
81
  def speech_file_to_array_fn(batch):
82
- speech_array, sampling_rate = torchaudio.load(batch["path"])
83
- batch["speech"] = resampler(speech_array).squeeze().numpy()
84
  return batch
 
85
  test_dataset = test_dataset.map(speech_file_to_array_fn)
86
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
87
  with torch.no_grad():
@@ -99,12 +100,8 @@ import torchaudio
99
  from datasets import load_dataset, load_metric
100
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
101
  import re
102
- import torch
103
  import pydub
104
- from pydub.utils import mediainfo
105
  import array
106
- from pydub import AudioSegment
107
- from pydub.utils import get_array_type
108
  import numpy as np
109
 
110
  test_dataset = load_dataset("common_voice", "tr", split="test")
@@ -115,20 +112,12 @@ model.to("cuda")
115
 
116
  #Note: Not ignoring "'" on this one
117
  #Note: Not ignoring "'" on this one
118
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
119
 
120
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
121
  #using custom load and transformer for audio -> see audio_resampler
122
  new_sample_rate = 16000
123
 
124
-
125
-
126
- import torchaudio
127
- import torch
128
- import pydub
129
- import array
130
- import numpy as np
131
-
132
  def audio_resampler(batch, new_sample_rate = 16000):
133
 
134
  #not working without complex library compilation in windows for mp3
@@ -158,13 +147,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
158
  def remove_special_characters(batch):
159
 
160
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
161
- batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
162
 
163
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
164
- batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
165
 
166
  ##replace three dots (that are inside string with single)
167
- batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
168
 
169
  #standart ignore list
170
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
49
 
50
 
51
 
52
+ new_sample_rate = 16000
53
+
54
+ def audio_resampler(batch, new_sample_rate = 16000):
55
 
56
+ #not working without complex library compilation in windows for mp3
57
  #speech_array, sampling_rate = torchaudio.load(batch["path"])
58
  #speech_array, sampling_rate = librosa.load(batch["path"])
59
 
60
+ #sampling_rate = pydub.utils.info['sample_rate'] ##gets current samplerate
61
+
62
+ sound = pydub.AudioSegment.from_file(file=batch["path"])
63
+ sampling_rate = new_sample_rate
64
  sound = sound.set_frame_rate(new_sample_rate)
 
65
  left = sound.split_to_mono()[0]
66
  bit_depth = left.sample_width * 8
67
+ array_type = pydub.utils.get_array_type(bit_depth)
68
 
69
  numeric_array = np.array(array.array(array_type, left._data) )
70
 
 
71
  speech_array = torch.FloatTensor(numeric_array)
72
 
73
  batch["speech"] = numeric_array
74
+ batch["sampling_rate"] = sampling_rate
75
+ #batch["target_text"] = batch["sentence"]
76
 
77
  return batch
78
+
 
79
 
80
  # Preprocessing the datasets.
81
  # We need to read the aduio files as arrays
82
  def speech_file_to_array_fn(batch):
83
+ batch = audio_resampler(batch, new_sample_rate = new_sample_rate)
 
84
  return batch
85
+
86
  test_dataset = test_dataset.map(speech_file_to_array_fn)
87
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
88
  with torch.no_grad():
 
100
  from datasets import load_dataset, load_metric
101
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
102
  import re
 
103
  import pydub
 
104
  import array
 
 
105
  import numpy as np
106
 
107
  test_dataset = load_dataset("common_voice", "tr", split="test")
 
112
 
113
  #Note: Not ignoring "'" on this one
114
  #Note: Not ignoring "'" on this one
115
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
116
 
117
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
118
  #using custom load and transformer for audio -> see audio_resampler
119
  new_sample_rate = 16000
120
 
 
 
 
 
 
 
 
 
121
  def audio_resampler(batch, new_sample_rate = 16000):
122
 
123
  #not working without complex library compilation in windows for mp3
 
147
  def remove_special_characters(batch):
148
 
149
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
150
+ batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
151
 
152
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
153
+ batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
154
 
155
  ##replace three dots (that are inside string with single)
156
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
157
 
158
  #standart ignore list
159
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "