gorkemgoknar commited on
Commit
4d1fa17
·
1 Parent(s): 4cadfa5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -4
README.md CHANGED
@@ -27,6 +27,13 @@ model-index:
27
  value: TBD
28
  ---
29
  # Wav2Vec2-Large-XLSR-53-Turkish
 
 
 
 
 
 
 
30
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
31
  When using this model, make sure that your speech input is sampled at 16kHz.
32
  ## Usage
@@ -112,7 +119,7 @@ model.to("cuda")
112
 
113
  #Note: Not ignoring "'" on this one
114
  #Note: Not ignoring "'" on this one
115
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\#\\\\>\\\\<\\\\_\\\\’\\\\[\\\\]\\\\{\\\\}]'
116
 
117
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
118
  #using custom load and transformer for audio -> see audio_resampler
@@ -147,13 +154,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
147
  def remove_special_characters(batch):
148
 
149
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
150
- batch["sentence"] = re.sub('\\\\b\\\\d{2}:\\\\d{2}:\\\\d{2}(,+\\\\d{2})?\\\\b', ' ', batch["sentence"])
151
 
152
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
153
- batch["sentence"] = re.sub('\\\\[(\\\\b[A-Z]+\\\\])', '', batch["sentence"])
154
 
155
  ##replace three dots (that are inside string with single)
156
- batch["sentence"] = re.sub("([a-zA-Z]+)\\\\.\\\\.\\\\.", r"\\\\1.", batch["sentence"])
157
 
158
  #standart ignore list
159
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
27
  value: TBD
28
  ---
29
  # Wav2Vec2-Large-XLSR-53-Turkish
30
+
31
+ Note: Common voice Turkish data is no background noise voice only dataset
32
+ In this model although Word Error rate for test is 50% it is agains Common Voice text
33
+ Please try speech yourself and see it is converting pretty good
34
+ I hope some news channels or movie producers lets use their data for test/training (I asked some no reply)
35
+
36
+
37
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
38
  When using this model, make sure that your speech input is sampled at 16kHz.
39
  ## Usage
 
119
 
120
  #Note: Not ignoring "'" on this one
121
  #Note: Not ignoring "'" on this one
122
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
123
 
124
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
125
  #using custom load and transformer for audio -> see audio_resampler
 
154
  def remove_special_characters(batch):
155
 
156
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
157
+ batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
158
 
159
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
160
+ batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
161
 
162
  ##replace three dots (that are inside string with single)
163
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
164
 
165
  #standart ignore list
166
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "