gorkemgoknar
commited on
Commit
·
4d1fa17
1
Parent(s):
4cadfa5
Update README.md
Browse files
README.md
CHANGED
@@ -27,6 +27,13 @@ model-index:
|
|
27 |
value: TBD
|
28 |
---
|
29 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
31 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
32 |
## Usage
|
@@ -112,7 +119,7 @@ model.to("cuda")
|
|
112 |
|
113 |
#Note: Not ignoring "'" on this one
|
114 |
#Note: Not ignoring "'" on this one
|
115 |
-
chars_to_ignore_regex = '[
|
116 |
|
117 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
118 |
#using custom load and transformer for audio -> see audio_resampler
|
@@ -147,13 +154,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
147 |
def remove_special_characters(batch):
|
148 |
|
149 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
150 |
-
batch["sentence"] = re.sub('
|
151 |
|
152 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
153 |
-
batch["sentence"] = re.sub('
|
154 |
|
155 |
##replace three dots (that are inside string with single)
|
156 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
157 |
|
158 |
#standart ignore list
|
159 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
|
|
27 |
value: TBD
|
28 |
---
|
29 |
# Wav2Vec2-Large-XLSR-53-Turkish
|
30 |
+
|
31 |
+
Note: Common voice Turkish data is no background noise voice only dataset
|
32 |
+
In this model although Word Error rate for test is 50% it is agains Common Voice text
|
33 |
+
Please try speech yourself and see it is converting pretty good
|
34 |
+
I hope some news channels or movie producers lets use their data for test/training (I asked some no reply)
|
35 |
+
|
36 |
+
|
37 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
38 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
39 |
## Usage
|
|
|
119 |
|
120 |
#Note: Not ignoring "'" on this one
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
+
chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
|
123 |
|
124 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
125 |
#using custom load and transformer for audio -> see audio_resampler
|
|
|
154 |
def remove_special_characters(batch):
|
155 |
|
156 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
157 |
+
batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
|
158 |
|
159 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
160 |
+
batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
|
161 |
|
162 |
##replace three dots (that are inside string with single)
|
163 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
|
164 |
|
165 |
#standart ignore list
|
166 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|