gorkemgoknar
commited on
Commit
·
4cadfa5
1
Parent(s):
ad33257
Update README.md
Browse files
README.md
CHANGED
@@ -112,7 +112,7 @@ model.to("cuda")
|
|
112 |
|
113 |
#Note: Not ignoring "'" on this one
|
114 |
#Note: Not ignoring "'" on this one
|
115 |
-
chars_to_ignore_regex = '[
|
116 |
|
117 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
118 |
#using custom load and transformer for audio -> see audio_resampler
|
@@ -147,13 +147,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
147 |
def remove_special_characters(batch):
|
148 |
|
149 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
150 |
-
batch["sentence"] = re.sub('
|
151 |
|
152 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
153 |
-
batch["sentence"] = re.sub('
|
154 |
|
155 |
##replace three dots (that are inside string with single)
|
156 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
157 |
|
158 |
#standart ignore list
|
159 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
@@ -188,6 +188,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=2)
|
|
188 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
189 |
|
190 |
```
|
191 |
-
**Test Result**:
|
192 |
## Training
|
193 |
-
The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used
|
|
|
|
112 |
|
113 |
#Note: Not ignoring "'" on this one
|
114 |
#Note: Not ignoring "'" on this one
|
115 |
+
chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\#\\\\>\\\\<\\\\_\\\\’\\\\[\\\\]\\\\{\\\\}]'
|
116 |
|
117 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
118 |
#using custom load and transformer for audio -> see audio_resampler
|
|
|
147 |
def remove_special_characters(batch):
|
148 |
|
149 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
150 |
+
batch["sentence"] = re.sub('\\\\b\\\\d{2}:\\\\d{2}:\\\\d{2}(,+\\\\d{2})?\\\\b', ' ', batch["sentence"])
|
151 |
|
152 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
153 |
+
batch["sentence"] = re.sub('\\\\[(\\\\b[A-Z]+\\\\])', '', batch["sentence"])
|
154 |
|
155 |
##replace three dots (that are inside string with single)
|
156 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\\\\.\\\\.\\\\.", r"\\\\1.", batch["sentence"])
|
157 |
|
158 |
#standart ignore list
|
159 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
|
|
188 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
189 |
|
190 |
```
|
191 |
+
**Test Result**: 50.41 %
|
192 |
## Training
|
193 |
+
The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used.
|
194 |
+
Training still continues...
|