gorkemgoknar commited on
Commit
4cadfa5
·
1 Parent(s): ad33257

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -6
README.md CHANGED
@@ -112,7 +112,7 @@ model.to("cuda")
112
 
113
  #Note: Not ignoring "'" on this one
114
  #Note: Not ignoring "'" on this one
115
- chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
116
 
117
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
118
  #using custom load and transformer for audio -> see audio_resampler
@@ -147,13 +147,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
147
  def remove_special_characters(batch):
148
 
149
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
150
- batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
151
 
152
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
153
- batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
154
 
155
  ##replace three dots (that are inside string with single)
156
- batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
157
 
158
  #standart ignore list
159
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
@@ -188,6 +188,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=2)
188
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
189
 
190
  ```
191
- **Test Result**: TBD %
192
  ## Training
193
- The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used
 
 
112
 
113
  #Note: Not ignoring "'" on this one
114
  #Note: Not ignoring "'" on this one
115
+ chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\#\\\\>\\\\<\\\\_\\\\’\\\\[\\\\]\\\\{\\\\}]'
116
 
117
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
118
  #using custom load and transformer for audio -> see audio_resampler
 
147
  def remove_special_characters(batch):
148
 
149
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
150
+ batch["sentence"] = re.sub('\\\\b\\\\d{2}:\\\\d{2}:\\\\d{2}(,+\\\\d{2})?\\\\b', ' ', batch["sentence"])
151
 
152
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
153
+ batch["sentence"] = re.sub('\\\\[(\\\\b[A-Z]+\\\\])', '', batch["sentence"])
154
 
155
  ##replace three dots (that are inside string with single)
156
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\\\.\\\\.\\\\.", r"\\\\1.", batch["sentence"])
157
 
158
  #standart ignore list
159
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
188
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
189
 
190
  ```
191
+ **Test Result**: 50.41 %
192
  ## Training
193
+ The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used.
194
+ Training still continues...