gorkemgoknar commited on
Commit
d739c10
·
1 Parent(s): e7650c7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -7
README.md CHANGED
@@ -32,6 +32,8 @@ model-index:
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
  Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
34
 
 
 
35
  Dataset building from csv and merging code can be found on below of this Readme.
36
 
37
  Please try speech yourself on the right side to see its performance.
@@ -122,7 +124,7 @@ model.to("cuda")
122
 
123
  #Note: Not ignoring "'" on this one
124
  #Note: Not ignoring "'" on this one
125
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
126
 
127
 
128
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -153,11 +155,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
153
  def remove_special_characters(batch):
154
 
155
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
156
- batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
157
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
158
- batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
159
  ##replace three dots (that are inside string with single)
160
- batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
161
  #standart ignore list
162
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
163
 
@@ -220,7 +222,7 @@ from datasets import Dataset
220
  import csv
221
 
222
  #Walk all subdirectories of base_set_path and find csv files
223
- base_set_path = r'C:\dataset_extracts'
224
  csv_files = []
225
  for path, subdirs, files in os.walk(base_set_path):
226
  for name in files:
@@ -230,7 +232,7 @@ for path, subdirs, files in os.walk(base_set_path):
230
 
231
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
232
  path = Path(csvfilename)
233
- csv_delimiter="\t" ##tab seperated, change if something else
234
 
235
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
236
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -248,7 +250,7 @@ from datasets import concatenate_datasets, load_dataset
248
  from datasets import load_from_disk
249
 
250
  # Merge datasets together (from csv files)
251
- dataset_file_path = ".\dataset_file"
252
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
253
 
254
  #save this one to disk
 
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
  Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
34
 
35
+ Disclaimer: Please use another wav2vec2-tr model in hub for "clean environment" dialogues as they tend to do better in clean sounds with less background noise.
36
+
37
  Dataset building from csv and merging code can be found on below of this Readme.
38
 
39
  Please try speech yourself on the right side to see its performance.
 
124
 
125
  #Note: Not ignoring "'" on this one
126
  #Note: Not ignoring "'" on this one
127
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
128
 
129
 
130
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
155
  def remove_special_characters(batch):
156
 
157
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
158
+ batch["sentence"] = re.sub('\\b\\d{2}:\\d{2}:\\d{2}(,+\\d{2})?\\b', ' ', batch["sentence"])
159
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
160
+ batch["sentence"] = re.sub('\\[(\\b[A-Z]+\\])', '', batch["sentence"])
161
  ##replace three dots (that are inside string with single)
162
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\.\\.\\.", r"\\1.", batch["sentence"])
163
  #standart ignore list
164
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
165
 
 
222
  import csv
223
 
224
  #Walk all subdirectories of base_set_path and find csv files
225
+ base_set_path = r'C:\\dataset_extracts'
226
  csv_files = []
227
  for path, subdirs, files in os.walk(base_set_path):
228
  for name in files:
 
232
 
233
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
234
  path = Path(csvfilename)
235
+ csv_delimiter="\\t" ##tab seperated, change if something else
236
 
237
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
238
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 
250
  from datasets import load_from_disk
251
 
252
  # Merge datasets together (from csv files)
253
+ dataset_file_path = ".\\dataset_file"
254
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
255
 
256
  #save this one to disk