Dionyssos commited on
Commit
5b7599e
·
1 Parent(s): b073bdf

fx audiobook api

Browse files
Files changed (5) hide show
  1. README.md +3 -4
  2. text_utils.py → Utils/text_utils.py +1 -2
  3. api.py +22 -18
  4. audiobook.py +119 -151
  5. msinference.py +40 -36
README.md CHANGED
@@ -18,7 +18,7 @@ tags:
18
 
19
  # Affective TTS / Soundscape
20
 
21
- This repo is an expansion of the [SHIFT TTS tool](https://github.com/audeering/shift) with [foreign langs](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv) and audio sound generation via [AudioGen](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3).
22
  - Analysis of emotions of TTS [#1](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
23
  - `landscape2soundscape.py` generates soundscape, i.e. `trees, water, ..` & overlays TTS & creates video from image.
24
 
@@ -117,10 +117,9 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
117
 
118
  # Audiobook
119
 
120
- Convert `.docx` to audio `.wav` & `.mp4`. Via multiple voices. Listen to it in YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
121
 
122
  ```python
123
- # uses Flask api.py
124
- # download shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx
125
  python audiobook.py
126
  ```
 
18
 
19
  # Affective TTS / Soundscape
20
 
21
+ Expansion of the [SHIFT TTS tool](https://github.com/audeering/shift) with [foreign langs](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv) and audio soundscape via [AudioGen](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3).
22
  - Analysis of emotions of TTS [#1](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
23
  - `landscape2soundscape.py` generates soundscape, i.e. `trees, water, ..` & overlays TTS & creates video from image.
24
 
 
117
 
118
  # Audiobook
119
 
120
+ Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
121
 
122
  ```python
123
+ # generated audiobook will be saved in ./tts_audiobooks
 
124
  python audiobook.py
125
  ```
text_utils.py → Utils/text_utils.py RENAMED
@@ -1,5 +1,4 @@
1
  # -*- coding: utf-8 -*-
2
- import numpy as np
3
  import re
4
  import codecs
5
  import textwrap
@@ -118,4 +117,4 @@ def store_ssml(text=None,
118
  print(len(text),'\n\n\n\n\n\n\n', _s)
119
 
120
  with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
121
- f.write(_s)
 
1
  # -*- coding: utf-8 -*-
 
2
  import re
3
  import codecs
4
  import textwrap
 
117
  print(len(text),'\n\n\n\n\n\n\n', _s)
118
 
119
  with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
120
+ f.write(_s)
api.py CHANGED
@@ -3,7 +3,7 @@
3
  import numpy as np
4
  import soundfile
5
  import audresample
6
- import text_utils
7
  import msinference
8
  import re
9
  import srt
@@ -35,7 +35,8 @@ nltk.download('punkt')
35
  # git remote set-url origin [email protected]:audeering/shift
36
  # ==
37
 
38
-
 
39
 
40
  def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
41
  '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
@@ -144,19 +145,22 @@ def tts_multi_sentence(precomputed_style_vector=None,
144
  beta=0.7,
145
  diffusion_steps=7,
146
  embedding_scale=1))
147
- # Fallback - MMS TTS - Non-English Foreign voice=language
148
- else:
149
- x = []
150
- for _sentence in text:
151
- x.append(msinference.foreign(text=_sentence,
152
- lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
153
- speed=speed))
154
 
155
-
156
- x = np.concatenate(x)
157
 
158
- x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
 
 
 
 
 
 
 
 
159
 
 
 
160
  return overlay(x, scene=scene)
161
 
162
 
@@ -185,15 +189,15 @@ def serve_wav():
185
 
186
  # Physically Save Client Files
187
  for filename, obj in request.files.items():
188
- obj.save(f'{CACHE_DIR}{filename.replace("/","")}')
189
 
190
  print('Saved all files on Server Side\n\n')
191
 
192
  args = SimpleNamespace(
193
- text = None if r.get('text') is None else CACHE_DIR + r.get('text' )[0][-6:],
194
- video = None if r.get('video') is None else CACHE_DIR + r.get('video')[0][-6:],
195
- image = None if r.get('image') is None else CACHE_DIR + r.get('image')[0][-6:],
196
- native = None if r.get('native') is None else CACHE_DIR + r.get('native')[0][-6:],
197
  affective = r.get('affective')[0],
198
  voice = r.get('voice')[0],
199
  speed = float(r.get('speed')[0]), # For Non-English MMS TTS
@@ -233,7 +237,7 @@ def serve_wav():
233
  with open(args.text, 'r') as f:
234
  t = ''.join(f)
235
  t = re.sub(' +', ' ', t) # delete spaces
236
- text = text_utils.split_into_sentences(t) # split to short sentences (~200 phonemes max)
237
 
238
  # ====STYLE VECTOR====
239
 
 
3
  import numpy as np
4
  import soundfile
5
  import audresample
6
+ from Utils.text_utils import split_into_sentences
7
  import msinference
8
  import re
9
  import srt
 
35
  # git remote set-url origin [email protected]:audeering/shift
36
  # ==
37
 
38
+ def _shorten(filename):
39
+ return filename.replace("/","")[-6:]
40
 
41
  def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
42
  '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
 
145
  beta=0.7,
146
  diffusion_steps=7,
147
  embedding_scale=1))
148
+ x = np.concatenate(x)
 
 
 
 
 
 
149
 
150
+ # Fallback - MMS TTS - Non-English
 
151
 
152
+ else:
153
+
154
+ # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
155
+ x = msinference.foreign(text=text,
156
+ lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
157
+ speed=speed) # normalisation externally
158
+
159
+
160
+ # volume
161
 
162
+ x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
163
+
164
  return overlay(x, scene=scene)
165
 
166
 
 
189
 
190
  # Physically Save Client Files
191
  for filename, obj in request.files.items():
192
+ obj.save(f'{CACHE_DIR}{_shorten(filename)}')
193
 
194
  print('Saved all files on Server Side\n\n')
195
 
196
  args = SimpleNamespace(
197
+ text = None if r.get('text') is None else CACHE_DIR + _shorten(r.get('text' )[0]), # crop last letters from original filename & use as tmp
198
+ video = None if r.get('video') is None else CACHE_DIR + _shorten(r.get('video')[0]),
199
+ image = None if r.get('image') is None else CACHE_DIR + _shorten(r.get('image')[0]),
200
+ native = None if r.get('native') is None else CACHE_DIR + _shorten(r.get('native')[0]),
201
  affective = r.get('affective')[0],
202
  voice = r.get('voice')[0],
203
  speed = float(r.get('speed')[0]), # For Non-English MMS TTS
 
237
  with open(args.text, 'r') as f:
238
  t = ''.join(f)
239
  t = re.sub(' +', ' ', t) # delete spaces
240
+ text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
241
 
242
  # ====STYLE VECTOR====
243
 
audiobook.py CHANGED
@@ -1,22 +1,15 @@
1
- # FOR EACH VOICE -> create .wav file per chapter & full audiobook.wav from assets/INCLUSION_IN_MUSEUMS_audiobook.docx
2
- #
3
- # Chapters
4
- #
5
- # ROOT_DIR/voice/voxstr_CHAPTER_0.wav
6
- # ..
7
- # ROOT_DIR/voice/voxstr_CHAPTER_10.wav
8
- # ROOT_DIR/voice/voxstr_full_book.wav
9
- #
10
- # Full AudioBook
11
- #
12
- # ROOT_DIR/full_audiobook_all_voices.wav
13
 
14
  import cv2
15
  import subprocess
16
  import numpy as np
17
  import soundfile
18
- import docx # pip install python-docx
19
-
 
20
  from pathlib import Path
21
  from moviepy.editor import *
22
 
@@ -25,98 +18,95 @@ ROOT_DIR = './tts_audiobooks/voices/'
25
  Path(ROOT_DIR).mkdir(parents=True,
26
  exist_ok=True)
27
  voices = [
28
- # 'en_US/hifi-tts_low#9017' ,
29
- 'en_US/m-ailabs_low#mary_ann',
30
- 'en_US/cmu-arctic_low#jmk',
31
- # 'en_US/cmu-arctic_low#eey',
32
- 'en_UK/apope_low'
33
- ] # select any voice from - https://audeering.github.io/shift/
34
 
35
- d = docx.Document('../shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
36
 
37
  last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
38
 
39
  chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME
40
 
41
- youtube_video_parts = [] # audiobook .mp4 from each voice
42
-
43
  for vox in voices:
44
 
45
- # string (map for assets/)
46
-
47
  vox_str = vox.replace(
48
  '/', '_').replace(
49
  '#', '_').replace(
50
  'cmu-arctic', 'cmu_arctic').replace(
51
  '_low', '').replace('-','')
52
-
53
  # create dir for chapter_x.wav & audiobook.wav - for this voice vox
54
-
55
  Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
56
  exist_ok=True)
57
-
58
-
59
  print(vox)
60
-
61
  # for new voice start list of audio tiles making up the 1st chapter of book
62
-
63
  total = []
64
  chapter = []
65
-
66
- for para in d.paragraphs[:41]:
67
  t = para.text
68
-
69
-
70
-
71
-
72
  # start new chapter
73
-
74
  if t.startswith('CHAPTER:'):
75
-
76
-
77
-
78
  # silence for end chapter
79
-
80
- chapter.append(np.zeros(int(.1 * FS),
81
- dtype=np.float32))
82
-
83
  # chapter.wav
84
-
85
  audio = np.concatenate(chapter)
86
-
87
  soundfile.write(
88
- ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
89
- audio,
90
- FS) # 27400?
91
-
92
  # fill AUDIO of this chapter into total (for complete audiobook)
93
-
94
  total.append(audio)
95
-
96
  # new chapter
97
-
98
  chapter = []
99
-
100
  chapter_counter += 1
101
-
102
-
103
-
104
-
105
-
106
  # If paragraph is non empty -> TTS
107
-
108
  if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
109
-
110
  # place paragraph text to .txt for tts.py
111
-
112
  with open('_tmp.txt', 'w') as f:
113
  f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
114
-
115
-
116
- print(t,'\n_____________________________\n')
117
-
118
  # TTS
119
-
120
  subprocess.run(
121
  [
122
  "python",
@@ -128,40 +118,41 @@ for vox in voices:
128
  # '--scene', 'calm sounds of castle',
129
  '--voice', vox,
130
  '--out_file', '_tmp' # save on _tmp load audio and concat to total
131
- ])
132
-
133
  audio, _fs = soundfile.read('out/_tmp.wav')
134
- print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
 
135
  chapter.append(audio)
136
-
137
  # flag
138
-
139
  last_paragraph_was_silence = False
140
-
141
- # append silence if empty paragraph (e.g. end of Section)
142
-
143
  else:
144
-
145
  if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once
146
-
147
  chapter.append(np.zeros(int(.1 * FS),
148
- dtype=np.float32))
149
-
150
  last_paragraph_was_silence = True
151
-
152
  # save full .wav audiobook - for this voice
153
-
154
  soundfile.write(
155
- ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
156
- np.concatenate(total),
157
- FS) # 27400?
 
158
 
159
 
160
 
161
-
162
  # pic TTS voice
163
-
164
- voice_pic = np.zeros((768, 1024, 3), dtype=np.uint8)
165
 
166
  shift_logo = cv2.imread('assets/shift_banner.png')
167
 
@@ -177,27 +168,35 @@ for vox in voices:
177
  lineType = 2
178
  # voice
179
  cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
180
- bottomLeftCornerOfText,
181
- font,
182
- fontScale,
183
- fontColor,
184
- thickness,
185
- lineType)
186
- # =
 
 
 
 
 
 
 
 
187
  cv2.putText(voice_pic, 'TTS voice =',
188
- (0, 500),
189
- font,
190
- fontScale,
191
- fontColor,
192
- thickness,
193
- lineType)
194
  STATIC_FRAME = '_tmp.png'
195
  cv2.imwrite(STATIC_FRAME, voice_pic)
196
-
197
-
198
  # MoviePy silence video
199
-
200
-
201
  SILENT_VIDEO = '_tmp.mp4'
202
 
203
  # SILENT CLIP
@@ -207,54 +206,23 @@ for vox in voices:
207
 
208
 
209
 
210
-
211
-
212
  # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
213
 
214
  # write final output video
215
  subprocess.call(
216
  ["ffmpeg",
217
- "-y",
218
- "-i",
219
- SILENT_VIDEO,
220
- "-i",
221
- ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
222
- "-c:v",
223
- "copy",
224
- "-map",
225
- "0:v:0",
226
- "-map",
227
- " 1:a:0",
228
- ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
229
- ])
230
-
231
- youtube_video_parts.append(ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4')
232
- # Final vid for YouTube
233
-
234
- with open('_youtube_video_parts.txt', 'w') as f:
235
- _str = 'file ' + ' \n file '.join(youtube_video_parts)
236
- f.write(_str)
237
-
238
- # # list of audiobooks of single vox
239
- # # --
240
- # # $ cat mylist.txt
241
- # # file '/path/to/file1'
242
- # # file '/path/to/file2'
243
- # # file '/path/to/file3'
244
-
245
- youtube_video_file = 'audiobook_shift_youtube.mp4'
246
-
247
- # ffmpeg -f concat -i video_parts.txt -c copy output.mp4
248
- subprocess.call(
249
- ["ffmpeg",
250
- "-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
251
- "-safe",
252
- "0", # https://stackoverflow.com/questions/38996925/ffmpeg-concat-unsafe-file-name
253
- "-f",
254
- "concat", # https://stackoverflow.com/questions/7333232/how-to-concatenate-two-mp4-files-using-ffmpeg
255
- "-i",
256
- '_youtube_video_parts.txt',
257
- "-c",
258
- "copy",
259
- youtube_video_file]
260
- )
 
1
+ # creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
2
+ # __________________________________________________________________________________________________
3
+ # ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
4
+ # ROOT_DIR/voice/voice_full_book.wav
 
 
 
 
 
 
 
 
5
 
6
  import cv2
7
  import subprocess
8
  import numpy as np
9
  import soundfile
10
+ import docx # package = python-docx
11
+ import audresample
12
+ import urllib
13
  from pathlib import Path
14
  from moviepy.editor import *
15
 
 
18
  Path(ROOT_DIR).mkdir(parents=True,
19
  exist_ok=True)
20
  voices = [
21
+ 'en_US/vctk_low#p228',
22
+ # 'en_US/vctk_low#p326',
23
+ ] # select any voice from - https://audeering.github.io/shift/
24
+
25
+ urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "tmp.docx")
 
26
 
27
+ d = docx.Document('tmp.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
28
 
29
  last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
30
 
31
  chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME
32
 
 
 
33
  for vox in voices:
34
 
35
+ # string cleanup
36
+
37
  vox_str = vox.replace(
38
  '/', '_').replace(
39
  '#', '_').replace(
40
  'cmu-arctic', 'cmu_arctic').replace(
41
  '_low', '').replace('-','')
42
+
43
  # create dir for chapter_x.wav & audiobook.wav - for this voice vox
44
+
45
  Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
46
  exist_ok=True)
47
+
48
+
49
  print(vox)
50
+
51
  # for new voice start list of audio tiles making up the 1st chapter of book
52
+
53
  total = []
54
  chapter = []
55
+
56
+ for para in d.paragraphs: #[:41]
57
  t = para.text
58
+
59
+
60
+
61
+
62
  # start new chapter
63
+
64
  if t.startswith('CHAPTER:'):
65
+
66
+
67
+
68
  # silence for end chapter
69
+
70
+ chapter.append(np.zeros(int(.24 * FS),
71
+ dtype=np.float32))
72
+
73
  # chapter.wav
74
+
75
  audio = np.concatenate(chapter)
76
+
77
  soundfile.write(
78
+ ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
79
+ audio,
80
+ 16000) # 27400?
81
+
82
  # fill AUDIO of this chapter into total (for complete audiobook)
83
+
84
  total.append(audio)
85
+
86
  # new chapter
87
+
88
  chapter = []
89
+
90
  chapter_counter += 1
91
+
92
+ print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')
93
+
94
+
95
+
96
  # If paragraph is non empty -> TTS
97
+
98
  if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
99
+
100
  # place paragraph text to .txt for tts.py
101
+
102
  with open('_tmp.txt', 'w') as f:
103
  f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
104
+
105
+
106
+
107
+
108
  # TTS
109
+
110
  subprocess.run(
111
  [
112
  "python",
 
118
  # '--scene', 'calm sounds of castle',
119
  '--voice', vox,
120
  '--out_file', '_tmp' # save on _tmp load audio and concat to total
121
+ ])
122
+
123
  audio, _fs = soundfile.read('out/_tmp.wav')
124
+ audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
125
+ # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
126
  chapter.append(audio)
127
+
128
  # flag
129
+
130
  last_paragraph_was_silence = False
131
+
132
+ # append silence if empty paragraph (e.g. end of Section)
133
+
134
  else:
135
+
136
  if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once
137
+
138
  chapter.append(np.zeros(int(.1 * FS),
139
+ dtype=np.float32))
140
+
141
  last_paragraph_was_silence = True
142
+
143
  # save full .wav audiobook - for this voice
144
+
145
  soundfile.write(
146
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
147
+ np.concatenate(total),
148
+ 16000) # 27400?
149
+
150
 
151
 
152
 
 
153
  # pic TTS voice
154
+
155
+ voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)
156
 
157
  shift_logo = cv2.imread('assets/shift_banner.png')
158
 
 
168
  lineType = 2
169
  # voice
170
  cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
171
+ bottomLeftCornerOfText,
172
+ font,
173
+ fontScale,
174
+ fontColor,
175
+ thickness,
176
+ lineType)
177
+ # = AUDIOBOOK
178
+ cv2.putText(voice_pic, 'AUDIOBOOK',
179
+ (170, 170),
180
+ font,
181
+ 4,
182
+ fontColor,
183
+ thickness,
184
+ lineType)
185
+ # = VOICE
186
  cv2.putText(voice_pic, 'TTS voice =',
187
+ (0, 500),
188
+ font,
189
+ fontScale,
190
+ fontColor,
191
+ thickness,
192
+ lineType)
193
  STATIC_FRAME = '_tmp.png'
194
  cv2.imwrite(STATIC_FRAME, voice_pic)
195
+
196
+
197
  # MoviePy silence video
198
+
199
+
200
  SILENT_VIDEO = '_tmp.mp4'
201
 
202
  # SILENT CLIP
 
206
 
207
 
208
 
209
+
210
+
211
  # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
212
 
213
  # write final output video
214
  subprocess.call(
215
  ["ffmpeg",
216
+ "-y",
217
+ "-i",
218
+ SILENT_VIDEO,
219
+ "-i",
220
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
221
+ "-c:v",
222
+ "copy",
223
+ "-map",
224
+ "0:v:0",
225
+ "-map",
226
+ " 1:a:0",
227
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
228
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
msinference.py CHANGED
@@ -366,23 +366,13 @@ class TextForeign(object):
366
  return text_norm
367
 
368
  def filter_oov(self, text, lang=None):
369
- text = self.preprocess_char(text, lang=lang)
370
  val_chars = self._symbol_to_id
371
  txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
372
  return txt_filt
373
 
374
- def preprocess_char(self, text, lang=None):
375
- """
376
- Special treatement of characters in certain languages
377
- """
378
- if lang == "ron":
379
- text = text.replace("ț", "ţ")
380
- print(f"{lang} (ț -> ţ): {text}")
381
- return text
382
-
383
-
384
- def foreign(text=None, lang='romanian', speed=None):
385
- # TTS for non english languages supported by
386
  # https://huggingface.co/spaces/mms-meta/MMS
387
 
388
  if 'hun' in lang.lower():
@@ -391,9 +381,9 @@ def foreign(text=None, lang='romanian', speed=None):
391
 
392
  elif 'ser' in lang.lower():
393
 
394
- if has_cyrillic(text):
395
 
396
- lang_code = 'rmc-script_cyrillic' # romani carpathian (has also Vlax)
397
 
398
  else:
399
 
@@ -439,28 +429,39 @@ def foreign(text=None, lang='romanian', speed=None):
439
  # TTS via MMS
440
 
441
  is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
442
-
443
- if is_uroman:
444
- uroman_dir = "Utils/uroman"
445
- assert os.path.exists(uroman_dir)
446
- uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
447
- text = text_mapper.uromanize(text, uroman_pl)
448
-
449
- text = text.lower()
450
- text = text_mapper.filter_oov(text, lang=lang)
451
- stn_tst = text_mapper.get_text(text, hps)
452
- with torch.no_grad():
453
- print(f'{speed=}\n\n\n\n_______________________________')
454
- x_tst = stn_tst.unsqueeze(0).to(device)
455
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
456
- x = (
457
- net_g.infer(
458
- x_tst,
459
- x_tst_lengths,
460
- noise_scale=0.667,
461
- noise_scale_w=0.8,
462
- length_scale=1.0 / speed)[0][0, 0].cpu().float().numpy()
 
 
 
 
 
 
 
 
463
  )
 
 
 
464
  x /= np.abs(x).max() + 1e-7
465
 
466
  # hyp = (hyp * 32768).astype(np.int16)
@@ -470,6 +471,9 @@ def foreign(text=None, lang='romanian', speed=None):
470
  x = audresample.resample(signal=x.astype(np.float32),
471
  original_rate=16000,
472
  target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
 
 
 
473
  return x
474
 
475
 
 
366
  return text_norm
367
 
368
  def filter_oov(self, text, lang=None):
 
369
  val_chars = self._symbol_to_id
370
  txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
371
  return txt_filt
372
 
373
+ def foreign(text=None, # list of text
374
+ lang='romanian',
375
+ speed=None):
 
 
 
 
 
 
 
 
 
376
  # https://huggingface.co/spaces/mms-meta/MMS
377
 
378
  if 'hun' in lang.lower():
 
381
 
382
  elif 'ser' in lang.lower():
383
 
384
+ if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
385
 
386
+ lang_code = 'rmc-script_cyrillic' # romani carpathian (also has lating/cyrillic Vlax)
387
 
388
  else:
389
 
 
429
  # TTS via MMS
430
 
431
  is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
432
+
433
+ # CALL TTS
434
+
435
+ x = []
436
+
437
+ for _t in text:
438
+
439
+
440
+
441
+ if is_uroman:
442
+ uroman_dir = "Utils/uroman"
443
+ assert os.path.exists(uroman_dir)
444
+ uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
445
+ _t = text_mapper.uromanize(_t, uroman_pl)
446
+
447
+ _t = _t.lower().replace("ţ", "ț").replace('ț','ts') #.replace('ț', 'ts').replace('Ţ', 'ts').replace('î', 'u').replace('Î', 'u')
448
+ _t = text_mapper.filter_oov(_t, lang=lang)
449
+ # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
450
+ stn_tst = text_mapper.get_text(_t, hps)
451
+ with torch.no_grad():
452
+ x_tst = stn_tst.unsqueeze(0).to(device)
453
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
454
+ x.append(
455
+ net_g.infer(
456
+ x_tst,
457
+ x_tst_lengths,
458
+ noise_scale=0.667,
459
+ noise_scale_w=1, #0, #0.8,
460
+ length_scale=1.0 / speed)[0][0, 0].cpu().float().numpy()
461
  )
462
+
463
+ x = np.concatenate(x)
464
+
465
  x /= np.abs(x).max() + 1e-7
466
 
467
  # hyp = (hyp * 32768).astype(np.int16)
 
471
  x = audresample.resample(signal=x.astype(np.float32),
472
  original_rate=16000,
473
  target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
474
+
475
+
476
+
477
  return x
478
 
479