|
import numpy as np |
|
import soundfile |
|
import msinference |
|
|
|
|
|
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.', |
|
voice='af_ZA_google-nwu_1919', |
|
speed=1.4, |
|
affect = True |
|
): |
|
'''returns 24kHZ np.array TTS |
|
|
|
voice : 'en_US/vctk_low#p276' # from English voices -> https://audeering.github.io/shift/ |
|
|
|
or |
|
|
|
voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6 |
|
|
|
or |
|
|
|
voice : 'deu' # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv |
|
''' |
|
|
|
|
|
|
|
|
|
|
|
if ('en_US/' in voice) or ('en_UK/' in voice): |
|
a = '' if affect else 'v2/' |
|
style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace( |
|
'/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
|
|
x = msinference.inference(text, |
|
style_vector) |
|
|
|
|
|
|
|
elif '_' in voice: |
|
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace( |
|
'/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
|
|
x = msinference.inference(text, |
|
style_vector) |
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
x = msinference.foreign(text=text, |
|
lang=voice, |
|
speed=speed) |
|
|
|
|
|
|
|
x /= np.abs(x).max() + 1e-7 |
|
print(x.shape, 'TTS OK') |
|
return x |
|
|
|
soundfile.write(f'demo.wav', tts_entry(), 24000) |
|
|