File size: 2,564 Bytes
fe62fb4
bdb4f02
 
 
 
 
9146509
bdb4f02
 
 
 
dafcadc
bdb4f02
dafcadc
bdb4f02
dafcadc
9146509
dafcadc
 
 
9146509
bdb4f02
 
dafcadc
 
bdb4f02
 
dafcadc
bdb4f02
 
 
 
 
dafcadc
bdb4f02
9146509
dafcadc
 
 
 
bdb4f02
 
 
 
dafcadc
bdb4f02
9146509
dafcadc
bdb4f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import soundfile
import msinference


def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
              voice='af_ZA_google-nwu_1919',  # 'serbian', # 'en_US/vctk_low#p276', 'isl', 'abi',
              speed=1.4,  # only for non-english
              affect = True  # False = high clarity for partially sight
              ):
    '''returns 24kHZ np.array TTS

       voice : 'en_US/vctk_low#p276'  # from English voices -> https://audeering.github.io/shift/

          or

       voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6

          or

       voice : 'deu'  # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
       '''

    # StyleTTS2 - En

    # mimic-3 format of voice (English txt - English accent)

    if ('en_US/' in voice) or ('en_UK/' in voice):
        a = '' if affect else 'v2/'
        style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text,
                                    style_vector)

    # mimic-3 format of voice (English text - Foreign accent)

    elif '_' in  voice:
        style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text,
                                    style_vector)


    # Fallback - MMS TTS - Non-English

    else:

        # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
        x = msinference.foreign(text=text,
                                lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                speed=speed)  # normalisation externally

    # volume

    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
    print(x.shape, 'TTS OK')
    return x

soundfile.write(f'demo.wav', tts_entry(), 24000)