File size: 1,943 Bytes
a9482ab
 
b87f08b
7f6563e
 
510f17f
aeceb48
7f6563e
8772ca9
7f6563e
d59ee2f
 
9630f4e
dd29aa4
e28cac3
a9482ab
a7d0893
9630f4e
 
48287cf
a9482ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63ced49
 
a9482ab
7663e41
a9482ab
 
132c7ea
2b65d86
d2e0f91
b7d4e28
d2e0f91
 
 
b7d4e28
d2e0f91
dd29aa4
 
 
d2e0f91
91eda71
 
7ad0327
91eda71
132c7ea
95ec227
91eda71
44e30b0
7670619
132c7ea
 
42ed2a2
a9482ab
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import IPython

import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"])

# entmax could not be installed at same time as torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"])

from tortoise_tts.api import TextToSpeech
from tortoise_tts.utils.audio import load_audio, get_voices
import torch 
import torchaudio
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

# This will download all the models used by Tortoise from the HF hub
tts = TextToSpeech(autoregressive_batch_size=16, device=device)

voices = [
  "angie",
  "daniel",
  "deniro",
  "emma",
  "freeman",
  "geralt",
  "halle",
  "jlaw",
  "lj",
  "snakes",
  "tom",
  "William",
]
voice_paths = get_voices()
print(voice_paths)

preset = "fast"

def inference(text, voice):
    text = text[:256]
    cond_paths = voice_paths[voice]
    conds = []
    print(voice_paths, voice, cond_paths)
    for cond_path in cond_paths:
        c = load_audio(cond_path, 22050)
        conds.append(c)
    print(text, conds, preset)
    gen = tts.tts_with_preset(text, conds, preset)
    print("gen")
    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
    return "generated.wav"
 
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
iface = gr.Interface(
  inference,
  inputs=[
      gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
      gr.inputs.Dropdown(voices),
  ],
  outputs="audio",
  title="TorToiSe",
  description="A multi-voice TTS system trained with an emphasis on quality",
  article="This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",
  enable_queue=True,
)

iface.launch()