import IPython import sys import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"]) # entmax could not be installed at same time as torch subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"]) from tortoise_tts.api import TextToSpeech from tortoise_tts.utils.audio import load_audio, get_voices import torch import torchaudio import gradio as gr device = "cuda" if torch.cuda.is_available() else "cpu" # This will download all the models used by Tortoise from the HF hub tts = TextToSpeech(autoregressive_batch_size=16, device=device) voices = [ "angie", "daniel", "deniro", "emma", "freeman", "geralt", "halle", "jlaw", "lj", "snakes", "tom", "William", ] voice_paths = get_voices() print(voice_paths) preset = "fast" def inference(text, voice): text = text[:256] cond_paths = voice_paths[voice] conds = [] print(voice_paths, voice, cond_paths) for cond_path in cond_paths: c = load_audio(cond_path, 22050) conds.append(c) print(text, conds, preset) gen = tts.tts_with_preset(text, conds, preset) print("gen") torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000) return "generated.wav" text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" iface = gr.Interface( inference, inputs=[ gr.inputs.Textbox(type="str", default=text, label="Text", lines=3), gr.inputs.Dropdown(voices), ], outputs="audio", title="TorToiSe", description="A multi-voice TTS system trained with an emphasis on quality", article="This demo shows the ultra fast option in the TorToiSe system. For more info check the Repository.", enable_queue=True, ) iface.launch()