Spaces:
Runtime error
Runtime error
File size: 4,507 Bytes
a9482ab b87f08b 7f6563e cab646b 7f6563e 510f17f aeceb48 7f6563e 8772ca9 7f6563e d59ee2f 9630f4e dd29aa4 1a8cc73 e28cac3 a9482ab a7d0893 cab646b 9630f4e 55ef1e7 a9482ab 63ced49 a9482ab 7663e41 a9482ab 132c7ea 2b65d86 d2e0f91 b7d4e28 d2e0f91 b7d4e28 d2e0f91 dd29aa4 1a8cc73 9c77c78 d9f9ad4 1a8cc73 b9de5e1 1a8cc73 d2e0f91 91eda71 d9f9ad4 1256bad d9f9ad4 bfae475 1a8cc73 66ca704 1a8cc73 a0ece8c 1a8cc73 8622a01 ad2aa2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import IPython
import sys
import subprocess
import os
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"])
# entmax could not be installed at same time as torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"])
from tortoise_tts.api import TextToSpeech
from tortoise_tts.utils.audio import load_audio, get_voices
import torch
import torchaudio
import numpy as np
import gradio as gr
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# This will download all the models used by Tortoise from the HF hub
tts = TextToSpeech(device="cuda")
voices = [
"angie",
"daniel",
"deniro",
"emma",
"freeman",
"geralt",
"halle",
"jlaw",
"lj",
"snakes",
"William",
]
voice_paths = get_voices()
print(voice_paths)
preset = "fast"
def inference(text, voice):
text = text[:256]
cond_paths = voice_paths[voice]
conds = []
print(voice_paths, voice, cond_paths)
for cond_path in cond_paths:
c = load_audio(cond_path, 22050)
conds.append(c)
print(text, conds, preset)
gen = tts.tts_with_preset(text, conds, preset)
print("gen")
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
return "generated.wav"
def load_audio_special(sr, data):
if data.dtype == np.int32:
norm_fix = 2 ** 31
elif data.dtype == np.int16:
norm_fix = 2 ** 15
elif data.dtype == np.float16 or data.dtype == np.float32:
norm_fix = 1.
audio = torch.FloatTensor(data.astype(np.float32)) / norm_fix
# Remove any channel data.
if len(audio.shape) > 1:
if audio.shape[0] < 5:
audio = audio[0]
else:
assert audio.shape[1] < 5
audio = audio[:, 0]
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
if torch.any(audio > 2) or not torch.any(audio < 0):
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
audio.clip_(-1, 1)
return audio.unsqueeze(0)
def inference_own_voice(text, voice_1, voice_2, voice_3):
text = text[:256]
print(voice_1)
conds = [
load_audio_special(voice_1[0], voice_1[1]),
load_audio_special(voice_2[0], voice_2[1]),
load_audio_special(voice_3[0], voice_3[1]),
]
print(text, conds, preset)
gen = tts.tts_with_preset(text, conds, preset)
print("gen")
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
return "generated.wav"
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
examples = [
[text, "angie"],
[text, "emma"],
["how are you doing this day", "freeman"]
]
block = gr.Blocks(enable_queue=True)
with block:
gr.Markdown("# TorToiSe")
gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
with gr.Tabs():
with gr.TabItem("Pre-recorded voices"):
iface = gr.Interface(
inference,
inputs=[
gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
gr.inputs.Dropdown(voices),
],
outputs="audio",
examples=examples,
)
with gr.TabItem("Record your voice (experimental, might not work well)"):
iface = gr.Interface(
inference_own_voice,
inputs=[
gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 1)", type="numpy"),
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 2)", type="numpy"),
gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 3)", type="numpy"),
],
outputs="audio",
)
gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)
block.launch(debug=True) |