Spaces:
Running
Running
xJuuzouYTx
commited on
Commit
·
f98d769
1
Parent(s):
5837809
[ADD] coquitts
Browse files- app.py +41 -9
- packages.txt +2 -0
- requirements.txt +4 -1
- tts/constants.py +1 -1
- tts/conversion.py +19 -62
app.py
CHANGED
@@ -6,8 +6,9 @@ import hashlib
|
|
6 |
from utils.model import model_downloader, get_model
|
7 |
import requests
|
8 |
import json
|
|
|
9 |
from tts.constants import VOICE_METHODS, BARK_VOICES, EDGE_VOICES
|
10 |
-
from tts.conversion import tts_infer, ELEVENLABS_VOICES_RAW, ELEVENLABS_VOICES_NAMES
|
11 |
|
12 |
api_url = "https://rvc-models-api.onrender.com/uploadfile/"
|
13 |
|
@@ -18,6 +19,17 @@ if not os.path.exists(zips_folder):
|
|
18 |
if not os.path.exists(unzips_folder):
|
19 |
os.mkdir(unzips_folder)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def calculate_md5(file_path):
|
22 |
hash_md5 = hashlib.md5()
|
23 |
with open(file_path, "rb") as f:
|
@@ -82,16 +94,26 @@ def post_model(name, model_url, version, creator):
|
|
82 |
|
83 |
md5_hash = calculate_md5(os.path.join(unzips_folder,model_files['pth']))
|
84 |
zipfile = compress(modelname, list(model_files.values()))
|
|
|
|
|
85 |
file_to_upload = open(zipfile, "rb")
|
|
|
|
|
|
|
|
|
86 |
data = {
|
87 |
"name": name,
|
88 |
"version": version,
|
89 |
"creator": creator,
|
90 |
-
"hash": md5_hash
|
|
|
|
|
|
|
91 |
}
|
92 |
print("Subiendo archivo...")
|
93 |
# Realizar la solicitud POST
|
94 |
response = requests.post(api_url, files={"file": file_to_upload}, data=data)
|
|
|
95 |
|
96 |
# Comprobar la respuesta
|
97 |
if response.status_code == 200:
|
@@ -100,6 +122,7 @@ def post_model(name, model_url, version, creator):
|
|
100 |
else:
|
101 |
print("Error al cargar el archivo:", response.status_code)
|
102 |
return result
|
|
|
103 |
|
104 |
def search_model(name):
|
105 |
web_service_url = "https://script.google.com/macros/s/AKfycbyRaNxtcuN8CxUrcA_nHW6Sq9G2QJor8Z2-BJUGnQ2F_CB8klF4kQL--U2r2MhLFZ5J/exec"
|
@@ -130,11 +153,13 @@ def search_model(name):
|
|
130 |
|
131 |
def update_tts_methods_voice(select_value):
|
132 |
if select_value == "Edge-tts":
|
133 |
-
return gr.update(choices=EDGE_VOICES), gr.Markdown.update(visible=False), gr.Textbox.update(visible=False)
|
134 |
elif select_value == "Bark-tts":
|
135 |
-
return gr.update(choices=BARK_VOICES), gr.Markdown.update(visible=False), gr.Textbox.update(visible=False)
|
136 |
elif select_value == 'ElevenLabs':
|
137 |
-
return gr.update(choices=ELEVENLABS_VOICES_NAMES), gr.Markdown.update(visible=True), gr.Textbox.update(visible=True)
|
|
|
|
|
138 |
|
139 |
with gr.Blocks() as app:
|
140 |
gr.HTML("<h1> Simple RVC Inference - by Juuxn 💻 </h1>")
|
@@ -168,7 +193,14 @@ with gr.Blocks() as app:
|
|
168 |
with gr.Row():
|
169 |
tts_method = gr.Dropdown(choices=VOICE_METHODS, value="Edge-tts", label="Método TTS:", visible=True)
|
170 |
tts_model = gr.Dropdown(choices=ELEVENLABS_VOICES_NAMES, label="Modelo TTS:", visible=True, interactive=True)
|
171 |
-
tts_api_key = gr.Textbox(label="ElevenLabs Api key", show_label=True, placeholder="4a4afce72349680c8e8b6fdcfaf2b65a",interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
tts_btn = gr.Button(value="Convertir")
|
174 |
|
@@ -176,13 +208,13 @@ with gr.Blocks() as app:
|
|
176 |
tts_vc_output1 = gr.Textbox(label="Salida")
|
177 |
tts_vc_output2 = gr.Audio(label="Audio de salida")
|
178 |
|
179 |
-
tts_btn.click(fn=tts_infer, inputs=[tts_text, tts_model_url, tts_method, tts_model, tts_api_key], outputs=[tts_vc_output1, tts_vc_output2])
|
180 |
|
181 |
tts_msg = gr.Markdown("""**Recomiendo que te crees una cuenta de eleven labs y pongas tu clave de api, es gratis y tienes 10k caracteres de limite al mes.** <br/>
|
182 |
![Imgur](https://imgur.com/HH6YTu0.png)
|
183 |
-
""", visible=
|
184 |
|
185 |
-
tts_method.change(fn=update_tts_methods_voice, inputs=[tts_method], outputs=[tts_model, tts_msg, tts_api_key])
|
186 |
|
187 |
with gr.Tab("Modelos"):
|
188 |
gr.HTML("<h4>Buscar modelos</h4>")
|
|
|
6 |
from utils.model import model_downloader, get_model
|
7 |
import requests
|
8 |
import json
|
9 |
+
import torch
|
10 |
from tts.constants import VOICE_METHODS, BARK_VOICES, EDGE_VOICES
|
11 |
+
from tts.conversion import tts_infer, ELEVENLABS_VOICES_RAW, ELEVENLABS_VOICES_NAMES, COQUI_LANGUAGES
|
12 |
|
13 |
api_url = "https://rvc-models-api.onrender.com/uploadfile/"
|
14 |
|
|
|
19 |
if not os.path.exists(unzips_folder):
|
20 |
os.mkdir(unzips_folder)
|
21 |
|
22 |
+
def get_info(path):
|
23 |
+
path = os.path.join(unzips_folder, path)
|
24 |
+
try:
|
25 |
+
a = torch.load(path, map_location="cpu")
|
26 |
+
return a
|
27 |
+
except Exception as e:
|
28 |
+
print("*****************eeeeeeeeeeeeeeeeeeeerrrrrrrrrrrrrrrrrr*****")
|
29 |
+
print(e)
|
30 |
+
return {
|
31 |
+
|
32 |
+
}
|
33 |
def calculate_md5(file_path):
|
34 |
hash_md5 = hashlib.md5()
|
35 |
with open(file_path, "rb") as f:
|
|
|
94 |
|
95 |
md5_hash = calculate_md5(os.path.join(unzips_folder,model_files['pth']))
|
96 |
zipfile = compress(modelname, list(model_files.values()))
|
97 |
+
|
98 |
+
a = get_info(model_files.get('pth'))
|
99 |
file_to_upload = open(zipfile, "rb")
|
100 |
+
info = a.get("info", "None"),
|
101 |
+
sr = a.get("sr", "None"),
|
102 |
+
f0 = a.get("f0", "None"),
|
103 |
+
|
104 |
data = {
|
105 |
"name": name,
|
106 |
"version": version,
|
107 |
"creator": creator,
|
108 |
+
"hash": md5_hash,
|
109 |
+
"info": info,
|
110 |
+
"sr": sr,
|
111 |
+
"f0": f0
|
112 |
}
|
113 |
print("Subiendo archivo...")
|
114 |
# Realizar la solicitud POST
|
115 |
response = requests.post(api_url, files={"file": file_to_upload}, data=data)
|
116 |
+
result = response.json()
|
117 |
|
118 |
# Comprobar la respuesta
|
119 |
if response.status_code == 200:
|
|
|
122 |
else:
|
123 |
print("Error al cargar el archivo:", response.status_code)
|
124 |
return result
|
125 |
+
|
126 |
|
127 |
def search_model(name):
|
128 |
web_service_url = "https://script.google.com/macros/s/AKfycbyRaNxtcuN8CxUrcA_nHW6Sq9G2QJor8Z2-BJUGnQ2F_CB8klF4kQL--U2r2MhLFZ5J/exec"
|
|
|
153 |
|
154 |
def update_tts_methods_voice(select_value):
|
155 |
if select_value == "Edge-tts":
|
156 |
+
return gr.update(choices=EDGE_VOICES), gr.Markdown.update(visible=False), gr.Textbox.update(visible=False),gr.Radio.update(visible=False)
|
157 |
elif select_value == "Bark-tts":
|
158 |
+
return gr.update(choices=BARK_VOICES), gr.Markdown.update(visible=False), gr.Textbox.update(visible=False),gr.Radio.update(visible=False)
|
159 |
elif select_value == 'ElevenLabs':
|
160 |
+
return gr.update(choices=ELEVENLABS_VOICES_NAMES), gr.Markdown.update(visible=True), gr.Textbox.update(visible=True), gr.Radio.update(visible=False)
|
161 |
+
elif select_value == 'CoquiTTS':
|
162 |
+
return gr.Dropdown(visible=False), gr.Markdown.update(visible=True), gr.Textbox.update(visible=True), gr.Radio.update(visible=False)
|
163 |
|
164 |
with gr.Blocks() as app:
|
165 |
gr.HTML("<h1> Simple RVC Inference - by Juuxn 💻 </h1>")
|
|
|
193 |
with gr.Row():
|
194 |
tts_method = gr.Dropdown(choices=VOICE_METHODS, value="Edge-tts", label="Método TTS:", visible=True)
|
195 |
tts_model = gr.Dropdown(choices=ELEVENLABS_VOICES_NAMES, label="Modelo TTS:", visible=True, interactive=True)
|
196 |
+
tts_api_key = gr.Textbox(label="ElevenLabs Api key", show_label=True, placeholder="4a4afce72349680c8e8b6fdcfaf2b65a",interactive=True, visible=False)
|
197 |
+
|
198 |
+
tts_coqui_languages = gr.Radio(
|
199 |
+
label="Language",
|
200 |
+
choices=COQUI_LANGUAGES,
|
201 |
+
value="en",
|
202 |
+
visible=False
|
203 |
+
)
|
204 |
|
205 |
tts_btn = gr.Button(value="Convertir")
|
206 |
|
|
|
208 |
tts_vc_output1 = gr.Textbox(label="Salida")
|
209 |
tts_vc_output2 = gr.Audio(label="Audio de salida")
|
210 |
|
211 |
+
tts_btn.click(fn=tts_infer, inputs=[tts_text, tts_model_url, tts_method, tts_model, tts_api_key, tts_coqui_languages], outputs=[tts_vc_output1, tts_vc_output2])
|
212 |
|
213 |
tts_msg = gr.Markdown("""**Recomiendo que te crees una cuenta de eleven labs y pongas tu clave de api, es gratis y tienes 10k caracteres de limite al mes.** <br/>
|
214 |
![Imgur](https://imgur.com/HH6YTu0.png)
|
215 |
+
""", visible=False)
|
216 |
|
217 |
+
tts_method.change(fn=update_tts_methods_voice, inputs=[tts_method], outputs=[tts_model, tts_msg, tts_api_key, tts_coqui_languages])
|
218 |
|
219 |
with gr.Tab("Modelos"):
|
220 |
gr.HTML("<h4>Buscar modelos</h4>")
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
espeak-ng
|
requirements.txt
CHANGED
@@ -169,4 +169,7 @@ firebase_admin
|
|
169 |
nltk
|
170 |
gdown
|
171 |
validators
|
172 |
-
git+https://github.com/suno-ai/bark.git
|
|
|
|
|
|
|
|
169 |
nltk
|
170 |
gdown
|
171 |
validators
|
172 |
+
#git+https://github.com/suno-ai/bark.git
|
173 |
+
#tortoise-tts
|
174 |
+
#git+https://github.com/neonbjb/tortoise-tts.git
|
175 |
+
neon-tts-plugin-coqui==0.7.3a1
|
tts/constants.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
VOICE_METHODS = ["Edge-tts", "ElevenLabs",]
|
2 |
|
3 |
BARK_VOICES = [
|
4 |
"v2/en_speaker_0-Male",
|
|
|
1 |
+
VOICE_METHODS = ["Edge-tts", "CoquiTTS", "ElevenLabs",]
|
2 |
|
3 |
BARK_VOICES = [
|
4 |
"v2/en_speaker_0-Male",
|
tts/conversion.py
CHANGED
@@ -9,7 +9,10 @@ from inference import Inference
|
|
9 |
import asyncio
|
10 |
from elevenlabs import voices, generate, save
|
11 |
from elevenlabs.api.error import UnauthenticatedRateLimitError
|
|
|
|
|
12 |
|
|
|
13 |
ELEVENLABS_VOICES_RAW = voices()
|
14 |
|
15 |
def get_elevenlabs_voice_names():
|
@@ -20,50 +23,11 @@ def get_elevenlabs_voice_names():
|
|
20 |
|
21 |
ELEVENLABS_VOICES_NAMES = get_elevenlabs_voice_names()
|
22 |
|
23 |
-
#
|
24 |
-
|
25 |
-
|
26 |
-
# from nltk.tokenize import sent_tokenize
|
27 |
-
# from bark import SAMPLE_RATE
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
def cast_to_device(tensor, device):
|
32 |
-
try:
|
33 |
-
return tensor.to(device)
|
34 |
-
except Exception as e:
|
35 |
-
print(e)
|
36 |
-
return tensor
|
37 |
-
|
38 |
-
# Buscar la forma de evitar descargar el archivo de 4gb cada vez que crea una instancia
|
39 |
-
# def _bark_conversion_(text, voice_preset):
|
40 |
-
# os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True)
|
41 |
-
|
42 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
43 |
-
# dtype = torch.float32 if "cpu" in device else torch.float16
|
44 |
-
# bark_processor = AutoProcessor.from_pretrained(
|
45 |
-
# "suno/bark",
|
46 |
-
# cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
|
47 |
-
# torch_dtype=dtype,
|
48 |
-
# )
|
49 |
-
# bark_model = BarkModel.from_pretrained(
|
50 |
-
# "suno/bark",
|
51 |
-
# cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
|
52 |
-
# torch_dtype=dtype,
|
53 |
-
# ).to(device)
|
54 |
-
# # bark_model.enable_cpu_offload()
|
55 |
-
# inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
|
56 |
-
# tensor_dict = {
|
57 |
-
# k: cast_to_device(v, device) if hasattr(v, "to") else v
|
58 |
-
# for k, v in inputs.items()
|
59 |
-
# }
|
60 |
-
# speech_values = bark_model.generate(**tensor_dict, do_sample=True)
|
61 |
-
# sampling_rate = bark_model.generation_config.sample_rate
|
62 |
-
# speech = speech_values.cpu().numpy().squeeze()
|
63 |
-
# return speech, sampling_rate
|
64 |
-
|
65 |
-
|
66 |
-
def tts_infer(tts_text, model_url, tts_method, tts_model, tts_api_key):
|
67 |
if not tts_text:
|
68 |
return 'Primero escribe el texto que quieres convertir.', None
|
69 |
if not tts_model:
|
@@ -79,8 +43,8 @@ def tts_infer(tts_text, model_url, tts_method, tts_model, tts_api_key):
|
|
79 |
tts_text = tts_text[:60]
|
80 |
print("DEMO; limit to 60 characters")
|
81 |
|
82 |
-
language = tts_model[:2]
|
83 |
if tts_method == "Edge-tts":
|
|
|
84 |
try:
|
85 |
asyncio.run(
|
86 |
edge_tts.Communicate(
|
@@ -102,6 +66,17 @@ def tts_infer(tts_text, model_url, tts_method, tts_model, tts_api_key):
|
|
102 |
tts.save(converted_tts_filename)
|
103 |
print("Error: Audio will be replaced.")
|
104 |
success = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
if tts_method == 'ElevenLabs':
|
106 |
try:
|
107 |
audio = generate(
|
@@ -117,25 +92,7 @@ def tts_infer(tts_text, model_url, tts_method, tts_model, tts_api_key):
|
|
117 |
|
118 |
if not model_url:
|
119 |
return 'Pon la url del modelo si quieres aplicarle otro tono.', converted_tts_filename
|
120 |
-
|
121 |
-
# elif tts_method == "Bark-tts":
|
122 |
-
# try:
|
123 |
-
# script = tts_text.replace("\n", " ").strip()
|
124 |
-
# sentences = sent_tokenize(script)
|
125 |
-
# silence = np.zeros(int(0.25 * SAMPLE_RATE))
|
126 |
-
# pieces = []
|
127 |
-
# for sentence in sentences:
|
128 |
-
# audio_array, _ = _bark_conversion_(sentence, tts_model.split("-")[0])
|
129 |
-
# pieces += [audio_array, silence.copy()]
|
130 |
|
131 |
-
# sf.write(
|
132 |
-
# file=converted_tts_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces)
|
133 |
-
# )
|
134 |
-
|
135 |
-
# except Exception as e:
|
136 |
-
# print(f"{e}")
|
137 |
-
# return None, None
|
138 |
-
|
139 |
if success:
|
140 |
inference = Inference(
|
141 |
model_name=model_url,
|
|
|
9 |
import asyncio
|
10 |
from elevenlabs import voices, generate, save
|
11 |
from elevenlabs.api.error import UnauthenticatedRateLimitError
|
12 |
+
from neon_tts_plugin_coqui import CoquiTTS
|
13 |
+
import tempfile
|
14 |
|
15 |
+
# Elevenlabs
|
16 |
ELEVENLABS_VOICES_RAW = voices()
|
17 |
|
18 |
def get_elevenlabs_voice_names():
|
|
|
23 |
|
24 |
ELEVENLABS_VOICES_NAMES = get_elevenlabs_voice_names()
|
25 |
|
26 |
+
# CoquiTTS
|
27 |
+
COQUI_LANGUAGES = list(CoquiTTS.langs.keys())
|
28 |
+
coquiTTS = CoquiTTS()
|
|
|
|
|
29 |
|
30 |
+
def tts_infer(tts_text, model_url, tts_method, tts_model, tts_api_key, language):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
if not tts_text:
|
32 |
return 'Primero escribe el texto que quieres convertir.', None
|
33 |
if not tts_model:
|
|
|
43 |
tts_text = tts_text[:60]
|
44 |
print("DEMO; limit to 60 characters")
|
45 |
|
|
|
46 |
if tts_method == "Edge-tts":
|
47 |
+
language = tts_model[:2]
|
48 |
try:
|
49 |
asyncio.run(
|
50 |
edge_tts.Communicate(
|
|
|
66 |
tts.save(converted_tts_filename)
|
67 |
print("Error: Audio will be replaced.")
|
68 |
success = False
|
69 |
+
|
70 |
+
# if tts_method == "Tortoise":
|
71 |
+
# api.TextToSpeech()
|
72 |
+
|
73 |
+
if tts_method == "CoquiTTS":
|
74 |
+
print(tts_text, language)
|
75 |
+
# return output
|
76 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
77 |
+
coquiTTS.get_tts(tts_text, fp, speaker = {"language" : language})
|
78 |
+
return fp.name
|
79 |
+
|
80 |
if tts_method == 'ElevenLabs':
|
81 |
try:
|
82 |
audio = generate(
|
|
|
92 |
|
93 |
if not model_url:
|
94 |
return 'Pon la url del modelo si quieres aplicarle otro tono.', converted_tts_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
if success:
|
97 |
inference = Inference(
|
98 |
model_name=model_url,
|