Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import base64 | |
import tempfile | |
import json | |
import os | |
from os.path import abspath | |
import zipfile | |
import random | |
import xtts | |
import re | |
DO_CHECK = os.getenv('DO_CHECK', '1') | |
OUTPUT = "./demo_outputs" | |
cloned_speakers = {} | |
print("Preparing file structure...") | |
if not os.path.exists(OUTPUT): | |
os.mkdir(OUTPUT) | |
os.mkdir(os.path.join(OUTPUT, "cloned_speakers")) | |
os.mkdir(os.path.join(OUTPUT, "generated_audios")) | |
elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")): | |
print("Loading existing cloned speakers...") | |
for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")): | |
if file.endswith(".json"): | |
with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp: | |
cloned_speakers[file[:-5]] = json.load(fp) | |
print("Available cloned speakers:", ", ".join(cloned_speakers.keys())) | |
AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios"); | |
ZIP_DIR = os.path.join("zip_outputs"); | |
print("Checking zip at", ZIP_DIR) | |
if not os.path.exists(ZIP_DIR): | |
os.mkdir(ZIP_DIR) | |
try: | |
print("Getting metadata from server ...") | |
LANUGAGES = xtts.get_languages() | |
print("Available languages:", ", ".join(LANUGAGES)) | |
STUDIO_SPEAKERS = xtts.get_speakers() | |
print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) | |
except: | |
raise Exception("Please make sure the server is running first.") | |
def ExtractVars(input_string): | |
# Split the string into lines | |
lines = input_string.split('\n') | |
# Initialize an empty dictionary to store key-value pairs | |
result_dict = { | |
'prefix': None, | |
'name': '', | |
'speaker': None, | |
'num': None, | |
} | |
# List to hold lines that do not start with '!' | |
filtered_lines = [] | |
# Iterate through each line | |
for line in lines: | |
# Check if the line starts with '!' | |
if line.strip().startswith('!'): | |
# Try to split the line into key and value parts | |
try: | |
# Split on '=' and strip whitespace from key and value | |
key, value = line.strip()[1:].split('=') | |
key = key.strip() | |
value = value.strip() | |
# Add to dictionary | |
result_dict[key] = value | |
except ValueError: | |
# Handle the case where there is no '=' or improper format | |
continue | |
elif len(line.strip()) > 0: | |
# Add the line to filtered_lines if it doesn't start with '!' | |
filtered_lines.append(line) | |
# Join the filtered lines back into a single string | |
filtered_string = '\n'.join(filtered_lines) | |
return result_dict, filtered_string | |
def ParsePronucs(PronuncStr): | |
# Split the string into lines | |
lines = PronuncStr.split('\n') | |
# Initialize an empty dictionary to store key-value pairs | |
PronuncWords = [] | |
# Iterate through each line | |
for line in lines: | |
if len(line.strip()) > 0: | |
word,*text = line.strip().split('=',1) | |
word = word.strip() | |
text,*opts = text[0].split("|",1); | |
text = text.strip(); | |
if len(opts) > 0: | |
opts = opts[0].strip().split(","); | |
else: | |
opts = []; | |
PronuncWords.append({'word':word, 'text':text, 'opts':opts}) | |
return PronuncWords | |
def FindSpeakerByName(name, speakerType): | |
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers; | |
for key, value in srcItems.items(): | |
if key == name: | |
return key,value | |
if key.split(" ")[0] == name: | |
return key,value; | |
def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): | |
embeddings = xtts.predict_speaker(upload_file) | |
with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp: | |
json.dump(embeddings, fp) | |
cloned_speakers[clone_speaker_name] = embeddings | |
cloned_speaker_names.append(clone_speaker_name) | |
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names) | |
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature | |
,speed,top_p,top_k, AllFileList,progress=gr.Progress() | |
): | |
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] | |
# break at line! | |
lines = text.split("---"); | |
totalLines = len(lines); | |
print("Total parts:", len(lines)) | |
audioNum = 0; | |
DefaultPrefix = next(tempfile._get_candidate_names()); | |
CurrentPrefix = DefaultPrefix | |
# break pronuc | |
Pronuncs = ParsePronucs(pronunc) | |
AudioList = []; | |
for line in progress.tqdm(lines, desc="Gerando fala..."): | |
audioNum += 1; | |
textVars,cleanLine = ExtractVars(line) | |
if textVars['prefix']: | |
CurrentPrefix = textVars['prefix'] | |
audioName = textVars['name']; | |
if audioName: | |
audioName = '_'+audioName | |
num = textVars['num']; | |
if not num: | |
num = audioNum; | |
path = CurrentPrefix +"_n_" + str(num)+audioName+".wav" | |
print("Generating audio for line", num, 'sequence', audioNum); | |
speaker = textVars['speaker']; | |
if not speaker: | |
speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom | |
speakerName,embeddings = FindSpeakerByName(speaker, speaker_type) | |
if not speakerName: | |
raise ValueError("InvalidSpeaker: "+speakerName) | |
FixedText = cleanLine; | |
for pronunc in Pronuncs: | |
word = pronunc['word'] | |
text = pronunc['text'] | |
opts = pronunc['opts']; | |
flg = re.IGNORECASE | |
if 'cs' in opts: | |
flg = 0; | |
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg) | |
ipts = xtts.TTSInputs( | |
speaker_embedding=embeddings["speaker_embedding"], | |
gpt_cond_latent=embeddings["gpt_cond_latent"], | |
text=FixedText, | |
language=lang, | |
temperature=temperature, | |
speed=speed, | |
top_k=top_k, | |
top_p=top_p | |
) | |
generated_audio = xtts.predict_speech(ipts) | |
print("Audio generated.. Saving to", path); | |
generated_audio_path = os.path.join(AUDIOS_DIR, path) | |
with open(generated_audio_path, "wb") as fp: | |
fp.write(base64.b64decode(generated_audio)) | |
AudioList.append(fp.name); | |
AllFileList.clear(); | |
AllFileList.extend(AudioList); | |
return gr.Dropdown( | |
label="Generated Audios", | |
choices=list(AudioList), | |
value=AudioList[0] | |
) | |
def get_file_content(f): | |
if len(f) > 0: | |
return f[0]; | |
return None; | |
def UpdateFileList(DirListState): | |
DirListState.clear(); | |
DirListState.extend( os.listdir(AUDIOS_DIR) ) | |
def audio_list_update(d): | |
fullPath = abspath(d) | |
return fullPath | |
def ZipAndDownload(files): | |
allFiles = files | |
DefaultPrefix = next(tempfile._get_candidate_names()); | |
zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") ); | |
with zipfile.ZipFile(zipFile, 'w') as zipMe: | |
for file in allFiles: | |
print("Zipping", file); | |
zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED) | |
print("Pronto", zipFile); | |
return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>'; | |
js = """ | |
function DetectDownloadLink(){ | |
console.log('Configuring AutoDonwloadObservr...'); | |
let hiddenLink = document.getElementById("DonwloadLink"); | |
let onChange= function(mutations){ | |
for (const mutation of mutations) { | |
if (mutation.type !== 'childList') | |
continue; | |
for (const addedNode of mutation.addedNodes) { | |
if (addedNode.nodeName === 'A') { | |
location.href = addedNode.href; | |
} | |
} | |
} | |
} | |
let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] } | |
let obs = new MutationObserver(onChange); | |
obs.observe(hiddenLink, config); | |
} | |
""" | |
with gr.Blocks(js=js) as demo: | |
defaultSpeaker = "Dionisio Schuyler" | |
cloned_speaker_names = gr.State(list(cloned_speakers.keys())) | |
AllFileList = gr.State(list([])) | |
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml") | |
with gr.Tab("TTS"): | |
with gr.Column() as row4: | |
with gr.Row() as col4: | |
speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") | |
speaker_name_studio = gr.Dropdown( | |
label="Studio speaker", | |
choices=STUDIO_SPEAKERS.keys(), | |
value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None, | |
) | |
speaker_name_custom = gr.Dropdown( | |
label="Cloned speaker", | |
choices=cloned_speaker_names.value, | |
value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None, | |
) | |
with gr.Accordion("Advanced options", open=False): | |
with gr.Row() as rowAdvanced: | |
temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1") | |
top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1") | |
top_k = gr.Number(label="TOP K",value=50) | |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)") | |
with gr.Column() as col2: | |
with gr.Row(): | |
text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!") | |
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4) | |
with gr.Row(): | |
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt") | |
tts_button = gr.Button(value="TTS") | |
with gr.Column() as col3: | |
# FileList = gr.FileExplorer( | |
# glob="*.wav", | |
# # value=["themes/utils"], | |
# ignore_glob="**/__init__.py", | |
# root_dir=AUDIOS_DIR, | |
# interactive = True, | |
# value=DirectoryList.value | |
# ) | |
AudioList = gr.Dropdown( | |
label="Generated Audios", | |
choices=[] | |
,interactive=True | |
) | |
generated_audio = gr.Audio(label="Audio Play", autoplay=True) | |
AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio]) | |
dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False); | |
downloadAll = gr.DownloadButton("Download All Files") | |
downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]); | |
dummyHtml.render(); | |
with gr.Tab("Clone a new speaker"): | |
with gr.Column() as col1: | |
upload_file = gr.Audio(label="Upload reference audio", type="filepath") | |
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker") | |
clone_button = gr.Button(value="Clone speaker") | |
with gr.Tab("Help"): | |
gr.Markdown(""" | |
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing). | |
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI. | |
In this version, we have some customizations that are quite useful. | |
# Multiple audios | |
You can generate multiple audios at once by separating the text with three dashes. For example: | |
``` | |
Text 1 | |
--- | |
Text 2, line 1 | |
Text 2, line 2 | |
``` | |
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS. | |
You can also specify variables that modify certain aspects. | |
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio. | |
List of variables: | |
- `speaker` = name of the speaker | |
- `num` = file number (by default, it's the sequential number) | |
- `prefix` = file name prefix | |
# Pronunciation adjustment | |
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations. | |
Simply separate them by each line. Example: | |
``` | |
API = A,P,I | |
SomeFunctionCode = Function Code | |
``` | |
This is useful for mapping foreign words, abbreviations, acronyms, code, etc. | |
""") | |
clone_button.click( | |
fn=clone_speaker, | |
inputs=[upload_file, clone_speaker_name, cloned_speaker_names], | |
outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom], | |
) | |
tts_button.click( | |
fn=tts, | |
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature | |
,speed,top_p,top_k,AllFileList | |
], | |
outputs=[AudioList], | |
) | |
if __name__ == "__main__" and DO_CHECK == "1": | |
print("Warming up server... Checking server healthy...") | |
speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items())); | |
print("Testing with", speakerName); | |
ipts = xtts.TTSInputs( | |
speaker_embedding=embs["speaker_embedding"], | |
gpt_cond_latent=embs["gpt_cond_latent"], | |
text="This is a warmup request.", | |
language="en", | |
temperature=0.5, | |
speed=1.0, | |
top_k=50, | |
top_p=0.8 | |
) | |
resp = xtts.predict_speech(ipts) | |
print(" TEST OK") | |
if __name__ == "__main__": | |
print("STARTING...") | |
demo.launch( | |
share=False, | |
debug=False, | |
server_port=7860, | |
server_name="0.0.0.0", | |
allowed_paths=[ZIP_DIR] | |
) | |