import torch
import os
import streamlit as st
from TTS.api import TTS
from tempfile import NamedTemporaryFile
# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"
def generate_audio(audio_file, text_input):
# Initialize model
model = "tts_models/multilingual/multi-dataset/xtts_v2"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tts = TTS(model).to(device)
with NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
output_path = tmp_file.name
tts.tts_to_file(text=text_input, speaker_wav=audio_file, language='en', file_path=output_path)
return output_path
def main():
# Title
title = f"""
Voice Clone
"""
st.markdown(title, unsafe_allow_html=True)
# Subtitle
title = f"""Make your favorite characters say anything!
"""
st.markdown(title, unsafe_allow_html=True)
sample_files = {
'Stewie Griffin': 'sample_inputs/stewie.wav',
'Donald Trump': 'sample_inputs/trump.wav',
'Joe Rogan': 'sample_inputs/rogan.wav'
}
# Upload audio file
uploaded_file = st.file_uploader('Add an audio file of the voice you want to clone...', type=['wav'])
selected_sample = st.selectbox('Or choose a sample:', list(sample_files.keys()))
speaker_file = sample_files[selected_sample] if uploaded_file is None else uploaded_file
if speaker_file:
st.header('Reference Audio')
st.audio(uploaded_file, format='audio/wav')
# Input text
text_input = st.text_input('What do you want your character to say? (Alphabet letters only, DO NOT INCLUDE PUNCTUATION)')
if st.button('Synthesize'):
if text_input:
try:
with st.spinner('Synthesizing...'):
output_path = generate_audio(speaker_file, text_input)
st.header('Synthesized Audio')
st.audio(output_path, format='audio/wav')
except:
st.error('There was an issue synthesizing the text. Please check the input and try again. Remember, do not include punctuation.')
else:
st.error('Please provide a text input!')
if __name__ == '__main__':
main()