import torch import os import streamlit as st from TTS.api import TTS from tempfile import NamedTemporaryFile # By using XTTS you agree to CPML license https://coqui.ai/cpml os.environ["COQUI_TOS_AGREED"] = "1" def generate_audio(audio_file, text_input): # Initialize model model = "tts_models/multilingual/multi-dataset/xtts_v2" device = 'cuda' if torch.cuda.is_available() else 'cpu' tts = TTS(model).to(device) with NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: output_path = tmp_file.name tts.tts_to_file(text=text_input, speaker_wav=audio_file, language='en', file_path=output_path) return output_path def main(): # Title title = f"""

Voice Clone

""" st.markdown(title, unsafe_allow_html=True) # Subtitle title = f"""

Make your favorite characters say anything!

""" st.markdown(title, unsafe_allow_html=True) sample_files = { 'Stewie Griffin': 'sample_inputs/stewie.wav', 'Donald Trump': 'sample_inputs/trump.wav', 'Joe Rogan': 'sample_inputs/rogan.wav' } # Upload audio file uploaded_file = st.file_uploader('Add an audio file of the voice you want to clone...', type=['wav']) selected_sample = st.selectbox('Or choose a sample:', list(sample_files.keys())) speaker_file = sample_files[selected_sample] if uploaded_file is None else uploaded_file if speaker_file: st.header('Reference Audio') st.audio(uploaded_file, format='audio/wav') # Input text text_input = st.text_input('What do you want your character to say? (Alphabet letters only, DO NOT INCLUDE PUNCTUATION)') if st.button('Synthesize'): if text_input: try: with st.spinner('Synthesizing...'): output_path = generate_audio(speaker_file, text_input) st.header('Synthesized Audio') st.audio(output_path, format='audio/wav') except: st.error('There was an issue synthesizing the text. Please check the input and try again. Remember, do not include punctuation.') else: st.error('Please provide a text input!') if __name__ == '__main__': main()