import gradio as gr from models.tts import TTSModel from utils.audio_utils import save_audio, get_cached_audio, get_audio_filename from utils.input_validation import validate_input from config.language_mapping import ( LANGUAGE_VOICE_MAPPING, construct_description, EMOTION_DESC, SPEED_DESC, PITCH_DESC, BACKGROUND_NOISE_DESC, REVERBERATION_DESC, QUALITY_DESC, get_speakers_for_language ) def generate_speech( text, language, speaker, emotion="Neutral", speed="Normal", pitch="Medium", background_noise="Minimal", reverberation="Close", quality="High" ): try: # Validate inputs validate_input(text, language) # Check if audio is already cached cached_audio = get_cached_audio( text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality ) if cached_audio: return cached_audio # Get the description using the imported constructor description = construct_description( speaker, language, emotion, speed, pitch, background_noise, reverberation, quality ) # Generate audio tts_model = TTSModel() audio_array = tts_model.generate_audio(text, description) # Save the generated audio filename = get_audio_filename( text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality ) filepath = save_audio(audio_array, filename) return filepath except Exception as e: raise gr.Error(str(e)) # Create Gradio interface with gr.Blocks(title="Indic Text-to-Speech") as demo: gr.Markdown("# Indian Local Text-to-Speech Synthesizer") gr.Markdown("Generate natural speech in multiple Indian languages using AI4Bharat's model") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to speak", placeholder="Enter the text you want to convert to speech...", lines=3 ) with gr.Row(): language_input = gr.Dropdown( choices=sorted(list(LANGUAGE_VOICE_MAPPING.keys())), label="Language", value="English" ) speaker_input = gr.Dropdown( choices=LANGUAGE_VOICE_MAPPING["English"], # Default choices label="Speaker", value=LANGUAGE_VOICE_MAPPING["English"][0] # Default value ) with gr.Row(): emotion_input = gr.Dropdown( choices=list(EMOTION_DESC.keys()), label="Expressivity", value="Neutral" ) speed_input = gr.Dropdown( choices=list(SPEED_DESC.keys()), label="Speaking Speed", value="Normal" ) with gr.Row(): pitch_input = gr.Dropdown( choices=list(PITCH_DESC.keys()), label="Pitch", value="Medium" ) background_input = gr.Dropdown( choices=list(BACKGROUND_NOISE_DESC.keys()), label="Background Noise", value="Minimal" ) with gr.Row(): reverb_input = gr.Dropdown( choices=list(REVERBERATION_DESC.keys()), label="Reverberation", value="Close" ) quality_input = gr.Dropdown( choices=list(QUALITY_DESC.keys()), label="Audio Quality", value="High" ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="numpy" ) # Update speaker choices when language changes def update_speakers(language): speakers = get_speakers_for_language(language) return gr.Dropdown(choices=speakers, value=speakers[0]) language_input.change( fn=update_speakers, inputs=[language_input], outputs=[speaker_input] ) # Connect the components generate_btn.click( fn=generate_speech, inputs=[ text_input, language_input, speaker_input, emotion_input, speed_input, pitch_input, background_input, reverb_input, quality_input ], outputs=audio_output ) # Pre-generate and cache example outputs example_outputs = [] examples = [ ["Hello, how are you?", "English", "Thoma", "Happy", "Normal", "Medium", "Minimal", "Close", "High"], ["नमस्ते, आप कैसे हैं?", "Hindi", "Rohit", "Neutral", "Normal", "Medium", "None", "Very Close", "Studio"], ["ನಮಸ್ಕಾರ, ಹೇಗಿದ್ದೀರಾ?", "Kannada", "Suresh", "Highly Expressive", "Fast", "High", "Minimal", "Moderate", "High"], ["How are you doing today?", "English", "Mary", "Monotone", "Slow", "Low", "Moderate", "Distant", "Good"], ] # Generate and cache example outputs at startup for example in examples: output = generate_speech(*example) example_outputs.append(output) # Add examples with cached outputs gr.Examples( examples=examples, inputs=[ text_input, language_input, speaker_input, emotion_input, speed_input, pitch_input, background_input, reverb_input, quality_input ], outputs=audio_output, fn=generate_speech, cache_examples=True, preprocess=False, # Don't preprocess inputs postprocess=False # Don't postprocess outputs ) if __name__ == "__main__": demo.launch()