|
import gradio as gr |
|
from transformers import pipeline |
|
from gtts import gTTS |
|
import tempfile |
|
import os |
|
|
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-english") |
|
|
|
|
|
qa_model = pipeline("question-answering", model="AVISHKAARAM/avishkaarak-ekta-hindi") |
|
|
|
|
|
def answer_question(context, question=None, audio=None): |
|
try: |
|
|
|
if audio: |
|
transcription_result = transcriber(audio)["text"] |
|
question_text = transcription_result |
|
else: |
|
question_text = question |
|
|
|
|
|
qa_result = qa_model(question=question_text, context=context) |
|
answer = qa_result["answer"] |
|
|
|
|
|
tts = gTTS(text=answer, lang="en") |
|
audio_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name |
|
tts.save(audio_path) |
|
|
|
return answer, audio_path |
|
|
|
except Exception as e: |
|
return str(e), None |
|
|
|
|
|
|
|
context_input = gr.Textbox(label="Context", lines=3) |
|
question_input = gr.Textbox(label="Question") |
|
audio_input = gr.Audio(type="filepath", label="Question (Audio Input)") |
|
|
|
output_text = gr.Textbox(label="Answer") |
|
output_audio = gr.Audio(label="Answer (Audio Output)") |
|
|
|
interface = gr.Interface( |
|
fn=answer_question, |
|
inputs=[context_input, question_input, audio_input], |
|
outputs=[output_text, output_audio], |
|
title="Multimodal Question Answering", |
|
description="Provide a context and either a text question or an audio question to get an answer.", |
|
examples=[ |
|
["The capital of France is Paris.", "What is the capital of France?", None], |
|
["OpenAI is famous for developing GPT-3.", "What is OpenAI known for?", None], |
|
], |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|