Spaces:

Kr08
/

ASR

Sleeping

File size: 5,786 Bytes

import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import traceback
import sys
from audio_processing import AudioProcessor
import spaces 


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def load_qa_model():
    """Load question-answering model"""
    try:
        qa_pipeline = pipeline(
            "text-generation",
            model="meta-llama/Meta-Llama-3-8B-Instruct",
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
        )
        return qa_pipeline
    except Exception as e:
        logger.error(f"Failed to load Q&A model: {str(e)}")
        return None

def load_summarization_model():
    """Load summarization model"""
    try:
        summarizer = pipeline(
            "summarization", 
            model="sshleifer/distilbart-cnn-12-6",
            device=0 if torch.cuda.is_available() else -1
        )
        return summarizer
    except Exception as e:
        logger.error(f"Failed to load summarization model: {str(e)}")
        return None


@spaces.GPU(duration=60)
def process_audio(audio_file, translate=False):
    """Process audio file"""
    try:
        processor = AudioProcessor()
        language_segments, final_segments = processor.process_audio(audio_file, translate)
        
        # Format output
        transcription = ""
        full_text = ""
        
        # Add language detection information
        for segment in language_segments:
            transcription += f"Language: {segment['language']}\n"
            transcription += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
        
        # Add transcription/translation information
        transcription += "Transcription with language detection:\n\n"
        for segment in final_segments:
            transcription += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}):\n"
            transcription += f"Original: {segment['text']}\n"
            if translate and 'translated' in segment:
                transcription += f"Translated: {segment['translated']}\n"
                full_text += segment['translated'] + " "
            else:
                full_text += segment['text'] + " "
            transcription += "\n"
        
        return transcription, full_text
        
    except Exception as e:
        logger.error(f"Audio processing failed: {str(e)}")
        raise gr.Error(f"Processing failed: {str(e)}")


@spaces.GPU(duration=60)
def summarize_text(text):
    """Summarize text"""
    try:
        summarizer = load_summarization_model()
        if summarizer is None:
            return "Summarization model could not be loaded."
        
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)}")
        return "Error occurred during summarization."


@spaces.GPU(duration=60)
def answer_question(context, question):
    """Answer questions about the text"""
    try:
        qa_pipeline = load_qa_model()
        if qa_pipeline is None:
            return "Q&A model could not be loaded."
        
        messages = [
            {"role": "system", "content": "You are a helpful assistant who can answer questions based on the given context."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
        ]
        
        response = qa_pipeline(messages, max_new_tokens=256)[0]['generated_text']
        return response
    except Exception as e:
        logger.error(f"Q&A failed: {str(e)}")
        return f"Error occurred during Q&A process: {str(e)}"


# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Automatic Speech Recognition for Indic Languages")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath")
            translate_checkbox = gr.Checkbox(label="Enable Translation")
            process_button = gr.Button("Process Audio")
        
        with gr.Column():
            transcription_output = gr.Textbox(label="Transcription/Translation", lines=10)
            full_text_output = gr.Textbox(label="Full Text", lines=5)
    
    with gr.Row():
        with gr.Column():
            summarize_button = gr.Button("Summarize")
            summary_output = gr.Textbox(label="Summary", lines=3)
            
        with gr.Column():
            question_input = gr.Textbox(label="Ask a question about the transcription")
            answer_button = gr.Button("Get Answer")
            answer_output = gr.Textbox(label="Answer", lines=3)
    
    # Set up event handlers
    process_button.click(
        process_audio,
        inputs=[audio_input, translate_checkbox],
        outputs=[transcription_output, full_text_output]
    )
    
    summarize_button.click(
        summarize_text,
        inputs=[full_text_output],
        outputs=[summary_output]
    )
    
    answer_button.click(
        answer_question,
        inputs=[full_text_output, question_input],
        outputs=[answer_output]
    )
    
    # Add system information
    gr.Markdown(f"""
    ## System Information
    - Device: {"CUDA" if torch.cuda.is_available() else "CPU"}
    - CUDA Available: {"Yes" if torch.cuda.is_available() else "No"}
    
    ## Features
    - Automatic language detection
    - High-quality transcription using MMS
    - Optional translation to English
    - Text summarization
    - Question answering
    """)

if __name__ == "__main__":
    iface.launch(server_port=None)