Spaces:

RealSanjay
/

ai-speech-analyzer

Sleeping

File size: 8,263 Bytes

import gradio as gr
from faster_whisper import WhisperModel
import numpy as np
import os
import statistics
from transformers import pipeline
from textblob import TextBlob
import torch
import time

class WebAITranscriber:
    def __init__(self):
        # Check if CUDA is available
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.compute_type = "float16" if torch.cuda.is_available() else "int8"
        
        # Initialize Whisper Model with optimized settings
        print(f"Initializing Whisper Model on {self.device}...")
        self.model = WhisperModel(
            "base",  # Fixed: Directly specify the model size as a string
            device=self.device,
            compute_type=self.compute_type,
            cpu_threads=min(os.cpu_count(), 4),  # Optimized thread count
            download_root=None,  # Use default cache directory
            local_files_only=False  # Allow downloading if needed
        )

        # Optimize model settings
        self.model_settings = {
            'beam_size': 1,  # Reduced beam size for speed
            'best_of': 1,    # Take first result
            'temperature': 0, # Reduce randomness
            'compression_ratio_threshold': 2.4,
            'condition_on_previous_text': True,
            'no_speech_threshold': 0.6,
            'initial_prompt': None
        }

        # AI Detection Markers (optimized for speed)
        self.ai_markers = {
            'formal_phrases': [
                'moreover', 'furthermore', 'consequently', 
                'in conclusion', 'it is worth noting'
            ],
            'ai_disclaimers': [
                'as an ai', 'i want to be clear', 
                'it is important to note'
            ]
        }

        # Initialize AI Detector with optimized settings
        print("Initializing AI Detection...")
        self.ai_detector = pipeline('text-classification', 
                                  model='roberta-base-openai-detector',
                                  device=self.device)
        
        # Optimized buffer settings
        self.min_analysis_words = 10  # Minimum words before analysis
        self.max_buffer_size = 1000   # Maximum buffer size in words
        self.analysis_interval = 3     # Minimum seconds between analyses
        self.last_analysis_time = time.time()
        self.transcript_buffer = []

    def process_realtime_audio(self, audio, state):
        """Process real-time audio with optimized settings"""
        if audio is None:
            return "", state

        try:
            # Initialize state if needed
            if state is None:
                state = {"full_transcript": "", "buffer": [], "pending_analysis": False}

            # Process audio in smaller chunks for real-time performance
            segments, _ = self.model.transcribe(
                audio[1],
                language="en",  # Specify language for faster processing
                vad_filter=True,  # Use Voice Activity Detection
                **self.model_settings
            )

            # Process segments
            current_transcript = ""
            for segment in segments:
                current_transcript += segment.text + " "

            if not current_transcript.strip():
                return state["full_transcript"], state

            # Update state
            state["full_transcript"] += " " + current_transcript
            state["buffer"].append(current_transcript)

            # Check if we should perform analysis
            current_time = time.time()
            buffer_text = " ".join(state["buffer"])
            word_count = len(buffer_text.split())

            if (word_count >= self.min_analysis_words and 
                (current_time - self.last_analysis_time) >= self.analysis_interval):
                
                # Perform AI analysis
                if len(buffer_text.strip()) > 0:
                    classification, probability, confidence = self.analyze_ai_content(buffer_text)
                    analysis_result = f"\n\n---AI Analysis---\nClassification: {classification}\nProbability: {probability:.2f}\nConfidence: {confidence}\n---\n"
                    state["full_transcript"] += analysis_result
                    state["buffer"] = []
                    self.last_analysis_time = current_time

                # Trim buffer if it gets too large
                if word_count > self.max_buffer_size:
                    words = state["full_transcript"].split()
                    state["full_transcript"] = " ".join(words[-self.max_buffer_size:])

            return state["full_transcript"], state

        except Exception as e:
            return f"Error processing audio: {str(e)}", state

    def analyze_ai_content(self, text):
        """Optimized AI content analysis"""
        if not text or len(text.split()) < self.min_analysis_words:
            return "Insufficient text", 0.0, "None"

        try:
            # Parallel processing of different analysis methods
            roberta_result = self.ai_detector(text[:512])[0]  # Limit text length for speed
            ai_prob = roberta_result['score']

            # Quick linguistic analysis
            linguistic_score = self.analyze_linguistic_patterns(text)
            
            # Calculate final score
            final_score = (ai_prob + linguistic_score) / 2

            # Fast classification
            if final_score > 0.7:
                return "AI Generated", final_score, "High"
            elif final_score > 0.5:
                return "Likely AI", final_score, "Medium"
            elif final_score > 0.3:
                return "Possibly AI", final_score, "Low"
            return "Likely Human", final_score, "High"

        except Exception as e:
            print(f"Analysis error: {e}")
            return "Analysis Error", 0.0, "None"

    def analyze_linguistic_patterns(self, text):
        """Optimized linguistic pattern analysis"""
        text_lower = text.lower()
        ai_phrase_count = sum(1 for category in self.ai_markers.values() 
                            for phrase in category if phrase in text_lower)
        
        unique_words = len(set(text.split()))
        total_words = len(text.split())
        
        return min((ai_phrase_count * 0.3) + (1 - (unique_words / total_words)) * 0.4, 1.0)

def create_gradio_interface():
    transcriber = WebAITranscriber()

    # Create the interface
    with gr.Blocks(title="Real-time AI Speech Analyzer") as interface:
        gr.Markdown("""
        # Real-time AI Speech Analyzer
        This app uses Faster Whisper for real-time speech recognition and AI detection.
        """)
        
        with gr.Tab("Real-time Analysis"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        sources=["microphone"],  # Updated from 'source' to 'sources'
                        streaming=True,
                        type="numpy",
                        label="🎤 Speak into your microphone"
                    )
                    gr.Markdown("""
                    ### Tips for best performance:
                    - Speak clearly and at a moderate pace
                    - Minimize background noise
                    - Wait a few seconds for initial processing
                    """)
                with gr.Column():
                    realtime_output = gr.Textbox(
                        label="Real-time Transcript and Analysis",
                        lines=15,
                        max_lines=30
                    )
                    
            audio_input.stream(
                transcriber.process_realtime_audio,
                inputs=[audio_input],
                outputs=[realtime_output],
                show_progress=False
            )

        gr.Markdown("""
        ### Technical Details:
        - Using Faster Whisper for optimized speech recognition
        - Real-time AI content analysis
        - Automatic voice activity detection
        - Optimized for low-latency processing
        """)

    return interface

# Launch the app
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch()