import gradio as gr from faster_whisper import WhisperModel import numpy as np import os import statistics from transformers import pipeline from textblob import TextBlob import torch import time class WebAITranscriber: def __init__(self): # Check if CUDA is available self.device = "cuda" if torch.cuda.is_available() else "cpu" self.compute_type = "float16" if torch.cuda.is_available() else "int8" # Initialize Whisper Model with optimized settings print(f"Initializing Whisper Model on {self.device}...") self.model = WhisperModel( "base", # Fixed: Directly specify the model size as a string device=self.device, compute_type=self.compute_type, cpu_threads=min(os.cpu_count(), 4), # Optimized thread count download_root=None, # Use default cache directory local_files_only=False # Allow downloading if needed ) # Optimize model settings self.model_settings = { 'beam_size': 1, # Reduced beam size for speed 'best_of': 1, # Take first result 'temperature': 0, # Reduce randomness 'compression_ratio_threshold': 2.4, 'condition_on_previous_text': True, 'no_speech_threshold': 0.6, 'initial_prompt': None } # AI Detection Markers (optimized for speed) self.ai_markers = { 'formal_phrases': [ 'moreover', 'furthermore', 'consequently', 'in conclusion', 'it is worth noting' ], 'ai_disclaimers': [ 'as an ai', 'i want to be clear', 'it is important to note' ] } # Initialize AI Detector with optimized settings print("Initializing AI Detection...") self.ai_detector = pipeline('text-classification', model='roberta-base-openai-detector', device=self.device) # Optimized buffer settings self.min_analysis_words = 10 # Minimum words before analysis self.max_buffer_size = 1000 # Maximum buffer size in words self.analysis_interval = 3 # Minimum seconds between analyses self.last_analysis_time = time.time() self.transcript_buffer = [] def process_realtime_audio(self, audio, state): """Process real-time audio with optimized settings""" if audio is None: return "", state try: # Initialize state if needed if state is None: state = {"full_transcript": "", "buffer": [], "pending_analysis": False} # Process audio in smaller chunks for real-time performance segments, _ = self.model.transcribe( audio[1], language="en", # Specify language for faster processing vad_filter=True, # Use Voice Activity Detection **self.model_settings ) # Process segments current_transcript = "" for segment in segments: current_transcript += segment.text + " " if not current_transcript.strip(): return state["full_transcript"], state # Update state state["full_transcript"] += " " + current_transcript state["buffer"].append(current_transcript) # Check if we should perform analysis current_time = time.time() buffer_text = " ".join(state["buffer"]) word_count = len(buffer_text.split()) if (word_count >= self.min_analysis_words and (current_time - self.last_analysis_time) >= self.analysis_interval): # Perform AI analysis if len(buffer_text.strip()) > 0: classification, probability, confidence = self.analyze_ai_content(buffer_text) analysis_result = f"\n\n---AI Analysis---\nClassification: {classification}\nProbability: {probability:.2f}\nConfidence: {confidence}\n---\n" state["full_transcript"] += analysis_result state["buffer"] = [] self.last_analysis_time = current_time # Trim buffer if it gets too large if word_count > self.max_buffer_size: words = state["full_transcript"].split() state["full_transcript"] = " ".join(words[-self.max_buffer_size:]) return state["full_transcript"], state except Exception as e: return f"Error processing audio: {str(e)}", state def analyze_ai_content(self, text): """Optimized AI content analysis""" if not text or len(text.split()) < self.min_analysis_words: return "Insufficient text", 0.0, "None" try: # Parallel processing of different analysis methods roberta_result = self.ai_detector(text[:512])[0] # Limit text length for speed ai_prob = roberta_result['score'] # Quick linguistic analysis linguistic_score = self.analyze_linguistic_patterns(text) # Calculate final score final_score = (ai_prob + linguistic_score) / 2 # Fast classification if final_score > 0.7: return "AI Generated", final_score, "High" elif final_score > 0.5: return "Likely AI", final_score, "Medium" elif final_score > 0.3: return "Possibly AI", final_score, "Low" return "Likely Human", final_score, "High" except Exception as e: print(f"Analysis error: {e}") return "Analysis Error", 0.0, "None" def analyze_linguistic_patterns(self, text): """Optimized linguistic pattern analysis""" text_lower = text.lower() ai_phrase_count = sum(1 for category in self.ai_markers.values() for phrase in category if phrase in text_lower) unique_words = len(set(text.split())) total_words = len(text.split()) return min((ai_phrase_count * 0.3) + (1 - (unique_words / total_words)) * 0.4, 1.0) def create_gradio_interface(): transcriber = WebAITranscriber() # Create the interface with gr.Blocks(title="Real-time AI Speech Analyzer") as interface: gr.Markdown(""" # Real-time AI Speech Analyzer This app uses Faster Whisper for real-time speech recognition and AI detection. """) with gr.Tab("Real-time Analysis"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["microphone"], # Updated from 'source' to 'sources' streaming=True, type="numpy", label="🎤 Speak into your microphone" ) gr.Markdown(""" ### Tips for best performance: - Speak clearly and at a moderate pace - Minimize background noise - Wait a few seconds for initial processing """) with gr.Column(): realtime_output = gr.Textbox( label="Real-time Transcript and Analysis", lines=15, max_lines=30 ) audio_input.stream( transcriber.process_realtime_audio, inputs=[audio_input], outputs=[realtime_output], show_progress=False ) gr.Markdown(""" ### Technical Details: - Using Faster Whisper for optimized speech recognition - Real-time AI content analysis - Automatic voice activity detection - Optimized for low-latency processing """) return interface # Launch the app if __name__ == "__main__": interface = create_gradio_interface() interface.launch()