RealSanjay's picture
Update app.py
9ef4d1e verified
import gradio as gr
from faster_whisper import WhisperModel
import numpy as np
import os
import statistics
from transformers import pipeline
from textblob import TextBlob
import torch
import time
class WebAITranscriber:
def __init__(self):
# Check if CUDA is available
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.compute_type = "float16" if torch.cuda.is_available() else "int8"
# Initialize Whisper Model with optimized settings
print(f"Initializing Whisper Model on {self.device}...")
self.model = WhisperModel(
"base", # Fixed: Directly specify the model size as a string
device=self.device,
compute_type=self.compute_type,
cpu_threads=min(os.cpu_count(), 4), # Optimized thread count
download_root=None, # Use default cache directory
local_files_only=False # Allow downloading if needed
)
# Optimize model settings
self.model_settings = {
'beam_size': 1, # Reduced beam size for speed
'best_of': 1, # Take first result
'temperature': 0, # Reduce randomness
'compression_ratio_threshold': 2.4,
'condition_on_previous_text': True,
'no_speech_threshold': 0.6,
'initial_prompt': None
}
# AI Detection Markers (optimized for speed)
self.ai_markers = {
'formal_phrases': [
'moreover', 'furthermore', 'consequently',
'in conclusion', 'it is worth noting'
],
'ai_disclaimers': [
'as an ai', 'i want to be clear',
'it is important to note'
]
}
# Initialize AI Detector with optimized settings
print("Initializing AI Detection...")
self.ai_detector = pipeline('text-classification',
model='roberta-base-openai-detector',
device=self.device)
# Optimized buffer settings
self.min_analysis_words = 10 # Minimum words before analysis
self.max_buffer_size = 1000 # Maximum buffer size in words
self.analysis_interval = 3 # Minimum seconds between analyses
self.last_analysis_time = time.time()
self.transcript_buffer = []
def process_realtime_audio(self, audio, state):
"""Process real-time audio with optimized settings"""
if audio is None:
return "", state
try:
# Initialize state if needed
if state is None:
state = {"full_transcript": "", "buffer": [], "pending_analysis": False}
# Process audio in smaller chunks for real-time performance
segments, _ = self.model.transcribe(
audio[1],
language="en", # Specify language for faster processing
vad_filter=True, # Use Voice Activity Detection
**self.model_settings
)
# Process segments
current_transcript = ""
for segment in segments:
current_transcript += segment.text + " "
if not current_transcript.strip():
return state["full_transcript"], state
# Update state
state["full_transcript"] += " " + current_transcript
state["buffer"].append(current_transcript)
# Check if we should perform analysis
current_time = time.time()
buffer_text = " ".join(state["buffer"])
word_count = len(buffer_text.split())
if (word_count >= self.min_analysis_words and
(current_time - self.last_analysis_time) >= self.analysis_interval):
# Perform AI analysis
if len(buffer_text.strip()) > 0:
classification, probability, confidence = self.analyze_ai_content(buffer_text)
analysis_result = f"\n\n---AI Analysis---\nClassification: {classification}\nProbability: {probability:.2f}\nConfidence: {confidence}\n---\n"
state["full_transcript"] += analysis_result
state["buffer"] = []
self.last_analysis_time = current_time
# Trim buffer if it gets too large
if word_count > self.max_buffer_size:
words = state["full_transcript"].split()
state["full_transcript"] = " ".join(words[-self.max_buffer_size:])
return state["full_transcript"], state
except Exception as e:
return f"Error processing audio: {str(e)}", state
def analyze_ai_content(self, text):
"""Optimized AI content analysis"""
if not text or len(text.split()) < self.min_analysis_words:
return "Insufficient text", 0.0, "None"
try:
# Parallel processing of different analysis methods
roberta_result = self.ai_detector(text[:512])[0] # Limit text length for speed
ai_prob = roberta_result['score']
# Quick linguistic analysis
linguistic_score = self.analyze_linguistic_patterns(text)
# Calculate final score
final_score = (ai_prob + linguistic_score) / 2
# Fast classification
if final_score > 0.7:
return "AI Generated", final_score, "High"
elif final_score > 0.5:
return "Likely AI", final_score, "Medium"
elif final_score > 0.3:
return "Possibly AI", final_score, "Low"
return "Likely Human", final_score, "High"
except Exception as e:
print(f"Analysis error: {e}")
return "Analysis Error", 0.0, "None"
def analyze_linguistic_patterns(self, text):
"""Optimized linguistic pattern analysis"""
text_lower = text.lower()
ai_phrase_count = sum(1 for category in self.ai_markers.values()
for phrase in category if phrase in text_lower)
unique_words = len(set(text.split()))
total_words = len(text.split())
return min((ai_phrase_count * 0.3) + (1 - (unique_words / total_words)) * 0.4, 1.0)
def create_gradio_interface():
transcriber = WebAITranscriber()
# Create the interface
with gr.Blocks(title="Real-time AI Speech Analyzer") as interface:
gr.Markdown("""
# Real-time AI Speech Analyzer
This app uses Faster Whisper for real-time speech recognition and AI detection.
""")
with gr.Tab("Real-time Analysis"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"], # Updated from 'source' to 'sources'
streaming=True,
type="numpy",
label="🎤 Speak into your microphone"
)
gr.Markdown("""
### Tips for best performance:
- Speak clearly and at a moderate pace
- Minimize background noise
- Wait a few seconds for initial processing
""")
with gr.Column():
realtime_output = gr.Textbox(
label="Real-time Transcript and Analysis",
lines=15,
max_lines=30
)
audio_input.stream(
transcriber.process_realtime_audio,
inputs=[audio_input],
outputs=[realtime_output],
show_progress=False
)
gr.Markdown("""
### Technical Details:
- Using Faster Whisper for optimized speech recognition
- Real-time AI content analysis
- Automatic voice activity detection
- Optimized for low-latency processing
""")
return interface
# Launch the app
if __name__ == "__main__":
interface = create_gradio_interface()
interface.launch()