Spaces:
Sleeping
Sleeping
File size: 8,263 Bytes
17dc2a5 7be1aca 17dc2a5 9ef4d1e 17dc2a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import gradio as gr
from faster_whisper import WhisperModel
import numpy as np
import os
import statistics
from transformers import pipeline
from textblob import TextBlob
import torch
import time
class WebAITranscriber:
def __init__(self):
# Check if CUDA is available
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.compute_type = "float16" if torch.cuda.is_available() else "int8"
# Initialize Whisper Model with optimized settings
print(f"Initializing Whisper Model on {self.device}...")
self.model = WhisperModel(
"base", # Fixed: Directly specify the model size as a string
device=self.device,
compute_type=self.compute_type,
cpu_threads=min(os.cpu_count(), 4), # Optimized thread count
download_root=None, # Use default cache directory
local_files_only=False # Allow downloading if needed
)
# Optimize model settings
self.model_settings = {
'beam_size': 1, # Reduced beam size for speed
'best_of': 1, # Take first result
'temperature': 0, # Reduce randomness
'compression_ratio_threshold': 2.4,
'condition_on_previous_text': True,
'no_speech_threshold': 0.6,
'initial_prompt': None
}
# AI Detection Markers (optimized for speed)
self.ai_markers = {
'formal_phrases': [
'moreover', 'furthermore', 'consequently',
'in conclusion', 'it is worth noting'
],
'ai_disclaimers': [
'as an ai', 'i want to be clear',
'it is important to note'
]
}
# Initialize AI Detector with optimized settings
print("Initializing AI Detection...")
self.ai_detector = pipeline('text-classification',
model='roberta-base-openai-detector',
device=self.device)
# Optimized buffer settings
self.min_analysis_words = 10 # Minimum words before analysis
self.max_buffer_size = 1000 # Maximum buffer size in words
self.analysis_interval = 3 # Minimum seconds between analyses
self.last_analysis_time = time.time()
self.transcript_buffer = []
def process_realtime_audio(self, audio, state):
"""Process real-time audio with optimized settings"""
if audio is None:
return "", state
try:
# Initialize state if needed
if state is None:
state = {"full_transcript": "", "buffer": [], "pending_analysis": False}
# Process audio in smaller chunks for real-time performance
segments, _ = self.model.transcribe(
audio[1],
language="en", # Specify language for faster processing
vad_filter=True, # Use Voice Activity Detection
**self.model_settings
)
# Process segments
current_transcript = ""
for segment in segments:
current_transcript += segment.text + " "
if not current_transcript.strip():
return state["full_transcript"], state
# Update state
state["full_transcript"] += " " + current_transcript
state["buffer"].append(current_transcript)
# Check if we should perform analysis
current_time = time.time()
buffer_text = " ".join(state["buffer"])
word_count = len(buffer_text.split())
if (word_count >= self.min_analysis_words and
(current_time - self.last_analysis_time) >= self.analysis_interval):
# Perform AI analysis
if len(buffer_text.strip()) > 0:
classification, probability, confidence = self.analyze_ai_content(buffer_text)
analysis_result = f"\n\n---AI Analysis---\nClassification: {classification}\nProbability: {probability:.2f}\nConfidence: {confidence}\n---\n"
state["full_transcript"] += analysis_result
state["buffer"] = []
self.last_analysis_time = current_time
# Trim buffer if it gets too large
if word_count > self.max_buffer_size:
words = state["full_transcript"].split()
state["full_transcript"] = " ".join(words[-self.max_buffer_size:])
return state["full_transcript"], state
except Exception as e:
return f"Error processing audio: {str(e)}", state
def analyze_ai_content(self, text):
"""Optimized AI content analysis"""
if not text or len(text.split()) < self.min_analysis_words:
return "Insufficient text", 0.0, "None"
try:
# Parallel processing of different analysis methods
roberta_result = self.ai_detector(text[:512])[0] # Limit text length for speed
ai_prob = roberta_result['score']
# Quick linguistic analysis
linguistic_score = self.analyze_linguistic_patterns(text)
# Calculate final score
final_score = (ai_prob + linguistic_score) / 2
# Fast classification
if final_score > 0.7:
return "AI Generated", final_score, "High"
elif final_score > 0.5:
return "Likely AI", final_score, "Medium"
elif final_score > 0.3:
return "Possibly AI", final_score, "Low"
return "Likely Human", final_score, "High"
except Exception as e:
print(f"Analysis error: {e}")
return "Analysis Error", 0.0, "None"
def analyze_linguistic_patterns(self, text):
"""Optimized linguistic pattern analysis"""
text_lower = text.lower()
ai_phrase_count = sum(1 for category in self.ai_markers.values()
for phrase in category if phrase in text_lower)
unique_words = len(set(text.split()))
total_words = len(text.split())
return min((ai_phrase_count * 0.3) + (1 - (unique_words / total_words)) * 0.4, 1.0)
def create_gradio_interface():
transcriber = WebAITranscriber()
# Create the interface
with gr.Blocks(title="Real-time AI Speech Analyzer") as interface:
gr.Markdown("""
# Real-time AI Speech Analyzer
This app uses Faster Whisper for real-time speech recognition and AI detection.
""")
with gr.Tab("Real-time Analysis"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"], # Updated from 'source' to 'sources'
streaming=True,
type="numpy",
label="🎤 Speak into your microphone"
)
gr.Markdown("""
### Tips for best performance:
- Speak clearly and at a moderate pace
- Minimize background noise
- Wait a few seconds for initial processing
""")
with gr.Column():
realtime_output = gr.Textbox(
label="Real-time Transcript and Analysis",
lines=15,
max_lines=30
)
audio_input.stream(
transcriber.process_realtime_audio,
inputs=[audio_input],
outputs=[realtime_output],
show_progress=False
)
gr.Markdown("""
### Technical Details:
- Using Faster Whisper for optimized speech recognition
- Real-time AI content analysis
- Automatic voice activity detection
- Optimized for low-latency processing
""")
return interface
# Launch the app
if __name__ == "__main__":
interface = create_gradio_interface()
interface.launch() |