File size: 8,263 Bytes
17dc2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7be1aca
17dc2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ef4d1e
17dc2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import gradio as gr
from faster_whisper import WhisperModel
import numpy as np
import os
import statistics
from transformers import pipeline
from textblob import TextBlob
import torch
import time

class WebAITranscriber:
    def __init__(self):
        # Check if CUDA is available
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.compute_type = "float16" if torch.cuda.is_available() else "int8"
        
        # Initialize Whisper Model with optimized settings
        print(f"Initializing Whisper Model on {self.device}...")
        self.model = WhisperModel(
            "base",  # Fixed: Directly specify the model size as a string
            device=self.device,
            compute_type=self.compute_type,
            cpu_threads=min(os.cpu_count(), 4),  # Optimized thread count
            download_root=None,  # Use default cache directory
            local_files_only=False  # Allow downloading if needed
        )

        # Optimize model settings
        self.model_settings = {
            'beam_size': 1,  # Reduced beam size for speed
            'best_of': 1,    # Take first result
            'temperature': 0, # Reduce randomness
            'compression_ratio_threshold': 2.4,
            'condition_on_previous_text': True,
            'no_speech_threshold': 0.6,
            'initial_prompt': None
        }

        # AI Detection Markers (optimized for speed)
        self.ai_markers = {
            'formal_phrases': [
                'moreover', 'furthermore', 'consequently', 
                'in conclusion', 'it is worth noting'
            ],
            'ai_disclaimers': [
                'as an ai', 'i want to be clear', 
                'it is important to note'
            ]
        }

        # Initialize AI Detector with optimized settings
        print("Initializing AI Detection...")
        self.ai_detector = pipeline('text-classification', 
                                  model='roberta-base-openai-detector',
                                  device=self.device)
        
        # Optimized buffer settings
        self.min_analysis_words = 10  # Minimum words before analysis
        self.max_buffer_size = 1000   # Maximum buffer size in words
        self.analysis_interval = 3     # Minimum seconds between analyses
        self.last_analysis_time = time.time()
        self.transcript_buffer = []

    def process_realtime_audio(self, audio, state):
        """Process real-time audio with optimized settings"""
        if audio is None:
            return "", state

        try:
            # Initialize state if needed
            if state is None:
                state = {"full_transcript": "", "buffer": [], "pending_analysis": False}

            # Process audio in smaller chunks for real-time performance
            segments, _ = self.model.transcribe(
                audio[1],
                language="en",  # Specify language for faster processing
                vad_filter=True,  # Use Voice Activity Detection
                **self.model_settings
            )

            # Process segments
            current_transcript = ""
            for segment in segments:
                current_transcript += segment.text + " "

            if not current_transcript.strip():
                return state["full_transcript"], state

            # Update state
            state["full_transcript"] += " " + current_transcript
            state["buffer"].append(current_transcript)

            # Check if we should perform analysis
            current_time = time.time()
            buffer_text = " ".join(state["buffer"])
            word_count = len(buffer_text.split())

            if (word_count >= self.min_analysis_words and 
                (current_time - self.last_analysis_time) >= self.analysis_interval):
                
                # Perform AI analysis
                if len(buffer_text.strip()) > 0:
                    classification, probability, confidence = self.analyze_ai_content(buffer_text)
                    analysis_result = f"\n\n---AI Analysis---\nClassification: {classification}\nProbability: {probability:.2f}\nConfidence: {confidence}\n---\n"
                    state["full_transcript"] += analysis_result
                    state["buffer"] = []
                    self.last_analysis_time = current_time

                # Trim buffer if it gets too large
                if word_count > self.max_buffer_size:
                    words = state["full_transcript"].split()
                    state["full_transcript"] = " ".join(words[-self.max_buffer_size:])

            return state["full_transcript"], state

        except Exception as e:
            return f"Error processing audio: {str(e)}", state

    def analyze_ai_content(self, text):
        """Optimized AI content analysis"""
        if not text or len(text.split()) < self.min_analysis_words:
            return "Insufficient text", 0.0, "None"

        try:
            # Parallel processing of different analysis methods
            roberta_result = self.ai_detector(text[:512])[0]  # Limit text length for speed
            ai_prob = roberta_result['score']

            # Quick linguistic analysis
            linguistic_score = self.analyze_linguistic_patterns(text)
            
            # Calculate final score
            final_score = (ai_prob + linguistic_score) / 2

            # Fast classification
            if final_score > 0.7:
                return "AI Generated", final_score, "High"
            elif final_score > 0.5:
                return "Likely AI", final_score, "Medium"
            elif final_score > 0.3:
                return "Possibly AI", final_score, "Low"
            return "Likely Human", final_score, "High"

        except Exception as e:
            print(f"Analysis error: {e}")
            return "Analysis Error", 0.0, "None"

    def analyze_linguistic_patterns(self, text):
        """Optimized linguistic pattern analysis"""
        text_lower = text.lower()
        ai_phrase_count = sum(1 for category in self.ai_markers.values() 
                            for phrase in category if phrase in text_lower)
        
        unique_words = len(set(text.split()))
        total_words = len(text.split())
        
        return min((ai_phrase_count * 0.3) + (1 - (unique_words / total_words)) * 0.4, 1.0)

def create_gradio_interface():
    transcriber = WebAITranscriber()

    # Create the interface
    with gr.Blocks(title="Real-time AI Speech Analyzer") as interface:
        gr.Markdown("""
        # Real-time AI Speech Analyzer
        This app uses Faster Whisper for real-time speech recognition and AI detection.
        """)
        
        with gr.Tab("Real-time Analysis"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        sources=["microphone"],  # Updated from 'source' to 'sources'
                        streaming=True,
                        type="numpy",
                        label="🎤 Speak into your microphone"
                    )
                    gr.Markdown("""
                    ### Tips for best performance:
                    - Speak clearly and at a moderate pace
                    - Minimize background noise
                    - Wait a few seconds for initial processing
                    """)
                with gr.Column():
                    realtime_output = gr.Textbox(
                        label="Real-time Transcript and Analysis",
                        lines=15,
                        max_lines=30
                    )
                    
            audio_input.stream(
                transcriber.process_realtime_audio,
                inputs=[audio_input],
                outputs=[realtime_output],
                show_progress=False
            )

        gr.Markdown("""
        ### Technical Details:
        - Using Faster Whisper for optimized speech recognition
        - Real-time AI content analysis
        - Automatic voice activity detection
        - Optimized for low-latency processing
        """)

    return interface

# Launch the app
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch()