Spaces:
Running
on
T4
Running
on
T4
Commit
·
5f58cac
1
Parent(s):
6cba8bb
Stop recording
Browse files- app.py +30 -21
- requirements.txt +1 -1
app.py
CHANGED
@@ -7,10 +7,7 @@ import numpy as np
|
|
7 |
import requests
|
8 |
import traceback
|
9 |
from dataclasses import dataclass
|
10 |
-
from pathlib import Path
|
11 |
import io
|
12 |
-
import wave
|
13 |
-
import tempfile
|
14 |
from pydub import AudioSegment
|
15 |
import librosa
|
16 |
from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
|
@@ -87,7 +84,15 @@ def warm_up():
|
|
87 |
warm_up()
|
88 |
|
89 |
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"""Take in the stream, determine if a pause happened"""
|
92 |
|
93 |
temp_audio = audio
|
@@ -95,6 +100,11 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
|
|
95 |
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
96 |
duration = len(audio) / sampling_rate
|
97 |
|
|
|
|
|
|
|
|
|
|
|
98 |
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
99 |
|
100 |
return (duration - dur_vad) > 0.5
|
@@ -102,14 +112,14 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
|
|
102 |
|
103 |
def speaking(audio: np.ndarray, sampling_rate: int):
|
104 |
audio_buffer = io.BytesIO()
|
105 |
-
|
106 |
-
|
107 |
audio.tobytes(),
|
108 |
frame_rate=sampling_rate,
|
109 |
-
sample_width=
|
110 |
-
channels=(1 if len(
|
111 |
)
|
112 |
-
|
113 |
|
114 |
with open("input_audio.wav", "wb") as f:
|
115 |
f.write(audio_buffer.getvalue())
|
@@ -144,12 +154,6 @@ def speaking(audio: np.ndarray, sampling_rate: int):
|
|
144 |
|
145 |
|
146 |
|
147 |
-
@dataclass
|
148 |
-
class AppState:
|
149 |
-
stream: np.ndarray | None = None
|
150 |
-
sampling_rate: int = 0
|
151 |
-
pause_detected: bool = False
|
152 |
-
|
153 |
|
154 |
def process_audio(audio: tuple, state: AppState):
|
155 |
if state.stream is None:
|
@@ -158,22 +162,22 @@ def process_audio(audio: tuple, state: AppState):
|
|
158 |
else:
|
159 |
state.stream = np.concatenate((state.stream, audio[1]))
|
160 |
|
161 |
-
pause_detected = determine_pause(state.stream, state.sampling_rate)
|
162 |
state.pause_detected = pause_detected
|
163 |
|
164 |
-
if state.pause_detected:
|
165 |
return gr.Audio(recording=False), state
|
166 |
return None, state
|
167 |
|
168 |
|
169 |
def response(state: AppState):
|
170 |
if not state.pause_detected:
|
171 |
-
return None,
|
172 |
|
173 |
for mp3_bytes in speaking(state.stream, state.sampling_rate):
|
174 |
-
yield
|
175 |
|
176 |
-
yield
|
177 |
|
178 |
|
179 |
with gr.Blocks() as demo:
|
@@ -196,7 +200,12 @@ with gr.Blocks() as demo:
|
|
196 |
respond = input_audio.stop_recording(
|
197 |
response,
|
198 |
[state],
|
199 |
-
[
|
|
|
|
|
|
|
|
|
|
|
200 |
)
|
201 |
cancel = gr.Button("Stop Conversation", variant="stop")
|
202 |
cancel.click(lambda: AppState(), None, [state], cancels=[respond])
|
|
|
7 |
import requests
|
8 |
import traceback
|
9 |
from dataclasses import dataclass
|
|
|
10 |
import io
|
|
|
|
|
11 |
from pydub import AudioSegment
|
12 |
import librosa
|
13 |
from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
|
|
|
84 |
warm_up()
|
85 |
|
86 |
|
87 |
+
@dataclass
|
88 |
+
class AppState:
|
89 |
+
stream: np.ndarray | None = None
|
90 |
+
sampling_rate: int = 0
|
91 |
+
pause_detected: bool = False
|
92 |
+
started_talking = False
|
93 |
+
|
94 |
+
|
95 |
+
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
96 |
"""Take in the stream, determine if a pause happened"""
|
97 |
|
98 |
temp_audio = audio
|
|
|
100 |
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
|
101 |
duration = len(audio) / sampling_rate
|
102 |
|
103 |
+
if dur_vad > 0.5 and not state.started_talking:
|
104 |
+
print("started talking")
|
105 |
+
state.started_talking = True
|
106 |
+
return False
|
107 |
+
|
108 |
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
|
109 |
|
110 |
return (duration - dur_vad) > 0.5
|
|
|
112 |
|
113 |
def speaking(audio: np.ndarray, sampling_rate: int):
|
114 |
audio_buffer = io.BytesIO()
|
115 |
+
|
116 |
+
segment = AudioSegment(
|
117 |
audio.tobytes(),
|
118 |
frame_rate=sampling_rate,
|
119 |
+
sample_width=audio.dtype.itemsize,
|
120 |
+
channels=(1 if len(audio.shape) == 1 else audio.shape[1]),
|
121 |
)
|
122 |
+
segment.export(audio_buffer, format="wav")
|
123 |
|
124 |
with open("input_audio.wav", "wb") as f:
|
125 |
f.write(audio_buffer.getvalue())
|
|
|
154 |
|
155 |
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
def process_audio(audio: tuple, state: AppState):
|
159 |
if state.stream is None:
|
|
|
162 |
else:
|
163 |
state.stream = np.concatenate((state.stream, audio[1]))
|
164 |
|
165 |
+
pause_detected = determine_pause(state.stream, state.sampling_rate, state)
|
166 |
state.pause_detected = pause_detected
|
167 |
|
168 |
+
if state.pause_detected and state.started_talking:
|
169 |
return gr.Audio(recording=False), state
|
170 |
return None, state
|
171 |
|
172 |
|
173 |
def response(state: AppState):
|
174 |
if not state.pause_detected:
|
175 |
+
return None, AppState()
|
176 |
|
177 |
for mp3_bytes in speaking(state.stream, state.sampling_rate):
|
178 |
+
yield mp3_bytes, state
|
179 |
|
180 |
+
yield None, AppState()
|
181 |
|
182 |
|
183 |
with gr.Blocks() as demo:
|
|
|
200 |
respond = input_audio.stop_recording(
|
201 |
response,
|
202 |
[state],
|
203 |
+
[output_audio, state]
|
204 |
+
)
|
205 |
+
output_audio.stop(
|
206 |
+
lambda: gr.Audio(recording=True),
|
207 |
+
None,
|
208 |
+
[input_audio]
|
209 |
)
|
210 |
cancel = gr.Button("Stop Conversation", variant="stop")
|
211 |
cancel.click(lambda: AppState(), None, [state], cancels=[respond])
|
requirements.txt
CHANGED
@@ -11,7 +11,7 @@ streamlit==1.37.1
|
|
11 |
pydub==0.25.1
|
12 |
onnxruntime==1.19.0
|
13 |
# numpy==1.26.3
|
14 |
-
https://gradio-builds.s3.amazonaws.com/
|
15 |
fastapi==0.112.4
|
16 |
librosa==0.10.2.post1
|
17 |
flask==3.0.3
|
|
|
11 |
pydub==0.25.1
|
12 |
onnxruntime==1.19.0
|
13 |
# numpy==1.26.3
|
14 |
+
https://gradio-builds.s3.amazonaws.com/cffe9a7ab7f71e76d7214dc57c6278ffaf5bcdf9/gradio-5.0.0b1-py3-none-any.whl
|
15 |
fastapi==0.112.4
|
16 |
librosa==0.10.2.post1
|
17 |
flask==3.0.3
|