freddyaboulton HF staff commited on
Commit
5f58cac
·
1 Parent(s): 6cba8bb

Stop recording

Browse files
Files changed (2) hide show
  1. app.py +30 -21
  2. requirements.txt +1 -1
app.py CHANGED
@@ -7,10 +7,7 @@ import numpy as np
7
  import requests
8
  import traceback
9
  from dataclasses import dataclass
10
- from pathlib import Path
11
  import io
12
- import wave
13
- import tempfile
14
  from pydub import AudioSegment
15
  import librosa
16
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
@@ -87,7 +84,15 @@ def warm_up():
87
  warm_up()
88
 
89
 
90
- def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
 
 
 
 
 
 
 
 
91
  """Take in the stream, determine if a pause happened"""
92
 
93
  temp_audio = audio
@@ -95,6 +100,11 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
95
  dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
96
  duration = len(audio) / sampling_rate
97
 
 
 
 
 
 
98
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
99
 
100
  return (duration - dur_vad) > 0.5
@@ -102,14 +112,14 @@ def determine_pause(audio: np.ndarray, sampling_rate: int) -> bool:
102
 
103
  def speaking(audio: np.ndarray, sampling_rate: int):
104
  audio_buffer = io.BytesIO()
105
-
106
- audio = AudioSegment(
107
  audio.tobytes(),
108
  frame_rate=sampling_rate,
109
- sample_width=data.dtype.itemsize,
110
- channels=(1 if len(data.shape) == 1 else data.shape[1]),
111
  )
112
- file = audio.export(audio_buffer, format="wav")
113
 
114
  with open("input_audio.wav", "wb") as f:
115
  f.write(audio_buffer.getvalue())
@@ -144,12 +154,6 @@ def speaking(audio: np.ndarray, sampling_rate: int):
144
 
145
 
146
 
147
- @dataclass
148
- class AppState:
149
- stream: np.ndarray | None = None
150
- sampling_rate: int = 0
151
- pause_detected: bool = False
152
-
153
 
154
  def process_audio(audio: tuple, state: AppState):
155
  if state.stream is None:
@@ -158,22 +162,22 @@ def process_audio(audio: tuple, state: AppState):
158
  else:
159
  state.stream = np.concatenate((state.stream, audio[1]))
160
 
161
- pause_detected = determine_pause(state.stream, state.sampling_rate)
162
  state.pause_detected = pause_detected
163
 
164
- if state.pause_detected:
165
  return gr.Audio(recording=False), state
166
  return None, state
167
 
168
 
169
  def response(state: AppState):
170
  if not state.pause_detected:
171
- return None, None, AppState()
172
 
173
  for mp3_bytes in speaking(state.stream, state.sampling_rate):
174
- yield None, mp3_bytes, state
175
 
176
- yield gr.Audio(recording=True), None, AppState()
177
 
178
 
179
  with gr.Blocks() as demo:
@@ -196,7 +200,12 @@ with gr.Blocks() as demo:
196
  respond = input_audio.stop_recording(
197
  response,
198
  [state],
199
- [input_audio, output_audio, state]
 
 
 
 
 
200
  )
201
  cancel = gr.Button("Stop Conversation", variant="stop")
202
  cancel.click(lambda: AppState(), None, [state], cancels=[respond])
 
7
  import requests
8
  import traceback
9
  from dataclasses import dataclass
 
10
  import io
 
 
11
  from pydub import AudioSegment
12
  import librosa
13
  from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
 
84
  warm_up()
85
 
86
 
87
+ @dataclass
88
+ class AppState:
89
+ stream: np.ndarray | None = None
90
+ sampling_rate: int = 0
91
+ pause_detected: bool = False
92
+ started_talking = False
93
+
94
+
95
+ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
96
  """Take in the stream, determine if a pause happened"""
97
 
98
  temp_audio = audio
 
100
  dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
101
  duration = len(audio) / sampling_rate
102
 
103
+ if dur_vad > 0.5 and not state.started_talking:
104
+ print("started talking")
105
+ state.started_talking = True
106
+ return False
107
+
108
  print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
109
 
110
  return (duration - dur_vad) > 0.5
 
112
 
113
  def speaking(audio: np.ndarray, sampling_rate: int):
114
  audio_buffer = io.BytesIO()
115
+
116
+ segment = AudioSegment(
117
  audio.tobytes(),
118
  frame_rate=sampling_rate,
119
+ sample_width=audio.dtype.itemsize,
120
+ channels=(1 if len(audio.shape) == 1 else audio.shape[1]),
121
  )
122
+ segment.export(audio_buffer, format="wav")
123
 
124
  with open("input_audio.wav", "wb") as f:
125
  f.write(audio_buffer.getvalue())
 
154
 
155
 
156
 
 
 
 
 
 
 
157
 
158
  def process_audio(audio: tuple, state: AppState):
159
  if state.stream is None:
 
162
  else:
163
  state.stream = np.concatenate((state.stream, audio[1]))
164
 
165
+ pause_detected = determine_pause(state.stream, state.sampling_rate, state)
166
  state.pause_detected = pause_detected
167
 
168
+ if state.pause_detected and state.started_talking:
169
  return gr.Audio(recording=False), state
170
  return None, state
171
 
172
 
173
  def response(state: AppState):
174
  if not state.pause_detected:
175
+ return None, AppState()
176
 
177
  for mp3_bytes in speaking(state.stream, state.sampling_rate):
178
+ yield mp3_bytes, state
179
 
180
+ yield None, AppState()
181
 
182
 
183
  with gr.Blocks() as demo:
 
200
  respond = input_audio.stop_recording(
201
  response,
202
  [state],
203
+ [output_audio, state]
204
+ )
205
+ output_audio.stop(
206
+ lambda: gr.Audio(recording=True),
207
+ None,
208
+ [input_audio]
209
  )
210
  cancel = gr.Button("Stop Conversation", variant="stop")
211
  cancel.click(lambda: AppState(), None, [state], cancels=[respond])
requirements.txt CHANGED
@@ -11,7 +11,7 @@ streamlit==1.37.1
11
  pydub==0.25.1
12
  onnxruntime==1.19.0
13
  # numpy==1.26.3
14
- https://gradio-builds.s3.amazonaws.com/e3011b3b19ee8f7b7fc2dbba848d56a0b30b6cdb/gradio-5.0.0b1-py3-none-any.whl
15
  fastapi==0.112.4
16
  librosa==0.10.2.post1
17
  flask==3.0.3
 
11
  pydub==0.25.1
12
  onnxruntime==1.19.0
13
  # numpy==1.26.3
14
+ https://gradio-builds.s3.amazonaws.com/cffe9a7ab7f71e76d7214dc57c6278ffaf5bcdf9/gradio-5.0.0b1-py3-none-any.whl
15
  fastapi==0.112.4
16
  librosa==0.10.2.post1
17
  flask==3.0.3