Spaces:
Build error
Build error
NikolaSelic
commited on
Commit
·
fc830e2
1
Parent(s):
6ee68da
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
import speech_recognition as sr
|
5 |
+
import whisper
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from datetime import datetime, timedelta
|
9 |
+
from queue import Queue
|
10 |
+
from tempfile import NamedTemporaryFile
|
11 |
+
from time import sleep
|
12 |
+
from sys import platform
|
13 |
+
|
14 |
+
|
15 |
+
def main():
|
16 |
+
parser = argparse.ArgumentParser()
|
17 |
+
parser.add_argument(
|
18 |
+
"--model",
|
19 |
+
default="medium",
|
20 |
+
help="Model to use",
|
21 |
+
choices=["tiny", "base", "small", "medium", "large"],
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--non_english", action="store_true", help="Don't use the english model."
|
25 |
+
)
|
26 |
+
parser.add_argument(
|
27 |
+
"--energy_threshold",
|
28 |
+
default=1000,
|
29 |
+
help="Energy level for mic to detect.",
|
30 |
+
type=int,
|
31 |
+
)
|
32 |
+
parser.add_argument(
|
33 |
+
"--record_timeout",
|
34 |
+
default=2,
|
35 |
+
help="How real time the recording is in seconds.",
|
36 |
+
type=float,
|
37 |
+
)
|
38 |
+
parser.add_argument(
|
39 |
+
"--phrase_timeout",
|
40 |
+
default=3,
|
41 |
+
help="How much empty space between recordings before we "
|
42 |
+
"consider it a new line in the transcription.",
|
43 |
+
type=float,
|
44 |
+
)
|
45 |
+
if "linux" in platform:
|
46 |
+
parser.add_argument(
|
47 |
+
"--default_microphone",
|
48 |
+
default="pulse",
|
49 |
+
help="Default microphone name for SpeechRecognition. "
|
50 |
+
"Run this with 'list' to view available Microphones.",
|
51 |
+
type=str,
|
52 |
+
)
|
53 |
+
args = parser.parse_args()
|
54 |
+
|
55 |
+
# The last time a recording was retreived from the queue.
|
56 |
+
phrase_time = None
|
57 |
+
# Current raw audio bytes.
|
58 |
+
last_sample = bytes()
|
59 |
+
# Thread safe Queue for passing data from the threaded recording callback.
|
60 |
+
data_queue = Queue()
|
61 |
+
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
|
62 |
+
recorder = sr.Recognizer()
|
63 |
+
recorder.energy_threshold = args.energy_threshold
|
64 |
+
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
|
65 |
+
recorder.dynamic_energy_threshold = False
|
66 |
+
|
67 |
+
# Important for linux users.
|
68 |
+
# Prevents permanent application hang and crash by using the wrong Microphone
|
69 |
+
if "linux" in platform:
|
70 |
+
mic_name = args.default_microphone
|
71 |
+
if not mic_name or mic_name == "list":
|
72 |
+
print("Available microphone devices are: ")
|
73 |
+
for index, name in enumerate(sr.Microphone.list_microphone_names()):
|
74 |
+
print(f'Microphone with name "{name}" found')
|
75 |
+
return
|
76 |
+
else:
|
77 |
+
for index, name in enumerate(sr.Microphone.list_microphone_names()):
|
78 |
+
if mic_name in name:
|
79 |
+
source = sr.Microphone(sample_rate=16000, device_index=index)
|
80 |
+
break
|
81 |
+
else:
|
82 |
+
source = sr.Microphone(sample_rate=16000)
|
83 |
+
|
84 |
+
# Load / Download model
|
85 |
+
model = args.model
|
86 |
+
if args.model != "large" and not args.non_english:
|
87 |
+
model = model + ".en"
|
88 |
+
audio_model = whisper.load_model(model)
|
89 |
+
|
90 |
+
record_timeout = args.record_timeout
|
91 |
+
phrase_timeout = args.phrase_timeout
|
92 |
+
|
93 |
+
temp_file = NamedTemporaryFile().name
|
94 |
+
transcription = [""]
|
95 |
+
|
96 |
+
with source:
|
97 |
+
recorder.adjust_for_ambient_noise(source)
|
98 |
+
|
99 |
+
def record_callback(_, audio: sr.AudioData) -> None:
|
100 |
+
"""
|
101 |
+
Threaded callback function to recieve audio data when recordings finish.
|
102 |
+
audio: An AudioData containing the recorded bytes.
|
103 |
+
"""
|
104 |
+
# Grab the raw bytes and push it into the thread safe queue.
|
105 |
+
data = audio.get_raw_data()
|
106 |
+
data_queue.put(data)
|
107 |
+
|
108 |
+
# Create a background thread that will pass us raw audio bytes.
|
109 |
+
# We could do this manually but SpeechRecognizer provides a nice helper.
|
110 |
+
recorder.listen_in_background(
|
111 |
+
source, record_callback, phrase_time_limit=record_timeout
|
112 |
+
)
|
113 |
+
|
114 |
+
# Cue the user that we're ready to go.
|
115 |
+
print("Model loaded.\n")
|
116 |
+
|
117 |
+
while True:
|
118 |
+
try:
|
119 |
+
now = datetime.utcnow()
|
120 |
+
# Pull raw recorded audio from the queue.
|
121 |
+
if not data_queue.empty():
|
122 |
+
phrase_complete = False
|
123 |
+
# If enough time has passed between recordings, consider the phrase complete.
|
124 |
+
# Clear the current working audio buffer to start over with the new data.
|
125 |
+
if phrase_time and now - phrase_time > timedelta(
|
126 |
+
seconds=phrase_timeout
|
127 |
+
):
|
128 |
+
last_sample = bytes()
|
129 |
+
phrase_complete = True
|
130 |
+
# This is the last time we received new audio data from the queue.
|
131 |
+
phrase_time = now
|
132 |
+
|
133 |
+
# Concatenate our current audio data with the latest audio data.
|
134 |
+
while not data_queue.empty():
|
135 |
+
data = data_queue.get()
|
136 |
+
last_sample += data
|
137 |
+
|
138 |
+
# Use AudioData to convert the raw data to wav data.
|
139 |
+
audio_data = sr.AudioData(
|
140 |
+
last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH
|
141 |
+
)
|
142 |
+
wav_data = io.BytesIO(audio_data.get_wav_data())
|
143 |
+
|
144 |
+
# Write wav data to the temporary file as bytes.
|
145 |
+
with open(temp_file, "w+b") as f:
|
146 |
+
f.write(wav_data.read())
|
147 |
+
|
148 |
+
# Read the transcription.
|
149 |
+
result = audio_model.transcribe(
|
150 |
+
temp_file, fp16=torch.cuda.is_available()
|
151 |
+
)
|
152 |
+
text = result["text"].strip()
|
153 |
+
|
154 |
+
# If we detected a pause between recordings, add a new item to our transcripion.
|
155 |
+
# Otherwise edit the existing one.
|
156 |
+
if phrase_complete:
|
157 |
+
transcription.append(text)
|
158 |
+
else:
|
159 |
+
transcription[-1] = text
|
160 |
+
|
161 |
+
# Clear the console to reprint the updated transcription.
|
162 |
+
os.system("cls" if os.name == "nt" else "clear")
|
163 |
+
for line in transcription:
|
164 |
+
print(line)
|
165 |
+
# Flush stdout.
|
166 |
+
print("", end="", flush=True)
|
167 |
+
|
168 |
+
# Infinite loops are bad for processors, must sleep.
|
169 |
+
sleep(0.25)
|
170 |
+
except KeyboardInterrupt:
|
171 |
+
break
|
172 |
+
|
173 |
+
print("\n\nTranscription:")
|
174 |
+
for line in transcription:
|
175 |
+
print(line)
|
176 |
+
|
177 |
+
|
178 |
+
if __name__ == "__main__":
|
179 |
+
main()
|