NikolaSelic commited on
Commit
fc830e2
·
1 Parent(s): 6ee68da

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -0
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import io
3
+ import os
4
+ import speech_recognition as sr
5
+ import whisper
6
+ import torch
7
+
8
+ from datetime import datetime, timedelta
9
+ from queue import Queue
10
+ from tempfile import NamedTemporaryFile
11
+ from time import sleep
12
+ from sys import platform
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--model",
19
+ default="medium",
20
+ help="Model to use",
21
+ choices=["tiny", "base", "small", "medium", "large"],
22
+ )
23
+ parser.add_argument(
24
+ "--non_english", action="store_true", help="Don't use the english model."
25
+ )
26
+ parser.add_argument(
27
+ "--energy_threshold",
28
+ default=1000,
29
+ help="Energy level for mic to detect.",
30
+ type=int,
31
+ )
32
+ parser.add_argument(
33
+ "--record_timeout",
34
+ default=2,
35
+ help="How real time the recording is in seconds.",
36
+ type=float,
37
+ )
38
+ parser.add_argument(
39
+ "--phrase_timeout",
40
+ default=3,
41
+ help="How much empty space between recordings before we "
42
+ "consider it a new line in the transcription.",
43
+ type=float,
44
+ )
45
+ if "linux" in platform:
46
+ parser.add_argument(
47
+ "--default_microphone",
48
+ default="pulse",
49
+ help="Default microphone name for SpeechRecognition. "
50
+ "Run this with 'list' to view available Microphones.",
51
+ type=str,
52
+ )
53
+ args = parser.parse_args()
54
+
55
+ # The last time a recording was retreived from the queue.
56
+ phrase_time = None
57
+ # Current raw audio bytes.
58
+ last_sample = bytes()
59
+ # Thread safe Queue for passing data from the threaded recording callback.
60
+ data_queue = Queue()
61
+ # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
62
+ recorder = sr.Recognizer()
63
+ recorder.energy_threshold = args.energy_threshold
64
+ # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
65
+ recorder.dynamic_energy_threshold = False
66
+
67
+ # Important for linux users.
68
+ # Prevents permanent application hang and crash by using the wrong Microphone
69
+ if "linux" in platform:
70
+ mic_name = args.default_microphone
71
+ if not mic_name or mic_name == "list":
72
+ print("Available microphone devices are: ")
73
+ for index, name in enumerate(sr.Microphone.list_microphone_names()):
74
+ print(f'Microphone with name "{name}" found')
75
+ return
76
+ else:
77
+ for index, name in enumerate(sr.Microphone.list_microphone_names()):
78
+ if mic_name in name:
79
+ source = sr.Microphone(sample_rate=16000, device_index=index)
80
+ break
81
+ else:
82
+ source = sr.Microphone(sample_rate=16000)
83
+
84
+ # Load / Download model
85
+ model = args.model
86
+ if args.model != "large" and not args.non_english:
87
+ model = model + ".en"
88
+ audio_model = whisper.load_model(model)
89
+
90
+ record_timeout = args.record_timeout
91
+ phrase_timeout = args.phrase_timeout
92
+
93
+ temp_file = NamedTemporaryFile().name
94
+ transcription = [""]
95
+
96
+ with source:
97
+ recorder.adjust_for_ambient_noise(source)
98
+
99
+ def record_callback(_, audio: sr.AudioData) -> None:
100
+ """
101
+ Threaded callback function to recieve audio data when recordings finish.
102
+ audio: An AudioData containing the recorded bytes.
103
+ """
104
+ # Grab the raw bytes and push it into the thread safe queue.
105
+ data = audio.get_raw_data()
106
+ data_queue.put(data)
107
+
108
+ # Create a background thread that will pass us raw audio bytes.
109
+ # We could do this manually but SpeechRecognizer provides a nice helper.
110
+ recorder.listen_in_background(
111
+ source, record_callback, phrase_time_limit=record_timeout
112
+ )
113
+
114
+ # Cue the user that we're ready to go.
115
+ print("Model loaded.\n")
116
+
117
+ while True:
118
+ try:
119
+ now = datetime.utcnow()
120
+ # Pull raw recorded audio from the queue.
121
+ if not data_queue.empty():
122
+ phrase_complete = False
123
+ # If enough time has passed between recordings, consider the phrase complete.
124
+ # Clear the current working audio buffer to start over with the new data.
125
+ if phrase_time and now - phrase_time > timedelta(
126
+ seconds=phrase_timeout
127
+ ):
128
+ last_sample = bytes()
129
+ phrase_complete = True
130
+ # This is the last time we received new audio data from the queue.
131
+ phrase_time = now
132
+
133
+ # Concatenate our current audio data with the latest audio data.
134
+ while not data_queue.empty():
135
+ data = data_queue.get()
136
+ last_sample += data
137
+
138
+ # Use AudioData to convert the raw data to wav data.
139
+ audio_data = sr.AudioData(
140
+ last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH
141
+ )
142
+ wav_data = io.BytesIO(audio_data.get_wav_data())
143
+
144
+ # Write wav data to the temporary file as bytes.
145
+ with open(temp_file, "w+b") as f:
146
+ f.write(wav_data.read())
147
+
148
+ # Read the transcription.
149
+ result = audio_model.transcribe(
150
+ temp_file, fp16=torch.cuda.is_available()
151
+ )
152
+ text = result["text"].strip()
153
+
154
+ # If we detected a pause between recordings, add a new item to our transcripion.
155
+ # Otherwise edit the existing one.
156
+ if phrase_complete:
157
+ transcription.append(text)
158
+ else:
159
+ transcription[-1] = text
160
+
161
+ # Clear the console to reprint the updated transcription.
162
+ os.system("cls" if os.name == "nt" else "clear")
163
+ for line in transcription:
164
+ print(line)
165
+ # Flush stdout.
166
+ print("", end="", flush=True)
167
+
168
+ # Infinite loops are bad for processors, must sleep.
169
+ sleep(0.25)
170
+ except KeyboardInterrupt:
171
+ break
172
+
173
+ print("\n\nTranscription:")
174
+ for line in transcription:
175
+ print(line)
176
+
177
+
178
+ if __name__ == "__main__":
179
+ main()