Thiago Hersan
commited on
Commit
Β·
5c641bc
0
Parent(s):
initial commit
Browse files- .gitattributes +10 -0
- .gitignore +3 -0
- README.md +10 -0
- app.py +37 -0
- audio/plain_01.wav +3 -0
- audio/plain_02.wav +3 -0
- audio/plain_03.wav +3 -0
- audio/plain_04.wav +3 -0
- audio/plain_04b.wav +3 -0
- audio/plain_05.wav +3 -0
- audio/secret_01.wav +3 -0
- audio/secret_02.wav +3 -0
- audio/secret_03.wav +3 -0
- audio/secret_04.wav +3 -0
- requirements.txt +3 -0
- whisper.ipynb +94 -0
.gitattributes
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio/plain_03.wav filter=lfs diff=lfs merge=lfs -text
|
2 |
+
audio/plain_04.wav filter=lfs diff=lfs merge=lfs -text
|
3 |
+
audio/plain_05.wav filter=lfs diff=lfs merge=lfs -text
|
4 |
+
audio/secret_01.wav filter=lfs diff=lfs merge=lfs -text
|
5 |
+
audio/secret_02.wav filter=lfs diff=lfs merge=lfs -text
|
6 |
+
audio/plain_02.wav filter=lfs diff=lfs merge=lfs -text
|
7 |
+
audio/plain_04b.wav filter=lfs diff=lfs merge=lfs -text
|
8 |
+
audio/secret_03.wav filter=lfs diff=lfs merge=lfs -text
|
9 |
+
audio/secret_04.wav filter=lfs diff=lfs merge=lfs -text
|
10 |
+
audio/plain_01.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.DS_S*
|
2 |
+
__pycache__/
|
3 |
+
gradio_cached_examples/
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: 9103H 2024F whisper-base-en-gradio
|
3 |
+
emoji: ππ
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from librosa import resample
|
5 |
+
from transformers import pipeline
|
6 |
+
|
7 |
+
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30)
|
8 |
+
|
9 |
+
def transcribe(audio_in):
|
10 |
+
orig_sr, samples = audio_in
|
11 |
+
min_s, max_s = min(samples), max(samples)
|
12 |
+
range_in = (max_s - min_s)
|
13 |
+
samples_scl = np.array(samples) / range_in
|
14 |
+
min_scl = min_s / range_in
|
15 |
+
samples_f = 2.0 * (samples_scl - min_scl) - 1.0
|
16 |
+
resamples = resample(samples_f, orig_sr=orig_sr, target_sr=16000)
|
17 |
+
prediction = pipe(resamples.copy(), batch_size=8)
|
18 |
+
return prediction["text"].strip().lower()
|
19 |
+
|
20 |
+
|
21 |
+
with gr.Blocks() as demo:
|
22 |
+
gr.Markdown("""
|
23 |
+
# 9103H 2024F Audio Transcription.
|
24 |
+
## API for [whisper-base.en](https://huggingface.co/openai/whisper-base.en) english model\
|
25 |
+
to help check [HW03](https://github.com/DM-GY-9103-2024F-H/HW03) exercises.
|
26 |
+
""")
|
27 |
+
|
28 |
+
gr.Interface(
|
29 |
+
transcribe,
|
30 |
+
inputs=gr.Audio(type="numpy"),
|
31 |
+
outputs="text",
|
32 |
+
cache_examples=True,
|
33 |
+
examples=[["./audio/plain_01.wav"]]
|
34 |
+
)
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
demo.launch()
|
audio/plain_01.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4db5391e8429e21d7c19f05c6d551e01fe168186c91d1debb055c0305e8f84f
|
3 |
+
size 176440
|
audio/plain_02.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4222969e675d59440f5fec9dc7dd1fa83f4901a5b370f4fb116cdced83bfdc4f
|
3 |
+
size 453704
|
audio/plain_03.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58b18ff12c5ab02b3886669aebe0dfa2181006f65b8c7ae271bea60cdc0c9f19
|
3 |
+
size 308740
|
audio/plain_04.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6165df34e92ab966590fe5bd33b5afdae568ed2d26726bbaea142622805d0445
|
3 |
+
size 439442
|
audio/plain_04b.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72aad56c76fa0e1e3d0c16dd41bfbf62ecc406ed216711c16b9b2b852464b1be
|
3 |
+
size 5974316
|
audio/plain_05.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64aa9b09942a77e68820468e91af7ba00008ed520c03db90dad36292048feb31
|
3 |
+
size 441040
|
audio/secret_01.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b76b42a98c06aaeb1ac65ea7cf50063a76708d4e52466dc59684eab677857ae1
|
3 |
+
size 176440
|
audio/secret_02.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be2e88e04a8a28b8bcb848a7c5a08f50fb5835ad0b1e5436d03eac18e16ef656
|
3 |
+
size 453704
|
audio/secret_03.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0555b2ef8e1c29c24cd31eb622c146d028dba3b3bc0bab5467e3325d5811d769
|
3 |
+
size 308740
|
audio/secret_04.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd460839bcd94d5f51f2b641fa6b557561525ecdf04ab9c18e157d8bdf99b9f8
|
3 |
+
size 5712216
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
librosa
|
2 |
+
torch
|
3 |
+
transformers
|
whisper.ipynb
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import numpy as np\n",
|
10 |
+
"import wave\n",
|
11 |
+
"\n",
|
12 |
+
"from librosa import resample\n",
|
13 |
+
"from IPython.display import Audio\n",
|
14 |
+
"from transformers import pipeline"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": null,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [],
|
22 |
+
"source": [
|
23 |
+
"def open_wave(wav_filename):\n",
|
24 |
+
" with wave.open(wav_filename, mode=\"rb\") as wav_in:\n",
|
25 |
+
" if wav_in.getsampwidth() != 2:\n",
|
26 |
+
" raise Exception(\"Input not 16-bit\")\n",
|
27 |
+
"\n",
|
28 |
+
" nchannels = wav_in.getnchannels()\n",
|
29 |
+
" nframes = wav_in.getnframes()\n",
|
30 |
+
" nsamples = nchannels * nframes\n",
|
31 |
+
" xb = wav_in.readframes(nframes)\n",
|
32 |
+
" b_np = np.frombuffer(xb, dtype=np.int16) / nchannels\n",
|
33 |
+
" samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]\n",
|
34 |
+
"\n",
|
35 |
+
" return (samples, wav_in.getframerate())"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": null,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base\", chunk_length_s=30)"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"def transcribe(samples, orig_sr=44100, target_sr=16000):\n",
|
54 |
+
" min_s, max_s = min(samples), max(samples)\n",
|
55 |
+
" samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0\n",
|
56 |
+
" resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)\n",
|
57 |
+
" prediction = pipe(resamples.copy(), batch_size=8)\n",
|
58 |
+
" return prediction[\"text\"].strip().lower()"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"cell_type": "code",
|
63 |
+
"execution_count": null,
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [],
|
66 |
+
"source": [
|
67 |
+
"samples, sr = open_wave(\"./audio/plain_01.wav\")\n",
|
68 |
+
"display(Audio(samples, rate=sr))\n",
|
69 |
+
"transcribe(samples, sr)"
|
70 |
+
]
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"metadata": {
|
74 |
+
"kernelspec": {
|
75 |
+
"display_name": "gradio",
|
76 |
+
"language": "python",
|
77 |
+
"name": "python3"
|
78 |
+
},
|
79 |
+
"language_info": {
|
80 |
+
"codemirror_mode": {
|
81 |
+
"name": "ipython",
|
82 |
+
"version": 3
|
83 |
+
},
|
84 |
+
"file_extension": ".py",
|
85 |
+
"mimetype": "text/x-python",
|
86 |
+
"name": "python",
|
87 |
+
"nbconvert_exporter": "python",
|
88 |
+
"pygments_lexer": "ipython3",
|
89 |
+
"version": "3.9.18"
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"nbformat": 4,
|
93 |
+
"nbformat_minor": 2
|
94 |
+
}
|