Thiago Hersan commited on
Commit
5c641bc
Β·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ audio/plain_03.wav filter=lfs diff=lfs merge=lfs -text
2
+ audio/plain_04.wav filter=lfs diff=lfs merge=lfs -text
3
+ audio/plain_05.wav filter=lfs diff=lfs merge=lfs -text
4
+ audio/secret_01.wav filter=lfs diff=lfs merge=lfs -text
5
+ audio/secret_02.wav filter=lfs diff=lfs merge=lfs -text
6
+ audio/plain_02.wav filter=lfs diff=lfs merge=lfs -text
7
+ audio/plain_04b.wav filter=lfs diff=lfs merge=lfs -text
8
+ audio/secret_03.wav filter=lfs diff=lfs merge=lfs -text
9
+ audio/secret_04.wav filter=lfs diff=lfs merge=lfs -text
10
+ audio/plain_01.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .DS_S*
2
+ __pycache__/
3
+ gradio_cached_examples/
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: 9103H 2024F whisper-base-en-gradio
3
+ emoji: πŸ”ŠπŸ“
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+
4
+ from librosa import resample
5
+ from transformers import pipeline
6
+
7
+ pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30)
8
+
9
+ def transcribe(audio_in):
10
+ orig_sr, samples = audio_in
11
+ min_s, max_s = min(samples), max(samples)
12
+ range_in = (max_s - min_s)
13
+ samples_scl = np.array(samples) / range_in
14
+ min_scl = min_s / range_in
15
+ samples_f = 2.0 * (samples_scl - min_scl) - 1.0
16
+ resamples = resample(samples_f, orig_sr=orig_sr, target_sr=16000)
17
+ prediction = pipe(resamples.copy(), batch_size=8)
18
+ return prediction["text"].strip().lower()
19
+
20
+
21
+ with gr.Blocks() as demo:
22
+ gr.Markdown("""
23
+ # 9103H 2024F Audio Transcription.
24
+ ## API for [whisper-base.en](https://huggingface.co/openai/whisper-base.en) english model\
25
+ to help check [HW03](https://github.com/DM-GY-9103-2024F-H/HW03) exercises.
26
+ """)
27
+
28
+ gr.Interface(
29
+ transcribe,
30
+ inputs=gr.Audio(type="numpy"),
31
+ outputs="text",
32
+ cache_examples=True,
33
+ examples=[["./audio/plain_01.wav"]]
34
+ )
35
+
36
+ if __name__ == "__main__":
37
+ demo.launch()
audio/plain_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4db5391e8429e21d7c19f05c6d551e01fe168186c91d1debb055c0305e8f84f
3
+ size 176440
audio/plain_02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4222969e675d59440f5fec9dc7dd1fa83f4901a5b370f4fb116cdced83bfdc4f
3
+ size 453704
audio/plain_03.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b18ff12c5ab02b3886669aebe0dfa2181006f65b8c7ae271bea60cdc0c9f19
3
+ size 308740
audio/plain_04.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6165df34e92ab966590fe5bd33b5afdae568ed2d26726bbaea142622805d0445
3
+ size 439442
audio/plain_04b.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72aad56c76fa0e1e3d0c16dd41bfbf62ecc406ed216711c16b9b2b852464b1be
3
+ size 5974316
audio/plain_05.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64aa9b09942a77e68820468e91af7ba00008ed520c03db90dad36292048feb31
3
+ size 441040
audio/secret_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76b42a98c06aaeb1ac65ea7cf50063a76708d4e52466dc59684eab677857ae1
3
+ size 176440
audio/secret_02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be2e88e04a8a28b8bcb848a7c5a08f50fb5835ad0b1e5436d03eac18e16ef656
3
+ size 453704
audio/secret_03.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0555b2ef8e1c29c24cd31eb622c146d028dba3b3bc0bab5467e3325d5811d769
3
+ size 308740
audio/secret_04.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd460839bcd94d5f51f2b641fa6b557561525ecdf04ab9c18e157d8bdf99b9f8
3
+ size 5712216
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ librosa
2
+ torch
3
+ transformers
whisper.ipynb ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import wave\n",
11
+ "\n",
12
+ "from librosa import resample\n",
13
+ "from IPython.display import Audio\n",
14
+ "from transformers import pipeline"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "def open_wave(wav_filename):\n",
24
+ " with wave.open(wav_filename, mode=\"rb\") as wav_in:\n",
25
+ " if wav_in.getsampwidth() != 2:\n",
26
+ " raise Exception(\"Input not 16-bit\")\n",
27
+ "\n",
28
+ " nchannels = wav_in.getnchannels()\n",
29
+ " nframes = wav_in.getnframes()\n",
30
+ " nsamples = nchannels * nframes\n",
31
+ " xb = wav_in.readframes(nframes)\n",
32
+ " b_np = np.frombuffer(xb, dtype=np.int16) / nchannels\n",
33
+ " samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]\n",
34
+ "\n",
35
+ " return (samples, wav_in.getframerate())"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base\", chunk_length_s=30)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "def transcribe(samples, orig_sr=44100, target_sr=16000):\n",
54
+ " min_s, max_s = min(samples), max(samples)\n",
55
+ " samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0\n",
56
+ " resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)\n",
57
+ " prediction = pipe(resamples.copy(), batch_size=8)\n",
58
+ " return prediction[\"text\"].strip().lower()"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "samples, sr = open_wave(\"./audio/plain_01.wav\")\n",
68
+ "display(Audio(samples, rate=sr))\n",
69
+ "transcribe(samples, sr)"
70
+ ]
71
+ }
72
+ ],
73
+ "metadata": {
74
+ "kernelspec": {
75
+ "display_name": "gradio",
76
+ "language": "python",
77
+ "name": "python3"
78
+ },
79
+ "language_info": {
80
+ "codemirror_mode": {
81
+ "name": "ipython",
82
+ "version": 3
83
+ },
84
+ "file_extension": ".py",
85
+ "mimetype": "text/x-python",
86
+ "name": "python",
87
+ "nbconvert_exporter": "python",
88
+ "pygments_lexer": "ipython3",
89
+ "version": "3.9.18"
90
+ }
91
+ },
92
+ "nbformat": 4,
93
+ "nbformat_minor": 2
94
+ }