JanDalhuysen dwarkesh commited on
Commit
79df181
·
0 Parent(s):

Duplicate from dwarkesh/whisper-speaker-recognition

Browse files

Co-authored-by: Dwarkesh Patel <[email protected]>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +109 -0
  4. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper Speaker Recognition
3
+ emoji: 🌖
4
+ colorFrom: gray
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: dwarkesh/whisper-speaker-recognition
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import datetime
4
+
5
+ import subprocess
6
+
7
+ import torch
8
+ import pyannote.audio
9
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
10
+
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+
14
+ import wave
15
+ import contextlib
16
+
17
+ from sklearn.cluster import AgglomerativeClustering
18
+ import numpy as np
19
+
20
+ model = whisper.load_model("large-v2")
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
+ "speechbrain/spkrec-ecapa-voxceleb",
23
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+ )
25
+
26
+ def transcribe(audio, num_speakers):
27
+ path, error = convert_to_wav(audio)
28
+ if error is not None:
29
+ return error
30
+
31
+ duration = get_duration(path)
32
+ if duration > 4 * 60 * 60:
33
+ return "Audio duration too long"
34
+
35
+ result = model.transcribe(path)
36
+ segments = result["segments"]
37
+
38
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
39
+ if len(segments) == 1:
40
+ segments[0]['speaker'] = 'SPEAKER 1'
41
+ else:
42
+ embeddings = make_embeddings(path, segments, duration)
43
+ add_speaker_labels(segments, embeddings, num_speakers)
44
+ output = get_output(segments)
45
+ return output
46
+
47
+ def convert_to_wav(path):
48
+ if path[-3:] != 'wav':
49
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
50
+ try:
51
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
52
+ except:
53
+ return path, 'Error: Could not convert file to .wav'
54
+ path = new_path
55
+ return path, None
56
+
57
+ def get_duration(path):
58
+ with contextlib.closing(wave.open(path,'r')) as f:
59
+ frames = f.getnframes()
60
+ rate = f.getframerate()
61
+ return frames / float(rate)
62
+
63
+ def make_embeddings(path, segments, duration):
64
+ embeddings = np.zeros(shape=(len(segments), 192))
65
+ for i, segment in enumerate(segments):
66
+ embeddings[i] = segment_embedding(path, segment, duration)
67
+ return np.nan_to_num(embeddings)
68
+
69
+ audio = Audio()
70
+
71
+ def segment_embedding(path, segment, duration):
72
+ start = segment["start"]
73
+ # Whisper overshoots the end timestamp in the last segment
74
+ end = min(duration, segment["end"])
75
+ clip = Segment(start, end)
76
+ waveform, sample_rate = audio.crop(path, clip)
77
+ return embedding_model(waveform[None])
78
+
79
+ def add_speaker_labels(segments, embeddings, num_speakers):
80
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
81
+ labels = clustering.labels_
82
+ for i in range(len(segments)):
83
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
84
+
85
+ def time(secs):
86
+ return datetime.timedelta(seconds=round(secs))
87
+
88
+ def get_output(segments):
89
+ output = ''
90
+ for (i, segment) in enumerate(segments):
91
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
92
+ if i != 0:
93
+ output += '\n\n'
94
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
95
+ output += segment["text"][1:] + ' '
96
+ return output
97
+
98
+ gr.Interface(
99
+ title = 'Whisper with Speaker Recognition',
100
+ fn=transcribe,
101
+ inputs=[
102
+ gr.inputs.Audio(source="upload", type="filepath"),
103
+ gr.inputs.Number(default=2, label="Number of Speakers")
104
+
105
+ ],
106
+ outputs=[
107
+ gr.outputs.Textbox(label='Transcript')
108
+ ]
109
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/pyannote/pyannote-audio
2
+ git+https://github.com/openai/whisper.git
3
+ gradio