NitishBorthakur commited on
Commit
f14265e
1 Parent(s): 931ac37

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +304 -0
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import torch
7
+ from speechbrain.inference.speaker import EncoderClassifier
8
+ from sklearn.decomposition import PCA
9
+ from sklearn.manifold import TSNE
10
+ import plotly.graph_objects as go
11
+ from sklearn.preprocessing import normalize
12
+ import os
13
+ from cryptography.fernet import Fernet
14
+ import pickle
15
+
16
+ # --- Configuration using Environment Variables ---
17
+ encrypted_file_path = os.environ.get("SPEAKER_EMBEDDINGS_FILE")
18
+ metadata_file = os.environ.get("METADATA_FILE")
19
+ visualization_method = os.environ.get("VISUALIZATION_METHOD", "pca")
20
+ max_length = 5 * 16000
21
+ num_closest_speakers = 20
22
+ pca_dim = 50
23
+
24
+ # --- Check for Missing Environment Variables ---
25
+ if not encrypted_file_path:
26
+ raise ValueError("SPEAKER_EMBEDDINGS_FILE environment variable is not set.")
27
+ if not metadata_file:
28
+ raise ValueError("METADATA_FILE environment variable is not set.")
29
+ # --- Check for valid visualization method ---
30
+ if visualization_method not in ["pca", "tsne"]:
31
+ raise ValueError("Invalid VISUALIZATION_METHOD. Choose 'pca' or 'tsne'.")
32
+
33
+ # --- Debugging: Check Environment Variables ---
34
+ print(f"DECRYPTION_KEY: {os.getenv('DECRYPTION_KEY')}")
35
+ print(f"SPEAKER_EMBEDDINGS_FILE: {os.getenv('SPEAKER_EMBEDDINGS_FILE')}")
36
+ if os.getenv('SPEAKER_EMBEDDINGS_FILE'):
37
+ print(
38
+ f"Encrypted file path exists: {os.path.exists(os.getenv('SPEAKER_EMBEDDINGS_FILE'))}"
39
+ )
40
+ else:
41
+ print(
42
+ "Encrypted file path does not exist: SPEAKER_EMBEDDINGS_FILE environment variable not set or file not found."
43
+ )
44
+
45
+ # --- Decryption ---
46
+ key = os.getenv("DECRYPTION_KEY")
47
+ if not key:
48
+ raise ValueError(
49
+ "Decryption key is missing. Ensure DECRYPTION_KEY is set in the environment variables."
50
+ )
51
+
52
+ fernet = Fernet(key.encode("utf-8"))
53
+
54
+ # --- Sample Audio Files ---
55
+ sample_audio_dir = "sample_audio"
56
+ sample_audio_files = [
57
+ "Bob_Barker.mp3",
58
+ "Howie_Mandel.m4a",
59
+ "Katherine_Jenkins.mp3",
60
+ ]
61
+
62
+ # --- Load Embeddings and Metadata ---
63
+ try:
64
+ with open(encrypted_file_path, "rb") as encrypted_file:
65
+ encrypted_data = encrypted_file.read()
66
+
67
+ decrypted_data_bytes = fernet.decrypt(encrypted_data)
68
+
69
+ # Deserialize using pickle.loads()
70
+ speaker_embeddings = pickle.loads(decrypted_data_bytes)
71
+
72
+ print("Speaker embeddings loaded successfully!")
73
+
74
+ except FileNotFoundError:
75
+ raise FileNotFoundError(
76
+ f"Could not find encrypted embeddings file at: {encrypted_file_path}"
77
+ )
78
+ except Exception as e:
79
+ raise Exception(f"Error during decryption or loading embeddings: {e}")
80
+
81
+ df = pd.read_csv(metadata_file, delimiter="\t")
82
+
83
+ # --- Convert Embeddings to NumPy Arrays ---
84
+ for spk_id, embeddings in speaker_embeddings.items():
85
+ speaker_embeddings[spk_id] = [np.array(embedding) for embedding in embeddings]
86
+
87
+ # --- Speaker ID to Name Mapping ---
88
+ speaker_id_to_name = dict(zip(df["VoxCeleb1 ID"], df["VGGFace1 ID"]))
89
+
90
+ # --- Load SpeechBrain Classifier ---
91
+ classifier = EncoderClassifier.from_hparams(
92
+ source="speechbrain/spkrec-xvect-voxceleb",
93
+ savedir="pretrained_models/spkrec-xvect-voxceleb",
94
+ )
95
+
96
+ # --- Function to Calculate Average Embedding (Centroid) ---
97
+ def calculate_average_embedding(embeddings):
98
+ avg_embedding = np.mean(embeddings, axis=0)
99
+ return normalize(avg_embedding.reshape(1, -1)).flatten()
100
+
101
+ # --- Precompute Speaker Centroids ---
102
+ speaker_centroids = {
103
+ spk_id: calculate_average_embedding(embeddings)
104
+ for spk_id, embeddings in speaker_embeddings.items()
105
+ }
106
+
107
+ # --- Function to Prepare Data for Visualization ---
108
+ def prepare_data_for_visualization(speaker_centroids, closest_speaker_ids):
109
+ all_embeddings = [
110
+ centroid
111
+ for speaker_id, centroid in speaker_centroids.items()
112
+ if speaker_id in closest_speaker_ids
113
+ ]
114
+ all_speaker_ids = [
115
+ speaker_id
116
+ for speaker_id in speaker_centroids
117
+ if speaker_id in closest_speaker_ids
118
+ ]
119
+ return np.array(all_embeddings), np.array(all_speaker_ids)
120
+
121
+ # --- Function to Reduce Dimensionality ---
122
+ def reduce_dimensionality(all_embeddings, method="tsne", perplexity=5, pca_dim=50):
123
+ if method == "pca":
124
+ reducer = PCA(n_components=2)
125
+ elif method == "tsne":
126
+ pca_reducer = PCA(n_components=pca_dim)
127
+ all_embeddings = pca_reducer.fit_transform(all_embeddings)
128
+ reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity)
129
+ else:
130
+ raise ValueError("Invalid method. Choose 'pca' or 'tsne'.")
131
+ reduced_embeddings = reducer.fit_transform(all_embeddings)
132
+ return reducer, reduced_embeddings
133
+
134
+ # --- Function to Get Speaker Name from ID ---
135
+ def get_speaker_name(speaker_id):
136
+ return speaker_id_to_name.get(speaker_id, f"Unknown ({speaker_id})")
137
+
138
+ # --- Function to Generate Visualization ---
139
+ def generate_visualization(
140
+ pca_reducer,
141
+ reduced_embeddings,
142
+ all_speaker_ids,
143
+ new_embedding,
144
+ predicted_speaker_id,
145
+ visualization_method,
146
+ perplexity,
147
+ pca_dim,
148
+ ):
149
+ if visualization_method == "pca":
150
+ new_embedding_reduced = pca_reducer.transform(new_embedding.reshape(1, -1))
151
+ elif visualization_method == "tsne":
152
+ combined_embeddings = np.vstack(
153
+ [reduced_embeddings, new_embedding.reshape(1, -1)]
154
+ )
155
+ reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity)
156
+ combined_reduced = reducer.fit_transform(combined_embeddings)
157
+ reduced_embeddings = combined_reduced[:-1]
158
+ new_embedding_reduced = combined_reduced[-1].reshape(1, -1)
159
+ else:
160
+ raise ValueError("Invalid visualization method.")
161
+
162
+ fig = go.Figure()
163
+ fig.add_trace(
164
+ go.Scatter(
165
+ x=reduced_embeddings[:, 0],
166
+ y=reduced_embeddings[:, 1],
167
+ mode="markers",
168
+ marker=dict(color="blue", size=8, opacity=0.5),
169
+ text=[get_speaker_name(speaker_id) for speaker_id in all_speaker_ids],
170
+ name="Other Speakers",
171
+ )
172
+ )
173
+
174
+ if predicted_speaker_id in all_speaker_ids:
175
+ predicted_speaker_index = list(all_speaker_ids).index(predicted_speaker_id)
176
+ fig.add_trace(
177
+ go.Scatter(
178
+ x=[reduced_embeddings[predicted_speaker_index, 0]],
179
+ y=[reduced_embeddings[predicted_speaker_index, 1]],
180
+ mode="markers",
181
+ marker=dict(
182
+ color="green",
183
+ size=10,
184
+ symbol="circle",
185
+ line=dict(color="black", width=2),
186
+ ),
187
+ name=get_speaker_name(predicted_speaker_id),
188
+ text=[get_speaker_name(predicted_speaker_id)],
189
+ )
190
+ )
191
+
192
+ fig.add_trace(
193
+ go.Scatter(
194
+ x=new_embedding_reduced[:, 0],
195
+ y=new_embedding_reduced[:, 1],
196
+ mode="markers",
197
+ marker=dict(color="red", size=12, symbol="star"),
198
+ name="New Voice",
199
+ text=["New Voice"],
200
+ )
201
+ )
202
+
203
+ fig.update_layout(
204
+ title=f"Dimensionality Reduction of Speaker Embeddings using {visualization_method.upper()}",
205
+ xaxis_title="Component 1",
206
+ yaxis_title="Component 2",
207
+ legend=dict(x=0, y=1, traceorder="normal", orientation="h"),
208
+ hovermode="closest",
209
+ )
210
+ return fig
211
+
212
+ # --- Main Function ---
213
+ def identify_voice_and_visualize_with_averaging(audio_file, perplexity=5):
214
+ try:
215
+ if isinstance(audio_file, str):
216
+ signal, fs = librosa.load(audio_file, sr=16000)
217
+ elif isinstance(audio_file, np.ndarray):
218
+ signal = audio_file
219
+ fs = 16000
220
+ else:
221
+ raise ValueError(
222
+ "Invalid audio input. Must be a file path or a NumPy array."
223
+ )
224
+
225
+ signal_tensor = torch.tensor(signal, dtype=torch.float32).unsqueeze(0)
226
+ signal_tensor = torch.nn.functional.pad(
227
+ signal_tensor, (0, max_length - signal_tensor.shape[1])
228
+ )
229
+
230
+ user_embedding = classifier.encode_batch(signal_tensor).cpu().detach().numpy()
231
+ user_embedding = normalize(
232
+ user_embedding.squeeze(axis=(0, 1)).reshape(1, -1)
233
+ ).flatten()
234
+
235
+ similarity_scores = {
236
+ spk_id: cosine_similarity(
237
+ user_embedding.reshape(1, -1), centroid.reshape(1, -1)
238
+ )[0][0]
239
+ for spk_id, centroid in speaker_centroids.items()
240
+ }
241
+
242
+ closest_speaker_ids = sorted(
243
+ similarity_scores, key=similarity_scores.get, reverse=True
244
+ )[:num_closest_speakers]
245
+ predicted_speaker_id = closest_speaker_ids[0]
246
+ highest_similarity = similarity_scores[predicted_speaker_id]
247
+
248
+ all_embeddings, all_speaker_ids = prepare_data_for_visualization(
249
+ speaker_centroids, closest_speaker_ids
250
+ )
251
+ reducer, reduced_embeddings = reduce_dimensionality(
252
+ all_embeddings,
253
+ method=visualization_method,
254
+ perplexity=perplexity,
255
+ pca_dim=pca_dim,
256
+ )
257
+
258
+ predicted_speaker_name = get_speaker_name(predicted_speaker_id)
259
+ similarity_percentage = round(highest_similarity * 100, 2) # Rounded here
260
+
261
+ visualization = generate_visualization(
262
+ reducer,
263
+ reduced_embeddings,
264
+ all_speaker_ids,
265
+ user_embedding,
266
+ predicted_speaker_id,
267
+ visualization_method,
268
+ perplexity,
269
+ pca_dim,
270
+ )
271
+
272
+ result_text = (
273
+ f"The voice resembles speaker: {predicted_speaker_name} "
274
+ f"with a similarity of {similarity_percentage:.2f}%" # Display rounded value
275
+ )
276
+ return result_text, visualization
277
+
278
+ except Exception as e:
279
+ return f"Error during processing: {e}", None
280
+
281
+ # --- Gradio Interface ---
282
+ # Create a directory for caching examples if it doesn't exist
283
+ cache_dir = "examples_cache"
284
+ if not os.path.exists(cache_dir):
285
+ os.makedirs(cache_dir)
286
+
287
+ # Define the Gradio interface
288
+ iface = gr.Interface(
289
+ fn=identify_voice_and_visualize_with_averaging,
290
+ inputs=gr.Audio(type="filepath", label="Input Audio"),
291
+ outputs=["text", gr.Plot()],
292
+ title="Discover Your Celebrity Voice Twin!",
293
+ description="Record your voice or upload an audio file, and see your celebrity match! Not ready to record? Try our sample voices to see how it works!",
294
+ cache_examples=False,
295
+ examples_per_page=3,
296
+ examples=[
297
+ [os.path.join(sample_audio_dir, sample_audio_files[0])],
298
+ [os.path.join(sample_audio_dir, sample_audio_files[1])],
299
+ [os.path.join(sample_audio_dir, sample_audio_files[2])],
300
+ ],
301
+ )
302
+
303
+ # Launch the interface
304
+ iface.launch(debug=True, share=True)