keithhon commited on
Commit
101845c
·
1 Parent(s): 1955797

Upload toolbox/ui.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. toolbox/ui.py +611 -0
toolbox/ui.py ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
3
+ from matplotlib.figure import Figure
4
+ from PyQt5.QtCore import Qt, QStringListModel
5
+ from PyQt5.QtWidgets import *
6
+ from encoder.inference import plot_embedding_as_heatmap
7
+ from toolbox.utterance import Utterance
8
+ from pathlib import Path
9
+ from typing import List, Set
10
+ import sounddevice as sd
11
+ import soundfile as sf
12
+ import numpy as np
13
+ # from sklearn.manifold import TSNE # You can try with TSNE if you like, I prefer UMAP
14
+ from time import sleep
15
+ import umap
16
+ import sys
17
+ from warnings import filterwarnings, warn
18
+ filterwarnings("ignore")
19
+
20
+
21
+ colormap = np.array([
22
+ [0, 127, 70],
23
+ [255, 0, 0],
24
+ [255, 217, 38],
25
+ [0, 135, 255],
26
+ [165, 0, 165],
27
+ [255, 167, 255],
28
+ [97, 142, 151],
29
+ [0, 255, 255],
30
+ [255, 96, 38],
31
+ [142, 76, 0],
32
+ [33, 0, 127],
33
+ [0, 0, 0],
34
+ [183, 183, 183],
35
+ [76, 255, 0],
36
+ ], dtype=np.float) / 255
37
+
38
+ default_text = \
39
+ "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \
40
+ "yourself.\nOnce its embedding has been created, you can synthesize any text written here.\n" \
41
+ "The synthesizer expects to generate " \
42
+ "outputs that are somewhere between 5 and 12 seconds.\nTo mark breaks, write a new line. " \
43
+ "Each line will be treated separately.\nThen, they are joined together to make the final " \
44
+ "spectrogram. Use the vocoder to generate audio.\nThe vocoder generates almost in constant " \
45
+ "time, so it will be more time efficient for longer inputs like this one.\nOn the left you " \
46
+ "have the embedding projections. Load or record more utterances to see them.\nIf you have " \
47
+ "at least 2 or 3 utterances from a same speaker, a cluster should form.\nSynthesized " \
48
+ "utterances are of the same color as the speaker whose voice was used, but they're " \
49
+ "represented with a cross."
50
+
51
+
52
+ class UI(QDialog):
53
+ min_umap_points = 4
54
+ max_log_lines = 5
55
+ max_saved_utterances = 20
56
+
57
+ def draw_utterance(self, utterance: Utterance, which):
58
+ self.draw_spec(utterance.spec, which)
59
+ self.draw_embed(utterance.embed, utterance.name, which)
60
+
61
+ def draw_embed(self, embed, name, which):
62
+ embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
63
+ embed_ax.figure.suptitle("" if embed is None else name)
64
+
65
+ ## Embedding
66
+ # Clear the plot
67
+ if len(embed_ax.images) > 0:
68
+ embed_ax.images[0].colorbar.remove()
69
+ embed_ax.clear()
70
+
71
+ # Draw the embed
72
+ if embed is not None:
73
+ plot_embedding_as_heatmap(embed, embed_ax)
74
+ embed_ax.set_title("embedding")
75
+ embed_ax.set_aspect("equal", "datalim")
76
+ embed_ax.set_xticks([])
77
+ embed_ax.set_yticks([])
78
+ embed_ax.figure.canvas.draw()
79
+
80
+ def draw_spec(self, spec, which):
81
+ _, spec_ax = self.current_ax if which == "current" else self.gen_ax
82
+
83
+ ## Spectrogram
84
+ # Draw the spectrogram
85
+ spec_ax.clear()
86
+ if spec is not None:
87
+ im = spec_ax.imshow(spec, aspect="auto", interpolation="none")
88
+ # spec_ax.figure.colorbar(mappable=im, shrink=0.65, orientation="horizontal",
89
+ # spec_ax=spec_ax)
90
+ spec_ax.set_title("mel spectrogram")
91
+
92
+ spec_ax.set_xticks([])
93
+ spec_ax.set_yticks([])
94
+ spec_ax.figure.canvas.draw()
95
+ if which != "current":
96
+ self.vocode_button.setDisabled(spec is None)
97
+
98
+ def draw_umap_projections(self, utterances: Set[Utterance]):
99
+ self.umap_ax.clear()
100
+
101
+ speakers = np.unique([u.speaker_name for u in utterances])
102
+ colors = {speaker_name: colormap[i] for i, speaker_name in enumerate(speakers)}
103
+ embeds = [u.embed for u in utterances]
104
+
105
+ # Display a message if there aren't enough points
106
+ if len(utterances) < self.min_umap_points:
107
+ self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
108
+ (self.min_umap_points - len(utterances)),
109
+ horizontalalignment='center', fontsize=15)
110
+ self.umap_ax.set_title("")
111
+
112
+ # Compute the projections
113
+ else:
114
+ if not self.umap_hot:
115
+ self.log(
116
+ "Drawing UMAP projections for the first time, this will take a few seconds.")
117
+ self.umap_hot = True
118
+
119
+ reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
120
+ # reducer = TSNE()
121
+ projections = reducer.fit_transform(embeds)
122
+
123
+ speakers_done = set()
124
+ for projection, utterance in zip(projections, utterances):
125
+ color = colors[utterance.speaker_name]
126
+ mark = "x" if "_gen_" in utterance.name else "o"
127
+ label = None if utterance.speaker_name in speakers_done else utterance.speaker_name
128
+ speakers_done.add(utterance.speaker_name)
129
+ self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark,
130
+ label=label)
131
+ # self.umap_ax.set_title("UMAP projections")
132
+ self.umap_ax.legend(prop={'size': 10})
133
+
134
+ # Draw the plot
135
+ self.umap_ax.set_aspect("equal", "datalim")
136
+ self.umap_ax.set_xticks([])
137
+ self.umap_ax.set_yticks([])
138
+ self.umap_ax.figure.canvas.draw()
139
+
140
+ def save_audio_file(self, wav, sample_rate):
141
+ dialog = QFileDialog()
142
+ dialog.setDefaultSuffix(".wav")
143
+ fpath, _ = dialog.getSaveFileName(
144
+ parent=self,
145
+ caption="Select a path to save the audio file",
146
+ filter="Audio Files (*.flac *.wav)"
147
+ )
148
+ if fpath:
149
+ #Default format is wav
150
+ if Path(fpath).suffix == "":
151
+ fpath += ".wav"
152
+ sf.write(fpath, wav, sample_rate)
153
+
154
+ def setup_audio_devices(self, sample_rate):
155
+ input_devices = []
156
+ output_devices = []
157
+ for device in sd.query_devices():
158
+ # Check if valid input
159
+ try:
160
+ sd.check_input_settings(device=device["name"], samplerate=sample_rate)
161
+ input_devices.append(device["name"])
162
+ except:
163
+ pass
164
+
165
+ # Check if valid output
166
+ try:
167
+ sd.check_output_settings(device=device["name"], samplerate=sample_rate)
168
+ output_devices.append(device["name"])
169
+ except Exception as e:
170
+ # Log a warning only if the device is not an input
171
+ if not device["name"] in input_devices:
172
+ warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
173
+
174
+ if len(input_devices) == 0:
175
+ self.log("No audio input device detected. Recording may not work.")
176
+ self.audio_in_device = None
177
+ else:
178
+ self.audio_in_device = input_devices[0]
179
+
180
+ if len(output_devices) == 0:
181
+ self.log("No supported output audio devices were found! Audio output may not work.")
182
+ self.audio_out_devices_cb.addItems(["None"])
183
+ self.audio_out_devices_cb.setDisabled(True)
184
+ else:
185
+ self.audio_out_devices_cb.clear()
186
+ self.audio_out_devices_cb.addItems(output_devices)
187
+ self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
188
+
189
+ self.set_audio_device()
190
+
191
+ def set_audio_device(self):
192
+
193
+ output_device = self.audio_out_devices_cb.currentText()
194
+ if output_device == "None":
195
+ output_device = None
196
+
197
+ # If None, sounddevice queries portaudio
198
+ sd.default.device = (self.audio_in_device, output_device)
199
+
200
+ def play(self, wav, sample_rate):
201
+ try:
202
+ sd.stop()
203
+ sd.play(wav, sample_rate)
204
+ except Exception as e:
205
+ print(e)
206
+ self.log("Error in audio playback. Try selecting a different audio output device.")
207
+ self.log("Your device must be connected before you start the toolbox.")
208
+
209
+ def stop(self):
210
+ sd.stop()
211
+
212
+ def record_one(self, sample_rate, duration):
213
+ self.record_button.setText("Recording...")
214
+ self.record_button.setDisabled(True)
215
+
216
+ self.log("Recording %d seconds of audio" % duration)
217
+ sd.stop()
218
+ try:
219
+ wav = sd.rec(duration * sample_rate, sample_rate, 1)
220
+ except Exception as e:
221
+ print(e)
222
+ self.log("Could not record anything. Is your recording device enabled?")
223
+ self.log("Your device must be connected before you start the toolbox.")
224
+ return None
225
+
226
+ for i in np.arange(0, duration, 0.1):
227
+ self.set_loading(i, duration)
228
+ sleep(0.1)
229
+ self.set_loading(duration, duration)
230
+ sd.wait()
231
+
232
+ self.log("Done recording.")
233
+ self.record_button.setText("Record")
234
+ self.record_button.setDisabled(False)
235
+
236
+ return wav.squeeze()
237
+
238
+ @property
239
+ def current_dataset_name(self):
240
+ return self.dataset_box.currentText()
241
+
242
+ @property
243
+ def current_speaker_name(self):
244
+ return self.speaker_box.currentText()
245
+
246
+ @property
247
+ def current_utterance_name(self):
248
+ return self.utterance_box.currentText()
249
+
250
+ def browse_file(self):
251
+ fpath = QFileDialog().getOpenFileName(
252
+ parent=self,
253
+ caption="Select an audio file",
254
+ filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
255
+ )
256
+ return Path(fpath[0]) if fpath[0] != "" else ""
257
+
258
+ @staticmethod
259
+ def repopulate_box(box, items, random=False):
260
+ """
261
+ Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
262
+ data to the items
263
+ """
264
+ box.blockSignals(True)
265
+ box.clear()
266
+ for item in items:
267
+ item = list(item) if isinstance(item, tuple) else [item]
268
+ box.addItem(str(item[0]), *item[1:])
269
+ if len(items) > 0:
270
+ box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
271
+ box.setDisabled(len(items) == 0)
272
+ box.blockSignals(False)
273
+
274
+ def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int,
275
+ random=True):
276
+ # Select a random dataset
277
+ if level <= 0:
278
+ if datasets_root is not None:
279
+ datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
280
+ datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
281
+ self.browser_load_button.setDisabled(len(datasets) == 0)
282
+ if datasets_root is None or len(datasets) == 0:
283
+ msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
284
+ if datasets_root is None else "o not have any of the recognized datasets" \
285
+ " in %s" % datasets_root)
286
+ self.log(msg)
287
+ msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
288
+ "can still use the toolbox by recording samples yourself." % \
289
+ ("\n\t".join(recognized_datasets))
290
+ print(msg, file=sys.stderr)
291
+
292
+ self.random_utterance_button.setDisabled(True)
293
+ self.random_speaker_button.setDisabled(True)
294
+ self.random_dataset_button.setDisabled(True)
295
+ self.utterance_box.setDisabled(True)
296
+ self.speaker_box.setDisabled(True)
297
+ self.dataset_box.setDisabled(True)
298
+ self.browser_load_button.setDisabled(True)
299
+ self.auto_next_checkbox.setDisabled(True)
300
+ return
301
+ self.repopulate_box(self.dataset_box, datasets, random)
302
+
303
+ # Select a random speaker
304
+ if level <= 1:
305
+ speakers_root = datasets_root.joinpath(self.current_dataset_name)
306
+ speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
307
+ self.repopulate_box(self.speaker_box, speaker_names, random)
308
+
309
+ # Select a random utterance
310
+ if level <= 2:
311
+ utterances_root = datasets_root.joinpath(
312
+ self.current_dataset_name,
313
+ self.current_speaker_name
314
+ )
315
+ utterances = []
316
+ for extension in ['mp3', 'flac', 'wav', 'm4a']:
317
+ utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
318
+ utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
319
+ self.repopulate_box(self.utterance_box, utterances, random)
320
+
321
+ def browser_select_next(self):
322
+ index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box)
323
+ self.utterance_box.setCurrentIndex(index)
324
+
325
+ @property
326
+ def current_encoder_fpath(self):
327
+ return self.encoder_box.itemData(self.encoder_box.currentIndex())
328
+
329
+ @property
330
+ def current_synthesizer_fpath(self):
331
+ return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex())
332
+
333
+ @property
334
+ def current_vocoder_fpath(self):
335
+ return self.vocoder_box.itemData(self.vocoder_box.currentIndex())
336
+
337
+ def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path,
338
+ vocoder_models_dir: Path):
339
+ # Encoder
340
+ encoder_fpaths = list(encoder_models_dir.glob("*.pt"))
341
+ if len(encoder_fpaths) == 0:
342
+ raise Exception("No encoder models found in %s" % encoder_models_dir)
343
+ self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths])
344
+
345
+ # Synthesizer
346
+ synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
347
+ if len(synthesizer_fpaths) == 0:
348
+ raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
349
+ self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
350
+
351
+ # Vocoder
352
+ vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt"))
353
+ vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
354
+ self.repopulate_box(self.vocoder_box, vocoder_items)
355
+
356
+ @property
357
+ def selected_utterance(self):
358
+ return self.utterance_history.itemData(self.utterance_history.currentIndex())
359
+
360
+ def register_utterance(self, utterance: Utterance):
361
+ self.utterance_history.blockSignals(True)
362
+ self.utterance_history.insertItem(0, utterance.name, utterance)
363
+ self.utterance_history.setCurrentIndex(0)
364
+ self.utterance_history.blockSignals(False)
365
+
366
+ if len(self.utterance_history) > self.max_saved_utterances:
367
+ self.utterance_history.removeItem(self.max_saved_utterances)
368
+
369
+ self.play_button.setDisabled(False)
370
+ self.generate_button.setDisabled(False)
371
+ self.synthesize_button.setDisabled(False)
372
+
373
+ def log(self, line, mode="newline"):
374
+ if mode == "newline":
375
+ self.logs.append(line)
376
+ if len(self.logs) > self.max_log_lines:
377
+ del self.logs[0]
378
+ elif mode == "append":
379
+ self.logs[-1] += line
380
+ elif mode == "overwrite":
381
+ self.logs[-1] = line
382
+ log_text = '\n'.join(self.logs)
383
+
384
+ self.log_window.setText(log_text)
385
+ self.app.processEvents()
386
+
387
+ def set_loading(self, value, maximum=1):
388
+ self.loading_bar.setValue(value * 100)
389
+ self.loading_bar.setMaximum(maximum * 100)
390
+ self.loading_bar.setTextVisible(value != 0)
391
+ self.app.processEvents()
392
+
393
+ def populate_gen_options(self, seed, trim_silences):
394
+ if seed is not None:
395
+ self.random_seed_checkbox.setChecked(True)
396
+ self.seed_textbox.setText(str(seed))
397
+ self.seed_textbox.setEnabled(True)
398
+ else:
399
+ self.random_seed_checkbox.setChecked(False)
400
+ self.seed_textbox.setText(str(0))
401
+ self.seed_textbox.setEnabled(False)
402
+
403
+ if not trim_silences:
404
+ self.trim_silences_checkbox.setChecked(False)
405
+ self.trim_silences_checkbox.setDisabled(True)
406
+
407
+ def update_seed_textbox(self):
408
+ if self.random_seed_checkbox.isChecked():
409
+ self.seed_textbox.setEnabled(True)
410
+ else:
411
+ self.seed_textbox.setEnabled(False)
412
+
413
+ def reset_interface(self):
414
+ self.draw_embed(None, None, "current")
415
+ self.draw_embed(None, None, "generated")
416
+ self.draw_spec(None, "current")
417
+ self.draw_spec(None, "generated")
418
+ self.draw_umap_projections(set())
419
+ self.set_loading(0)
420
+ self.play_button.setDisabled(True)
421
+ self.generate_button.setDisabled(True)
422
+ self.synthesize_button.setDisabled(True)
423
+ self.vocode_button.setDisabled(True)
424
+ self.replay_wav_button.setDisabled(True)
425
+ self.export_wav_button.setDisabled(True)
426
+ [self.log("") for _ in range(self.max_log_lines)]
427
+
428
+ def __init__(self):
429
+ ## Initialize the application
430
+ self.app = QApplication(sys.argv)
431
+ super().__init__(None)
432
+ self.setWindowTitle("SV2TTS toolbox")
433
+
434
+
435
+ ## Main layouts
436
+ # Root
437
+ root_layout = QGridLayout()
438
+ self.setLayout(root_layout)
439
+
440
+ # Browser
441
+ browser_layout = QGridLayout()
442
+ root_layout.addLayout(browser_layout, 0, 0, 1, 2)
443
+
444
+ # Generation
445
+ gen_layout = QVBoxLayout()
446
+ root_layout.addLayout(gen_layout, 0, 2, 1, 2)
447
+
448
+ # Projections
449
+ self.projections_layout = QVBoxLayout()
450
+ root_layout.addLayout(self.projections_layout, 1, 0, 1, 1)
451
+
452
+ # Visualizations
453
+ vis_layout = QVBoxLayout()
454
+ root_layout.addLayout(vis_layout, 1, 1, 1, 3)
455
+
456
+
457
+ ## Projections
458
+ # UMap
459
+ fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
460
+ fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
461
+ self.projections_layout.addWidget(FigureCanvas(fig))
462
+ self.umap_hot = False
463
+ self.clear_button = QPushButton("Clear")
464
+ self.projections_layout.addWidget(self.clear_button)
465
+
466
+
467
+ ## Browser
468
+ # Dataset, speaker and utterance selection
469
+ i = 0
470
+ self.dataset_box = QComboBox()
471
+ browser_layout.addWidget(QLabel("<b>Dataset</b>"), i, 0)
472
+ browser_layout.addWidget(self.dataset_box, i + 1, 0)
473
+ self.speaker_box = QComboBox()
474
+ browser_layout.addWidget(QLabel("<b>Speaker</b>"), i, 1)
475
+ browser_layout.addWidget(self.speaker_box, i + 1, 1)
476
+ self.utterance_box = QComboBox()
477
+ browser_layout.addWidget(QLabel("<b>Utterance</b>"), i, 2)
478
+ browser_layout.addWidget(self.utterance_box, i + 1, 2)
479
+ self.browser_load_button = QPushButton("Load")
480
+ browser_layout.addWidget(self.browser_load_button, i + 1, 3)
481
+ i += 2
482
+
483
+ # Random buttons
484
+ self.random_dataset_button = QPushButton("Random")
485
+ browser_layout.addWidget(self.random_dataset_button, i, 0)
486
+ self.random_speaker_button = QPushButton("Random")
487
+ browser_layout.addWidget(self.random_speaker_button, i, 1)
488
+ self.random_utterance_button = QPushButton("Random")
489
+ browser_layout.addWidget(self.random_utterance_button, i, 2)
490
+ self.auto_next_checkbox = QCheckBox("Auto select next")
491
+ self.auto_next_checkbox.setChecked(True)
492
+ browser_layout.addWidget(self.auto_next_checkbox, i, 3)
493
+ i += 1
494
+
495
+ # Utterance box
496
+ browser_layout.addWidget(QLabel("<b>Use embedding from:</b>"), i, 0)
497
+ self.utterance_history = QComboBox()
498
+ browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
499
+ i += 1
500
+
501
+ # Random & next utterance buttons
502
+ self.browser_browse_button = QPushButton("Browse")
503
+ browser_layout.addWidget(self.browser_browse_button, i, 0)
504
+ self.record_button = QPushButton("Record")
505
+ browser_layout.addWidget(self.record_button, i, 1)
506
+ self.play_button = QPushButton("Play")
507
+ browser_layout.addWidget(self.play_button, i, 2)
508
+ self.stop_button = QPushButton("Stop")
509
+ browser_layout.addWidget(self.stop_button, i, 3)
510
+ i += 1
511
+
512
+
513
+ # Model and audio output selection
514
+ self.encoder_box = QComboBox()
515
+ browser_layout.addWidget(QLabel("<b>Encoder</b>"), i, 0)
516
+ browser_layout.addWidget(self.encoder_box, i + 1, 0)
517
+ self.synthesizer_box = QComboBox()
518
+ browser_layout.addWidget(QLabel("<b>Synthesizer</b>"), i, 1)
519
+ browser_layout.addWidget(self.synthesizer_box, i + 1, 1)
520
+ self.vocoder_box = QComboBox()
521
+ browser_layout.addWidget(QLabel("<b>Vocoder</b>"), i, 2)
522
+ browser_layout.addWidget(self.vocoder_box, i + 1, 2)
523
+
524
+ self.audio_out_devices_cb=QComboBox()
525
+ browser_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 3)
526
+ browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 3)
527
+ i += 2
528
+
529
+ #Replay & Save Audio
530
+ browser_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
531
+ self.waves_cb = QComboBox()
532
+ self.waves_cb_model = QStringListModel()
533
+ self.waves_cb.setModel(self.waves_cb_model)
534
+ self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
535
+ browser_layout.addWidget(self.waves_cb, i, 1)
536
+ self.replay_wav_button = QPushButton("Replay")
537
+ self.replay_wav_button.setToolTip("Replay last generated vocoder")
538
+ browser_layout.addWidget(self.replay_wav_button, i, 2)
539
+ self.export_wav_button = QPushButton("Export")
540
+ self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
541
+ browser_layout.addWidget(self.export_wav_button, i, 3)
542
+ i += 1
543
+
544
+
545
+ ## Embed & spectrograms
546
+ vis_layout.addStretch()
547
+
548
+ gridspec_kw = {"width_ratios": [1, 4]}
549
+ fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
550
+ gridspec_kw=gridspec_kw)
551
+ fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
552
+ vis_layout.addWidget(FigureCanvas(fig))
553
+
554
+ fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
555
+ gridspec_kw=gridspec_kw)
556
+ fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
557
+ vis_layout.addWidget(FigureCanvas(fig))
558
+
559
+ for ax in self.current_ax.tolist() + self.gen_ax.tolist():
560
+ ax.set_facecolor("#F0F0F0")
561
+ for side in ["top", "right", "bottom", "left"]:
562
+ ax.spines[side].set_visible(False)
563
+
564
+
565
+ ## Generation
566
+ self.text_prompt = QPlainTextEdit(default_text)
567
+ gen_layout.addWidget(self.text_prompt, stretch=1)
568
+
569
+ self.generate_button = QPushButton("Synthesize and vocode")
570
+ gen_layout.addWidget(self.generate_button)
571
+
572
+ layout = QHBoxLayout()
573
+ self.synthesize_button = QPushButton("Synthesize only")
574
+ layout.addWidget(self.synthesize_button)
575
+ self.vocode_button = QPushButton("Vocode only")
576
+ layout.addWidget(self.vocode_button)
577
+ gen_layout.addLayout(layout)
578
+
579
+ layout_seed = QGridLayout()
580
+ self.random_seed_checkbox = QCheckBox("Random seed:")
581
+ self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
582
+ layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
583
+ self.seed_textbox = QLineEdit()
584
+ self.seed_textbox.setMaximumWidth(80)
585
+ layout_seed.addWidget(self.seed_textbox, 0, 1)
586
+ self.trim_silences_checkbox = QCheckBox("Enhance vocoder output")
587
+ self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
588
+ " This feature requires `webrtcvad` to be installed.")
589
+ layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
590
+ gen_layout.addLayout(layout_seed)
591
+
592
+ self.loading_bar = QProgressBar()
593
+ gen_layout.addWidget(self.loading_bar)
594
+
595
+ self.log_window = QLabel()
596
+ self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
597
+ gen_layout.addWidget(self.log_window)
598
+ self.logs = []
599
+ gen_layout.addStretch()
600
+
601
+
602
+ ## Set the size of the window and of the elements
603
+ max_size = QDesktopWidget().availableGeometry(self).size() * 0.8
604
+ self.resize(max_size)
605
+
606
+ ## Finalize the display
607
+ self.reset_interface()
608
+ self.show()
609
+
610
+ def start(self):
611
+ self.app.exec_()