Spaces:
Runtime error
Runtime error
Upload toolbox/ui.py with huggingface_hub
Browse files- toolbox/ui.py +611 -0
toolbox/ui.py
ADDED
@@ -0,0 +1,611 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
3 |
+
from matplotlib.figure import Figure
|
4 |
+
from PyQt5.QtCore import Qt, QStringListModel
|
5 |
+
from PyQt5.QtWidgets import *
|
6 |
+
from encoder.inference import plot_embedding_as_heatmap
|
7 |
+
from toolbox.utterance import Utterance
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import List, Set
|
10 |
+
import sounddevice as sd
|
11 |
+
import soundfile as sf
|
12 |
+
import numpy as np
|
13 |
+
# from sklearn.manifold import TSNE # You can try with TSNE if you like, I prefer UMAP
|
14 |
+
from time import sleep
|
15 |
+
import umap
|
16 |
+
import sys
|
17 |
+
from warnings import filterwarnings, warn
|
18 |
+
filterwarnings("ignore")
|
19 |
+
|
20 |
+
|
21 |
+
colormap = np.array([
|
22 |
+
[0, 127, 70],
|
23 |
+
[255, 0, 0],
|
24 |
+
[255, 217, 38],
|
25 |
+
[0, 135, 255],
|
26 |
+
[165, 0, 165],
|
27 |
+
[255, 167, 255],
|
28 |
+
[97, 142, 151],
|
29 |
+
[0, 255, 255],
|
30 |
+
[255, 96, 38],
|
31 |
+
[142, 76, 0],
|
32 |
+
[33, 0, 127],
|
33 |
+
[0, 0, 0],
|
34 |
+
[183, 183, 183],
|
35 |
+
[76, 255, 0],
|
36 |
+
], dtype=np.float) / 255
|
37 |
+
|
38 |
+
default_text = \
|
39 |
+
"Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \
|
40 |
+
"yourself.\nOnce its embedding has been created, you can synthesize any text written here.\n" \
|
41 |
+
"The synthesizer expects to generate " \
|
42 |
+
"outputs that are somewhere between 5 and 12 seconds.\nTo mark breaks, write a new line. " \
|
43 |
+
"Each line will be treated separately.\nThen, they are joined together to make the final " \
|
44 |
+
"spectrogram. Use the vocoder to generate audio.\nThe vocoder generates almost in constant " \
|
45 |
+
"time, so it will be more time efficient for longer inputs like this one.\nOn the left you " \
|
46 |
+
"have the embedding projections. Load or record more utterances to see them.\nIf you have " \
|
47 |
+
"at least 2 or 3 utterances from a same speaker, a cluster should form.\nSynthesized " \
|
48 |
+
"utterances are of the same color as the speaker whose voice was used, but they're " \
|
49 |
+
"represented with a cross."
|
50 |
+
|
51 |
+
|
52 |
+
class UI(QDialog):
|
53 |
+
min_umap_points = 4
|
54 |
+
max_log_lines = 5
|
55 |
+
max_saved_utterances = 20
|
56 |
+
|
57 |
+
def draw_utterance(self, utterance: Utterance, which):
|
58 |
+
self.draw_spec(utterance.spec, which)
|
59 |
+
self.draw_embed(utterance.embed, utterance.name, which)
|
60 |
+
|
61 |
+
def draw_embed(self, embed, name, which):
|
62 |
+
embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
|
63 |
+
embed_ax.figure.suptitle("" if embed is None else name)
|
64 |
+
|
65 |
+
## Embedding
|
66 |
+
# Clear the plot
|
67 |
+
if len(embed_ax.images) > 0:
|
68 |
+
embed_ax.images[0].colorbar.remove()
|
69 |
+
embed_ax.clear()
|
70 |
+
|
71 |
+
# Draw the embed
|
72 |
+
if embed is not None:
|
73 |
+
plot_embedding_as_heatmap(embed, embed_ax)
|
74 |
+
embed_ax.set_title("embedding")
|
75 |
+
embed_ax.set_aspect("equal", "datalim")
|
76 |
+
embed_ax.set_xticks([])
|
77 |
+
embed_ax.set_yticks([])
|
78 |
+
embed_ax.figure.canvas.draw()
|
79 |
+
|
80 |
+
def draw_spec(self, spec, which):
|
81 |
+
_, spec_ax = self.current_ax if which == "current" else self.gen_ax
|
82 |
+
|
83 |
+
## Spectrogram
|
84 |
+
# Draw the spectrogram
|
85 |
+
spec_ax.clear()
|
86 |
+
if spec is not None:
|
87 |
+
im = spec_ax.imshow(spec, aspect="auto", interpolation="none")
|
88 |
+
# spec_ax.figure.colorbar(mappable=im, shrink=0.65, orientation="horizontal",
|
89 |
+
# spec_ax=spec_ax)
|
90 |
+
spec_ax.set_title("mel spectrogram")
|
91 |
+
|
92 |
+
spec_ax.set_xticks([])
|
93 |
+
spec_ax.set_yticks([])
|
94 |
+
spec_ax.figure.canvas.draw()
|
95 |
+
if which != "current":
|
96 |
+
self.vocode_button.setDisabled(spec is None)
|
97 |
+
|
98 |
+
def draw_umap_projections(self, utterances: Set[Utterance]):
|
99 |
+
self.umap_ax.clear()
|
100 |
+
|
101 |
+
speakers = np.unique([u.speaker_name for u in utterances])
|
102 |
+
colors = {speaker_name: colormap[i] for i, speaker_name in enumerate(speakers)}
|
103 |
+
embeds = [u.embed for u in utterances]
|
104 |
+
|
105 |
+
# Display a message if there aren't enough points
|
106 |
+
if len(utterances) < self.min_umap_points:
|
107 |
+
self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
|
108 |
+
(self.min_umap_points - len(utterances)),
|
109 |
+
horizontalalignment='center', fontsize=15)
|
110 |
+
self.umap_ax.set_title("")
|
111 |
+
|
112 |
+
# Compute the projections
|
113 |
+
else:
|
114 |
+
if not self.umap_hot:
|
115 |
+
self.log(
|
116 |
+
"Drawing UMAP projections for the first time, this will take a few seconds.")
|
117 |
+
self.umap_hot = True
|
118 |
+
|
119 |
+
reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
|
120 |
+
# reducer = TSNE()
|
121 |
+
projections = reducer.fit_transform(embeds)
|
122 |
+
|
123 |
+
speakers_done = set()
|
124 |
+
for projection, utterance in zip(projections, utterances):
|
125 |
+
color = colors[utterance.speaker_name]
|
126 |
+
mark = "x" if "_gen_" in utterance.name else "o"
|
127 |
+
label = None if utterance.speaker_name in speakers_done else utterance.speaker_name
|
128 |
+
speakers_done.add(utterance.speaker_name)
|
129 |
+
self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark,
|
130 |
+
label=label)
|
131 |
+
# self.umap_ax.set_title("UMAP projections")
|
132 |
+
self.umap_ax.legend(prop={'size': 10})
|
133 |
+
|
134 |
+
# Draw the plot
|
135 |
+
self.umap_ax.set_aspect("equal", "datalim")
|
136 |
+
self.umap_ax.set_xticks([])
|
137 |
+
self.umap_ax.set_yticks([])
|
138 |
+
self.umap_ax.figure.canvas.draw()
|
139 |
+
|
140 |
+
def save_audio_file(self, wav, sample_rate):
|
141 |
+
dialog = QFileDialog()
|
142 |
+
dialog.setDefaultSuffix(".wav")
|
143 |
+
fpath, _ = dialog.getSaveFileName(
|
144 |
+
parent=self,
|
145 |
+
caption="Select a path to save the audio file",
|
146 |
+
filter="Audio Files (*.flac *.wav)"
|
147 |
+
)
|
148 |
+
if fpath:
|
149 |
+
#Default format is wav
|
150 |
+
if Path(fpath).suffix == "":
|
151 |
+
fpath += ".wav"
|
152 |
+
sf.write(fpath, wav, sample_rate)
|
153 |
+
|
154 |
+
def setup_audio_devices(self, sample_rate):
|
155 |
+
input_devices = []
|
156 |
+
output_devices = []
|
157 |
+
for device in sd.query_devices():
|
158 |
+
# Check if valid input
|
159 |
+
try:
|
160 |
+
sd.check_input_settings(device=device["name"], samplerate=sample_rate)
|
161 |
+
input_devices.append(device["name"])
|
162 |
+
except:
|
163 |
+
pass
|
164 |
+
|
165 |
+
# Check if valid output
|
166 |
+
try:
|
167 |
+
sd.check_output_settings(device=device["name"], samplerate=sample_rate)
|
168 |
+
output_devices.append(device["name"])
|
169 |
+
except Exception as e:
|
170 |
+
# Log a warning only if the device is not an input
|
171 |
+
if not device["name"] in input_devices:
|
172 |
+
warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
|
173 |
+
|
174 |
+
if len(input_devices) == 0:
|
175 |
+
self.log("No audio input device detected. Recording may not work.")
|
176 |
+
self.audio_in_device = None
|
177 |
+
else:
|
178 |
+
self.audio_in_device = input_devices[0]
|
179 |
+
|
180 |
+
if len(output_devices) == 0:
|
181 |
+
self.log("No supported output audio devices were found! Audio output may not work.")
|
182 |
+
self.audio_out_devices_cb.addItems(["None"])
|
183 |
+
self.audio_out_devices_cb.setDisabled(True)
|
184 |
+
else:
|
185 |
+
self.audio_out_devices_cb.clear()
|
186 |
+
self.audio_out_devices_cb.addItems(output_devices)
|
187 |
+
self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
|
188 |
+
|
189 |
+
self.set_audio_device()
|
190 |
+
|
191 |
+
def set_audio_device(self):
|
192 |
+
|
193 |
+
output_device = self.audio_out_devices_cb.currentText()
|
194 |
+
if output_device == "None":
|
195 |
+
output_device = None
|
196 |
+
|
197 |
+
# If None, sounddevice queries portaudio
|
198 |
+
sd.default.device = (self.audio_in_device, output_device)
|
199 |
+
|
200 |
+
def play(self, wav, sample_rate):
|
201 |
+
try:
|
202 |
+
sd.stop()
|
203 |
+
sd.play(wav, sample_rate)
|
204 |
+
except Exception as e:
|
205 |
+
print(e)
|
206 |
+
self.log("Error in audio playback. Try selecting a different audio output device.")
|
207 |
+
self.log("Your device must be connected before you start the toolbox.")
|
208 |
+
|
209 |
+
def stop(self):
|
210 |
+
sd.stop()
|
211 |
+
|
212 |
+
def record_one(self, sample_rate, duration):
|
213 |
+
self.record_button.setText("Recording...")
|
214 |
+
self.record_button.setDisabled(True)
|
215 |
+
|
216 |
+
self.log("Recording %d seconds of audio" % duration)
|
217 |
+
sd.stop()
|
218 |
+
try:
|
219 |
+
wav = sd.rec(duration * sample_rate, sample_rate, 1)
|
220 |
+
except Exception as e:
|
221 |
+
print(e)
|
222 |
+
self.log("Could not record anything. Is your recording device enabled?")
|
223 |
+
self.log("Your device must be connected before you start the toolbox.")
|
224 |
+
return None
|
225 |
+
|
226 |
+
for i in np.arange(0, duration, 0.1):
|
227 |
+
self.set_loading(i, duration)
|
228 |
+
sleep(0.1)
|
229 |
+
self.set_loading(duration, duration)
|
230 |
+
sd.wait()
|
231 |
+
|
232 |
+
self.log("Done recording.")
|
233 |
+
self.record_button.setText("Record")
|
234 |
+
self.record_button.setDisabled(False)
|
235 |
+
|
236 |
+
return wav.squeeze()
|
237 |
+
|
238 |
+
@property
|
239 |
+
def current_dataset_name(self):
|
240 |
+
return self.dataset_box.currentText()
|
241 |
+
|
242 |
+
@property
|
243 |
+
def current_speaker_name(self):
|
244 |
+
return self.speaker_box.currentText()
|
245 |
+
|
246 |
+
@property
|
247 |
+
def current_utterance_name(self):
|
248 |
+
return self.utterance_box.currentText()
|
249 |
+
|
250 |
+
def browse_file(self):
|
251 |
+
fpath = QFileDialog().getOpenFileName(
|
252 |
+
parent=self,
|
253 |
+
caption="Select an audio file",
|
254 |
+
filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
|
255 |
+
)
|
256 |
+
return Path(fpath[0]) if fpath[0] != "" else ""
|
257 |
+
|
258 |
+
@staticmethod
|
259 |
+
def repopulate_box(box, items, random=False):
|
260 |
+
"""
|
261 |
+
Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
|
262 |
+
data to the items
|
263 |
+
"""
|
264 |
+
box.blockSignals(True)
|
265 |
+
box.clear()
|
266 |
+
for item in items:
|
267 |
+
item = list(item) if isinstance(item, tuple) else [item]
|
268 |
+
box.addItem(str(item[0]), *item[1:])
|
269 |
+
if len(items) > 0:
|
270 |
+
box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
|
271 |
+
box.setDisabled(len(items) == 0)
|
272 |
+
box.blockSignals(False)
|
273 |
+
|
274 |
+
def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int,
|
275 |
+
random=True):
|
276 |
+
# Select a random dataset
|
277 |
+
if level <= 0:
|
278 |
+
if datasets_root is not None:
|
279 |
+
datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
|
280 |
+
datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
|
281 |
+
self.browser_load_button.setDisabled(len(datasets) == 0)
|
282 |
+
if datasets_root is None or len(datasets) == 0:
|
283 |
+
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
|
284 |
+
if datasets_root is None else "o not have any of the recognized datasets" \
|
285 |
+
" in %s" % datasets_root)
|
286 |
+
self.log(msg)
|
287 |
+
msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
|
288 |
+
"can still use the toolbox by recording samples yourself." % \
|
289 |
+
("\n\t".join(recognized_datasets))
|
290 |
+
print(msg, file=sys.stderr)
|
291 |
+
|
292 |
+
self.random_utterance_button.setDisabled(True)
|
293 |
+
self.random_speaker_button.setDisabled(True)
|
294 |
+
self.random_dataset_button.setDisabled(True)
|
295 |
+
self.utterance_box.setDisabled(True)
|
296 |
+
self.speaker_box.setDisabled(True)
|
297 |
+
self.dataset_box.setDisabled(True)
|
298 |
+
self.browser_load_button.setDisabled(True)
|
299 |
+
self.auto_next_checkbox.setDisabled(True)
|
300 |
+
return
|
301 |
+
self.repopulate_box(self.dataset_box, datasets, random)
|
302 |
+
|
303 |
+
# Select a random speaker
|
304 |
+
if level <= 1:
|
305 |
+
speakers_root = datasets_root.joinpath(self.current_dataset_name)
|
306 |
+
speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
|
307 |
+
self.repopulate_box(self.speaker_box, speaker_names, random)
|
308 |
+
|
309 |
+
# Select a random utterance
|
310 |
+
if level <= 2:
|
311 |
+
utterances_root = datasets_root.joinpath(
|
312 |
+
self.current_dataset_name,
|
313 |
+
self.current_speaker_name
|
314 |
+
)
|
315 |
+
utterances = []
|
316 |
+
for extension in ['mp3', 'flac', 'wav', 'm4a']:
|
317 |
+
utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
|
318 |
+
utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
|
319 |
+
self.repopulate_box(self.utterance_box, utterances, random)
|
320 |
+
|
321 |
+
def browser_select_next(self):
|
322 |
+
index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box)
|
323 |
+
self.utterance_box.setCurrentIndex(index)
|
324 |
+
|
325 |
+
@property
|
326 |
+
def current_encoder_fpath(self):
|
327 |
+
return self.encoder_box.itemData(self.encoder_box.currentIndex())
|
328 |
+
|
329 |
+
@property
|
330 |
+
def current_synthesizer_fpath(self):
|
331 |
+
return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex())
|
332 |
+
|
333 |
+
@property
|
334 |
+
def current_vocoder_fpath(self):
|
335 |
+
return self.vocoder_box.itemData(self.vocoder_box.currentIndex())
|
336 |
+
|
337 |
+
def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path,
|
338 |
+
vocoder_models_dir: Path):
|
339 |
+
# Encoder
|
340 |
+
encoder_fpaths = list(encoder_models_dir.glob("*.pt"))
|
341 |
+
if len(encoder_fpaths) == 0:
|
342 |
+
raise Exception("No encoder models found in %s" % encoder_models_dir)
|
343 |
+
self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths])
|
344 |
+
|
345 |
+
# Synthesizer
|
346 |
+
synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
|
347 |
+
if len(synthesizer_fpaths) == 0:
|
348 |
+
raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
|
349 |
+
self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
|
350 |
+
|
351 |
+
# Vocoder
|
352 |
+
vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt"))
|
353 |
+
vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
|
354 |
+
self.repopulate_box(self.vocoder_box, vocoder_items)
|
355 |
+
|
356 |
+
@property
|
357 |
+
def selected_utterance(self):
|
358 |
+
return self.utterance_history.itemData(self.utterance_history.currentIndex())
|
359 |
+
|
360 |
+
def register_utterance(self, utterance: Utterance):
|
361 |
+
self.utterance_history.blockSignals(True)
|
362 |
+
self.utterance_history.insertItem(0, utterance.name, utterance)
|
363 |
+
self.utterance_history.setCurrentIndex(0)
|
364 |
+
self.utterance_history.blockSignals(False)
|
365 |
+
|
366 |
+
if len(self.utterance_history) > self.max_saved_utterances:
|
367 |
+
self.utterance_history.removeItem(self.max_saved_utterances)
|
368 |
+
|
369 |
+
self.play_button.setDisabled(False)
|
370 |
+
self.generate_button.setDisabled(False)
|
371 |
+
self.synthesize_button.setDisabled(False)
|
372 |
+
|
373 |
+
def log(self, line, mode="newline"):
|
374 |
+
if mode == "newline":
|
375 |
+
self.logs.append(line)
|
376 |
+
if len(self.logs) > self.max_log_lines:
|
377 |
+
del self.logs[0]
|
378 |
+
elif mode == "append":
|
379 |
+
self.logs[-1] += line
|
380 |
+
elif mode == "overwrite":
|
381 |
+
self.logs[-1] = line
|
382 |
+
log_text = '\n'.join(self.logs)
|
383 |
+
|
384 |
+
self.log_window.setText(log_text)
|
385 |
+
self.app.processEvents()
|
386 |
+
|
387 |
+
def set_loading(self, value, maximum=1):
|
388 |
+
self.loading_bar.setValue(value * 100)
|
389 |
+
self.loading_bar.setMaximum(maximum * 100)
|
390 |
+
self.loading_bar.setTextVisible(value != 0)
|
391 |
+
self.app.processEvents()
|
392 |
+
|
393 |
+
def populate_gen_options(self, seed, trim_silences):
|
394 |
+
if seed is not None:
|
395 |
+
self.random_seed_checkbox.setChecked(True)
|
396 |
+
self.seed_textbox.setText(str(seed))
|
397 |
+
self.seed_textbox.setEnabled(True)
|
398 |
+
else:
|
399 |
+
self.random_seed_checkbox.setChecked(False)
|
400 |
+
self.seed_textbox.setText(str(0))
|
401 |
+
self.seed_textbox.setEnabled(False)
|
402 |
+
|
403 |
+
if not trim_silences:
|
404 |
+
self.trim_silences_checkbox.setChecked(False)
|
405 |
+
self.trim_silences_checkbox.setDisabled(True)
|
406 |
+
|
407 |
+
def update_seed_textbox(self):
|
408 |
+
if self.random_seed_checkbox.isChecked():
|
409 |
+
self.seed_textbox.setEnabled(True)
|
410 |
+
else:
|
411 |
+
self.seed_textbox.setEnabled(False)
|
412 |
+
|
413 |
+
def reset_interface(self):
|
414 |
+
self.draw_embed(None, None, "current")
|
415 |
+
self.draw_embed(None, None, "generated")
|
416 |
+
self.draw_spec(None, "current")
|
417 |
+
self.draw_spec(None, "generated")
|
418 |
+
self.draw_umap_projections(set())
|
419 |
+
self.set_loading(0)
|
420 |
+
self.play_button.setDisabled(True)
|
421 |
+
self.generate_button.setDisabled(True)
|
422 |
+
self.synthesize_button.setDisabled(True)
|
423 |
+
self.vocode_button.setDisabled(True)
|
424 |
+
self.replay_wav_button.setDisabled(True)
|
425 |
+
self.export_wav_button.setDisabled(True)
|
426 |
+
[self.log("") for _ in range(self.max_log_lines)]
|
427 |
+
|
428 |
+
def __init__(self):
|
429 |
+
## Initialize the application
|
430 |
+
self.app = QApplication(sys.argv)
|
431 |
+
super().__init__(None)
|
432 |
+
self.setWindowTitle("SV2TTS toolbox")
|
433 |
+
|
434 |
+
|
435 |
+
## Main layouts
|
436 |
+
# Root
|
437 |
+
root_layout = QGridLayout()
|
438 |
+
self.setLayout(root_layout)
|
439 |
+
|
440 |
+
# Browser
|
441 |
+
browser_layout = QGridLayout()
|
442 |
+
root_layout.addLayout(browser_layout, 0, 0, 1, 2)
|
443 |
+
|
444 |
+
# Generation
|
445 |
+
gen_layout = QVBoxLayout()
|
446 |
+
root_layout.addLayout(gen_layout, 0, 2, 1, 2)
|
447 |
+
|
448 |
+
# Projections
|
449 |
+
self.projections_layout = QVBoxLayout()
|
450 |
+
root_layout.addLayout(self.projections_layout, 1, 0, 1, 1)
|
451 |
+
|
452 |
+
# Visualizations
|
453 |
+
vis_layout = QVBoxLayout()
|
454 |
+
root_layout.addLayout(vis_layout, 1, 1, 1, 3)
|
455 |
+
|
456 |
+
|
457 |
+
## Projections
|
458 |
+
# UMap
|
459 |
+
fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
|
460 |
+
fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
|
461 |
+
self.projections_layout.addWidget(FigureCanvas(fig))
|
462 |
+
self.umap_hot = False
|
463 |
+
self.clear_button = QPushButton("Clear")
|
464 |
+
self.projections_layout.addWidget(self.clear_button)
|
465 |
+
|
466 |
+
|
467 |
+
## Browser
|
468 |
+
# Dataset, speaker and utterance selection
|
469 |
+
i = 0
|
470 |
+
self.dataset_box = QComboBox()
|
471 |
+
browser_layout.addWidget(QLabel("<b>Dataset</b>"), i, 0)
|
472 |
+
browser_layout.addWidget(self.dataset_box, i + 1, 0)
|
473 |
+
self.speaker_box = QComboBox()
|
474 |
+
browser_layout.addWidget(QLabel("<b>Speaker</b>"), i, 1)
|
475 |
+
browser_layout.addWidget(self.speaker_box, i + 1, 1)
|
476 |
+
self.utterance_box = QComboBox()
|
477 |
+
browser_layout.addWidget(QLabel("<b>Utterance</b>"), i, 2)
|
478 |
+
browser_layout.addWidget(self.utterance_box, i + 1, 2)
|
479 |
+
self.browser_load_button = QPushButton("Load")
|
480 |
+
browser_layout.addWidget(self.browser_load_button, i + 1, 3)
|
481 |
+
i += 2
|
482 |
+
|
483 |
+
# Random buttons
|
484 |
+
self.random_dataset_button = QPushButton("Random")
|
485 |
+
browser_layout.addWidget(self.random_dataset_button, i, 0)
|
486 |
+
self.random_speaker_button = QPushButton("Random")
|
487 |
+
browser_layout.addWidget(self.random_speaker_button, i, 1)
|
488 |
+
self.random_utterance_button = QPushButton("Random")
|
489 |
+
browser_layout.addWidget(self.random_utterance_button, i, 2)
|
490 |
+
self.auto_next_checkbox = QCheckBox("Auto select next")
|
491 |
+
self.auto_next_checkbox.setChecked(True)
|
492 |
+
browser_layout.addWidget(self.auto_next_checkbox, i, 3)
|
493 |
+
i += 1
|
494 |
+
|
495 |
+
# Utterance box
|
496 |
+
browser_layout.addWidget(QLabel("<b>Use embedding from:</b>"), i, 0)
|
497 |
+
self.utterance_history = QComboBox()
|
498 |
+
browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
|
499 |
+
i += 1
|
500 |
+
|
501 |
+
# Random & next utterance buttons
|
502 |
+
self.browser_browse_button = QPushButton("Browse")
|
503 |
+
browser_layout.addWidget(self.browser_browse_button, i, 0)
|
504 |
+
self.record_button = QPushButton("Record")
|
505 |
+
browser_layout.addWidget(self.record_button, i, 1)
|
506 |
+
self.play_button = QPushButton("Play")
|
507 |
+
browser_layout.addWidget(self.play_button, i, 2)
|
508 |
+
self.stop_button = QPushButton("Stop")
|
509 |
+
browser_layout.addWidget(self.stop_button, i, 3)
|
510 |
+
i += 1
|
511 |
+
|
512 |
+
|
513 |
+
# Model and audio output selection
|
514 |
+
self.encoder_box = QComboBox()
|
515 |
+
browser_layout.addWidget(QLabel("<b>Encoder</b>"), i, 0)
|
516 |
+
browser_layout.addWidget(self.encoder_box, i + 1, 0)
|
517 |
+
self.synthesizer_box = QComboBox()
|
518 |
+
browser_layout.addWidget(QLabel("<b>Synthesizer</b>"), i, 1)
|
519 |
+
browser_layout.addWidget(self.synthesizer_box, i + 1, 1)
|
520 |
+
self.vocoder_box = QComboBox()
|
521 |
+
browser_layout.addWidget(QLabel("<b>Vocoder</b>"), i, 2)
|
522 |
+
browser_layout.addWidget(self.vocoder_box, i + 1, 2)
|
523 |
+
|
524 |
+
self.audio_out_devices_cb=QComboBox()
|
525 |
+
browser_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 3)
|
526 |
+
browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 3)
|
527 |
+
i += 2
|
528 |
+
|
529 |
+
#Replay & Save Audio
|
530 |
+
browser_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
|
531 |
+
self.waves_cb = QComboBox()
|
532 |
+
self.waves_cb_model = QStringListModel()
|
533 |
+
self.waves_cb.setModel(self.waves_cb_model)
|
534 |
+
self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
|
535 |
+
browser_layout.addWidget(self.waves_cb, i, 1)
|
536 |
+
self.replay_wav_button = QPushButton("Replay")
|
537 |
+
self.replay_wav_button.setToolTip("Replay last generated vocoder")
|
538 |
+
browser_layout.addWidget(self.replay_wav_button, i, 2)
|
539 |
+
self.export_wav_button = QPushButton("Export")
|
540 |
+
self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
|
541 |
+
browser_layout.addWidget(self.export_wav_button, i, 3)
|
542 |
+
i += 1
|
543 |
+
|
544 |
+
|
545 |
+
## Embed & spectrograms
|
546 |
+
vis_layout.addStretch()
|
547 |
+
|
548 |
+
gridspec_kw = {"width_ratios": [1, 4]}
|
549 |
+
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
550 |
+
gridspec_kw=gridspec_kw)
|
551 |
+
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
552 |
+
vis_layout.addWidget(FigureCanvas(fig))
|
553 |
+
|
554 |
+
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
555 |
+
gridspec_kw=gridspec_kw)
|
556 |
+
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
557 |
+
vis_layout.addWidget(FigureCanvas(fig))
|
558 |
+
|
559 |
+
for ax in self.current_ax.tolist() + self.gen_ax.tolist():
|
560 |
+
ax.set_facecolor("#F0F0F0")
|
561 |
+
for side in ["top", "right", "bottom", "left"]:
|
562 |
+
ax.spines[side].set_visible(False)
|
563 |
+
|
564 |
+
|
565 |
+
## Generation
|
566 |
+
self.text_prompt = QPlainTextEdit(default_text)
|
567 |
+
gen_layout.addWidget(self.text_prompt, stretch=1)
|
568 |
+
|
569 |
+
self.generate_button = QPushButton("Synthesize and vocode")
|
570 |
+
gen_layout.addWidget(self.generate_button)
|
571 |
+
|
572 |
+
layout = QHBoxLayout()
|
573 |
+
self.synthesize_button = QPushButton("Synthesize only")
|
574 |
+
layout.addWidget(self.synthesize_button)
|
575 |
+
self.vocode_button = QPushButton("Vocode only")
|
576 |
+
layout.addWidget(self.vocode_button)
|
577 |
+
gen_layout.addLayout(layout)
|
578 |
+
|
579 |
+
layout_seed = QGridLayout()
|
580 |
+
self.random_seed_checkbox = QCheckBox("Random seed:")
|
581 |
+
self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
|
582 |
+
layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
|
583 |
+
self.seed_textbox = QLineEdit()
|
584 |
+
self.seed_textbox.setMaximumWidth(80)
|
585 |
+
layout_seed.addWidget(self.seed_textbox, 0, 1)
|
586 |
+
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output")
|
587 |
+
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
|
588 |
+
" This feature requires `webrtcvad` to be installed.")
|
589 |
+
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
|
590 |
+
gen_layout.addLayout(layout_seed)
|
591 |
+
|
592 |
+
self.loading_bar = QProgressBar()
|
593 |
+
gen_layout.addWidget(self.loading_bar)
|
594 |
+
|
595 |
+
self.log_window = QLabel()
|
596 |
+
self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
|
597 |
+
gen_layout.addWidget(self.log_window)
|
598 |
+
self.logs = []
|
599 |
+
gen_layout.addStretch()
|
600 |
+
|
601 |
+
|
602 |
+
## Set the size of the window and of the elements
|
603 |
+
max_size = QDesktopWidget().availableGeometry(self).size() * 0.8
|
604 |
+
self.resize(max_size)
|
605 |
+
|
606 |
+
## Finalize the display
|
607 |
+
self.reset_interface()
|
608 |
+
self.show()
|
609 |
+
|
610 |
+
def start(self):
|
611 |
+
self.app.exec_()
|