Spaces:
Sleeping
Sleeping
Hugo Flores Garcia
commited on
Commit
·
2f3fb32
1
Parent(s):
3346920
interface
Browse files- .gitignore +2 -1
- demo.py +27 -5
- vampnet/interface.py +23 -12
- vampnet/mask.py +4 -4
.gitignore
CHANGED
@@ -176,4 +176,5 @@ lyrebird-audio-codec
|
|
176 |
samples-*/**
|
177 |
|
178 |
gradio-outputs/
|
179 |
-
models/
|
|
|
|
176 |
samples-*/**
|
177 |
|
178 |
gradio-outputs/
|
179 |
+
models/
|
180 |
+
samples*/
|
demo.py
CHANGED
@@ -130,8 +130,6 @@ def _vamp(data, return_mask=False):
|
|
130 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
131 |
out_dir.mkdir()
|
132 |
sig = at.AudioSignal(data[input_audio])
|
133 |
-
#pitch shift input
|
134 |
-
sig = sig.shift_pitch(data[input_pitch_shift])
|
135 |
|
136 |
# TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
|
137 |
|
@@ -160,10 +158,20 @@ def _vamp(data, return_mask=False):
|
|
160 |
mask = pmask.mask_or(
|
161 |
mask, pmask.onset_mask(sig, z, interface, width=data[onset_mask_width])
|
162 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
# these should be the last two mask ops
|
164 |
mask = pmask.dropout(mask, data[dropout])
|
165 |
mask = pmask.codebook_unmask(mask, ncc)
|
166 |
|
|
|
167 |
print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]}, use coarse2fine {data[use_coarse2fine]}")
|
168 |
# save the mask as a txt file
|
169 |
np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
|
@@ -322,6 +330,18 @@ with gr.Blocks() as demo:
|
|
322 |
value=5,
|
323 |
)
|
324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
with gr.Accordion("extras ", open=False):
|
326 |
n_conditioning_codebooks = gr.Number(
|
327 |
label="number of conditioning codebooks. probably 0",
|
@@ -355,14 +375,14 @@ with gr.Blocks() as demo:
|
|
355 |
temp = gr.Slider(
|
356 |
label="temperature",
|
357 |
minimum=0.0,
|
358 |
-
maximum=
|
359 |
value=0.8
|
360 |
)
|
361 |
|
362 |
with gr.Accordion("sampling settings", open=False):
|
363 |
typical_filtering = gr.Checkbox(
|
364 |
label="typical filtering ",
|
365 |
-
value=
|
366 |
)
|
367 |
typical_mass = gr.Slider(
|
368 |
label="typical mass (should probably stay between 0.1 and 0.5)",
|
@@ -440,7 +460,9 @@ with gr.Blocks() as demo:
|
|
440 |
typical_filtering,
|
441 |
typical_mass,
|
442 |
typical_min_tokens,
|
443 |
-
checkpoint_key
|
|
|
|
|
444 |
}
|
445 |
|
446 |
# connect widgets
|
|
|
130 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
131 |
out_dir.mkdir()
|
132 |
sig = at.AudioSignal(data[input_audio])
|
|
|
|
|
133 |
|
134 |
# TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
|
135 |
|
|
|
158 |
mask = pmask.mask_or(
|
159 |
mask, pmask.onset_mask(sig, z, interface, width=data[onset_mask_width])
|
160 |
)
|
161 |
+
if data[beat_mask_width] > 0:
|
162 |
+
beat_mask = interface.make_beat_mask(
|
163 |
+
sig,
|
164 |
+
before_beat_s=(data[beat_mask_width]/1000)/2,
|
165 |
+
after_beat_s=(data[beat_mask_width]/1000)/2,
|
166 |
+
mask_upbeats=not data[beat_mask_downbeats],
|
167 |
+
)
|
168 |
+
mask = pmask.mask_and(mask, beat_mask)
|
169 |
+
|
170 |
# these should be the last two mask ops
|
171 |
mask = pmask.dropout(mask, data[dropout])
|
172 |
mask = pmask.codebook_unmask(mask, ncc)
|
173 |
|
174 |
+
|
175 |
print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]}, use coarse2fine {data[use_coarse2fine]}")
|
176 |
# save the mask as a txt file
|
177 |
np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
|
|
|
330 |
value=5,
|
331 |
)
|
332 |
|
333 |
+
beat_mask_width = gr.Slider(
|
334 |
+
label="beat mask width (in milliseconds)",
|
335 |
+
minimum=0,
|
336 |
+
maximum=200,
|
337 |
+
value=0,
|
338 |
+
)
|
339 |
+
beat_mask_downbeats = gr.Checkbox(
|
340 |
+
label="beat mask downbeats only?",
|
341 |
+
value=False
|
342 |
+
)
|
343 |
+
|
344 |
+
|
345 |
with gr.Accordion("extras ", open=False):
|
346 |
n_conditioning_codebooks = gr.Number(
|
347 |
label="number of conditioning codebooks. probably 0",
|
|
|
375 |
temp = gr.Slider(
|
376 |
label="temperature",
|
377 |
minimum=0.0,
|
378 |
+
maximum=3.0,
|
379 |
value=0.8
|
380 |
)
|
381 |
|
382 |
with gr.Accordion("sampling settings", open=False):
|
383 |
typical_filtering = gr.Checkbox(
|
384 |
label="typical filtering ",
|
385 |
+
value=False
|
386 |
)
|
387 |
typical_mass = gr.Slider(
|
388 |
label="typical mass (should probably stay between 0.1 and 0.5)",
|
|
|
460 |
typical_filtering,
|
461 |
typical_mass,
|
462 |
typical_min_tokens,
|
463 |
+
checkpoint_key,
|
464 |
+
beat_mask_width,
|
465 |
+
beat_mask_downbeats
|
466 |
}
|
467 |
|
468 |
# connect widgets
|
vampnet/interface.py
CHANGED
@@ -265,7 +265,12 @@ class Interface(torch.nn.Module):
|
|
265 |
if invert:
|
266 |
mask = 1 - mask
|
267 |
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
def coarse_to_fine(
|
271 |
self,
|
@@ -349,26 +354,32 @@ if __name__ == "__main__":
|
|
349 |
coarse_ckpt="./models/spotdl/coarse.pth",
|
350 |
coarse2fine_ckpt="./models/spotdl/c2f.pth",
|
351 |
codec_ckpt="./models/spotdl/codec.pth",
|
352 |
-
device="cuda"
|
|
|
353 |
)
|
354 |
|
355 |
-
|
|
|
356 |
|
357 |
z = interface.encode(sig)
|
358 |
|
359 |
-
mask = linear_random(z, 1.0)
|
360 |
-
mask = mask_and(
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
367 |
)
|
368 |
# mask = dropout(mask, 0.0)
|
369 |
# mask = codebook_unmask(mask, 0)
|
370 |
|
371 |
-
|
372 |
zv, mask_z = interface.coarse_vamp(
|
373 |
z,
|
374 |
mask=mask,
|
|
|
265 |
if invert:
|
266 |
mask = 1 - mask
|
267 |
|
268 |
+
mask = mask[None, None, :].bool().long()
|
269 |
+
if self.c2f is not None:
|
270 |
+
mask = mask.repeat(1, self.c2f.n_codebooks, 1)
|
271 |
+
else:
|
272 |
+
mask = mask.repeat(1, self.coarse.n_codebooks, 1)
|
273 |
+
return mask
|
274 |
|
275 |
def coarse_to_fine(
|
276 |
self,
|
|
|
354 |
coarse_ckpt="./models/spotdl/coarse.pth",
|
355 |
coarse2fine_ckpt="./models/spotdl/c2f.pth",
|
356 |
codec_ckpt="./models/spotdl/codec.pth",
|
357 |
+
device="cuda",
|
358 |
+
wavebeat_ckpt="./models/wavebeat.pth"
|
359 |
)
|
360 |
|
361 |
+
|
362 |
+
sig = at.AudioSignal.zeros(duration=10, sample_rate=44100)
|
363 |
|
364 |
z = interface.encode(sig)
|
365 |
|
366 |
+
# mask = linear_random(z, 1.0)
|
367 |
+
# mask = mask_and(
|
368 |
+
# mask, periodic_mask(
|
369 |
+
# z,
|
370 |
+
# 32,
|
371 |
+
# 1,
|
372 |
+
# random_roll=True
|
373 |
+
# )
|
374 |
+
# )
|
375 |
+
|
376 |
+
mask = interface.make_beat_mask(
|
377 |
+
sig, 0.0, 0.075
|
378 |
)
|
379 |
# mask = dropout(mask, 0.0)
|
380 |
# mask = codebook_unmask(mask, 0)
|
381 |
|
382 |
+
breakpoint()
|
383 |
zv, mask_z = interface.coarse_vamp(
|
384 |
z,
|
385 |
mask=mask,
|
vampnet/mask.py
CHANGED
@@ -26,9 +26,9 @@ def apply_mask(
|
|
26 |
mask: torch.Tensor,
|
27 |
mask_token: int
|
28 |
):
|
29 |
-
assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
|
30 |
-
assert mask.shape == x.shape, "mask must be same shape as x"
|
31 |
-
assert mask.dtype == torch.long, "mask must be long dtype"
|
32 |
assert ~torch.any(mask > 1), "mask must be binary"
|
33 |
assert ~torch.any(mask < 0), "mask must be binary"
|
34 |
|
@@ -163,7 +163,7 @@ def mask_or(
|
|
163 |
mask1: torch.Tensor,
|
164 |
mask2: torch.Tensor
|
165 |
):
|
166 |
-
assert mask1.shape == mask2.shape, "masks must be same shape"
|
167 |
assert mask1.max() <= 1, "mask1 must be binary"
|
168 |
assert mask2.max() <= 1, "mask2 must be binary"
|
169 |
assert mask1.min() >= 0, "mask1 must be binary"
|
|
|
26 |
mask: torch.Tensor,
|
27 |
mask_token: int
|
28 |
):
|
29 |
+
assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq), but got {mask.ndim}"
|
30 |
+
assert mask.shape == x.shape, f"mask must be same shape as x, but got {mask.shape} and {x.shape}"
|
31 |
+
assert mask.dtype == torch.long, "mask must be long dtype, but got {mask.dtype}"
|
32 |
assert ~torch.any(mask > 1), "mask must be binary"
|
33 |
assert ~torch.any(mask < 0), "mask must be binary"
|
34 |
|
|
|
163 |
mask1: torch.Tensor,
|
164 |
mask2: torch.Tensor
|
165 |
):
|
166 |
+
assert mask1.shape == mask2.shape, f"masks must be same shape, but got {mask1.shape} and {mask2.shape}"
|
167 |
assert mask1.max() <= 1, "mask1 must be binary"
|
168 |
assert mask2.max() <= 1, "mask2 must be binary"
|
169 |
assert mask1.min() >= 0, "mask1 must be binary"
|