Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

Hugo Flores commited on Mar 22, 2023

Commit

b54865d

1 Parent(s): 275afd0

interface

Browse files

Files changed (6) hide show

requirements.txt +0 -1
setup.py +1 -2
vampnet/__init__.py +1 -1
vampnet/enchilada.py +0 -179
vampnet/interface.py +332 -0
vampnet/modules/base.py +28 -3

requirements.txt CHANGED Viewed

@@ -26,5 +26,4 @@ jupyter-client==6.1.12
 tensorboardX
 gradio
 einops
-flash-attn
 frechet_audio_distance

 tensorboardX
 gradio
 einops
 frechet_audio_distance

setup.py CHANGED Viewed

@@ -20,7 +20,7 @@ setup(
     description="Generative Music Modeling.",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    author="Hugo Flores García",
     author_email="[email protected]",
     url="https://github.com/descriptinc/lyrebird-vampnet",
     license="MIT",
@@ -37,7 +37,6 @@ setup(
         "google-cloud-logging==2.2.0",
         "torchmetrics>=0.7.3",
         "einops",
-        "flash-attn",
         "frechet_audio_distance"
     ],
 )

     description="Generative Music Modeling.",
     long_description=long_description,
     long_description_content_type="text/markdown",
+    author="Hugo Flores García, Prem Seetharaman",
     author_email="[email protected]",
     url="https://github.com/descriptinc/lyrebird-vampnet",
     license="MIT",
         "google-cloud-logging==2.2.0",
         "torchmetrics>=0.7.3",
         "einops",
         "frechet_audio_distance"
     ],
 )

vampnet/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from . import modules
 from . import scheduler
-from . import enchilada
 __version__ = "0.0.1"

 from . import modules
 from . import scheduler
+from .interface import Interface
 __version__ = "0.0.1"

vampnet/enchilada.py DELETED Viewed

@@ -1,179 +0,0 @@
-import os
-from pathlib import Path
-import torch
-from audiotools import AudioSignal
-from .modules.transformer import VampNet
-from lac.model.lac import LAC
-class TheWholeEnchilada:
-    def __init__(
-        self,
-        coarse_ckpt: str,
-        coarse2fine_ckpt: str,
-        codec_ckpt: str,
-        device: str = "cpu",
-    ):
-        self.codec = LAC.load(Path(codec_ckpt))
-        self.codec.eval()
-        self.codec.to(device)
-        self.coarse = VampNet.load(location=Path(coarse_ckpt), map_location="cpu")
-        self.coarse.to(device)
-        self.coarse.eval()
-        self.coarse2fine = VampNet.load(
-            location=Path(coarse2fine_ckpt), map_location="cpu"
-        )
-        # FIXME
-        print(
-            f"WARNING: PATCHING coarse2fine seq_len to 288, for backwards compatibility with a specific jazzpop model. it used to be {self.coarse2fine.seq_len}"
-        )
-        self.coarse2fine.seq_len = 288
-        self.coarse2fine.to(device)
-        self.coarse2fine.eval()
-        self.device = device
-    def seconds_to_tokens(self, seconds: float):
-        return int(seconds * self.codec.sample_rate / self.codec.hop_length)
-    def to(self, device):
-        self.device = device
-        self.coarse.to(device)
-        self.coarse2fine.to(device)
-        self.codec.to(device)
-        return self
-    def encode(self, signal: AudioSignal):
-        with torch.inference_mode():
-            # coarse z
-            cz = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
-        return cz
-    def vamp(
-        self,
-        signal,
-        prefix_dur_s: float = 1.25,
-        suffix_dur_s: float = 1.25,
-        downsample_hint: bool = True,
-        downsample_factor: int = 4,
-        num_loops: int = 3,
-        **kwargs,
-    ):
-        """
-        Loop imputation of a signal.
-        """
-        signal.to(self.device).resample(self.codec.sample_rate).to_mono()
-        z = self.encode(signal)
-        cz = z[:, : self.coarse.n_codebooks, :].clone()
-        original_cz = cz.clone()
-        seq_len = original_cz.shape[-1]
-        assert (
-            seq_len == self.coarse.seq_len
-        ), f"expected seq_len {self.coarse.seq_len}, got {seq_len} for token sequence length. Is your signal the same duration as the model was trained with? "
-        vamp_hop_s = prefix_dur_s
-        vamp_hop = self.seconds_to_tokens(vamp_hop_s)
-        cmask = torch.ones_like(cz)
-        if downsample_hint:
-            # downsample by factor of 4
-            for i in range(cmask.shape[-1]):
-                if i % downsample_factor == 0:
-                    cmask[:, :, i] = 0
-        if prefix_dur_s > 0:
-            prefix_len = self.seconds_to_tokens(prefix_dur_s)
-            cmask[:, :, :prefix_len] = 0
-            print(f"prefix_len: {prefix_len}")
-        else:
-            prefix_len = 0
-        if suffix_dur_s > 0:
-            suffix_len = self.seconds_to_tokens(suffix_dur_s)
-            cmask[:, :, -suffix_len:] = 0
-            print(f"suffix_len: {suffix_len}")
-        else:
-            suffix_len = 0
-        prefix_z = cz[:, :, :prefix_len]
-        coarse_vamp = [prefix_z.clone()]
-        for i in range(num_loops):
-            sampled_cz = self.coarse.sample(
-                codec=self.codec,
-                time_steps=seq_len,
-                mask=cmask,
-                start_tokens=cz,
-                return_signal=False,
-                **kwargs,
-            )
-            new_prefix = sampled_cz[:, :, prefix_len : prefix_len + vamp_hop]
-            coarse_vamp.append(new_prefix.clone())
-            # replace the prefix in cz with the new prefix
-            # don't worry about a copy of the prefix still being
-            # in the mask area, since that will be masked out
-            cz[:, :, :vamp_hop] = new_prefix.clone()
-            print("to append and to prefix")
-        # we're done, so add the suffix
-        coarse_vamp.append(sampled_cz[:, :, prefix_len + vamp_hop :])
-        # concatenate the vamps
-        coarse_vamp = torch.cat(coarse_vamp, dim=-1)
-        # add a layer of
-        fine_prefix = z[:, self.coarse.n_codebooks :, :prefix_len]
-        fine_suffix = z[:, self.coarse.n_codebooks :, -suffix_len:]
-        fine_vamp = torch.randint(
-            0,
-            self.coarse2fine.vocab_size,
-            (
-                coarse_vamp.shape[0],
-                self.coarse2fine.n_predict_codebooks,
-                coarse_vamp.shape[-1],
-            ),
-        ).to(self.device)
-        fine_vamp[:, :, :prefix_len] = fine_prefix
-        fine_vamp[:, :, -suffix_len:] = fine_suffix
-        vamp_z = torch.cat([coarse_vamp, fine_vamp], dim=1)
-        # now we sample from the coarse2fine model
-        # to get the fine details
-        start_pos = 0
-        c2f_vamp = []
-        while start_pos < vamp_z.shape[-1]:
-            end_pos = min(start_pos + self.coarse2fine.seq_len, vamp_z.shape[-1])
-            c2fz = vamp_z[:, :, start_pos:end_pos]
-            self.coarse2fine: VampNet
-            sampled_c2fz = self.coarse2fine.sample(
-                codec=self.codec,
-                start_tokens=c2fz,
-                return_signal=False,
-                mask=None,
-            )
-            c2f_vamp.append(sampled_c2fz)
-            start_pos += self.coarse2fine.seq_len
-        c2f_vamp = torch.cat(c2f_vamp, dim=-1)
-        # make it a signal
-        vamp_signal = self.coarse2fine.to_signal(c2f_vamp, self.codec)
-        return {
-            "full": vamp_signal,
-            "coarse": self.coarse.to_signal(coarse_vamp, self.codec),
-        }

vampnet/interface.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import os
+from pathlib import Path
+import math
+import torch
+from audiotools import AudioSignal
+from .modules.transformer import VampNet
+from lac.model.lac import LAC
+class Interface:
+    def __init__(
+        self,
+        coarse_ckpt: str,
+        coarse2fine_ckpt: str,
+        codec_ckpt: str,
+        device: str = "cpu",
+        coarse_chunk_size_s: int =  5,
+        coarse2fine_chunk_size_s: int =  3,
+    ):
+        self.codec = LAC.load(Path(codec_ckpt))
+        self.codec.eval()
+        self.codec.to(device)
+        self.coarse = VampNet.load(location=Path(coarse_ckpt), map_location="cpu")
+        self.coarse.to(device)
+        self.coarse.eval()
+        self.coarse.chunk_size_s = coarse_chunk_size_s
+        self.c2f = VampNet.load(
+            location=Path(coarse2fine_ckpt), map_location="cpu"
+        )
+        self.c2f.to(device)
+        self.c2f.eval()
+        self.c2f.chunk_size_s = coarse2fine_chunk_size_s
+        self.device = device
+    def s2t(self, seconds: float):
+        """seconds to tokens"""
+        return int(seconds * self.codec.sample_rate / self.codec.hop_length)
+    def to(self, device):
+        self.device = device
+        self.coarse.to(device)
+        self.c2f.to(device)
+        self.codec.to(device)
+        return self
+    def to_signal(self, z: torch.Tensor):
+        return self.coarse.to_signal(z, self.codec)
+    @torch.inference_mode()
+    def encode(self, signal: AudioSignal):
+        signal = signal.clone().to(self.device).resample(self.codec.sample_rate).to_mono()
+        z = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
+        return z
+    def coarse_to_fine(
+        self,
+        coarse_z: torch.Tensor,
+        **kwargs
+    ):
+        length = coarse_z.shape[-1]
+        chunk_len = self.s2t(self.c2f.chunk_size_s)
+        n_chunks = math.ceil(coarse_z.shape[-1] / chunk_len)
+        # zero pad to chunk_len
+        if length % chunk_len != 0:
+            pad_len = chunk_len - (length % chunk_len)
+            coarse_z = torch.nn.functional.pad(coarse_z, (0, pad_len))
+        n_codebooks_to_append = self.c2f.n_codebooks - coarse_z.shape[1]
+        if n_codebooks_to_append > 0:
+            coarse_z = torch.cat([
+                coarse_z,
+                torch.zeros(coarse_z.shape[0], n_codebooks_to_append, coarse_z.shape[-1]).long().to(self.device)
+            ], dim=1)
+        fine_z = []
+        for i in range(n_chunks):
+            chunk = coarse_z[:, :, i * chunk_len : (i + 1) * chunk_len]
+            chunk = self.c2f.sample(
+                codec=self.codec,
+                time_steps=chunk_len,
+                start_tokens=chunk,
+                return_signal=False,
+            )
+            fine_z.append(chunk)
+        fine_z = torch.cat(fine_z, dim=-1)
+        return fine_z[:, :, :length].clone()
+    def coarse_vamp(
+        self,
+        signal,
+        prefix_dur_s: float = 1.25,
+        suffix_dur_s: float = 1.25,
+        num_loops: int = 3,
+        mode="impute",
+        downsample_factor: int = None,
+        debug=False,
+        **kwargs
+    ):
+        z = self.encode(signal)
+        assert signal.duration == self.coarse.chunk_size_s, "signal duration must match coarse chunk size for now"
+        # coarse z
+        cz = z[:, : self.coarse.n_codebooks, :].clone()
+        c_seq_len = cz.shape[-1]
+        n_prefix = self.s2t(prefix_dur_s)
+        n_suffix = self.s2t(suffix_dur_s)
+        # we'll keep the final codes sequence here
+        c_vamp = {
+            'prefix': [cz[:, :, :n_prefix].clone()],
+            'suffix': [cz[:, :, c_seq_len-n_suffix:].clone()]
+        }
+        _cz = cz.clone()
+        for _ in range(num_loops):
+            # add noise
+            cz_masked, cz_mask = self.coarse.add_noise(
+                _cz, r=0.0,
+                n_prefix=n_prefix,
+                n_suffix=n_suffix,
+                downsample_factor=downsample_factor
+            )
+            if debug:
+                print("tokens to infer")
+                self.to_signal(cz_masked).cpu().widget()
+            # sample!
+            cz_sampled = self.coarse.sample(
+                codec=self.codec,
+                time_steps=self.s2t(self.coarse.chunk_size_s),
+                start_tokens=_cz,
+                mask=cz_mask,
+                return_signal=False,
+                **kwargs
+            )
+            if debug:
+                print("tokens sampled")
+                self.to_signal(cz_sampled).cpu().widget()
+            cz_imputed = cz_sampled[:, :, n_prefix:c_seq_len-n_suffix].clone()
+            if mode == "impute":
+                 # split the imputed codes into two halves
+                cz_imputed_a = cz_imputed[:, :, : cz_imputed.shape[-1] // 2].clone()
+                cz_imputed_b = cz_imputed[:, :, cz_imputed.shape[-1] // 2 :].clone()
+            elif mode == "continue":
+                cz_imputed_a = cz_imputed[:, :, : cz_imputed.shape[-1]].clone()
+                cz_imputed_b = _cz[:, :, :0].clone() # empty
+            elif mode == "reverse-continue":
+                cz_imputed_a = _cz[:, :, :0].clone() # empty
+                cz_imputed_b = cz_imputed[:, :, : cz_imputed.shape[-1]].clone()
+            else:
+                raise ValueError(f"mode {mode} not supported")
+            if debug:
+                # add to our c_vamp
+                if cz_imputed_a.shape[-1] > 0:
+                    print("new_prefix added")
+                    self.to_signal(cz_imputed_a).cpu().widget()
+                if cz_imputed_b.shape[-1] >  0:
+                    print("new_suffix added")
+                    self.to_signal(cz_imputed_b).cpu().widget()
+            c_vamp['prefix'].append(cz_imputed_a.clone())
+            c_vamp['suffix'].insert(0, cz_imputed_b.clone())
+            n_to_insert = c_seq_len - (cz_imputed_a.shape[-1] + cz_imputed_b.shape[-1])
+            to_insert = torch.zeros(cz_imputed_a.shape[0], cz_imputed_a.shape[1], n_to_insert).long().to(self.device)
+            _cz = torch.cat([cz_imputed_a, to_insert, cz_imputed_b], dim=-1)
+            if debug:
+                print("tokens to infer next round (area to insert in the middle)")
+                self.to_signal(_cz).cpu().widget()
+        prefix_codes = torch.cat(c_vamp['prefix'], dim=-1)
+        suffix_codes = torch.cat(c_vamp['suffix'], dim=-1)
+        c_vamp = torch.cat([prefix_codes, suffix_codes], dim=-1)
+        return c_vamp
+    def coarse_vamp_v2(
+        self,
+        signal,
+        prefix_dur_s: float = 1.25,
+        suffix_dur_s: float = 1.25,
+        num_loops: int = 3,
+        downsample_factor: int = None,
+        debug=False,
+        **kwargs
+    ):
+        z = self.encode(signal)
+        assert signal.duration == self.coarse.chunk_size_s, "signal duration must match coarse chunk size for now"
+        # coarse z
+        cz = z[:, : self.coarse.n_codebooks, :].clone()
+        c_seq_len = cz.shape[-1]
+        n_prefix = self.s2t(prefix_dur_s)
+        n_suffix = self.s2t(suffix_dur_s)
+        assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
+        # we'll keep the final codes sequence here
+        c_vamp = {
+            'prefix': [cz[:, :, :n_prefix].clone()],
+            'suffix': [cz[:, :, c_seq_len-n_suffix:].clone()]
+        }
+        _cz = cz.clone()
+        cz_mask = None
+        for _ in range(num_loops):
+            # add noise
+            cz_masked, cz_mask = self.coarse.add_noise(
+                _cz, r=0.0,
+                n_prefix=n_prefix,
+                n_suffix=n_suffix,
+                downsample_factor=downsample_factor,
+                mask=cz_mask
+            )
+            if debug:
+                print("tokens to infer")
+                self.to_signal(cz_masked).cpu().widget()
+            # sample!
+            if debug:
+                print(f"mask: {cz_mask[:,0,:]}")
+                print(f"z: {_cz[:,0,:]}")
+            cz_sampled = self.coarse.sample(
+                codec=self.codec,
+                time_steps=self.s2t(self.coarse.chunk_size_s),
+                start_tokens=_cz,
+                mask=cz_mask,
+                return_signal=False,
+                **kwargs
+            )
+            if debug:
+                print("tokens sampled")
+                self.to_signal(cz_sampled).cpu().widget()
+            # the z that was generated
+            cz_generated = cz_sampled[:, :, n_prefix:c_seq_len-n_suffix].clone()
+            n_generated = cz_generated.shape[-1]
+            # create the new prefix and suffix
+            # we'll make sure that the number of prefix and suffix
+            # tokens is the same as the original
+            # but we do want to advance the sequence as much as we can
+            if n_prefix > 0 and n_suffix > 0:
+                # we have both prefix and suffix, so we'll split the generated
+                # codes in two halves
+                prefix_start_idx = n_generated // 2
+                prefix_stop_idx = prefix_start_idx + n_prefix
+                assert prefix_start_idx >= 0, "internal error"
+                suffix_start_idx = n_prefix + n_generated // 2
+                suffix_stop_idx = suffix_start_idx + n_suffix
+                assert suffix_stop_idx <= cz_sampled.shape[-1], "internal error"
+                cz_new_prefix = cz_sampled[:, :, prefix_start_idx:prefix_stop_idx].clone()
+                cz_new_suffix = cz_sampled[:, :, suffix_start_idx:suffix_stop_idx].clone()
+                c_vamp['prefix'].append(cz_generated[:,:,:n_generated//2])
+                c_vamp['suffix'].insert(0, cz_generated[:,:,n_generated//2:])
+            elif n_prefix > 0:
+                # we only have a prefix
+                prefix_start_idx = n_generated
+                prefix_stop_idx = prefix_start_idx + n_prefix
+                cz_new_prefix = cz_sampled[:, :, prefix_start_idx:prefix_stop_idx].clone()
+                cz_new_suffix = _cz[:, :, :0].clone()
+                c_vamp['prefix'].append(cz_generated)
+            elif n_suffix > 0:
+                # we only have a suffix, so everything starting at 0 is generated
+                suffix_stop_idx = max(n_generated, n_suffix)
+                suffix_start_idx = suffix_stop_idx - n_suffix
+                cz_new_prefix = _cz[:, :, :0].clone()
+                cz_new_suffix = cz_sampled[:, :, suffix_start_idx:suffix_stop_idx].clone()
+                c_vamp['suffix'].insert(0, cz_generated)
+            n_to_insert = c_seq_len - (cz_new_prefix.shape[-1] + cz_new_suffix.shape[-1])
+            to_insert = torch.zeros(cz_new_prefix.shape[0], cz_new_prefix.shape[1], n_to_insert).long().to(self.device)
+            _cz = torch.cat([cz_new_prefix, to_insert, cz_new_suffix], dim=-1)
+            to_insert_mask = torch.zeros_like(_cz).long().to(self.device)
+            to_insert_mask[:, :, cz_new_prefix.shape[-1]:cz_new_prefix.shape[-1]+n_to_insert] = 1
+            cz_mask = (cz_mask + to_insert_mask).bool().long()
+            if debug:
+                print("tokens to infer next round (area to insert in the middle)")
+                self.to_signal(_cz).cpu().widget()
+        prefix_codes = torch.cat(c_vamp['prefix'], dim=-1)
+        suffix_codes = torch.cat(c_vamp['suffix'], dim=-1)
+        c_vamp = torch.cat([prefix_codes, suffix_codes], dim=-1)
+        return c_vamp

vampnet/modules/base.py CHANGED Viewed

@@ -24,6 +24,9 @@ def gumbel_sample(t, temperature=1.0, dim=-1):
     return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
 class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
@@ -36,20 +39,40 @@ class VampBase(at.ml.BaseModel):
         mask: Optional[torch.Tensor] = None,
         n_prefix: Optional[torch.Tensor] = None,
         n_suffix: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
         if mask is None:
             r = self.gamma(r)[:, None, None]
             probs = torch.ones_like(x) * r
             # if we have a prefix or suffix, set their mask prob to 0
             if n_prefix is not None:
                 for i, n in enumerate(n_prefix):
-                    probs[i, :, :n] = 0.0
             if n_suffix is not None:
                 for i, n in enumerate(n_suffix):
-                    probs[i, :, -n:] = 0.0
             mask = torch.bernoulli(probs)
             mask = mask.round().long()
@@ -347,7 +370,9 @@ class VampBase(at.ml.BaseModel):
                 if num_to_keep > 0:
                     probs = logits.softmax(dim=-1)
-                    keep_probs = F.one_hot(z, self.vocab_size)[:, :, :]
                     probs = rearrange(
                         probs, "b (t c) p -> b c t p", c=n_infer_codebooks

     return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
+def scalar_to_batch_tensor(x, batch_size):
+    return torch.tensor(x).repeat(batch_size)
 class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
         mask: Optional[torch.Tensor] = None,
         n_prefix: Optional[torch.Tensor] = None,
         n_suffix: Optional[torch.Tensor] = None,
+        downsample_factor: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
         if mask is None:
+            if not isinstance(r, torch.Tensor):
+                r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
             r = self.gamma(r)[:, None, None]
             probs = torch.ones_like(x) * r
             # if we have a prefix or suffix, set their mask prob to 0
             if n_prefix is not None:
+                if not isinstance(n_prefix, torch.Tensor):
+                    n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device)
                 for i, n in enumerate(n_prefix):
+                    if n > 0:
+                        probs[i, :, :n] = 0.0
             if n_suffix is not None:
+                if not isinstance(n_suffix, torch.Tensor):
+                    n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device)
                 for i, n in enumerate(n_suffix):
+                    if n > 0:
+                        probs[i, :, -n:] = 0.0
+            # if we have a downsample factor, set the mask prob to 0
+            if downsample_factor is not None:
+                if not isinstance(downsample_factor, torch.Tensor):
+                    downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
+                for i, factor in enumerate(downsample_factor):
+                    if factor == 0:
+                        continue
+                    for j in range(probs.shape[-1]):
+                        if j % factor == 0:
+                            probs[i, :, j] = 0.0
             mask = torch.bernoulli(probs)
             mask = mask.round().long()
                 if num_to_keep > 0:
                     probs = logits.softmax(dim=-1)
+                    # do mod self.vocab_size to make sure we don't sample from the mask token
+                    # in case the mask token was in the og z
+                    keep_probs = F.one_hot(z%self.vocab_size, self.vocab_size)[:, :, :]
                     probs = rearrange(
                         probs, "b (t c) p -> b c t p", c=n_infer_codebooks