Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

Hugo Flores Garcia commited on Jun 23, 2023

Commit

d98455c

1 Parent(s): c068a29

cleanup

Browse files

Files changed (4) hide show

scripts/exp/train.py +0 -6
scripts/utils/vamp_folder.py +51 -81
vampnet/interface.py +1 -1
vampnet/modules/transformer.py +33 -248

scripts/exp/train.py CHANGED Viewed

@@ -342,8 +342,6 @@ def train(
                 dtype = torch.bfloat16 if accel.amp else None
                 with accel.autocast(dtype=dtype):
                     z_hat = model(z_mask_latent, r)
-                    # for mask mode
-                    z_hat = vn.add_truth_to_logits(z, z_hat, mask)
                 target = codebook_flatten(
                     z[:, vn.n_conditioning_codebooks :, :],
@@ -414,8 +412,6 @@ def train(
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
-            # for mask mode
-            z_hat = vn.add_truth_to_logits(z, z_hat, mask)
             target = codebook_flatten(
                 z[:, vn.n_conditioning_codebooks :, :],
@@ -573,8 +569,6 @@ def train(
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
-            # for mask mode
-            z_hat = vn.add_truth_to_logits(z, z_hat, mask)
             z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
             z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)

                 dtype = torch.bfloat16 if accel.amp else None
                 with accel.autocast(dtype=dtype):
                     z_hat = model(z_mask_latent, r)
                 target = codebook_flatten(
                     z[:, vn.n_conditioning_codebooks :, :],
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             target = codebook_flatten(
                 z[:, vn.n_conditioning_codebooks :, :],
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
             z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)

scripts/utils/vamp_folder.py CHANGED Viewed

@@ -9,10 +9,13 @@ from tqdm import tqdm
 import torch
 from vampnet.interface import Interface
 import audiotools as at
 Interface: Interface = argbind.bind(Interface)
 def calculate_bitrate(
         interface, num_codebooks,
         downsample_factor
@@ -38,29 +41,19 @@ def coarse2fine(sig, interface):
     z = interface.coarse_to_fine(z)
     return interface.to_signal(z)
-def coarse2fine_argmax(sig, interface):
-    z = interface.encode(sig)
-    z = z[:, :interface.c2f.n_conditioning_codebooks, :]
-    z = interface.coarse_to_fine(z,
-        sample="argmax", sampling_steps=1,
-        temperature=1.0
-    )
-    return interface.to_signal(z)
 class CoarseCond:
-    def __init__(self, num_codebooks, downsample_factor):
-        self.num_codebooks = num_codebooks
         self.downsample_factor = downsample_factor
     def __call__(self, sig, interface):
-        n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
-        zv = interface.coarse_vamp(sig,
-            n_conditioning_codebooks=n_conditioning_codebooks,
-            downsample_factor=self.downsample_factor,
-        )
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
@@ -97,24 +90,24 @@ def opus(sig, interface, bitrate=128):
 def mask_ratio_1_step(ratio=1.0):
     def wrapper(sig, interface):
-        r = interface.coarse.invgamma(ratio).to(interface.device)
-        intensity = 1-r
         zv = interface.coarse_vamp(
-            sig,
-            sample='argmax',
             sampling_steps=1,
-            intensity=intensity
         )
         return interface.to_signal(zv)
     return wrapper
 def num_sampling_steps(num_steps=1):
-    def wrapper(sig, interface):
         zv = interface.coarse_vamp(
-            sig,
-            downsample_factor=16,
             sampling_steps=num_steps,
         )
@@ -130,9 +123,9 @@ def beat_mask(ctx_time):
             after_beat_s=ctx_time,
             invert=True
         )
         zv = interface.coarse_vamp(
-            sig,
-            ext_mask=beat_mask,
         )
         zv = interface.coarse_to_fine(zv)
@@ -140,17 +133,28 @@ def beat_mask(ctx_time):
     return wrapper
 def inpaint(ctx_time):
-    def wrapper(sig, interface):
-        zv = interface.coarse_vamp(
-            sig,
-            prefix_dur_s=ctx_time,
-            suffix_dur_s=ctx_time,
-        )
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
     return wrapper
 EXP_REGISTRY = {}
 EXP_REGISTRY["gen-compression"] = {
@@ -158,62 +162,27 @@ EXP_REGISTRY["gen-compression"] = {
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
     **{
-        f"{n}_codebooks_downsampled_{x}x": CoarseCond(num_codebooks=n, downsample_factor=x)
             for (n, x) in (
-                (4, 2), # 4 codebooks, downsampled 2x,
-                (2, 2), # 2 codebooks, downsampled 2x
-                (1, None), # 1 codebook, no downsampling
                 (4, 4), # 4 codebooks, downsampled 4x
-                (1, 2), # 1 codebook, downsampled 2x,
-                (4, 6), # 4 codebooks, downsampled 6x
-                (4, 8), # 4 codebooks, downsampled 8x
                 (4, 16), # 4 codebooks, downsampled 16x
                 (4, 32), # 4 codebooks, downsampled 16x
             )
     },
 }
-EXP_REGISTRY["opus-jazzpop"] = {
-    f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
-    for bitrate in [5620, 1875, 1250, 625]
-}
-EXP_REGISTRY["opus-spotdl"] = {
-    f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
-    for bitrate in [8036, 2296, 1148, 574]
-}
-EXP_REGISTRY["opus-baseline"]  = {
-    f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
-    for bitrate in [8000, 12000, 16000]
-}
-EXP_REGISTRY["c2f"]  = {
-    "baseline": baseline,
-    "reconstructed": reconstructed,
-    "coarse2fine": coarse2fine,
-    "coarse2fine_argmax": coarse2fine_argmax,
-}
-EXP_REGISTRY["token-noise"] = {
-    f"token_noise_{r}": token_noise(r)  for r in [0.25, 0.5, 0.75, 1.0]
-}
-EXP_REGISTRY["mask-ratio"] = {
-    "codec": reconstructed,
-    **{f"mask_ratio_{r}": mask_ratio_1_step(r)  for r in [0.25, 0.5, 0.75, 0.9]}
-}
 EXP_REGISTRY["sampling-steps"] = {
-    "codec": reconstructed,
-    **{f"steps_{n}": num_sampling_steps(n)  for n in [1, 4, 12, 24, 36, 64, 72]},
 }
-EXP_REGISTRY["baseline"] = {
-    "baseline": baseline,
-    "codec": reconstructed,
-}
 EXP_REGISTRY["musical-sampling"] = {
     "baseline": baseline,
@@ -226,12 +195,13 @@ EXP_REGISTRY["musical-sampling"] = {
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
-            "/data/spotdl/audio/val", "/data/spotdl/audio/test"
         ],
         output_dir: str = "./samples",
-        max_excerpts: int = 5000,
-        exp_type: str = "coarse",
         seed: int = 0,
     ):
     at.util.seed(seed)
     interface = Interface()
@@ -241,7 +211,7 @@ def main(
     from audiotools.data.datasets import AudioLoader, AudioDataset
-    loader = AudioLoader(sources=sources, shuffle_state=seed)
     dataset = AudioDataset(loader,
         sample_rate=interface.codec.sample_rate,
         duration=interface.coarse.chunk_size_s,

 import torch
 from vampnet.interface import Interface
+from vampnet import mask as pmask
 import audiotools as at
 Interface: Interface = argbind.bind(Interface)
 def calculate_bitrate(
         interface, num_codebooks,
         downsample_factor
     z = interface.coarse_to_fine(z)
     return interface.to_signal(z)
 class CoarseCond:
+    def __init__(self, num_conditioning_codebooks, downsample_factor):
+        self.num_conditioning_codebooks = num_conditioning_codebooks
         self.downsample_factor = downsample_factor
     def __call__(self, sig, interface):
+        z = interface.encode(sig)
+        mask = pmask.full_mask(z)
+        mask = pmask.codebook_unmask(mask, self.num_conditioning_codebooks)
+        mask = pmask.periodic_mask(mask, self.downsample_factor)
+        zv = interface.coarse_vamp(z, mask)
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
 def mask_ratio_1_step(ratio=1.0):
     def wrapper(sig, interface):
+        z = interface.encode(sig)
+        mask = pmask.linear_random(z, ratio)
         zv = interface.coarse_vamp(
+            z,
+            mask,
             sampling_steps=1,
         )
         return interface.to_signal(zv)
     return wrapper
 def num_sampling_steps(num_steps=1):
+    def wrapper(sig, interface: Interface):
+        z = interface.encode(sig)
+        mask = pmask.periodic_mask(z, 16)
         zv = interface.coarse_vamp(
+            z,
+            mask,
             sampling_steps=num_steps,
         )
             after_beat_s=ctx_time,
             invert=True
         )
+        z = interface.encode(sig)
         zv = interface.coarse_vamp(
+            z, beat_mask,
         )
         zv = interface.coarse_to_fine(zv)
     return wrapper
 def inpaint(ctx_time):
+    def wrapper(sig, interface: Interface):
+        z = interface.encode(sig)
+        mask = pmask.inpaint(z, interface.s2t(ctx_time), interface.s2t(ctx_time))
+        zv = interface.coarse_vamp(z, mask)
         zv = interface.coarse_to_fine(zv)
         return interface.to_signal(zv)
     return wrapper
+def token_noise(noise_amt):
+    def wrapper(sig, interface: Interface):
+        z = interface.encode(sig)
+        mask = pmask.random(z, noise_amt)
+        z = torch.where(
+            mask,
+            torch.randint_like(z, 0, interface.coarse.vocab_size),
+            z
+        )
+        return interface.to_signal(z)
+    return wrapper
 EXP_REGISTRY = {}
 EXP_REGISTRY["gen-compression"] = {
     "reconstructed": reconstructed,
     "coarse2fine": coarse2fine,
     **{
+        f"{n}_codebooks_downsampled_{x}x": CoarseCond(num_conditioning_codebooks=n, downsample_factor=x)
             for (n, x) in (
+                (1, 1), # 1 codebook, no downsampling
                 (4, 4), # 4 codebooks, downsampled 4x
                 (4, 16), # 4 codebooks, downsampled 16x
                 (4, 32), # 4 codebooks, downsampled 16x
             )
     },
+    **{
+        f"token_noise_{x}": mask_ratio_1_step(ratio=x)
+            for x in [0.25, 0.5, 0.75]
+    },
 }
 EXP_REGISTRY["sampling-steps"] = {
+    # "codec": reconstructed,
+    **{f"steps_{n}": num_sampling_steps(n)  for n in [1, 4, 12, 36, 64, 72]},
 }
 EXP_REGISTRY["musical-sampling"] = {
     "baseline": baseline,
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
+            "/media/CHONK/hugo/spotdl/audio-test",
         ],
         output_dir: str = "./samples",
+        max_excerpts: int = 2000,
+        exp_type: str = "gen-compression",
         seed: int = 0,
+        ext: str = [".mp3"],
     ):
     at.util.seed(seed)
     interface = Interface()
     from audiotools.data.datasets import AudioLoader, AudioDataset
+    loader = AudioLoader(sources=sources, shuffle_state=seed, ext=ext)
     dataset = AudioDataset(loader,
         sample_rate=interface.codec.sample_rate,
         duration=interface.coarse.chunk_size_s,

vampnet/interface.py CHANGED Viewed

@@ -321,7 +321,7 @@ class Interface(torch.nn.Module):
         cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
         cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
-        gen_fn = gen_fn or self.coarse.sample
         c_vamp = gen_fn(
             codec=self.codec,
             time_steps=cz.shape[-1],

         cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
         cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
+        gen_fn = gen_fn or self.coarse.generate
         c_vamp = gen_fn(
             codec=self.codec,
             time_steps=cz.shape[-1],

vampnet/modules/transformer.py CHANGED Viewed

@@ -572,173 +572,13 @@ class VampNet(at.ml.BaseModel):
         return signal
-    def add_truth_to_logits(
-        self,
-        z_true,
-        z_hat,
-        mask,
-    ):
-        z_true = z_true[:, self.n_conditioning_codebooks :, :]
-        mask = mask[:, self.n_conditioning_codebooks :, :]
-        truth = F.one_hot(z_true, self.vocab_size)
-        mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
-        z_hat = rearrange(
-            z_hat,
-            "b p (t c) -> b c t p",
-            c=self.n_codebooks - self.n_conditioning_codebooks,
-        )
-        z_hat = z_hat * mask + truth * (1 - mask)
-        z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
-        return z_hat
-    @torch.no_grad()
-    def sample(
-        self,
-        codec,
-        time_steps: int = 300,
-        sampling_steps: int = 36,
-        start_tokens: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-        temperature: Union[float, Tuple[float, float]] = 0.8,
-        top_k: int = None,
-        sample: str = "gumbel",
-        typical_filtering=True,
-        typical_mass=0.2,
-        typical_min_tokens=1,
-        return_signal=True,
-    ):
-        if isinstance(temperature, float):
-            temperature = torch.tensor(temperature).repeat(sampling_steps)
-        elif isinstance(temperature, tuple):
-            assert len(temperature) == 2
-            l, h = temperature
-            temperature = torch.linspace(l, h, sampling_steps)
-        else:
-            raise TypeError(f"invalid type for temperature")
-        z = start_tokens
-        if z is None:
-            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
-                self.device
-            )
-        if mask is None:
-            mask = torch.ones_like(z).to(self.device).int()
-            mask[:, : self.n_conditioning_codebooks, :] = 0.0
-        if mask.ndim == 2:
-            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
-        # figure out which timesteps we're keeping
-        keep_mask = 1 - mask
-        # any conditioning codebook levels need to be in the keep mask
-        # if self.n_conditioning_codebooks > 0:
-        #     cond_mask = torch.ones(z.shape[0], self.n_conditioning_codebooks, z.shape[-1]).to(z.device)
-        #     keep_mask = torch.cat([cond_mask, keep_mask], dim=1)
-        # flatten
-        keep_mask = codebook_flatten(keep_mask)
-        # our r steps
-        r_steps = torch.linspace(0, 1, sampling_steps + 1)[1:].to(self.device)
-        # how many tokens did we keep on init?
-        num_kept_on_init = keep_mask.sum()
-        # how many codebooks are we inferring vs conditioning on?
-        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
-        for i in range(sampling_steps):
-            # our current temperature
-            tmpt = temperature[i]
-            # our current schedule step
-            r = r_steps[i : i + 1]
-            with torch.inference_mode():
-                # mask our z
-                keep_mask_unflat = codebook_unflatten(keep_mask, n_c=self.n_codebooks)
-                z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
-                # get latents
-                latents = self.embedding.from_codes(z_masked, codec)
-                # infer from latents
-                logits = self.forward(latents, r)
-                logits = logits.permute(0, 2, 1)  # b, seq, prob
-                # the schedule determines how many samples to keep
-                num_tokens_to_infer = (z.shape[-1] * z.shape[-2]) - num_kept_on_init
-                num_to_keep = num_kept_on_init + int(
-                    num_tokens_to_infer * (_gamma(1 - r))
-                )
-                # figure out which logits we wanna keep
-                if num_to_keep > 0:
-                    probs = logits.softmax(dim=-1)
-                    # do mod self.vocab_size to make sure we don't sample from the mask token
-                    # in case the mask token was in the og z
-                    keep_probs = F.one_hot(z%self.vocab_size, self.vocab_size)[:, :, :]
-                    probs = rearrange(
-                        probs, "b (t c) p -> b c t p", c=n_infer_codebooks
-                    )
-                    probs = torch.cat(
-                        [keep_probs[:, : self.n_conditioning_codebooks, ...], probs],
-                        dim=1,
-                    )
-                    keep_probs = rearrange(
-                        keep_probs, "b c t p -> b (t c) p", c=self.n_codebooks
-                    )
-                    probs = rearrange(probs, "b c t p -> b (t c) p", c=self.n_codebooks)
-                    keep_prob_mask = keep_mask.unsqueeze(-1).repeat(
-                        1, 1, self.vocab_size
-                    )
-                    probs = (keep_prob_mask.long() * keep_probs) + (
-                        1 - keep_prob_mask.long()
-                    ) * probs
-                    highest_probs = probs.max(dim=-1, keepdim=False)[0]
-                    v, _ = highest_probs.topk(num_to_keep, dim=-1)
-                    keep_mask = torch.ones_like(keep_mask).bool().clone()
-                    keep_mask[highest_probs < v[..., [-1]]] = 0
-                logits = torch.log(probs)
-                z_inferred = sample_from_logits(
-                    logits=logits,
-                    top_k=top_k,
-                    temperature=tmpt,
-                    sample=sample,
-                    typical_filtering=typical_filtering,
-                    typical_mass=typical_mass,
-                    typical_min_tokens=typical_min_tokens,
-                )
-                z = codebook_unflatten(z_inferred, n_c=self.n_codebooks)
-        if return_signal:
-            return self.to_signal(z, codec)
-        else:
-            return z
     @torch.no_grad()
     def generate(
         self,
         codec,
         time_steps: int = 300,
-        sampling_steps: int = 36,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
         temperature: Union[float, Tuple[float, float]] = 8.0,
@@ -747,7 +587,7 @@ class VampNet(at.ml.BaseModel):
         typical_min_tokens=1,
         return_signal=True,
     ):
-        logging.info(f"beginning generation with {sampling_steps} steps")
         #####################
         # resolve temperature #
@@ -761,7 +601,7 @@ class VampNet(at.ml.BaseModel):
         else:
             raise TypeError(f"invalid type for temperature")
-        logging.info(f"temperature: {temperature}")
         #####################
@@ -774,7 +614,7 @@ class VampNet(at.ml.BaseModel):
                 self.device
             )
-        logging.info(f"created z with shape {z.shape}")
         #################
@@ -788,7 +628,7 @@ class VampNet(at.ml.BaseModel):
             mask = mask[:, None, :].repeat(1, z.shape[1], 1)
         # init_mask = mask.clone()
-        logging.info(f"created mask with shape {mask.shape}")
         ###########
@@ -796,38 +636,38 @@ class VampNet(at.ml.BaseModel):
         ##########
         # apply the mask to z
         z_masked = z.masked_fill(mask.bool(), self.mask_token)
-        # logging.info(f"z_masked: {z_masked}")
         # how many mask tokens to begin with?
         num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
-        logging.info(f"num mask tokens at start: {num_mask_tokens_at_start}")
         # our r steps
         r_steps = torch.linspace(1e-10, 1, sampling_steps+1)[1:].to(self.device)
-        logging.info(f"r steps: {r_steps}")
         # how many codebooks are we inferring vs conditioning on?
         n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
-        logging.info(f"n infer codebooks: {n_infer_codebooks}")
         #################
         # begin sampling #
         #################
         for i in range(sampling_steps):
-            logging.info(f"step {i} of {sampling_steps}")
             # our current temperature
             tmpt = temperature[i]
-            logging.info(f"temperature: {tmpt}")
             # our current schedule step
             r = r_steps[i : i + 1]
-            logging.info(f"r: {r}")
             # get latents
             latents = self.embedding.from_codes(z_masked, codec)
-            logging.info(f"computed latents with shape: {latents.shape}")
             # infer from latents
@@ -841,12 +681,12 @@ class VampNet(at.ml.BaseModel):
                 )
-            logging.info(f"permuted logits with shape: {logits.shape}")
             # logits2probs
             probs = torch.softmax(logits, dim=-1)
-            logging.info(f"computed probs with shape: {probs.shape}")
             # sample from logits with multinomial sampling
@@ -857,7 +697,7 @@ class VampNet(at.ml.BaseModel):
             sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
-            logging.info(f"sampled z with shape: {sampled_z.shape}")
             # flatten z_masked and mask, so we can deal with the sampling logic
@@ -868,12 +708,12 @@ class VampNet(at.ml.BaseModel):
             mask = (z_masked == self.mask_token).int()
             # update the mask, remove conditioning codebooks from the mask
-            logging.info(f"updated mask with shape: {mask.shape}")
             # add z back into sampled z where the mask was false
             sampled_z = torch.where(
                 mask.bool(), sampled_z, z_masked
             )
-            logging.info(f"added z back into sampled z with shape: {sampled_z.shape}")
             # get the confidences: which tokens did we sample?
@@ -891,7 +731,7 @@ class VampNet(at.ml.BaseModel):
             # get the num tokens to mask, according to the schedule
             num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
-            logging.info(f"num to mask: {num_to_mask}")
             num_to_mask = torch.maximum(
                 torch.tensor(1),
@@ -911,17 +751,17 @@ class VampNet(at.ml.BaseModel):
             z_masked = torch.where(
                 mask.bool(), self.mask_token, sampled_z
             )
-            logging.info(f"updated z_masked with shape: {z_masked.shape}")
             z_masked = codebook_unflatten(z_masked, n_infer_codebooks)
             mask = codebook_unflatten(mask, n_infer_codebooks)
-            logging.info(f"unflattened z_masked with shape: {z_masked.shape}")
             # add conditioning codebooks back to z_masked
             z_masked = torch.cat(
                 (z[:, :self.n_conditioning_codebooks, :], z_masked), dim=1
             )
-            logging.info(f"added conditioning codebooks back to z_masked with shape: {z_masked.shape}")
         # add conditioning codebooks back to sampled_z
@@ -930,7 +770,7 @@ class VampNet(at.ml.BaseModel):
             (z[:, :self.n_conditioning_codebooks, :], sampled_z), dim=1
         )
-        logging.info(f"finished sampling")
         if return_signal:
             return self.to_signal(sampled_z, codec)
@@ -945,28 +785,28 @@ def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: floa
         probs (torch.Tensor): probabilities for each sampled event, shape (batch, seq)
         temperature (float, optional): temperature. Defaults to 1.0.
     """
-    logging.info(f"masking by random topk")
-    logging.info(f"num to mask: {num_to_mask}")
-    logging.info(f"probs shape: {probs.shape}")
-    logging.info(f"temperature: {temperature}")
-    logging.info("")
     confidence = torch.log(probs) + temperature * gumbel_noise_like(probs)
-    logging.info(f"confidence shape: {confidence.shape}")
     sorted_confidence, sorted_idx = confidence.sort(dim=-1)
-    logging.info(f"sorted confidence shape: {sorted_confidence.shape}")
-    logging.info(f"sorted idx shape: {sorted_idx.shape}")
     # get the cut off threshold, given the mask length
     cut_off = torch.take_along_dim(
         sorted_confidence, num_to_mask, axis=-1
     )
-    logging.info(f"cut off shape: {cut_off.shape}")
     # mask out the tokens
     mask = confidence < cut_off
-    logging.info(f"mask shape: {mask.shape}")
     return mask
@@ -999,61 +839,6 @@ def typical_filter(
     logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
     return logits
-def sample_from_logits(
-    logits,
-    top_k: int = None,
-    temperature: float = 1.0,
-    sample: str = "multinomial",
-    typical_filtering=False,
-    typical_mass=0.2,
-    typical_min_tokens=1,
-):
-    # add temperature
-    logits = logits / temperature
-    # add topk
-    if top_k is not None and typical_filtering == False:
-        v, topk_idx = logits.topk(top_k)
-        logits[logits < v[..., [-1]]] = -float("inf")
-    if typical_filtering:
-        assert top_k is None
-        nb, nt, _ = logits.shape
-        x_flat = rearrange(logits, "b t l -> (b t ) l")
-        x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
-        x_flat_norm_p = torch.exp(x_flat_norm)
-        entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
-        c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
-        c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
-        x_flat_cumsum = (
-            x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
-        )
-        last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
-        sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(
-            1, last_ind.view(-1, 1)
-        )
-        if typical_min_tokens > 1:
-            sorted_indices_to_remove[..., :typical_min_tokens] = 0
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            1, x_flat_indices, sorted_indices_to_remove
-        )
-        x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
-        logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
-    if sample == "multinomial":
-        probs = torch.softmax(logits, dim=-1)
-        inferred = torch.stack([pr.multinomial(1).squeeze(-1) for pr in probs])
-    elif sample == "argmax":
-        inferred = torch.softmax(logits, dim=-1).argmax(dim=-1)
-    elif sample == "gumbel":
-        inferred = gumbel_sample(logits, dim=-1)
-    else:
-        raise ValueError(f"invalid sampling method: {sample}")
-    return inferred
 if __name__ == "__main__":
     # import argbind

         return signal
     @torch.no_grad()
     def generate(
         self,
         codec,
         time_steps: int = 300,
+        sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
         temperature: Union[float, Tuple[float, float]] = 8.0,
         typical_min_tokens=1,
         return_signal=True,
     ):
+        logging.debug(f"beginning generation with {sampling_steps} steps")
         #####################
         # resolve temperature #
         else:
             raise TypeError(f"invalid type for temperature")
+        logging.debug(f"temperature: {temperature}")
         #####################
                 self.device
             )
+        logging.debug(f"created z with shape {z.shape}")
         #################
             mask = mask[:, None, :].repeat(1, z.shape[1], 1)
         # init_mask = mask.clone()
+        logging.debug(f"created mask with shape {mask.shape}")
         ###########
         ##########
         # apply the mask to z
         z_masked = z.masked_fill(mask.bool(), self.mask_token)
+        # logging.debug(f"z_masked: {z_masked}")
         # how many mask tokens to begin with?
         num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
+        logging.debug(f"num mask tokens at start: {num_mask_tokens_at_start}")
         # our r steps
         r_steps = torch.linspace(1e-10, 1, sampling_steps+1)[1:].to(self.device)
+        logging.debug(f"r steps: {r_steps}")
         # how many codebooks are we inferring vs conditioning on?
         n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
+        logging.debug(f"n infer codebooks: {n_infer_codebooks}")
         #################
         # begin sampling #
         #################
         for i in range(sampling_steps):
+            logging.debug(f"step {i} of {sampling_steps}")
             # our current temperature
             tmpt = temperature[i]
+            logging.debug(f"temperature: {tmpt}")
             # our current schedule step
             r = r_steps[i : i + 1]
+            logging.debug(f"r: {r}")
             # get latents
             latents = self.embedding.from_codes(z_masked, codec)
+            logging.debug(f"computed latents with shape: {latents.shape}")
             # infer from latents
                 )
+            logging.debug(f"permuted logits with shape: {logits.shape}")
             # logits2probs
             probs = torch.softmax(logits, dim=-1)
+            logging.debug(f"computed probs with shape: {probs.shape}")
             # sample from logits with multinomial sampling
             sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
+            logging.debug(f"sampled z with shape: {sampled_z.shape}")
             # flatten z_masked and mask, so we can deal with the sampling logic
             mask = (z_masked == self.mask_token).int()
             # update the mask, remove conditioning codebooks from the mask
+            logging.debug(f"updated mask with shape: {mask.shape}")
             # add z back into sampled z where the mask was false
             sampled_z = torch.where(
                 mask.bool(), sampled_z, z_masked
             )
+            logging.debug(f"added z back into sampled z with shape: {sampled_z.shape}")
             # get the confidences: which tokens did we sample?
             # get the num tokens to mask, according to the schedule
             num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
+            logging.debug(f"num to mask: {num_to_mask}")
             num_to_mask = torch.maximum(
                 torch.tensor(1),
             z_masked = torch.where(
                 mask.bool(), self.mask_token, sampled_z
             )
+            logging.debug(f"updated z_masked with shape: {z_masked.shape}")
             z_masked = codebook_unflatten(z_masked, n_infer_codebooks)
             mask = codebook_unflatten(mask, n_infer_codebooks)
+            logging.debug(f"unflattened z_masked with shape: {z_masked.shape}")
             # add conditioning codebooks back to z_masked
             z_masked = torch.cat(
                 (z[:, :self.n_conditioning_codebooks, :], z_masked), dim=1
             )
+            logging.debug(f"added conditioning codebooks back to z_masked with shape: {z_masked.shape}")
         # add conditioning codebooks back to sampled_z
             (z[:, :self.n_conditioning_codebooks, :], sampled_z), dim=1
         )
+        logging.debug(f"finished sampling")
         if return_signal:
             return self.to_signal(sampled_z, codec)
         probs (torch.Tensor): probabilities for each sampled event, shape (batch, seq)
         temperature (float, optional): temperature. Defaults to 1.0.
     """
+    logging.debug(f"masking by random topk")
+    logging.debug(f"num to mask: {num_to_mask}")
+    logging.debug(f"probs shape: {probs.shape}")
+    logging.debug(f"temperature: {temperature}")
+    logging.debug("")
     confidence = torch.log(probs) + temperature * gumbel_noise_like(probs)
+    logging.debug(f"confidence shape: {confidence.shape}")
     sorted_confidence, sorted_idx = confidence.sort(dim=-1)
+    logging.debug(f"sorted confidence shape: {sorted_confidence.shape}")
+    logging.debug(f"sorted idx shape: {sorted_idx.shape}")
     # get the cut off threshold, given the mask length
     cut_off = torch.take_along_dim(
         sorted_confidence, num_to_mask, axis=-1
     )
+    logging.debug(f"cut off shape: {cut_off.shape}")
     # mask out the tokens
     mask = confidence < cut_off
+    logging.debug(f"mask shape: {mask.shape}")
     return mask
     logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
     return logits
 if __name__ == "__main__":
     # import argbind