Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

Hugo Flores Garcia commited on Jun 12, 2023

Commit

3f6f517

1 Parent(s): 75a7169

critical sampling fix, two demoes for comparing old and new sampling

Browse files

Files changed (17) hide show

conf/generated/bulgarian-tv-choir/c2f.yml +15 -0
conf/generated/bulgarian-tv-choir/coarse.yml +8 -0
conf/generated/bulgarian-tv-choir/interface.yml +7 -0
conf/generated/panchos/c2f.yml +15 -0
conf/generated/panchos/coarse.yml +8 -0
conf/generated/panchos/interface.yml +7 -0
conf/generated/titi-monkey/c2f.yml +15 -0
conf/generated/titi-monkey/coarse.yml +8 -0
conf/generated/titi-monkey/interface.yml +7 -0
conf/interface/spotdl.yml +1 -1
demo-new.py +518 -0
demo.py +65 -5
scripts/exp/train.py +6 -12
scripts/utils/augment.py +53 -0
vampnet/interface.py +46 -32
vampnet/mask.py +1 -1
vampnet/modules/transformer.py +288 -32

conf/generated/bulgarian-tv-choir/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/c2f.pth
+save_path: ./runs/bulgarian-tv-choir/c2f
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
+val/AudioLoader.sources: *id001

conf/generated/bulgarian-tv-choir/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/coarse.pth
+save_path: ./runs/bulgarian-tv-choir/coarse
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
+val/AudioLoader.sources: *id001

conf/generated/bulgarian-tv-choir/interface.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+AudioLoader.sources:
+- - /media/CHONK/hugo/loras/bulgarian-female-tv-choir/
+Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
+Interface.coarse2fine_lora_ckpt: ./runs/bulgarian-tv-choir/c2f/latest/lora.pth
+Interface.coarse_ckpt: ./models/spotdl/coarse.pth
+Interface.coarse_lora_ckpt: ./runs/bulgarian-tv-choir/coarse/latest/lora.pth
+Interface.codec_ckpt: ./models/spotdl/codec.pth

conf/generated/panchos/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/c2f.pth
+save_path: ./runs/panchos/c2f
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/panchos/
+val/AudioLoader.sources: *id001

conf/generated/panchos/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/coarse.pth
+save_path: ./runs/panchos/coarse
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/panchos/
+val/AudioLoader.sources: *id001

conf/generated/panchos/interface.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+AudioLoader.sources:
+- - /media/CHONK/hugo/loras/panchos/
+Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
+Interface.coarse2fine_lora_ckpt: ./runs/panchos/c2f/latest/lora.pth
+Interface.coarse_ckpt: ./models/spotdl/coarse.pth
+Interface.coarse_lora_ckpt: ./runs/panchos/coarse/latest/lora.pth
+Interface.codec_ckpt: ./models/spotdl/codec.pth

conf/generated/titi-monkey/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/c2f.pth
+save_path: ./runs/titi-monkey/c2f
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/titi-monkey.mp3
+val/AudioLoader.sources: *id001

conf/generated/titi-monkey/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/coarse.pth
+save_path: ./runs/titi-monkey/coarse
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/loras/titi-monkey.mp3
+val/AudioLoader.sources: *id001

conf/generated/titi-monkey/interface.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+AudioLoader.sources:
+- - /media/CHONK/hugo/loras/titi-monkey.mp3
+Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
+Interface.coarse2fine_lora_ckpt: ./runs/titi-monkey/c2f/latest/lora.pth
+Interface.coarse_ckpt: ./models/spotdl/coarse.pth
+Interface.coarse_lora_ckpt: ./runs/titi-monkey/coarse/latest/lora.pth
+Interface.codec_ckpt: ./models/spotdl/codec.pth

conf/interface/spotdl.yml CHANGED Viewed

@@ -7,6 +7,6 @@ Interface.coarse2fine_chunk_size_s: 3
 AudioLoader.sources:
-  # - /media/CHONK/hugo/spotdl/subsets/jazz-blues
   - /media/CHONK/null

 AudioLoader.sources:
+  # - /media/CHONK/hugo/spotdl/subsets/jazz-blues/
   - /media/CHONK/null

demo-new.py ADDED Viewed

	@@ -0,0 +1,518 @@

+from pathlib import Path
+from typing import Tuple
+import yaml
+import tempfile
+import uuid
+from dataclasses import dataclass, asdict
+import numpy as np
+import audiotools as at
+import argbind
+import gradio as gr
+from vampnet.interface import Interface
+from vampnet import mask as pmask
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.CRITICAL)
+Interface = argbind.bind(Interface)
+AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
+conf = argbind.parse_args()
+with argbind.scope(conf):
+    interface = Interface()
+    loader = AudioLoader()
+    print(f"interface device is {interface.device}")
+dataset = at.data.datasets.AudioDataset(
+    loader,
+    sample_rate=interface.codec.sample_rate,
+    duration=interface.coarse.chunk_size_s,
+    n_examples=5000,
+    without_replacement=True,
+)
+checkpoints = {
+    "spotdl": {
+        "coarse": "./models/spotdl/coarse.pth",
+        "c2f": "./models/spotdl/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "berta": {
+        "coarse": "./models/finetuned/berta-goldman-speech/coarse.pth",
+        "c2f": "./models/finetuned/berta-goldman-speech/c2f.pth",
+        "codec": "./model/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "xeno-canto-2": {
+        "coarse": "./models/finetuned/xeno-canto-2/coarse.pth",
+        "c2f": "./models/finetuned/xeno-canto-2/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "panchos": {
+        "coarse": "./models/finetuned/panchos/coarse.pth",
+        "c2f": "./models/finetuned/panchos/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "tv-choir": {
+        "coarse": "./models/finetuned/tv-choir/coarse.pth",
+        "c2f": "./models/finetuned/tv-choir/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "titi": {
+        "coarse": "./models/finetuned/titi/coarse.pth",
+        "c2f": "./models/finetuned/titi/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "titi-clean": {
+        "coarse": "./models/finetuned/titi-clean/coarse.pth",
+        "c2f": "./models/finetuned/titi-clean/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    }
+}
+interface.checkpoint_key = "spotdl"
+OUT_DIR = Path("gradio-outputs")
+OUT_DIR.mkdir(exist_ok=True, parents=True)
+def load_audio(file):
+    print(file)
+    filepath = file.name
+    sig = at.AudioSignal.salient_excerpt(
+        filepath,
+        duration=interface.coarse.chunk_size_s
+    )
+    sig = interface.preprocess(sig)
+    out_dir = OUT_DIR / "tmp" / str(uuid.uuid4())
+    out_dir.mkdir(parents=True, exist_ok=True)
+    sig.write(out_dir / "input.wav")
+    return sig.path_to_file
+def load_random_audio():
+    index = np.random.randint(0, len(dataset))
+    sig = dataset[index]["signal"]
+    sig = interface.preprocess(sig)
+    out_dir = OUT_DIR / "tmp" / str(uuid.uuid4())
+    out_dir.mkdir(parents=True, exist_ok=True)
+    sig.write(out_dir / "input.wav")
+    return sig.path_to_file
+def _vamp(data, return_mask=False):
+    # if our checkpoint key is different, we need to load a new checkpoint
+    if data[checkpoint_key] != interface.checkpoint_key:
+        print(f"loading checkpoint {data[checkpoint_key]}")
+        interface.lora_load(
+            checkpoints[data[checkpoint_key]]["coarse"],
+            checkpoints[data[checkpoint_key]]["c2f"],
+            checkpoints[data[checkpoint_key]]["full_ckpt"],
+        )
+        interface.checkpoint_key = data[checkpoint_key]
+    out_dir = OUT_DIR / str(uuid.uuid4())
+    out_dir.mkdir()
+    sig = at.AudioSignal(data[input_audio])
+    #pitch shift input
+    sig = sig.shift_pitch(data[input_pitch_shift])
+    # TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
+    z = interface.encode(sig)
+    ncc = data[n_conditioning_codebooks]
+    # build the mask
+    mask = pmask.linear_random(z, data[rand_mask_intensity])
+    mask = pmask.mask_and(
+        mask, pmask.inpaint(
+            z,
+            interface.s2t(data[prefix_s]),
+            interface.s2t(data[suffix_s])
+        )
+    )
+    mask = pmask.mask_and(
+        mask, pmask.periodic_mask(
+            z,
+            data[periodic_p],
+            data[periodic_w],
+            random_roll=True
+        )
+    )
+    if data[onset_mask_width] > 0:
+        mask = pmask.mask_or(
+            mask, pmask.onset_mask(sig, z, interface, width=data[onset_mask_width])
+        )
+    # these should be the last two mask ops
+    mask = pmask.dropout(mask, data[dropout])
+    mask = pmask.codebook_unmask(mask, ncc)
+    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[init_temp]}, final temp {data[final_temp]}, use coarse2fine {data[use_coarse2fine]}")
+    # save the mask as a txt file
+    np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
+    # if data[topk] is not None:
+    #     top_k = data[topk] if data[topk] > 0 else None
+    # else:
+    #     top_k = None
+    zv, mask_z = interface.coarse_vamp(
+        z,
+        mask=mask,
+        sampling_steps=data[num_steps],
+        temperature=(data[init_temp]*10, data[final_temp]*10),
+        return_mask=True,
+        # sample=data[sampling_strategy],
+        typical_filtering=data[typical_filtering],
+        typical_mass=data[typical_mass],
+        typical_min_tokens=data[typical_min_tokens],
+        # top_k=top_k,
+        gen_fn=interface.coarse.generate,
+    )
+    if use_coarse2fine:
+        zv = interface.coarse_to_fine(zv)
+    sig = interface.to_signal(zv).cpu()
+    print("done")
+    sig.write(out_dir / "output.wav")
+    if return_mask:
+        mask = interface.to_signal(mask_z).cpu()
+        mask.write(out_dir / "mask.wav")
+        return sig.path_to_file, mask.path_to_file
+    else:
+        return sig.path_to_file
+def vamp(data):
+    return _vamp(data, return_mask=True)
+def api_vamp(data):
+    return _vamp(data, return_mask=False)
+def save_vamp(data):
+    out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
+    out_dir.mkdir(parents=True, exist_ok=True)
+    sig_in = at.AudioSignal(data[input_audio])
+    sig_out = at.AudioSignal(data[output_audio])
+    sig_in.write(out_dir / "input.wav")
+    sig_out.write(out_dir / "output.wav")
+    _data = {
+        "init_temp": data[init_temp],
+        "final_temp": data[final_temp],
+        "prefix_s": data[prefix_s],
+        "suffix_s": data[suffix_s],
+        "rand_mask_intensity": data[rand_mask_intensity],
+        "num_steps": data[num_steps],
+        "notes": data[notes_text],
+        "periodic_period": data[periodic_p],
+        "periodic_width": data[periodic_w],
+        "n_conditioning_codebooks": data[n_conditioning_codebooks],
+        "use_coarse2fine": data[use_coarse2fine],
+        "stretch_factor": data[stretch_factor],
+    }
+    # save with yaml
+    with open(out_dir / "data.yaml", "w") as f:
+        yaml.dump(_data, f)
+    import zipfile
+    zip_path = out_dir.with_suffix(".zip")
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        for file in out_dir.iterdir():
+            zf.write(file, file.name)
+    return f"saved! your save code is {out_dir.stem}", zip_path
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            use_coarse2fine = gr.Checkbox(
+                label="use coarse2fine",
+                value=True
+            )
+            manual_audio_upload = gr.File(
+                label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
+                file_types=["audio"]
+            )
+            load_random_audio_button = gr.Button("or load random audio")
+            input_audio = gr.Audio(
+                label="input audio",
+                interactive=False,
+                type="filepath",
+            )
+            audio_mask = gr.Audio(
+                label="audio mask (listen to this to hear the mask hints)",
+                interactive=False,
+                type="filepath",
+            )
+            # connect widgets
+            load_random_audio_button.click(
+                fn=load_random_audio,
+                inputs=[],
+                outputs=[ input_audio]
+            )
+            manual_audio_upload.change(
+                fn=load_audio,
+                inputs=[manual_audio_upload],
+                outputs=[ input_audio]
+            )
+        # mask settings
+        with gr.Column():
+            input_pitch_shift = gr.Slider(
+                label="input pitch shift (semitones)",
+                minimum=-36,
+                maximum=36,
+                step=1,
+                value=0,
+            )
+            rand_mask_intensity = gr.Slider(
+                label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
+                minimum=0.0,
+                maximum=1.0,
+                value=1.0
+            )
+            periodic_p = gr.Slider(
+                label="periodic prompt  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
+                minimum=0,
+                maximum=128,
+                step=1,
+                value=3,
+            )
+            periodic_w = gr.Slider(
+                label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
+                minimum=1,
+                maximum=20,
+                step=1,
+                value=1,
+            )
+            onset_mask_width = gr.Slider(
+                label="onset mask width (steps, 1 step ~= 10milliseconds)",
+                minimum=0,
+                maximum=20,
+                step=1,
+                value=5,
+            )
+            with gr.Accordion("extras ", open=False):
+                n_conditioning_codebooks = gr.Number(
+                    label="number of conditioning codebooks. probably 0",
+                    value=0,
+                    precision=0,
+                )
+                stretch_factor = gr.Slider(
+                    label="time stretch factor",
+                    minimum=0,
+                    maximum=64,
+                    step=1,
+                    value=1,
+                )
+            with gr.Accordion("prefix/suffix hints", open=False):
+                prefix_s = gr.Slider(
+                    label="prefix hint length (seconds)",
+                    minimum=0.0,
+                    maximum=10.0,
+                    value=0.0
+                )
+                suffix_s = gr.Slider(
+                    label="suffix hint length (seconds)",
+                    minimum=0.0,
+                    maximum=10.0,
+                    value=0.0
+                )
+            with gr.Accordion("temperature settings", open=False):
+                init_temp = gr.Slider(
+                    label="initial temperature (should probably stay between 0.6 and 1)",
+                    minimum=0.0,
+                    maximum=1.5,
+                    value=0.8
+                )
+                final_temp = gr.Slider(
+                    label="final temperature (should probably stay between 0.7 and 2)",
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.8
+                )
+            with gr.Accordion("sampling settings", open=False):
+                sampling_strategy = gr.Radio(
+                    label="sampling strategy",
+                    choices=["gumbel", "multinomial"],
+                    value="gumbel"
+                )
+                typical_filtering = gr.Checkbox(
+                    label="typical filtering (cannot be used with topk)",
+                    value=False
+                )
+                typical_mass = gr.Slider(
+                    label="typical mass (should probably stay between 0.1 and 0.5)",
+                    minimum=0.01,
+                    maximum=0.99,
+                    value=0.2
+                )
+                typical_min_tokens = gr.Slider(
+                    label="typical min tokens (should probably stay between 1 and 256)",
+                    minimum=1,
+                    maximum=256,
+                    step=1,
+                    value=1
+                )
+            num_steps = gr.Slider(
+                label="number of steps (should normally be between 12 and 36)",
+                minimum=1,
+                maximum=128,
+                step=1,
+                value=36
+            )
+            dropout = gr.Slider(
+                label="mask dropout",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.01,
+                value=0.0
+            )
+        # mask settings
+        with gr.Column():
+            checkpoint_key = gr.Radio(
+                label="checkpoint",
+                choices=list(checkpoints.keys()),
+                value="spotdl"
+            )
+            vamp_button = gr.Button("vamp!!!")
+            output_audio = gr.Audio(
+                label="output audio",
+                interactive=False,
+                type="filepath"
+            )
+        # with gr.Column():
+        #     with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
+        #         use_beats = gr.Checkbox(
+        #             label="use beat hints (helps the output stick to the beat structure of the input)",
+        #             value=False
+        #         )
+        #         snap_to_beats = gr.Checkbox(
+        #             label="trim to beat markers (uncheck if the output audio is too short.)",
+        #             value=True
+        #         )
+        #         beat_unmask_dur = gr.Slider(
+        #             label="duration",
+        #             minimum=0.0,
+        #             maximum=3.0,
+        #             value=0.07
+        #         )
+            notes_text = gr.Textbox(
+                label="type any notes about the generated audio here",
+                value="",
+                interactive=True
+            )
+            save_button = gr.Button("save vamp")
+            download_file = gr.File(
+                label="vamp to download will appear here",
+                interactive=False
+            )
+            use_as_input_button = gr.Button("use output as input")
+            thank_you = gr.Markdown("")
+    _inputs = {
+            input_audio,
+            num_steps,
+            init_temp, final_temp,
+            prefix_s, suffix_s,
+            rand_mask_intensity,
+            periodic_p, periodic_w,
+            n_conditioning_codebooks,
+            dropout,
+            use_coarse2fine,
+            stretch_factor,
+            onset_mask_width,
+            input_pitch_shift,
+            sampling_strategy,
+            typical_filtering,
+            typical_mass,
+            typical_min_tokens,
+            # topk,
+            checkpoint_key
+        }
+    # connect widgets
+    vamp_button.click(
+        fn=vamp,
+        inputs=_inputs,
+        outputs=[output_audio, audio_mask],
+    )
+    api_vamp_button = gr.Button("api vamp")
+    api_vamp_button.click(
+        fn=api_vamp,
+        inputs=_inputs,
+        outputs=[output_audio],
+        api_name="vamp"
+    )
+    use_as_input_button.click(
+        fn=lambda x: x,
+        inputs=[output_audio],
+        outputs=[input_audio]
+    )
+    save_button.click(
+        fn=save_vamp,
+        inputs=_inputs | {notes_text, output_audio},
+        outputs=[thank_you, download_file]
+    )
+demo.launch(share=True, enable_queue=False, debug=True, server_name="0.0.0.0")

demo.py CHANGED Viewed

@@ -32,6 +32,47 @@ dataset = at.data.datasets.AudioDataset(
 )
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
@@ -63,6 +104,19 @@ def load_random_audio():
 def _vamp(data, return_mask=False):
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
     sig = at.AudioSignal(data[input_audio])
@@ -229,8 +283,8 @@ with gr.Blocks() as demo:
             input_pitch_shift = gr.Slider(
                 label="input pitch shift (semitones)",
-                minimum=-12,
-                maximum=12,
                 step=1,
                 value=0,
             )
@@ -247,7 +301,7 @@ with gr.Blocks() as demo:
                 minimum=0,
                 maximum=128,
                 step=1,
-                value=9,
             )
             periodic_w = gr.Slider(
                 label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
@@ -262,7 +316,7 @@ with gr.Blocks() as demo:
                 minimum=0,
                 maximum=20,
                 step=1,
-                value=0,
             )
             with gr.Accordion("extras ", open=False):
@@ -361,6 +415,11 @@ with gr.Blocks() as demo:
         # mask settings
         with gr.Column():
             vamp_button = gr.Button("vamp!!!")
             output_audio = gr.Audio(
                 label="output audio",
@@ -423,6 +482,7 @@ with gr.Blocks() as demo:
             typical_mass,
             typical_min_tokens,
             topk,
         }
     # connect widgets
@@ -452,4 +512,4 @@ with gr.Blocks() as demo:
         outputs=[thank_you, download_file]
     )
-demo.launch(share=True, enable_queue=False, debug=True)

 )
+checkpoints = {
+    "spotdl": {
+        "coarse": "./models/spotdl/coarse.pth",
+        "c2f": "./models/spotdl/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "berta": {
+        "coarse": "./models/finetuned/berta-goldman-speech/coarse.pth",
+        "c2f": "./models/finetuned/berta-goldman-speech/c2f.pth",
+        "codec": "./model/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "xeno-canto-2": {
+        "coarse": "./models/finetuned/xeno-canto-2/coarse.pth",
+        "c2f": "./models/finetuned/xeno-canto-2/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": True
+    },
+    "panchos": {
+        "coarse": "./models/finetuned/panchos/coarse.pth",
+        "c2f": "./models/finetuned/panchos/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "tv-choir": {
+        "coarse": "./models/finetuned/tv-choir/coarse.pth",
+        "c2f": "./models/finetuned/tv-choir/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "titi": {
+        "coarse": "./models/finetuned/titi/coarse.pth",
+        "c2f": "./models/finetuned/titi/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    }
+}
+interface.checkpoint_key = "spotdl"
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
 def _vamp(data, return_mask=False):
+    # if our checkpoint key is different, we need to load a new checkpoint
+    if data[checkpoint_key] != interface.checkpoint_key:
+        print(f"loading checkpoint {data[checkpoint_key]}")
+        interface.lora_load(
+            checkpoints[data[checkpoint_key]]["coarse"],
+            checkpoints[data[checkpoint_key]]["c2f"],
+            checkpoints[data[checkpoint_key]]["full_ckpt"],
+            reset=(data[checkpoint_key] == "spotdl")
+        )
+        interface.checkpoint_key = data[checkpoint_key]
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
     sig = at.AudioSignal(data[input_audio])
             input_pitch_shift = gr.Slider(
                 label="input pitch shift (semitones)",
+                minimum=-36,
+                maximum=36,
                 step=1,
                 value=0,
             )
                 minimum=0,
                 maximum=128,
                 step=1,
+                value=3,
             )
             periodic_w = gr.Slider(
                 label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
                 minimum=0,
                 maximum=20,
                 step=1,
+                value=5,
             )
             with gr.Accordion("extras ", open=False):
         # mask settings
         with gr.Column():
+            checkpoint_key = gr.Radio(
+                label="checkpoint",
+                choices=list(checkpoints.keys()),
+                value="spotdl"
+            )
             vamp_button = gr.Button("vamp!!!")
             output_audio = gr.Audio(
                 label="output audio",
             typical_mass,
             typical_min_tokens,
             topk,
+            checkpoint_key
         }
     # connect widgets
         outputs=[thank_you, download_file]
     )
+demo.launch(share=True, enable_queue=False, debug=True, server_name="0.0.0.0")

scripts/exp/train.py CHANGED Viewed

@@ -353,12 +353,9 @@ def train(
                     mask[:, vn.n_conditioning_codebooks :, :],
                 )
-                if vn.noise_mode == "mask":
-                    # replace target with ignore index for masked tokens
-                    t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
-                    output["loss"] = criterion(z_hat, t_masked)
-                else:
-                    output["loss"] = criterion(z_hat, target)
                 self._metrics(
                     vn=vn,
@@ -429,12 +426,9 @@ def train(
             )
             output = {}
-            if vn.noise_mode == "mask":
-                # replace target with ignore index for masked tokens
-                t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
-                output["loss"] = criterion(z_hat, t_masked)
-            else:
-                output["loss"] = criterion(z_hat, target)
             self._metrics(
                 vn=vn,

                     mask[:, vn.n_conditioning_codebooks :, :],
                 )
+                # replace target with ignore index for masked tokens
+                t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
+                output["loss"] = criterion(z_hat, t_masked)
                 self._metrics(
                     vn=vn,
             )
             output = {}
+            # replace target with ignore index for masked tokens
+            t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
+            output["loss"] = criterion(z_hat, t_masked)
             self._metrics(
                 vn=vn,

scripts/utils/augment.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pathlib import Path
+import audiotools as at
+from audiotools import AudioSignal
+import argbind
+import tqdm
+from pedalboard import (
+   Compressor, Gain, Chorus, LadderFilter, Phaser, Convolution, Reverb, Pedalboard
+)
+from pedalboard.io import AudioFile
+# Read in a whole file, resampling to our desired sample rate:
+samplerate = 44100.0
+with AudioFile('guitar-input.wav').resampled_to(samplerate) as f:
+  audio = f.read(f.frames)
+# Make a pretty interesting sounding guitar pedalboard:
+board = Pedalboard([
+    Compressor(threshold_db=-50, ratio=25),
+    Gain(gain_db=30),
+    Chorus(),
+    LadderFilter(mode=LadderFilter.Mode.HPF12, cutoff_hz=900),
+    Phaser(),
+    Convolution("./guitar_amp.wav", 1.0),
+    Reverb(room_size=0.25),
+])
+@argbind.bind(without_prefix=True)
+def augment(
+    audio_folder: Path,
+    dest_folder: Path,
+    n_augmentations: int = 10,
+):
+    """
+        Augment a folder of audio files by applying audiotools and pedalboard transforms.
+        The dest foler will contain a folder for each of the clean dataset's files.
+        Under each of these folders, there will be a clean file and many augmented files.
+    """
+    audio_files = at.util.find_audio(audio_folder)
+    for audio_file in tqdm.tqdm(audio_files):
+        subtree = dest_folder / audio_file.relative_to(audio_folder).parent
+        subdir = subtree / audio_file.stem
+        subdir.mkdir(parents=True, exist_ok=True)
+        # apply pedalboard transforms
+        for i in range(n_augmentations):

vampnet/interface.py CHANGED Viewed

@@ -97,17 +97,36 @@ class Interface(torch.nn.Module):
     def lora_load(
         self,
-        coarse_lora_ckpt: str = None,
-        coarse2fine_lora_ckpt: str = None,
     ):
-        if coarse_lora_ckpt is not None:
-            self.coarse.to("cpu")
-            self.coarse.load_state_dict(torch.load(coarse_lora_ckpt, map_location="cpu"))
-            self.coarse.to(self.device)
-        if coarse2fine_lora_ckpt is not None:
-            self.c2f.to("cpu")
-            self.c2f.load_state_dict(torch.load(coarse2fine_lora_ckpt, map_location="cpu"))
-            self.c2f.to(self.device)
     def s2t(self, seconds: float):
@@ -290,6 +309,7 @@ class Interface(torch.nn.Module):
         z,
         mask,
         return_mask=False,
         **kwargs
     ):
         # coarse z
@@ -301,7 +321,8 @@ class Interface(torch.nn.Module):
         cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
         cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
-        c_vamp = self.coarse.sample(
             codec=self.codec,
             time_steps=cz.shape[-1],
             start_tokens=cz,
@@ -310,8 +331,6 @@ class Interface(torch.nn.Module):
             **kwargs
         )
-        # replace the mask token in cz_masked with random tokens
-        # so that we can decode it
         if return_mask:
             return c_vamp, cz_masked
@@ -320,53 +339,48 @@ class Interface(torch.nn.Module):
 if __name__ == "__main__":
     import audiotools as at
     interface = Interface(
         coarse_ckpt="./models/spotdl/coarse.pth",
         coarse2fine_ckpt="./models/spotdl/c2f.pth",
         codec_ckpt="./models/spotdl/codec.pth",
-        device="cpu"
     )
-    sig = at.AudioSignal('cali.mp3', duration=10)
     z = interface.encode(sig)
-    mask = linear_random(z, 0.8)
-    print(mask)
-    mask = mask_and(
-        mask, inpaint(
-            z,
-            interface.s2t(3),
-            interface.s2t(3)
-        )
-    )
-    print(mask)
     mask = mask_and(
         mask, periodic_mask(
             z,
-            7,
             1,
             random_roll=True
         )
     )
-    mask = dropout(mask, 0.0)
-    mask = codebook_unmask(mask, 0)
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
-        sampling_steps=1,
-        temperature=(0.8,1),
-        return_mask=True
     )
     use_coarse2fine = False
     if use_coarse2fine:
         zv = interface.coarse_to_fine(zv)
-    print(mask_z)
     mask = interface.to_signal(mask_z).cpu()
     sig = interface.to_signal(zv).cpu()

     def lora_load(
         self,
+        coarse_ckpt: str = None,
+        c2f_ckpt: str = None,
+        full_ckpts: bool = False,
     ):
+        if full_ckpts:
+            if coarse_ckpt is not None:
+                self.coarse = _load_model(
+                    ckpt=coarse_ckpt,
+                    device=self.device,
+                    chunk_size_s=self.coarse.chunk_size_s,
+                )
+            if c2f_ckpt is not None:
+                self.c2f = _load_model(
+                    ckpt=c2f_ckpt,
+                    device=self.device,
+                    chunk_size_s=self.c2f.chunk_size_s,
+                )
+        else:
+            if coarse_ckpt is not None:
+                self.coarse.to("cpu")
+                state_dict = torch.load(coarse_ckpt, map_location="cpu")
+                self.coarse.load_state_dict(state_dict, strict=False)
+                self.coarse.to(self.device)
+            if c2f_ckpt is not None:
+                self.c2f.to("cpu")
+                state_dict = torch.load(c2f_ckpt, map_location="cpu")
+                self.c2f.load_state_dict(state_dict, strict=False)
+                self.c2f.to(self.device)
     def s2t(self, seconds: float):
         z,
         mask,
         return_mask=False,
+        gen_fn=None,
         **kwargs
     ):
         # coarse z
         cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
         cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
+        gen_fn = gen_fn or self.coarse.sample
+        c_vamp = gen_fn(
             codec=self.codec,
             time_steps=cz.shape[-1],
             start_tokens=cz,
             **kwargs
         )
         if return_mask:
             return c_vamp, cz_masked
 if __name__ == "__main__":
     import audiotools as at
+    import logging
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    torch.set_printoptions(threshold=10000)
     interface = Interface(
         coarse_ckpt="./models/spotdl/coarse.pth",
         coarse2fine_ckpt="./models/spotdl/c2f.pth",
         codec_ckpt="./models/spotdl/codec.pth",
+        device="cuda"
     )
+    sig = at.AudioSignal('introspection ii-1.mp3', duration=10)
     z = interface.encode(sig)
+    mask = linear_random(z, 1.0)
     mask = mask_and(
         mask, periodic_mask(
             z,
+            32,
             1,
             random_roll=True
         )
     )
+    # mask = dropout(mask, 0.0)
+    # mask = codebook_unmask(mask, 0)
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
+        sampling_steps=36,
+        temperature=6.0,
+        return_mask=True,
+        # gen_fn=interface.coarse.generate
     )
     use_coarse2fine = False
     if use_coarse2fine:
         zv = interface.coarse_to_fine(zv)
     mask = interface.to_signal(mask_z).cpu()
     sig = interface.to_signal(zv).cpu()

vampnet/mask.py CHANGED Viewed

@@ -6,7 +6,7 @@ from audiotools import AudioSignal
 from .util import scalar_to_batch_tensor
 def _gamma(r):
-    return (r * torch.pi / 2).cos()
 def _invgamma(y):
     if not torch.is_tensor(y):

 from .util import scalar_to_batch_tensor
 def _gamma(r):
+    return (r * torch.pi / 2).cos().clamp(1e-10, 1.0)
 def _invgamma(y):
     if not torch.is_tensor(y):

vampnet/modules/transformer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import math
 from typing import Optional, Tuple, Union
 import numpy as np
@@ -19,17 +20,17 @@ from ..mask import _gamma
 LORA_R = 8
-def log(t, eps=1e-20):
-    return torch.log(t + eps)
-def gumbel_noise(t):
-    noise = torch.zeros_like(t).uniform_(0, 1)
-    return -log(-log(noise))
 def gumbel_sample(t, temperature=1.0, dim=-1):
-    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
 class RMSNorm(nn.Module):
@@ -477,23 +478,16 @@ class VampNet(at.ml.BaseModel):
         self.flash_attn = flash_attn
         self.noise_mode = noise_mode
-        if noise_mode == "mask":
-            special_tokens = ["MASK"]
-        elif noise_mode == "random":
-            special_tokens = None
-        else:
-            raise ValueError(f"Unknown noise mode: {noise_mode}")
         self.embedding = CodebookEmbedding(
             latent_dim=latent_dim,
             n_codebooks=n_codebooks,
             vocab_size=vocab_size,
             emb_dim=embedding_dim,
-            special_tokens=special_tokens,
         )
-        if noise_mode == "mask":
-            self.mask_token = self.embedding.special_idxs["MASK"]
         self.transformer = TransformerStack(
             d_model=embedding_dim,
@@ -584,23 +578,20 @@ class VampNet(at.ml.BaseModel):
         z_hat,
         mask,
     ):
-        if self.noise_mode == "mask":
-            z_true = z_true[:, self.n_conditioning_codebooks :, :]
-            mask = mask[:, self.n_conditioning_codebooks :, :]
-            truth = F.one_hot(z_true, self.vocab_size)
-            mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
-            z_hat = rearrange(
-                z_hat,
-                "b p (t c) -> b c t p",
-                c=self.n_codebooks - self.n_conditioning_codebooks,
-            )
-            z_hat = z_hat * mask + truth * (1 - mask)
-            z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
-        else:
-            raise ValueError(f"invalid noise mode for adding truth to logits {self.noise_mode}")
         return z_hat
@@ -742,6 +733,272 @@ class VampNet(at.ml.BaseModel):
         else:
             return z
 def sample_from_logits(
     logits,
     top_k: int = None,
@@ -798,7 +1055,6 @@ def sample_from_logits(
     return inferred
 if __name__ == "__main__":
     # import argbind
     from .layers import num_params

 import math
+import logging
 from typing import Optional, Tuple, Union
 import numpy as np
 LORA_R = 8
+# def log(t, eps=1e-20):
+#     return torch.log(t + eps)
+def gumbel_noise_like(t):
+    noise = torch.zeros_like(t).uniform_(1e-20, 1)
+    return -torch.log(-torch.log(noise))
 def gumbel_sample(t, temperature=1.0, dim=-1):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise_like(t)).argmax(dim=dim)
 class RMSNorm(nn.Module):
         self.flash_attn = flash_attn
         self.noise_mode = noise_mode
+        assert self.noise_mode == "mask", "deprecated"
         self.embedding = CodebookEmbedding(
             latent_dim=latent_dim,
             n_codebooks=n_codebooks,
             vocab_size=vocab_size,
             emb_dim=embedding_dim,
+            special_tokens=["MASK"],
         )
+        self.mask_token = self.embedding.special_idxs["MASK"]
         self.transformer = TransformerStack(
             d_model=embedding_dim,
         z_hat,
         mask,
     ):
+        z_true = z_true[:, self.n_conditioning_codebooks :, :]
+        mask = mask[:, self.n_conditioning_codebooks :, :]
+        truth = F.one_hot(z_true, self.vocab_size)
+        mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
+        z_hat = rearrange(
+            z_hat,
+            "b p (t c) -> b c t p",
+            c=self.n_codebooks - self.n_conditioning_codebooks,
+        )
+        z_hat = z_hat * mask + truth * (1 - mask)
+        z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
         return z_hat
         else:
             return z
+    @torch.no_grad()
+    def generate(
+        self,
+        codec,
+        time_steps: int = 300,
+        sampling_steps: int = 36,
+        start_tokens: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        temperature: Union[float, Tuple[float, float]] = 0.8,
+        typical_filtering=False,
+        typical_mass=0.2,
+        typical_min_tokens=1,
+        return_signal=True,
+    ):
+        logging.info(f"beginning generation with {sampling_steps} steps")
+        #####################
+        # resolve temperature #
+        #####################
+        if isinstance(temperature, float):
+            temperature = torch.tensor(temperature).repeat(sampling_steps)
+        elif isinstance(temperature, tuple):
+            assert len(temperature) == 2
+            l, h = temperature
+            temperature = torch.linspace(l, h, sampling_steps)
+        else:
+            raise TypeError(f"invalid type for temperature")
+        logging.info(f"temperature: {temperature}")
+        #####################
+        # resolve initial z #
+        #####################
+        z = start_tokens
+        if z is None:
+            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
+                self.device
+            )
+        logging.info(f"created z with shape {z.shape}")
+        #################
+        # resolve mask #
+        #################
+        if mask is None:
+            mask = torch.ones_like(z).to(self.device).int()
+            mask[:, : self.n_conditioning_codebooks, :] = 0.0
+        if mask.ndim == 2:
+            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
+        # init_mask = mask.clone()
+        logging.info(f"created mask with shape {mask.shape}")
+        ###########
+        # set up #
+        ##########
+        # apply the mask to z
+        z_masked = z.masked_fill(mask.bool(), self.mask_token)
+        # logging.info(f"z_masked: {z_masked}")
+        # how many mask tokens to begin with?
+        num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
+        logging.info(f"num mask tokens at start: {num_mask_tokens_at_start}")
+        # our r steps
+        r_steps = torch.linspace(1e-10, 1, sampling_steps+1)[1:].to(self.device)
+        logging.info(f"r steps: {r_steps}")
+        # how many codebooks are we inferring vs conditioning on?
+        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
+        logging.info(f"n infer codebooks: {n_infer_codebooks}")
+        #################
+        # begin sampling #
+        #################
+        for i in range(sampling_steps):
+            logging.info(f"step {i} of {sampling_steps}")
+            # our current temperature
+            tmpt = temperature[i]
+            logging.info(f"temperature: {tmpt}")
+            # our current schedule step
+            r = r_steps[i : i + 1]
+            logging.info(f"r: {r}")
+            # get latents
+            latents = self.embedding.from_codes(z_masked, codec)
+            logging.info(f"computed latents with shape: {latents.shape}")
+            # infer from latents
+            # NOTE: this collapses the codebook dimension into the sequence dimension
+            logits = self.forward(latents, r) # b, prob, seq
+            logits = logits.permute(0, 2, 1)  # b, seq, prob
+            if typical_filtering:
+                typical_filter(logits,
+                               typical_mass=typical_mass,
+                               typical_min_tokens=typical_min_tokens
+                )
+            logging.info(f"permuted logits with shape: {logits.shape}")
+            # logits2probs
+            probs = torch.softmax(logits, dim=-1)
+            logging.info(f"computed probs with shape: {probs.shape}")
+            # flatten z_masked and mask, so we can deal with the sampling logic
+            # we'll unflatten them at the end of the loop for the next forward pass
+            z_masked = codebook_flatten(z_masked)
+            # sample from logits with multinomial sampling
+            b = probs.shape[0]
+            probs = rearrange(probs, "b seq prob -> (b seq) prob")
+            sampled_z =  torch.multinomial(probs, 1).squeeze(-1)
+            sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
+            probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
+            logging.info(f"sampled z with shape: {sampled_z.shape}")
+            # update the mask
+            mask = (z_masked == self.mask_token).int()
+            logging.info(f"updated mask with shape: {mask.shape}")
+            # add z back into sampled z where the mask was false
+            sampled_z = torch.where(
+                mask.bool(), sampled_z, z_masked
+            )
+            logging.info(f"added z back into sampled z with shape: {sampled_z.shape}")
+            # get the confidences: which tokens did we sample?
+            selected_probs = (
+                torch.take_along_dim(
+                    probs, sampled_z.long().unsqueeze(-1),
+                    dim=-1
+                ).squeeze(-1)
+            )
+            # ignore any tokens that weren't masked
+            selected_probs = torch.where(
+                mask.bool(), selected_probs, torch.inf
+            )
+            # get the num tokens to mask, according to the schedule
+            num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
+            logging.info(f"num to mask: {num_to_mask}")
+            num_to_mask = torch.maximum(
+                torch.tensor(1),
+                torch.minimum(
+                    mask.sum(dim=-1, keepdim=True) - 1,
+                    num_to_mask
+                )
+            )
+            # get our new mask
+            # print(tmpt * (1-_gamma(r)))
+            mask = mask_by_random_topk(
+                num_to_mask, selected_probs, tmpt * (1-r)
+            )
+            # print(f"most confident tokens: ")
+            # print(torch.take_along_dim(
+            #     sampled_z, selected_probs.argsort(descending=False), dim=-1)
+            # )
+            # print(sampled_z[~mask.bool()])
+            # update the mask
+            z_masked = torch.where(
+                mask.bool(), self.mask_token, sampled_z
+            )
+            logging.info(f"updated z_masked with shape: {z_masked.shape}")
+            z_masked = codebook_unflatten(z_masked, self.n_codebooks)
+            mask = codebook_unflatten(mask, self.n_codebooks)
+            logging.info(f"unflattened z_masked with shape: {z_masked.shape}")
+            logging.info(f"updated z_masked with shape: {z_masked.shape}")
+        logging.info(f"finished sampling")
+        z = codebook_unflatten(sampled_z, self.n_codebooks)
+        if return_signal:
+            return self.to_signal(z, codec)
+        else:
+            return z
+def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):
+    """
+    Args:
+        num_to_mask (int): number of tokens to mask
+        probs (torch.Tensor): probabilities for each sampled event, shape (batch, seq)
+        temperature (float, optional): temperature. Defaults to 1.0.
+    """
+    logging.info(f"masking by random topk")
+    logging.info(f"num to mask: {num_to_mask}")
+    logging.info(f"probs shape: {probs.shape}")
+    logging.info(f"temperature: {temperature}")
+    logging.info("")
+    confidence = torch.log(probs) + temperature * gumbel_noise_like(probs)
+    logging.info(f"confidence shape: {confidence.shape}")
+    sorted_confidence, sorted_idx = confidence.sort(dim=-1)
+    logging.info(f"sorted confidence shape: {sorted_confidence.shape}")
+    logging.info(f"sorted idx shape: {sorted_idx.shape}")
+    # get the cut off threshold, given the mask length
+    cut_off = torch.take_along_dim(
+        sorted_confidence, num_to_mask, axis=-1
+    )
+    logging.info(f"cut off shape: {cut_off.shape}")
+    # mask out the tokens
+    mask = confidence < cut_off
+    logging.info(f"mask shape: {mask.shape}")
+    return mask
+def typical_filter(
+        logits,
+        typical_mass: float = 0.95,
+        typical_min_tokens: int = 1,):
+    nb, nt, _ = logits.shape
+    x_flat = rearrange(logits, "b t l -> (b t ) l")
+    x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
+    x_flat_norm_p = torch.exp(x_flat_norm)
+    entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
+    c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
+    c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
+    x_flat_cumsum = (
+        x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
+    )
+    last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
+    sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(
+        1, last_ind.view(-1, 1)
+    )
+    if typical_min_tokens > 1:
+        sorted_indices_to_remove[..., :typical_min_tokens] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, x_flat_indices, sorted_indices_to_remove
+    )
+    x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
+    logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
+    return logits
 def sample_from_logits(
     logits,
     top_k: int = None,
     return inferred
 if __name__ == "__main__":
     # import argbind
     from .layers import num_params