UDiffText

Build error

App Files Files Community

ZYMPKU commited on Dec 13, 2023

Commit

ed25868

1 Parent(s): 0b659a7

v1

Browse files

Files changed (15) hide show

app.py +49 -8
checkpoints/{st-step=100000+la-step=100000-v2.ckpt → st-step=100000+la-step=100000-v1.ckpt} +2 -2
configs/demo.yaml +2 -2
configs/test/textdesign_sd_2.yaml +28 -17
sgm/modules/__init__.py +1 -1
sgm/modules/attention.py +621 -61
sgm/modules/diffusionmodules/__init__.py +1 -1
sgm/modules/diffusionmodules/guiders.py +33 -4
sgm/modules/diffusionmodules/loss.py +1 -58
sgm/modules/diffusionmodules/openaimodel.py +1641 -195
sgm/modules/diffusionmodules/sampling.py +222 -5
sgm/modules/diffusionmodules/sampling_utils.py +4 -1
sgm/modules/diffusionmodules/wrappers.py +2 -2
sgm/modules/encoders/modules.py +50 -43
util.py +1 -9

app.py CHANGED Viewed

@@ -8,10 +8,56 @@ from omegaconf import OmegaConf
 from contextlib import nullcontext
 from pytorch_lightning import seed_everything
 from os.path import join as ospj
 from util import *
 def predict(cfgs, model, sampler, batch):
     context = nullcontext if cfgs.aae_enabled else torch.no_grad
@@ -58,15 +104,8 @@ def demo_predict(input_blk, text, num_samples, steps, scale, seed, show_detail):
     image = input_blk["image"]
     mask = input_blk["mask"]
-    image = cv2.resize(image, (cfgs.W, cfgs.H))
-    mask = cv2.resize(mask, (cfgs.W, cfgs.H))
-    mask = (mask == 0).astype(np.int32)
-    image = torch.from_numpy(image.transpose(2,0,1)).to(dtype=torch.float32) / 127.5 - 1.0
-    mask = torch.from_numpy(mask.transpose(2,0,1)).to(dtype=torch.float32).mean(dim=0, keepdim=True)
-    masked = image * mask
-    mask = 1 - mask
     seg_mask = torch.cat((torch.ones(len(text)), torch.zeros(cfgs.seq_len-len(text))))
@@ -131,6 +170,7 @@ if __name__ == "__main__":
     model = init_model(cfgs)
     sampler = init_sampling(cfgs)
     global_index = 0
     block = gr.Blocks().queue()
     with block:
@@ -161,6 +201,7 @@ if __name__ == "__main__":
             with gr.Column():
                 input_blk = gr.Image(source='upload', tool='sketch', type="numpy", label="Input", height=512)
                 text = gr.Textbox(label="Text to render: (1~12 characters)", info="the text you want to render at the masked region")
                 run_button = gr.Button(variant="primary")

 from contextlib import nullcontext
 from pytorch_lightning import seed_everything
 from os.path import join as ospj
+from random import randint
+from torchvision.utils import save_image
+from torchvision.transforms import Resize
 from util import *
+def process(image, mask):
+    img_h, img_w = image.shape[:2]
+    mask = mask[...,:1]//255
+    contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    if len(contours) != 1: raise gr.Error("One masked area only!")
+    m_x, m_y, m_w, m_h = cv2.boundingRect(contours[0])
+    c_x, c_y = m_x + m_w//2, m_y + m_h//2
+    if img_w > img_h:
+        if m_w > img_h: raise gr.Error("Illegal mask area!")
+        if c_x < img_w - c_x:
+            c_l = max(0, c_x - img_h//2)
+            c_r = c_l + img_h
+        else:
+            c_r = min(img_w, c_x + img_h//2)
+            c_l = c_r - img_h
+        image = image[:,c_l:c_r,:]
+        mask = mask[:,c_l:c_r,:]
+    else:
+        if m_h > img_w: raise gr.Error("Illegal mask area!")
+        if c_y < img_h - c_y:
+            c_t = max(0, c_y - img_w//2)
+            c_b = c_t + img_w
+        else:
+            c_b = min(img_h, c_y + img_w//2)
+            c_t = c_b - img_w
+        image = image[c_t:c_b,:,:]
+        mask = mask[c_t:c_b,:,:]
+    image = torch.from_numpy(image.transpose(2,0,1)).to(dtype=torch.float32) / 127.5 - 1.0
+    mask = torch.from_numpy(mask.transpose(2,0,1)).to(dtype=torch.float32)
+    image = resize(image[None])[0]
+    mask = resize(mask[None])[0]
+    masked = image * (1 - mask)
+    return image, mask, masked
 def predict(cfgs, model, sampler, batch):
     context = nullcontext if cfgs.aae_enabled else torch.no_grad
     image = input_blk["image"]
     mask = input_blk["mask"]
+    image, mask, masked = process(image, mask)
     seg_mask = torch.cat((torch.ones(len(text)), torch.zeros(cfgs.seq_len-len(text))))
     model = init_model(cfgs)
     sampler = init_sampling(cfgs)
     global_index = 0
+    resize = Resize((cfgs.H, cfgs.W))
     block = gr.Blocks().queue()
     with block:
             with gr.Column():
                 input_blk = gr.Image(source='upload', tool='sketch', type="numpy", label="Input", height=512)
+                gr.Markdown("Notice: please draw horizontally to indicate only **one** masked area.")
                 text = gr.Textbox(label="Text to render: (1~12 characters)", info="the text you want to render at the masked region")
                 run_button = gr.Button(variant="primary")

checkpoints/{st-step=100000+la-step=100000-v2.ckpt → st-step=100000+la-step=100000-v1.ckpt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b87a307ed6e240208b415166e88c0f3e6467ec9330836d70c6d662f423bfbc15
-size 4173692086

 version https://git-lfs.github.com/spec/v1
+oid sha256:edea71eb83b6be72c33ef787a7122a810a7b9257bf97a276ef322707d5769878
+size 6148465904

configs/demo.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 type: "demo"
 # path
-load_ckpt_path: "./checkpoints/st-step=100000+la-step=100000-v2.ckpt"
 model_cfg_path: "./configs/test/textdesign_sd_2.yaml"
 # param
@@ -15,7 +15,7 @@ channel: 4 # AE latent channel
 factor: 8 # AE downsample factor
 scale: [4.0, 0.0] # content scale, style scale
 noise_iters: 10
-force_uc_zero_embeddings: ["ref", "label"]
 aae_enabled: False
 detailed: False

 type: "demo"
 # path
+load_ckpt_path: "./checkpoints/st-step=100000+la-step=100000-v1.ckpt"
 model_cfg_path: "./configs/test/textdesign_sd_2.yaml"
 # param
 factor: 8 # AE downsample factor
 scale: [4.0, 0.0] # content scale, style scale
 noise_iters: 10
+force_uc_zero_embeddings: ["label"]
 aae_enabled: False
 detailed: False

configs/test/textdesign_sd_2.yaml CHANGED Viewed

@@ -1,8 +1,6 @@
 model:
   target: sgm.models.diffusion.DiffusionEngine
   params:
-    opt_keys:
-      - t_attn
     input_key: image
     scale_factor: 0.18215
     disable_first_stage_autocast: True
@@ -20,45 +18,54 @@ model:
           target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
     network_config:
-      target: sgm.modules.diffusionmodules.openaimodel.UnifiedUNetModel
       params:
         in_channels: 9
         out_channels: 4
         ctrl_channels: 0
         model_channels: 320
         attention_resolutions: [4, 2, 1]
-        save_attn_type: [t_attn]
-        save_attn_layers: [output_blocks.6.1]
         num_res_blocks: 2
         channel_mult: [1, 2, 4, 4]
         num_head_channels: 64
         use_linear_in_transformer: True
         transformer_depth: 1
-        t_context_dim: 2048
     conditioner_config:
       target: sgm.modules.GeneralConditioner
       params:
         emb_models:
-          # textual crossattn cond
           - is_trainable: False
-            emb_key: t_crossattn
-            ucg_rate: 0.1
             input_key: label
             target: sgm.modules.encoders.modules.LabelEncoder
             params:
               max_len: 12
               emb_dim: 2048
               n_heads: 8
               n_trans_layers: 12
-              ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
           # concat cond
           - is_trainable: False
             input_key: mask
-            target: sgm.modules.encoders.modules.SpatialRescaler
-            params:
-              in_channels: 1
-              multiplier: 0.125
           - is_trainable: False
             input_key: masked
             target: sgm.modules.encoders.modules.LatentEncoder
@@ -88,7 +95,6 @@ model:
     first_stage_config:
       target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
       params:
-        ckpt_path: ./checkpoints/AEs/AE_inpainting_2.safetensors
         embed_dim: 4
         monitor: val/rec_loss
         ddconfig:
@@ -111,11 +117,16 @@ model:
       params:
         seq_len: 12
         kernel_size: 3
-        gaussian_sigma: 1.0
         min_attn_size: 16
-        lambda_local_loss: 0.01
         lambda_ocr_loss: 0.001
         ocr_enabled: False
         sigma_sampler_config:
           target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling

 model:
   target: sgm.models.diffusion.DiffusionEngine
   params:
     input_key: image
     scale_factor: 0.18215
     disable_first_stage_autocast: True
           target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
     network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetAddModel
       params:
+        use_checkpoint: False
         in_channels: 9
         out_channels: 4
         ctrl_channels: 0
         model_channels: 320
         attention_resolutions: [4, 2, 1]
+        attn_type: add_attn
+        attn_layers:
+          - output_blocks.6.1
         num_res_blocks: 2
         channel_mult: [1, 2, 4, 4]
         num_head_channels: 64
+        use_spatial_transformer: True
         use_linear_in_transformer: True
         transformer_depth: 1
+        context_dim: 0
+        add_context_dim: 2048
+        legacy: False
     conditioner_config:
       target: sgm.modules.GeneralConditioner
       params:
         emb_models:
+          # crossattn cond
+          # - is_trainable: False
+          #   input_key: txt
+          #   target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+          #   params:
+          #     arch: ViT-H-14
+          #     version: ./checkpoints/encoders/OpenCLIP/ViT-H-14/open_clip_pytorch_model.bin
+          #     layer: penultimate
+          # add crossattn cond
           - is_trainable: False
             input_key: label
             target: sgm.modules.encoders.modules.LabelEncoder
             params:
+              is_add_embedder: True
               max_len: 12
               emb_dim: 2048
               n_heads: 8
               n_trans_layers: 12
+              ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
           # concat cond
           - is_trainable: False
             input_key: mask
+            target: sgm.modules.encoders.modules.IdentityEncoder
           - is_trainable: False
             input_key: masked
             target: sgm.modules.encoders.modules.LatentEncoder
     first_stage_config:
       target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
       params:
         embed_dim: 4
         monitor: val/rec_loss
         ddconfig:
       params:
         seq_len: 12
         kernel_size: 3
+        gaussian_sigma: 0.5
         min_attn_size: 16
+        lambda_local_loss: 0.02
         lambda_ocr_loss: 0.001
         ocr_enabled: False
+        predictor_config:
+          target: sgm.modules.predictors.model.ParseqPredictor
+          params:
+            ckpt_path: "./checkpoints/predictors/parseq-bb5792a6.pt"
         sigma_sampler_config:
           target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling

sgm/modules/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .encoders.modules import GeneralConditioner
 UNCONDITIONAL_CONFIG = {
     "target": "sgm.modules.GeneralConditioner",

+from .encoders.modules import GeneralConditioner, DualConditioner
 UNCONDITIONAL_CONFIG = {
     "target": "sgm.modules.GeneralConditioner",

sgm/modules/attention.py CHANGED Viewed

@@ -5,15 +5,53 @@ from typing import Any, Optional
 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from torch import nn, einsum
 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILABLE = True
 except:
     XFORMERS_IS_AVAILABLE = False
-    print("No module 'xformers'.")
 def exists(val):
@@ -108,6 +146,51 @@ class LinearAttention(nn.Module):
         return self.to_out(out)
 class CrossAttention(nn.Module):
     def __init__(
         self,
@@ -115,7 +198,8 @@ class CrossAttention(nn.Module):
         context_dim=None,
         heads=8,
         dim_head=64,
-        dropout=0.0
     ):
         super().__init__()
         inner_dim = dim_head * heads
@@ -128,38 +212,60 @@ class CrossAttention(nn.Module):
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = zero_module(
-            nn.Sequential(
-                nn.Linear(inner_dim, query_dim),
-                nn.Dropout(dropout)
-            )
-        )
         self.attn_map_cache = None
     def forward(
         self,
         x,
-        context=None
     ):
         h = self.heads
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
         ## old
         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         # attention, what we cannot get enough of
-        if sim.shape[-1] > 1:
-            sim = sim.softmax(dim=-1) # softmax on token dim
-        else:
-            sim = sim.sigmoid() # sigmoid on pixel dim
         # save attn_map
         if self.attn_map_cache is not None:
@@ -170,7 +276,20 @@ class CrossAttention(nn.Module):
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
         return self.to_out(out)
@@ -263,6 +382,10 @@ class MemoryEfficientCrossAttention(nn.Module):
 class BasicTransformerBlock(nn.Module):
     def __init__(
         self,
@@ -270,78 +393,169 @@ class BasicTransformerBlock(nn.Module):
         n_heads,
         d_head,
         dropout=0.0,
-        t_context_dim=None,
-        v_context_dim=None,
-        gated_ff=True
     ):
         super().__init__()
-        # self-attention
         self.attn1 = MemoryEfficientCrossAttention(
             query_dim=dim,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
-            context_dim=None
-        )
-        # textual cross-attention
-        if t_context_dim is not None and t_context_dim > 0:
-            self.t_attn = CrossAttention(
                 query_dim=dim,
-                context_dim=t_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
-                dropout=dropout
-            )
-            self.t_norm = nn.LayerNorm(dim)
-        # visual cross-attention
-        if v_context_dim is not None and v_context_dim > 0:
-            self.v_attn = CrossAttention(
                 query_dim=dim,
-                context_dim=v_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
-                dropout=dropout
-            )
-            self.v_norm = nn.LayerNorm(dim)
         self.norm1 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-    def forward(self, x, t_context=None, v_context=None):
         x = (
             self.attn1(
                 self.norm1(x),
-                context=None
             )
             + x
         )
-        if hasattr(self, "t_attn"):
             x = (
-                self.t_attn(
-                    self.t_norm(x),
-                    context=t_context
                 )
                 + x
             )
-        if hasattr(self, "v_attn"):
             x = (
-                self.v_attn(
-                    self.v_norm(x),
-                    context=v_context
                 )
                 + x
             )
         x = self.ff(self.norm3(x)) + x
         return x
-class SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
     First, project the input (aka embedding)
@@ -358,12 +572,36 @@ class SpatialTransformer(nn.Module):
         d_head,
         depth=1,
         dropout=0.0,
-        t_context_dim=None,
-        v_context_dim=None,
-        use_linear=False
     ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
@@ -381,8 +619,12 @@ class SpatialTransformer(nn.Module):
                     n_heads,
                     d_head,
                     dropout=dropout,
-                    t_context_dim=t_context_dim,
-                    v_context_dim=v_context_dim
                 )
                 for d in range(depth)
             ]
@@ -392,11 +634,14 @@ class SpatialTransformer(nn.Module):
                 nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
             )
         else:
             self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
         self.use_linear = use_linear
-    def forward(self, x, t_context=None, v_context=None):
         b, c, h, w = x.shape
         x_in = x
         x = self.norm(x)
@@ -406,11 +651,326 @@ class SpatialTransformer(nn.Module):
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
-            x = block(x, t_context=t_context, v_context=v_context)
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
-        return x + x_in

 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
+from packaging import version
 from torch import nn, einsum
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    print(
+        f"No SDP backend available, likely because you are running in pytorch versions < 2.0. In fact, "
+        f"you are using PyTorch {torch.__version__}. You might want to consider upgrading."
+    )
 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILABLE = True
 except:
     XFORMERS_IS_AVAILABLE = False
+    print("no module 'xformers'. Processing without...")
+from .diffusionmodules.util import checkpoint
 def exists(val):
         return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
 class CrossAttention(nn.Module):
     def __init__(
         self,
         context_dim=None,
         heads=8,
         dim_head=64,
+        dropout=0.0,
+        backend=None,
     ):
         super().__init__()
         inner_dim = dim_head * heads
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = zero_module(nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        ))
+        self.backend = backend
         self.attn_map_cache = None
     def forward(
         self,
         x,
+        context=None,
+        mask=None,
+        additional_tokens=None,
+        n_times_crossframe_attn_in_self=0,
     ):
         h = self.heads
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(
+                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
+            v = repeat(
+                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
+            )
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
         ## old
         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
         # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
         # save attn_map
         if self.attn_map_cache is not None:
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        ## new
+        # with sdp_kernel(**BACKEND_MAP[self.backend]):
+        #     # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
+        #     out = F.scaled_dot_product_attention(
+        #         q, k, v, attn_mask=mask
+        #     )  # scale is dim_head ** -0.5 per default
+        # del q, k, v
+        # out = rearrange(out, "b h n d -> b n (h d)", h=h)
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
+    }
     def __init__(
         self,
         n_heads,
         d_head,
         dropout=0.0,
+        context_dim=None,
+        add_context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        disable_self_attn=False,
+        attn_mode="softmax",
+        sdp_backend=None,
     ):
         super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
+            print(
+                f"Attention mode '{attn_mode}' is not available. Falling back to native attention. "
+                f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}"
+            )
+            attn_mode = "softmax"
+        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
+            print(
+                "We do not support vanilla attention anymore, as it is too expensive. Sorry."
+            )
+            if not XFORMERS_IS_AVAILABLE:
+                assert (
+                    False
+                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
+            else:
+                print("Falling back to xformers efficient attention.")
+                attn_mode = "softmax-xformers"
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        if version.parse(torch.__version__) >= version.parse("2.0.0"):
+            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
+        else:
+            assert sdp_backend is None
+        self.disable_self_attn = disable_self_attn
         self.attn1 = MemoryEfficientCrossAttention(
             query_dim=dim,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None,
+            backend=sdp_backend,
+        )  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        if context_dim is not None and context_dim > 0:
+            self.attn2 = attn_cls(
                 query_dim=dim,
+                context_dim=context_dim,
                 heads=n_heads,
                 dim_head=d_head,
+                dropout=dropout,
+                backend=sdp_backend,
+            )  # is self-attn if context is none
+        if add_context_dim is not None and add_context_dim > 0:
+            self.add_attn = attn_cls(
                 query_dim=dim,
+                context_dim=add_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
+                dropout=dropout,
+                backend=sdp_backend,
+            )  # is self-attn if context is none
+            self.add_norm = nn.LayerNorm(dim)
         self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(
+        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
+        kwargs = {"x": x}
+        if context is not None:
+            kwargs.update({"context": context})
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+        if n_times_crossframe_attn_in_self:
+            kwargs.update(
+                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
+            )
+        return checkpoint(
+            self._forward, (x, context, add_context), self.parameters(), self.checkpoint
+        )
+    def _forward(
+        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
+    ):
         x = (
             self.attn1(
                 self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
+                if not self.disable_self_attn
+                else 0,
             )
             + x
         )
+        if hasattr(self, "attn2"):
             x = (
+                self.attn2(
+                    self.norm2(x), context=context, additional_tokens=additional_tokens
                 )
                 + x
             )
+        if hasattr(self, "add_attn"):
             x = (
+                self.add_attn(
+                    self.add_norm(x), context=add_context, additional_tokens=additional_tokens
                 )
                 + x
             )
         x = self.ff(self.norm3(x)) + x
+        return x
+class BasicTransformerSingleLayerBlock(nn.Module):
+    ATTENTION_MODES = {
+        "softmax": CrossAttention,  # vanilla attention
+        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
+        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
+    }
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        attn_mode="softmax",
+    ):
+        super().__init__()
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim,
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        return checkpoint(
+            self._forward, (x, context), self.parameters(), self.checkpoint
+        )
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context) + x
+        x = self.ff(self.norm2(x)) + x
         return x
+class  SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
     First, project the input (aka embedding)
         d_head,
         depth=1,
         dropout=0.0,
+        context_dim=None,
+        add_context_dim=None,
+        disable_self_attn=False,
+        use_linear=False,
+        attn_type="softmax",
+        use_checkpoint=True,
+        # sdp_backend=SDPBackend.FLASH_ATTENTION
+        sdp_backend=None,
     ):
         super().__init__()
+        # print(
+        #     f"constructing {self.__class__.__name__} of depth {depth} w/ {in_channels} channels and {n_heads} heads"
+        # )
+        from omegaconf import ListConfig
+        if exists(context_dim) and not isinstance(context_dim, (list, ListConfig)):
+            context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                # print(
+                #     f"WARNING: {self.__class__.__name__}: Found context dims {context_dim} of depth {len(context_dim)}, "
+                #     f"which does not match the specified 'depth' of {depth}. Setting context_dim to {depth * [context_dim[0]]} now."
+                # )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
                     n_heads,
                     d_head,
                     dropout=dropout,
+                    context_dim=context_dim[d],
+                    add_context_dim=add_context_dim,
+                    disable_self_attn=disable_self_attn,
+                    attn_mode=attn_type,
+                    checkpoint=use_checkpoint,
+                    sdp_backend=sdp_backend,
                 )
                 for d in range(depth)
             ]
                 nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
             )
         else:
+            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
             self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
         self.use_linear = use_linear
+    def forward(self, x, context=None, add_context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
         b, c, h, w = x.shape
         x_in = x
         x = self.norm(x)
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
+            x = block(x, context=context[i], add_context=add_context)
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
+        return x + x_in
+def benchmark_attn():
+    # Lets define a helpful benchmarking function:
+    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.nn.functional as F
+    import torch.utils.benchmark as benchmark
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    # Lets define the hyper-parameters of our input
+    batch_size = 32
+    max_sequence_len = 1024
+    num_heads = 32
+    embed_dimension = 32
+    dtype = torch.float16
+    query = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    key = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    value = torch.rand(
+        batch_size,
+        num_heads,
+        max_sequence_len,
+        embed_dimension,
+        device=device,
+        dtype=dtype,
+    )
+    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
+    # Lets explore the speed of each of the 3 implementations
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    # Helpful arguments mapper
+    backend_map = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+    }
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    print(
+        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with profile(
+        activities=activities, record_shapes=False, profile_memory=True
+    ) as prof:
+        with record_function("Default detailed stats"):
+            for _ in range(25):
+                o = F.scaled_dot_product_attention(query, key, value)
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    print(
+        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+    )
+    with sdp_kernel(**backend_map[SDPBackend.MATH]):
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("Math implmentation stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        try:
+            print(
+                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("FlashAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("FlashAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        try:
+            print(
+                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
+            )
+        except RuntimeError:
+            print("EfficientAttention is not supported. See warnings for reasons.")
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("EfficientAttention stats"):
+                for _ in range(25):
+                    o = F.scaled_dot_product_attention(query, key, value)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+def run_model(model, x, context):
+    return model(x, context)
+def benchmark_transformer_blocks():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    import torch.utils.benchmark as benchmark
+    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+        t0 = benchmark.Timer(
+            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        )
+        return t0.blocked_autorange().mean * 1e6
+    checkpoint = True
+    compile = False
+    batch_size = 32
+    h, w = 64, 64
+    context_len = 77
+    embed_dimension = 1024
+    context_dim = 1024
+    d_head = 64
+    transformer_depth = 4
+    n_heads = embed_dimension // d_head
+    dtype = torch.float16
+    model_native = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        use_checkpoint=checkpoint,
+        attn_type="softmax",
+        depth=transformer_depth,
+        sdp_backend=SDPBackend.FLASH_ATTENTION,
+    ).to(device)
+    model_efficient_attn = SpatialTransformer(
+        embed_dimension,
+        n_heads,
+        d_head,
+        context_dim=context_dim,
+        use_linear=True,
+        depth=transformer_depth,
+        use_checkpoint=checkpoint,
+        attn_type="softmax-xformers",
+    ).to(device)
+    if not checkpoint and compile:
+        print("compiling models")
+        model_native = torch.compile(model_native)
+        model_efficient_attn = torch.compile(model_efficient_attn)
+    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
+    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
+    from torch.profiler import ProfilerActivity, profile, record_function
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+    with torch.autocast("cuda"):
+        print(
+            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
+        )
+        print(
+            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
+        )
+        print(75 * "+")
+        print("NATIVE")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("NativeAttention stats"):
+                for _ in range(25):
+                    model_native(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
+        print(75 * "+")
+        print("Xformers")
+        print(75 * "+")
+        torch.cuda.reset_peak_memory_stats()
+        with profile(
+            activities=activities, record_shapes=False, profile_memory=True
+        ) as prof:
+            with record_function("xformers stats"):
+                for _ in range(25):
+                    model_efficient_attn(x, c)
+        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
+def test01():
+    # conv1x1 vs linear
+    from ..util import count_params
+    conv = nn.Conv2d(3, 32, kernel_size=1).cuda()
+    print(count_params(conv))
+    linear = torch.nn.Linear(3, 32).cuda()
+    print(count_params(linear))
+    print(conv.weight.shape)
+    # use same initialization
+    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
+    linear.bias = torch.nn.Parameter(conv.bias)
+    print(linear.weight.shape)
+    x = torch.randn(11, 3, 64, 64).cuda()
+    xr = rearrange(x, "b c h w -> b (h w) c").contiguous()
+    print(xr.shape)
+    out_linear = linear(xr)
+    print(out_linear.mean(), out_linear.shape)
+    out_conv = conv(x)
+    print(out_conv.mean(), out_conv.shape)
+    print("done with test01.\n")
+def test02():
+    # try cosine flash attention
+    import time
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    print("testing cosine flash attention...")
+    DIM = 1024
+    SEQLEN = 4096
+    BS = 16
+    print(" softmax (vanilla) first...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="softmax",
+    ).cuda()
+    try:
+        x = torch.randn(BS, SEQLEN, DIM).cuda()
+        tic = time.time()
+        y = model(x)
+        toc = time.time()
+        print(y.shape, toc - tic)
+    except RuntimeError as e:
+        # likely oom
+        print(str(e))
+    print("\n now flash-cosine...")
+    model = BasicTransformerBlock(
+        dim=DIM,
+        n_heads=16,
+        d_head=64,
+        dropout=0.0,
+        context_dim=None,
+        attn_mode="flash-cosine",
+    ).cuda()
+    x = torch.randn(BS, SEQLEN, DIM).cuda()
+    tic = time.time()
+    y = model(x)
+    toc = time.time()
+    print(y.shape, toc - tic)
+    print("done with test02.\n")
+if __name__ == "__main__":
+    # test01()
+    # test02()
+    # test03()
+    # benchmark_attn()
+    benchmark_transformer_blocks()
+    print("done.")

sgm/modules/diffusionmodules/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@ from .denoiser import Denoiser
 from .discretizer import Discretization
 from .loss import StandardDiffusionLoss
 from .model import Model, Encoder, Decoder
-from .openaimodel import UnifiedUNetModel
 from .sampling import BaseDiffusionSampler
 from .wrappers import OpenAIWrapper

 from .discretizer import Discretization
 from .loss import StandardDiffusionLoss
 from .model import Model, Encoder, Decoder
+from .openaimodel import UNetModel
 from .sampling import BaseDiffusionSampler
 from .wrappers import OpenAIWrapper

sgm/modules/diffusionmodules/guiders.py CHANGED Viewed

@@ -11,8 +11,8 @@ class VanillaCFG:
     """
     def __init__(self, scale, dyn_thresh_config=None):
-        self.scale_value = scale
         self.dyn_thresh = instantiate_from_config(
             default(
                 dyn_thresh_config,
@@ -24,14 +24,15 @@ class VanillaCFG:
     def __call__(self, x, sigma):
         x_u, x_c = x.chunk(2)
-        x_pred = self.dyn_thresh(x_u, x_c, self.scale_value)
         return x_pred
     def prepare_inputs(self, x, s, c, uc):
         c_out = dict()
         for k in c:
-            if k in ["vector", "t_crossattn", "v_crossattn", "concat"]:
                 c_out[k] = torch.cat((uc[k], c[k]), 0)
             else:
                 assert c[k] == uc[k]
@@ -39,6 +40,34 @@ class VanillaCFG:
         return torch.cat([x] * 2), torch.cat([s] * 2), c_out
 class IdentityGuider:
     def __call__(self, x, sigma):
         return x

     """
     def __init__(self, scale, dyn_thresh_config=None):
+        scale_schedule = lambda scale, sigma: scale  # independent of step
+        self.scale_schedule = partial(scale_schedule, scale)
         self.dyn_thresh = instantiate_from_config(
             default(
                 dyn_thresh_config,
     def __call__(self, x, sigma):
         x_u, x_c = x.chunk(2)
+        scale_value = self.scale_schedule(sigma)
+        x_pred = self.dyn_thresh(x_u, x_c, scale_value)
         return x_pred
     def prepare_inputs(self, x, s, c, uc):
         c_out = dict()
         for k in c:
+            if k in ["vector", "crossattn", "add_crossattn", "concat"]:
                 c_out[k] = torch.cat((uc[k], c[k]), 0)
             else:
                 assert c[k] == uc[k]
         return torch.cat([x] * 2), torch.cat([s] * 2), c_out
+class DualCFG:
+    def __init__(self, scale):
+        self.scale = scale
+        self.dyn_thresh = instantiate_from_config(
+            {
+                "target": "sgm.modules.diffusionmodules.sampling_utils.DualThresholding"
+            },
+        )
+    def __call__(self, x, sigma):
+        x_u_1, x_u_2, x_c = x.chunk(3)
+        x_pred = self.dyn_thresh(x_u_1, x_u_2, x_c, self.scale)
+        return x_pred
+    def prepare_inputs(self, x, s, c, uc_1, uc_2):
+        c_out = dict()
+        for k in c:
+            if k in ["vector", "crossattn", "concat", "add_crossattn"]:
+                c_out[k] = torch.cat((uc_1[k], uc_2[k], c[k]), 0)
+            else:
+                assert c[k] == uc_1[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 3), torch.cat([s] * 3), c_out
 class IdentityGuider:
     def __call__(self, x, sigma):
         return x

sgm/modules/diffusionmodules/loss.py CHANGED Viewed

@@ -78,9 +78,7 @@ class FullLoss(StandardDiffusionLoss):
         min_attn_size=16,
         lambda_local_loss=0.0,
         lambda_ocr_loss=0.0,
-        lambda_style_loss=0.0,
         ocr_enabled = False,
-        style_enabled = False,
         predictor_config = None,
         *args, **kwarg
     ):
@@ -93,9 +91,7 @@ class FullLoss(StandardDiffusionLoss):
         self.min_attn_size = min_attn_size
         self.lambda_local_loss = lambda_local_loss
         self.lambda_ocr_loss = lambda_ocr_loss
-        self.lambda_style_loss = lambda_style_loss
-        self.style_enabled = style_enabled
         self.ocr_enabled = ocr_enabled
         if ocr_enabled:
             self.predictor = instantiate_from_config(predictor_config)
@@ -152,15 +148,9 @@ class FullLoss(StandardDiffusionLoss):
             ocr_loss = self.get_ocr_loss(model_output, batch["r_bbox"], batch["label"], first_stage_model, scaler)
             ocr_loss = ocr_loss.mean()
-        if self.style_enabled:
-            style_loss = self.get_style_local_loss(network.diffusion_model.attn_map_cache, batch["mask"])
-            style_loss = style_loss.mean()
         loss = diff_loss + self.lambda_local_loss * local_loss
         if self.ocr_enabled:
             loss += self.lambda_ocr_loss * ocr_loss
-        if self.style_enabled:
-            loss += self.lambda_style_loss * style_loss
         loss_dict = {
             "loss/diff_loss": diff_loss,
@@ -170,8 +160,6 @@ class FullLoss(StandardDiffusionLoss):
         if self.ocr_enabled:
             loss_dict["loss/ocr_loss"] = ocr_loss
-        if self.style_enabled:
-            loss_dict["loss/style_loss"] = style_loss
         return loss, loss_dict
@@ -196,9 +184,6 @@ class FullLoss(StandardDiffusionLoss):
         for item in attn_map_cache:
-            name = item["name"]
-            if not name.endswith("t_attn"): continue
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
@@ -241,9 +226,6 @@ class FullLoss(StandardDiffusionLoss):
         for item in attn_map_cache:
-            name = item["name"]
-            if not name.endswith("t_attn"): continue
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
@@ -252,7 +234,7 @@ class FullLoss(StandardDiffusionLoss):
             seg_l = seg_mask.shape[1]
-            bh, n, l = attn_map.shape # bh: batch size * heads / n: pixel length(h*w) / l: token length
             attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
             assert seg_l <= l
@@ -283,43 +265,4 @@ class FullLoss(StandardDiffusionLoss):
         loss = loss / count
-        return loss
-    def get_style_local_loss(self, attn_map_cache, mask):
-        loss = 0
-        count = 0
-        for item in attn_map_cache:
-            name = item["name"]
-            if not name.endswith("v_attn"): continue
-            heads = item["heads"]
-            size = item["size"]
-            attn_map = item["attn_map"]
-            if size < self.min_attn_size: continue
-            bh, n, l = attn_map.shape # bh: batch size * heads / n: pixel length(h*w) / l: token length
-            attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
-            attn_map = attn_map.permute(0, 1, 3, 2) # b, h, l, n
-            attn_map = attn_map.mean(dim = 1) # b, l, n
-            mask_map = F.interpolate(mask, (size, size))
-            mask_map = mask_map.reshape((-1, l, n)) # b, l, n
-            n_mask_map = 1 - mask_map
-            p_loss = (mask_map * attn_map).sum(dim = -1) / (mask_map.sum(dim = -1) + 1e-5) # b, l
-            n_loss = (n_mask_map * attn_map).sum(dim = -1) / (n_mask_map.sum(dim = -1) + 1e-5) # b, l
-            p_loss = p_loss.mean(dim = -1)
-            n_loss = n_loss.mean(dim = -1)
-            f_loss = n_loss - p_loss # b,
-            loss += f_loss
-            count += 1
-        loss = loss / count
         return loss

         min_attn_size=16,
         lambda_local_loss=0.0,
         lambda_ocr_loss=0.0,
         ocr_enabled = False,
         predictor_config = None,
         *args, **kwarg
     ):
         self.min_attn_size = min_attn_size
         self.lambda_local_loss = lambda_local_loss
         self.lambda_ocr_loss = lambda_ocr_loss
         self.ocr_enabled = ocr_enabled
         if ocr_enabled:
             self.predictor = instantiate_from_config(predictor_config)
             ocr_loss = self.get_ocr_loss(model_output, batch["r_bbox"], batch["label"], first_stage_model, scaler)
             ocr_loss = ocr_loss.mean()
         loss = diff_loss + self.lambda_local_loss * local_loss
         if self.ocr_enabled:
             loss += self.lambda_ocr_loss * ocr_loss
         loss_dict = {
             "loss/diff_loss": diff_loss,
         if self.ocr_enabled:
             loss_dict["loss/ocr_loss"] = ocr_loss
         return loss, loss_dict
         for item in attn_map_cache:
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
         for item in attn_map_cache:
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
             seg_l = seg_mask.shape[1]
+            bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
             attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
             assert seg_l <= l
         loss = loss / count
         return loss

sgm/modules/diffusionmodules/openaimodel.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from abc import abstractmethod
 from typing import Iterable
 import numpy as np
@@ -10,6 +13,7 @@ from einops import rearrange
 from ...modules.attention import SpatialTransformer
 from ...modules.diffusionmodules.util import (
     avg_pool_nd,
     conv_nd,
     linear,
     normalization,
@@ -19,14 +23,47 @@ from ...modules.diffusionmodules.util import (
 from ...util import default, exists
-class Timestep(nn.Module):
-    def __init__(self, dim):
         super().__init__()
-        self.dim = dim
-    def forward(self, t):
-        return timestep_embedding(t, self.dim)
 class TimestepBlock(nn.Module):
     """
@@ -50,14 +87,19 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
         self,
         x,
         emb,
-        t_context=None,
-        v_context=None
     ):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
-                x = layer(x, t_context, v_context)
             else:
                 x = layer(x)
         return x
@@ -102,6 +144,22 @@ class Upsample(nn.Module):
         return x
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
@@ -149,6 +207,17 @@ class Downsample(nn.Module):
 class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
     """
     def __init__(
@@ -160,11 +229,12 @@ class ResBlock(TimestepBlock):
         use_conv=False,
         use_scale_shift_norm=False,
         dims=2,
         up=False,
         down=False,
         kernel_size=3,
         exchange_temb_dims=False,
-        skip_t_emb=False
     ):
         super().__init__()
         self.channels = channels
@@ -172,6 +242,7 @@ class ResBlock(TimestepBlock):
         self.dropout = dropout
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.use_scale_shift_norm = use_scale_shift_norm
         self.exchange_temb_dims = exchange_temb_dims
@@ -240,6 +311,17 @@ class ResBlock(TimestepBlock):
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
     def forward(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
@@ -267,42 +349,233 @@ class ResBlock(TimestepBlock):
             h = self.out_layers(h)
         return self.skip_connection(x) + h
-import seaborn as sns
-import matplotlib.pyplot as plt
-class UnifiedUNetModel(nn.Module):
     def __init__(
         self,
         in_channels,
-        ctrl_channels,
         model_channels,
         out_channels,
         num_res_blocks,
         attention_resolutions,
         dropout=0,
         channel_mult=(1, 2, 4, 8),
-        save_attn_type=None,
-        save_attn_layers=[],
         conv_resample=True,
         dims=2,
-        use_label=None,
         num_heads=-1,
         num_head_channels=-1,
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
-        transformer_depth=1,
-        t_context_dim=None,
-        v_context_dim=None,
         num_attention_blocks=None,
         use_linear_in_transformer=False,
         adm_in_channels=None,
-        transformer_depth_middle=None
     ):
         super().__init__()
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
@@ -318,39 +591,106 @@ class UnifiedUNetModel(nn.Module):
             ), "Either num_heads or num_head_channels has to be set"
         self.in_channels = in_channels
-        self.ctrl_channels = ctrl_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
-        transformer_depth = len(channel_mult) * [transformer_depth]
-        transformer_depth_middle = default(transformer_depth_middle, transformer_depth[-1])
-        self.num_res_blocks = len(channel_mult) * [num_res_blocks]
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
-        self.use_label = use_label
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
         time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
         )
-        if self.use_label is not None:
-            self.label_emb = nn.Sequential(
-                nn.Sequential(
-                    linear(adm_in_channels, time_embed_dim),
-                    nn.SiLU(),
-                    linear(time_embed_dim, time_embed_dim),
                 )
-            )
         self.input_blocks = nn.ModuleList(
             [
@@ -359,26 +699,6 @@ class UnifiedUNetModel(nn.Module):
                 )
             ]
         )
-        if self.ctrl_channels > 0:
-            self.ctrl_block = TimestepEmbedSequential(
-                conv_nd(dims, ctrl_channels, 16, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 16, 16, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 16, 32, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 32, 32, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 32, 96, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 96, 96, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 96, 256, 3, padding=1),
-                nn.SiLU(),
-                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
-            )
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
@@ -386,13 +706,16 @@ class UnifiedUNetModel(nn.Module):
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_scale_shift_norm=use_scale_shift_norm
                     )
                 ]
                 ch = mult * model_channels
@@ -402,19 +725,45 @@ class UnifiedUNetModel(nn.Module):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if (
                         not exists(num_attention_blocks)
                         or nr < num_attention_blocks[level]
                     ):
                         layers.append(
-                            SpatialTransformer(
-                                ch,
-                                num_heads,
-                                dim_head,
-                                depth=transformer_depth[level],
-                                t_context_dim=t_context_dim,
-                                v_context_dim=v_context_dim,
-                                use_linear=use_linear_in_transformer
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
@@ -424,14 +773,17 @@ class UnifiedUNetModel(nn.Module):
                 out_ch = ch
                 self.input_blocks.append(
                     TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True
                         )
                         if resblock_updown
                         else Downsample(
@@ -449,33 +801,54 @@ class UnifiedUNetModel(nn.Module):
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
         self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_scale_shift_norm=use_scale_shift_norm
-            ),
-            SpatialTransformer(  # always uses a self-attn
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth_middle,
-                t_context_dim=t_context_dim,
-                v_context_dim=v_context_dim,
-                use_linear=use_linear_in_transformer
             ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_scale_shift_norm=use_scale_shift_norm
             )
         )
         self._feature_size += ch
         self.output_blocks = nn.ModuleList([])
@@ -483,13 +856,16 @@ class UnifiedUNetModel(nn.Module):
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
-                    ResBlock(
-                        ch + ich,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=model_channels * mult,
-                        dims=dims,
-                        use_scale_shift_norm=use_scale_shift_norm
                     )
                 ]
                 ch = model_channels * mult
@@ -499,32 +875,61 @@ class UnifiedUNetModel(nn.Module):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if (
                         not exists(num_attention_blocks)
                         or i < num_attention_blocks[level]
                     ):
                         layers.append(
-                            SpatialTransformer(
-                                ch,
-                                num_heads,
-                                dim_head,
-                                depth=transformer_depth[level],
-                                t_context_dim=t_context_dim,
-                                v_context_dim=v_context_dim,
-                                use_linear=use_linear_in_transformer
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            up=True
                         )
                         if resblock_updown
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
@@ -533,92 +938,1133 @@ class UnifiedUNetModel(nn.Module):
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
-        self.out = nn.Sequential(
-            normalization(ch),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1))
         )
-        # cache attn map
-        self.attn_type = save_attn_type
-        self.attn_layers = save_attn_layers
-        self.attn_map_cache = []
-        for name, module in self.named_modules():
-            if any([name.endswith(attn_type) for attn_type in self.attn_type]):
-                item = {"name": name, "heads": module.heads, "size": None, "attn_map": None}
-                self.attn_map_cache.append(item)
-                module.attn_map_cache = item
-    def clear_attn_map(self):
-        for item in self.attn_map_cache:
-            if item["attn_map"] is not None:
-                del item["attn_map"]
-                item["attn_map"] = None
-    def save_attn_map(self, attn_type="t_attn", save_name="temp", tokens=""):
-        attn_maps = []
-        for item in self.attn_map_cache:
-            name = item["name"]
-            if any([name.startswith(block) for block in self.attn_layers]) and name.endswith(attn_type):
-                heads = item["heads"]
-                attn_maps.append(item["attn_map"].detach().cpu())
-        attn_map = th.stack(attn_maps, dim=0)
-        attn_map = th.mean(attn_map, dim=0)
-        # attn_map: bh * n * l
-        bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
-        attn_map = attn_map.reshape((-1,heads,n,l)).mean(dim=1)
-        b = attn_map.shape[0]
-        h = w = int(n**0.5)
-        attn_map = attn_map.permute(0,2,1).reshape((b,l,h,w)).numpy()
-        attn_map_i = attn_map[-1]
-        l = attn_map_i.shape[0]
-        fig = plt.figure(figsize=(12, 8), dpi=300)
-        for j in range(12):
-            if j >= l: break
-            ax = fig.add_subplot(3, 4, j+1)
-            sns.heatmap(attn_map_i[j], square=True, xticklabels=False, yticklabels=False)
-            if j < len(tokens):
-                ax.set_title(tokens[j])
-        fig.savefig(f"temp/attn_map/attn_map_{save_name}.png")
-        plt.close()
-        return attn_map_i
-    def forward(self, x, timesteps=None, t_context=None, v_context=None, y=None, **kwargs):
         assert (y is not None) == (
-            self.use_label is not None
         ), "must specify y if and only if the model is class-conditional"
-        self.clear_attn_map()
         hs = []
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
-        if self.use_label is not None:
             assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
         h = x
-        if self.ctrl_channels > 0:
-            in_h, add_h = th.split(h, [self.in_channels, self.ctrl_channels], dim=1)
         for i, module in enumerate(self.input_blocks):
             if self.ctrl_channels > 0 and i == 0:
-                h = module(in_h, emb, t_context, v_context) + self.ctrl_block(add_h, emb, t_context, v_context)
             else:
-                h = module(h, emb, t_context, v_context)
             hs.append(h)
-        h = self.middle_block(h, emb, t_context, v_context)
         for i, module in enumerate(self.output_blocks):
             h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, t_context, v_context)
         h = h.type(x.dtype)
         return self.out(h)

+import os
+import math
 from abc import abstractmethod
+from functools import partial
 from typing import Iterable
 import numpy as np
 from ...modules.attention import SpatialTransformer
 from ...modules.diffusionmodules.util import (
     avg_pool_nd,
+    checkpoint,
     conv_nd,
     linear,
     normalization,
 from ...util import default, exists
+# dummy replace
+def convert_module_to_f16(x):
+    pass
+def convert_module_to_f32(x):
+    pass
+## go
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
         super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
 class TimestepBlock(nn.Module):
     """
         self,
         x,
         emb,
+        context=None,
+        add_context=None,
+        skip_time_mix=False,
+        time_context=None,
+        num_video_frames=None,
+        time_context_cat=None,
+        use_crossframe_attention_in_spatial_layers=False,
     ):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context, add_context)
             else:
                 x = layer(x)
         return x
         return x
+class TransposedUpsample(nn.Module):
+    "Learned 2x upsampling without padding"
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.up = nn.ConvTranspose2d(
+            self.channels, self.out_channels, kernel_size=ks, stride=2
+        )
+    def forward(self, x):
+        return self.up(x)
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
 class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
     """
     def __init__(
         use_conv=False,
         use_scale_shift_norm=False,
         dims=2,
+        use_checkpoint=False,
         up=False,
         down=False,
         kernel_size=3,
         exchange_temb_dims=False,
+        skip_t_emb=False,
     ):
         super().__init__()
         self.channels = channels
         self.dropout = dropout
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
         self.use_scale_shift_norm = use_scale_shift_norm
         self.exchange_temb_dims = exchange_temb_dims
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
     def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+    def _forward(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
             h = self.out_layers(h)
         return self.skip_connection(x) + h
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x, **kwargs):
+        # TODO add crossframe attention and use mixed checkpoint
+        return checkpoint(
+            self._forward, (x,), self.parameters(), True
+        )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        # return pt_checkpoint(self._forward, x)  # pytorch
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial**2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class Timestep(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, t):
+        return timestep_embedding(t, self.dim)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
     def __init__(
         self,
         in_channels,
         model_channels,
         out_channels,
         num_res_blocks,
         attention_resolutions,
         dropout=0,
         channel_mult=(1, 2, 4, 8),
         conv_resample=True,
         dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
         num_heads=-1,
         num_head_channels=-1,
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
         num_attention_blocks=None,
+        disable_middle_self_attn=False,
         use_linear_in_transformer=False,
+        spatial_transformer_attn_type="softmax",
         adm_in_channels=None,
+        use_fairscale_checkpoint=False,
+        offload_to_cpu=False,
+        transformer_depth_middle=None,
     ):
         super().__init__()
+        from omegaconf.listconfig import ListConfig
+        if use_spatial_transformer:
+            assert (
+                context_dim is not None
+            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
+        if context_dim is not None:
+            assert (
+                use_spatial_transformer
+            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
             ), "Either num_heads or num_head_channels has to be set"
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        elif isinstance(transformer_depth, ListConfig):
+            transformer_depth = list(transformer_depth)
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    "provide num_res_blocks either as an int (globally constant) or "
+                    "as a list/tuple (per-level) with the same length as channel_mult"
+                )
+            self.num_res_blocks = num_res_blocks
+        # self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
+            )
+            print(
+                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                f"attention will still not be set."
+            )  # todo: convert to warning
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        if use_fp16:
+            print("WARNING: use_fp16 was dropped and has no effect anymore.")
+        # self.dtype = th.float16 if use_fp16 else th.float32
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        assert use_fairscale_checkpoint != use_checkpoint or not (
+            use_checkpoint or use_fairscale_checkpoint
+        )
+        self.use_fairscale_checkpoint = False
+        checkpoint_wrapper_fn = (
+            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
+            if self.use_fairscale_checkpoint
+            else lambda x: x
+        )
         time_embed_dim = model_channels * 4
+        self.time_embed = checkpoint_wrapper_fn(
+            nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
         )
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = checkpoint_wrapper_fn(
+                    nn.Sequential(
+                        Timestep(model_channels),
+                        nn.Sequential(
+                            linear(model_channels, time_embed_dim),
+                            nn.SiLU(),
+                            linear(time_embed_dim, time_embed_dim),
+                        ),
+                    )
                 )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
         self.input_blocks = nn.ModuleList(
             [
                 )
             ]
         )
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=mult * model_channels,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
                     )
                 ]
                 ch = mult * model_channels
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
                     if (
                         not exists(num_attention_blocks)
                         or nr < num_attention_blocks[level]
                     ):
                         layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 out_ch = ch
                 self.input_blocks.append(
                     TimestepEmbedSequential(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                down=True,
+                            )
                         )
                         if resblock_updown
                         else Downsample(
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
         self.middle_block = TimestepEmbedSequential(
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
             ),
+            checkpoint_wrapper_fn(
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
             )
+            if not use_spatial_transformer
+            else checkpoint_wrapper_fn(
+                SpatialTransformer(  # always uses a self-attn
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    attn_type=spatial_transformer_attn_type,
+                    use_checkpoint=use_checkpoint,
+                )
+            ),
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
+            ),
         )
         self._feature_size += ch
         self.output_blocks = nn.ModuleList([])
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch + ich,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=model_channels * mult,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
                     )
                 ]
                 ch = model_channels * mult
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
                     if (
                         not exists(num_attention_blocks)
                         or i < num_attention_blocks[level]
                     ):
                         layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads_upsample,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                up=True,
+                            )
                         )
                         if resblock_updown
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
+        self.out = checkpoint_wrapper_fn(
+            nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+            )
         )
+        if self.predict_codebook_ids:
+            self.id_predictor = checkpoint_wrapper_fn(
+                nn.Sequential(
+                    normalization(ch),
+                    conv_nd(dims, model_channels, n_embed, 1),
+                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+                )
+            )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
         assert (y is not None) == (
+            self.num_classes is not None
         ), "must specify y if and only if the model is class-conditional"
         hs = []
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
             assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
+        # h = x.type(self.dtype)
         h = x
+        for i, module in enumerate(self.input_blocks):
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for i, module in enumerate(self.output_blocks):
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            assert False, "not supported anymore. what the f*** are you doing?"
+        else:
+            return self.out(h)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(
+        self,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+        spatial_transformer_attn_type="softmax",
+        adm_in_channels=None,
+        use_fairscale_checkpoint=False,
+        offload_to_cpu=False,
+        transformer_depth_middle=None,
+    ):
+        super().__init__()
+        from omegaconf.listconfig import ListConfig
+        if use_spatial_transformer:
+            assert (
+                context_dim is not None
+            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
+        if context_dim is not None:
+            assert (
+                use_spatial_transformer
+            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert (
+                num_head_channels != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        if num_head_channels == -1:
+            assert (
+                num_heads != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        elif isinstance(transformer_depth, ListConfig):
+            transformer_depth = list(transformer_depth)
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    "provide num_res_blocks either as an int (globally constant) or "
+                    "as a list/tuple (per-level) with the same length as channel_mult"
+                )
+            self.num_res_blocks = num_res_blocks
+        # self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
+            )
+            print(
+                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                f"attention will still not be set."
+            )  # todo: convert to warning
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        if use_fp16:
+            print("WARNING: use_fp16 was dropped and has no effect anymore.")
+        # self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        assert use_fairscale_checkpoint != use_checkpoint or not (
+            use_checkpoint or use_fairscale_checkpoint
+        )
+        self.use_fairscale_checkpoint = False
+        checkpoint_wrapper_fn = (
+            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
+            if self.use_fairscale_checkpoint
+            else lambda x: x
+        )
+        time_embed_dim = model_channels * 4
+        self.time_embed = checkpoint_wrapper_fn(
+            nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
+        )
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = checkpoint_wrapper_fn(
+                    nn.Sequential(
+                        Timestep(model_channels),
+                        nn.Sequential(
+                            linear(model_channels, time_embed_dim),
+                            nn.SiLU(),
+                            linear(time_embed_dim, time_embed_dim),
+                        ),
+                    )
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=mult * model_channels,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if (
+                        not exists(num_attention_blocks)
+                        or nr < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                down=True,
+                            )
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
+            ),
+            checkpoint_wrapper_fn(
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+            )
+            if not use_spatial_transformer
+            else checkpoint_wrapper_fn(
+                SpatialTransformer(  # always uses a self-attn
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    attn_type=spatial_transformer_attn_type,
+                    use_checkpoint=use_checkpoint,
+                )
+            ),
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch + ich,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=model_channels * mult,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if (
+                        not exists(num_attention_blocks)
+                        or i < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads_upsample,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
+                            )
+                        )
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                up=True,
+                            )
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = checkpoint_wrapper_fn(
+            nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+            )
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = checkpoint_wrapper_fn(
+                nn.Sequential(
+                    normalization(ch),
+                    conv_nd(dims, model_channels, n_embed, 1),
+                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+                )
+            )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+        # h = x.type(self.dtype)
+        h = x
+        for i, module in enumerate(self.input_blocks):
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for i, module in enumerate(self.output_blocks):
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            assert False, "not supported anymore. what the f*** are you doing?"
+        else:
+            return self.out(h)
+import seaborn as sns
+import matplotlib.pyplot as plt
+class UNetAddModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        ctrl_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        attn_type="attn2",
+        attn_layers=[],
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        add_context_dim=None,
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+        spatial_transformer_attn_type="softmax",
+        adm_in_channels=None,
+        use_fairscale_checkpoint=False,
+        offload_to_cpu=False,
+        transformer_depth_middle=None,
+    ):
+        super().__init__()
+        from omegaconf.listconfig import ListConfig
+        if use_spatial_transformer:
+            assert (
+                context_dim is not None
+            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
+        if context_dim is not None:
+            assert (
+                use_spatial_transformer
+            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert (
+                num_head_channels != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        if num_head_channels == -1:
+            assert (
+                num_heads != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        self.in_channels = in_channels
+        self.ctrl_channels = ctrl_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        elif isinstance(transformer_depth, ListConfig):
+            transformer_depth = list(transformer_depth)
+        transformer_depth_middle = default(
+            transformer_depth_middle, transformer_depth[-1]
+        )
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    "provide num_res_blocks either as an int (globally constant) or "
+                    "as a list/tuple (per-level) with the same length as channel_mult"
+                )
+            self.num_res_blocks = num_res_blocks
+        # self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
+            )
+            print(
+                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                f"attention will still not be set."
+            )  # todo: convert to warning
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        if use_fp16:
+            print("WARNING: use_fp16 was dropped and has no effect anymore.")
+        # self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        assert use_fairscale_checkpoint != use_checkpoint or not (
+            use_checkpoint or use_fairscale_checkpoint
+        )
+        self.use_fairscale_checkpoint = False
+        checkpoint_wrapper_fn = (
+            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
+            if self.use_fairscale_checkpoint
+            else lambda x: x
+        )
+        time_embed_dim = model_channels * 4
+        self.time_embed = checkpoint_wrapper_fn(
+            nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
+        )
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = checkpoint_wrapper_fn(
+                    nn.Sequential(
+                        Timestep(model_channels),
+                        nn.Sequential(
+                            linear(model_channels, time_embed_dim),
+                            nn.SiLU(),
+                            linear(time_embed_dim, time_embed_dim),
+                        ),
+                    )
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        if self.ctrl_channels > 0:
+            self.add_input_block = TimestepEmbedSequential(
+                conv_nd(dims, ctrl_channels, 16, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 16, 16, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 16, 32, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 32, 32, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 32, 96, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 96, 96, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 96, 256, 3, padding=1),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=mult * model_channels,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if (
+                        not exists(num_attention_blocks)
+                        or nr < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    add_context_dim=add_context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                down=True,
+                            )
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
+            ),
+            checkpoint_wrapper_fn(
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+            )
+            if not use_spatial_transformer
+            else checkpoint_wrapper_fn(
+                SpatialTransformer(  # always uses a self-attn
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    add_context_dim=add_context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    attn_type=spatial_transformer_attn_type,
+                    use_checkpoint=use_checkpoint,
+                )
+            ),
+            checkpoint_wrapper_fn(
+                ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    checkpoint_wrapper_fn(
+                        ResBlock(
+                            ch + ich,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=model_channels * mult,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                        )
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if (
+                        not exists(num_attention_blocks)
+                        or i < num_attention_blocks[level]
+                    ):
+                        layers.append(
+                            checkpoint_wrapper_fn(
+                                AttentionBlock(
+                                    ch,
+                                    use_checkpoint=use_checkpoint,
+                                    num_heads=num_heads_upsample,
+                                    num_head_channels=dim_head,
+                                    use_new_attention_order=use_new_attention_order,
+                                )
+                            )
+                            if not use_spatial_transformer
+                            else checkpoint_wrapper_fn(
+                                SpatialTransformer(
+                                    ch,
+                                    num_heads,
+                                    dim_head,
+                                    depth=transformer_depth[level],
+                                    context_dim=context_dim,
+                                    add_context_dim=add_context_dim,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_transformer,
+                                    attn_type=spatial_transformer_attn_type,
+                                    use_checkpoint=use_checkpoint,
+                                )
+                            )
+                        )
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(
+                        checkpoint_wrapper_fn(
+                            ResBlock(
+                                ch,
+                                time_embed_dim,
+                                dropout,
+                                out_channels=out_ch,
+                                dims=dims,
+                                use_checkpoint=use_checkpoint,
+                                use_scale_shift_norm=use_scale_shift_norm,
+                                up=True,
+                            )
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = checkpoint_wrapper_fn(
+            nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+            )
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = checkpoint_wrapper_fn(
+                nn.Sequential(
+                    normalization(ch),
+                    conv_nd(dims, model_channels, n_embed, 1),
+                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+                )
+            )
+        # cache attn map
+        self.attn_type = attn_type
+        self.attn_layers = attn_layers
+        self.attn_map_cache = []
+        for name, module in self.named_modules():
+            if name.endswith(self.attn_type):
+                item = {"name": name, "heads": module.heads, "size": None, "attn_map": None}
+                self.attn_map_cache.append(item)
+                module.attn_map_cache = item
+    def clear_attn_map(self):
+        for item in self.attn_map_cache:
+            if item["attn_map"] is not None:
+                del item["attn_map"]
+                item["attn_map"] = None
+    def save_attn_map(self, save_name="temp", tokens=""):
+        attn_maps = []
+        for item in self.attn_map_cache:
+            name = item["name"]
+            if any([name.startswith(block) for block in self.attn_layers]):
+                heads = item["heads"]
+                attn_maps.append(item["attn_map"].detach().cpu())
+        attn_map = th.stack(attn_maps, dim=0)
+        attn_map = th.mean(attn_map, dim=0)
+        # attn_map: bh * n * l
+        bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
+        attn_map = attn_map.reshape((-1,heads,n,l)).mean(dim=1)
+        b = attn_map.shape[0]
+        h = w = int(n**0.5)
+        attn_map = attn_map.permute(0,2,1).reshape((b,l,h,w)).numpy()
+        attn_map_i = attn_map[-1]
+        l = attn_map_i.shape[0]
+        fig = plt.figure(figsize=(12, 8), dpi=300)
+        for j in range(12):
+            if j >= l: break
+            ax = fig.add_subplot(3, 4, j+1)
+            sns.heatmap(attn_map_i[j], square=True, xticklabels=False, yticklabels=False)
+            if j < len(tokens):
+                ax.set_title(tokens[j])
+        fig.savefig(f"temp/attn_map/attn_map_{save_name}.png")
+        plt.close()
+        return attn_map_i
+    def forward(self, x, timesteps=None, context=None, add_context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        self.clear_attn_map()
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+        # h = x.type(self.dtype)
+        h = x
+        if self.ctrl_channels > 0:
+            in_h, add_h = th.split(h, [self.in_channels, self.ctrl_channels], dim=1)
         for i, module in enumerate(self.input_blocks):
             if self.ctrl_channels > 0 and i == 0:
+                h = module(in_h, emb, context, add_context) + self.add_input_block(add_h, emb, context, add_context)
             else:
+                h = module(h, emb, context, add_context)
             hs.append(h)
+        h = self.middle_block(h, emb, context, add_context)
         for i, module in enumerate(self.output_blocks):
             h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context, add_context)
         h = h.type(x.dtype)
         return self.out(h)

sgm/modules/diffusionmodules/sampling.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Dict, Union
 import imageio
 import torch
 import numpy as np
 import torch.nn.functional as F
 from omegaconf import ListConfig, OmegaConf
@@ -251,15 +252,47 @@ class EulerEDMSampler(EDMSampler):
         return x
-    def save_segment_map(self, attn_maps, tokens=None, save_name=None):
         sections = []
         for i in range(len(tokens)):
             attn_map = attn_maps[i]
             sections.append(attn_map)
         section = np.stack(sections)
-        np.save(f"./temp/seg_map/seg_{save_name}.npy", section)
     def get_init_noise(self, cfgs, model, cond, batch, uc=None):
@@ -343,7 +376,8 @@ class EulerEDMSampler(EDMSampler):
             local_loss = torch.zeros(1)
         if save_attn:
             attn_map = model.model.diffusion_model.save_attn_map(save_name=name, tokens=batch["label"][0])
-            self.save_segment_map(attn_map, tokens=batch["label"][0], save_name=name)
         d = to_d(x, sigma_hat, denoised)
         dt = append_dims(next_sigma - sigma_hat, x.ndim)
@@ -376,7 +410,7 @@ class EulerEDMSampler(EDMSampler):
             alpha = 20 * np.sqrt(scales[i])
             update = aae_enabled
-            save_loss = aae_enabled
             save_attn = detailed and (i == (num_sigmas-1)//2)
             save_inter = aae_enabled
@@ -412,12 +446,195 @@ class EulerEDMSampler(EDMSampler):
                 inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
                 inters.append(inter.astype(np.uint8))
-        # print(f"Local losses: {local_losses}")
         if len(inters) > 0:
             imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.02)
         return x
 class HeunEDMSampler(EDMSampler):

 import imageio
 import torch
+import json
 import numpy as np
 import torch.nn.functional as F
 from omegaconf import ListConfig, OmegaConf
         return x
+    def create_pascal_label_colormap(self):
+        """
+        PASCAL VOC 分割数据集的类别标签颜色映射label colormap
+        返回:
+            可视化分割结果的颜色映射Colormap
+        """
+        colormap = np.zeros((256, 3), dtype=int)
+        ind = np.arange(256, dtype=int)
+        for shift in reversed(range(8)):
+            for channel in range(3):
+                colormap[:, channel] |= ((ind >> channel) & 1) << shift
+            ind >>= 3
+        return colormap
+    def save_segment_map(self, image, attn_maps, tokens=None, save_name=None):
+        colormap = self.create_pascal_label_colormap()
+        H, W = image.shape[-2:]
+        image_ = image*0.3
         sections = []
         for i in range(len(tokens)):
             attn_map = attn_maps[i]
+            attn_map_t = np.tile(attn_map[None], (1,3,1,1)) # b, 3, h, w
+            attn_map_t = torch.from_numpy(attn_map_t)
+            attn_map_t = F.interpolate(attn_map_t, (W, H))
+            color = torch.from_numpy(colormap[i+1][None,:,None,None] / 255.0)
+            colored_attn_map = attn_map_t * color
+            colored_attn_map = colored_attn_map.to(device=image_.device)
+            image_ += colored_attn_map*0.7
             sections.append(attn_map)
         section = np.stack(sections)
+        np.save(f"temp/seg_map/seg_{save_name}.npy", section)
+        save_image(image_, f"temp/seg_map/seg_{save_name}.png", normalize=True)
     def get_init_noise(self, cfgs, model, cond, batch, uc=None):
             local_loss = torch.zeros(1)
         if save_attn:
             attn_map = model.model.diffusion_model.save_attn_map(save_name=name, tokens=batch["label"][0])
+            denoised_decode = model.decode_first_stage(denoised) if denoised_decode is None else denoised_decode
+            self.save_segment_map(denoised_decode, attn_map, tokens=batch["label"][0], save_name=name)
         d = to_d(x, sigma_hat, denoised)
         dt = append_dims(next_sigma - sigma_hat, x.ndim)
             alpha = 20 * np.sqrt(scales[i])
             update = aae_enabled
+            save_loss = detailed
             save_attn = detailed and (i == (num_sigmas-1)//2)
             save_inter = aae_enabled
                 inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
                 inters.append(inter.astype(np.uint8))
+        print(f"Local losses: {local_losses}")
         if len(inters) > 0:
             imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.02)
         return x
+class EulerEDMDualSampler(EulerEDMSampler):
+    def prepare_sampling_loop(self, x, cond, uc_1=None, uc_2=None, num_steps=None):
+        sigmas = self.discretization(
+            self.num_steps if num_steps is None else num_steps, device=self.device
+        )
+        uc_1 = default(uc_1, cond)
+        uc_2 = default(uc_2, cond)
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+        s_in = x.new_ones([x.shape[0]])
+        return x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2
+    def denoise(self, x, model, sigma, cond, uc_1, uc_2):
+        denoised = model.denoiser(model.model, *self.guider.prepare_inputs(x, sigma, cond, uc_1, uc_2))
+        denoised = self.guider(denoised, sigma)
+        return denoised
+    def get_init_noise(self, cfgs, model, cond, batch, uc_1=None, uc_2=None):
+        H, W = batch["target_size_as_tuple"][0]
+        shape = (cfgs.batch_size, cfgs.channel, int(H) // cfgs.factor, int(W) // cfgs.factor)
+        randn = torch.randn(shape).to(torch.device("cuda", index=cfgs.gpu))
+        x = randn.clone()
+        xs = []
+        self.verbose = False
+        for _ in range(cfgs.noise_iters):
+            x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2 = self.prepare_sampling_loop(
+                x, cond, uc_1, uc_2, num_steps=2
+            )
+            superv = {
+                "mask": batch["mask"] if "mask" in batch else None,
+                "seg_mask": batch["seg_mask"] if "seg_mask" in batch else None
+            }
+            local_losses = []
+            for i in self.get_sigma_gen(num_sigmas):
+                gamma = (
+                    min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                    if self.s_tmin <= sigmas[i] <= self.s_tmax
+                    else 0.0
+                )
+                x, inter, local_loss = self.sampler_step(
+                    s_in * sigmas[i],
+                    s_in * sigmas[i + 1],
+                    model,
+                    x,
+                    cond,
+                    superv,
+                    uc_1,
+                    uc_2,
+                    gamma,
+                    save_loss=True
+                )
+                local_losses.append(local_loss.item())
+            xs.append((randn, local_losses[-1]))
+            randn = torch.randn(shape).to(torch.device("cuda", index=cfgs.gpu))
+            x = randn.clone()
+        self.verbose = True
+        xs.sort(key = lambda x: x[-1])
+        if len(xs) > 0:
+            print(f"Init local loss: Best {xs[0][1]} Worst {xs[-1][1]}")
+            x = xs[0][0]
+        return x
+    def sampler_step(self, sigma, next_sigma, model, x, cond, batch=None, uc_1=None, uc_2=None,
+                     gamma=0.0, alpha=0, iter_enabled=False, thres=None, update=False,
+                     name=None, save_loss=False, save_attn=False, save_inter=False):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+        if update:
+            x = self.attend_and_excite(x, model, sigma_hat, cond, batch, alpha, iter_enabled, thres)
+        denoised = self.denoise(x, model, sigma_hat, cond, uc_1, uc_2)
+        denoised_decode = model.decode_first_stage(denoised) if save_inter else None
+        if save_loss:
+            local_loss = model.loss_fn.get_min_local_loss(model.model.diffusion_model.attn_map_cache, batch["mask"], batch["seg_mask"])
+            local_loss = local_loss[-local_loss.shape[0]//3:]
+        else:
+            local_loss = torch.zeros(1)
+        if save_attn:
+            attn_map = model.model.diffusion_model.save_attn_map(save_name=name, save_single=True)
+            denoised_decode = model.decode_first_stage(denoised) if denoised_decode is None else denoised_decode
+            self.save_segment_map(denoised_decode, attn_map, tokens=batch["label"][0], save_name=name)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+        euler_step = self.euler_step(x, d, dt)
+        return euler_step, denoised_decode, local_loss
+    def __call__(self, model, x, cond, batch=None, uc_1=None, uc_2=None, num_steps=None, init_step=0,
+                 name=None, aae_enabled=False, detailed=False):
+        x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2 = self.prepare_sampling_loop(
+            x, cond, uc_1, uc_2, num_steps
+        )
+        name = batch["name"][0]
+        inters = []
+        local_losses = []
+        scales = np.linspace(start=1.0, stop=0, num=num_sigmas)
+        iter_lst = np.linspace(start=5, stop=25, num=6, dtype=np.int32)
+        thres_lst = np.linspace(start=-0.5, stop=-0.8, num=6)
+        for i in self.get_sigma_gen(num_sigmas, init_step=init_step):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            alpha = 20 * np.sqrt(scales[i])
+            update = aae_enabled
+            save_loss = aae_enabled
+            save_attn = detailed and (i == (num_sigmas-1)//2)
+            save_inter = aae_enabled
+            if i in iter_lst:
+                iter_enabled = True
+                thres = thres_lst[list(iter_lst).index(i)]
+            else:
+                iter_enabled = False
+                thres = 0.0
+            x, inter, local_loss = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                model,
+                x,
+                cond,
+                batch,
+                uc_1,
+                uc_2,
+                gamma,
+                alpha=alpha,
+                iter_enabled=iter_enabled,
+                thres=thres,
+                update=update,
+                name=name,
+                save_loss=save_loss,
+                save_attn=save_attn,
+                save_inter=save_inter
+            )
+            local_losses.append(local_loss.item())
+            if inter is not None:
+                inter = torch.clamp((inter + 1.0) / 2.0, min=0.0, max=1.0)[0]
+                inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
+                inters.append(inter.astype(np.uint8))
+        print(f"Local losses: {local_losses}")
+        if len(inters) > 0:
+            imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.1)
+        return x
 class HeunEDMSampler(EDMSampler):

sgm/modules/diffusionmodules/sampling_utils.py CHANGED Viewed

@@ -7,7 +7,10 @@ from ...util import append_dims
 class NoDynamicThresholding:
     def __call__(self, uncond, cond, scale):
         return uncond + scale * (cond - uncond)
 def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
     if order - 1 > i:

 class NoDynamicThresholding:
     def __call__(self, uncond, cond, scale):
         return uncond + scale * (cond - uncond)
+class DualThresholding: # Dual condition CFG (from instructPix2Pix)
+    def __call__(self, uncond_1, uncond_2, cond, scale):
+        return uncond_1 + scale[0] * (uncond_2 - uncond_1) + scale[1] * (cond - uncond_2)
 def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
     if order - 1 > i:

sgm/modules/diffusionmodules/wrappers.py CHANGED Viewed

@@ -28,8 +28,8 @@ class OpenAIWrapper(IdentityWrapper):
         return self.diffusion_model(
             x,
             timesteps=t,
-            t_context=c.get("t_crossattn", None),
-            v_context=c.get("v_crossattn", None),
             y=c.get("vector", None),
             **kwargs
         )

         return self.diffusion_model(
             x,
             timesteps=t,
+            context=c.get("crossattn", None),
+            add_context=c.get("add_crossattn", None),
             y=c.get("vector", None),
             **kwargs
         )

sgm/modules/encoders/modules.py CHANGED Viewed

@@ -14,7 +14,6 @@ from transformers import (
     ByT5Tokenizer,
     CLIPTextModel,
     CLIPTokenizer,
-    CLIPVisionModel,
     T5EncoderModel,
     T5Tokenizer,
 )
@@ -39,19 +38,18 @@ import pytorch_lightning as pl
 from torchvision import transforms
 from timm.models.vision_transformer import VisionTransformer
 from safetensors.torch import load_file as load_safetensors
-from torchvision.utils import save_image
 # disable warning
 from transformers import logging
 logging.set_verbosity_error()
 class AbstractEmbModel(nn.Module):
-    def __init__(self):
         super().__init__()
         self._is_trainable = None
         self._ucg_rate = None
         self._input_key = None
-        self._emb_key = None
     @property
     def is_trainable(self) -> bool:
@@ -65,10 +63,6 @@ class AbstractEmbModel(nn.Module):
     def input_key(self) -> str:
         return self._input_key
-    @property
-    def emb_key(self) -> str:
-        return self._emb_key
     @is_trainable.setter
     def is_trainable(self, value: bool):
         self._is_trainable = value
@@ -81,10 +75,6 @@ class AbstractEmbModel(nn.Module):
     def input_key(self, value: str):
         self._input_key = value
-    @emb_key.setter
-    def emb_key(self, value: str):
-        self._emb_key = value
     @is_trainable.deleter
     def is_trainable(self):
         del self._is_trainable
@@ -97,13 +87,8 @@ class AbstractEmbModel(nn.Module):
     def input_key(self):
         del self._input_key
-    @emb_key.deleter
-    def emb_key(self):
-        del self._emb_key
 class GeneralConditioner(nn.Module):
     OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
     KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
@@ -124,8 +109,7 @@ class GeneralConditioner(nn.Module):
                 f"Initialized embedder #{n}: {embedder.__class__.__name__} "
                 f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
             )
-            if "emb_key" in embconfig:
-                embedder.emb_key = embconfig["emb_key"]
             if "input_key" in embconfig:
                 embedder.input_key = embconfig["input_key"]
             elif "input_keys" in embconfig:
@@ -172,10 +156,13 @@ class GeneralConditioner(nn.Module):
             if not isinstance(emb_out, (list, tuple)):
                 emb_out = [emb_out]
             for emb in emb_out:
-                if embedder.emb_key is not None:
-                    out_key = embedder.emb_key
                 else:
                     out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
                 if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
                     emb = (
                         expand_dims_like(
@@ -217,6 +204,28 @@ class GeneralConditioner(nn.Module):
         return c, uc
 class InceptionV3(nn.Module):
     """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
     port with an additional squeeze at the end"""
@@ -400,6 +409,7 @@ class FrozenCLIPEmbedder(AbstractEmbModel):
     def freeze(self):
         self.transformer = self.transformer.eval()
         for param in self.parameters():
             param.requires_grad = False
@@ -684,24 +694,24 @@ class FrozenOpenCLIPImageEmbedder(AbstractEmbModel):
         if self.output_tokens:
             z, tokens = z[0], z[1]
         z = z.to(image.dtype)
-        # if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
-        #     z = (
-        #         torch.bernoulli(
-        #             (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
-        #         )[:, None]
-        #         * z
-        #     )
-        #     if tokens is not None:
-        #         tokens = (
-        #             expand_dims_like(
-        #                 torch.bernoulli(
-        #                     (1.0 - self.ucg_rate)
-        #                     * torch.ones(tokens.shape[0], device=tokens.device)
-        #                 ),
-        #                 tokens,
-        #             )
-        #             * tokens
-        #         )
         if self.unsqueeze_dim:
             z = z[:, None, :]
         if self.output_tokens:
@@ -797,7 +807,7 @@ class FrozenCLIPT5Encoder(AbstractEmbModel):
         return [clip_z, t5_z]
-class SpatialRescaler(AbstractEmbModel):
     def __init__(
         self,
         n_stages=1,
@@ -836,9 +846,6 @@ class SpatialRescaler(AbstractEmbModel):
                 padding=kernel_size // 2,
             )
         self.wrap_video = wrap_video
-    def freeze(self):
-        pass
     def forward(self, x):
         if self.wrap_video and x.ndim == 5:

     ByT5Tokenizer,
     CLIPTextModel,
     CLIPTokenizer,
     T5EncoderModel,
     T5Tokenizer,
 )
 from torchvision import transforms
 from timm.models.vision_transformer import VisionTransformer
 from safetensors.torch import load_file as load_safetensors
 # disable warning
 from transformers import logging
 logging.set_verbosity_error()
 class AbstractEmbModel(nn.Module):
+    def __init__(self, is_add_embedder=False):
         super().__init__()
         self._is_trainable = None
         self._ucg_rate = None
         self._input_key = None
+        self.is_add_embedder = is_add_embedder
     @property
     def is_trainable(self) -> bool:
     def input_key(self) -> str:
         return self._input_key
     @is_trainable.setter
     def is_trainable(self, value: bool):
         self._is_trainable = value
     def input_key(self, value: str):
         self._input_key = value
     @is_trainable.deleter
     def is_trainable(self):
         del self._is_trainable
     def input_key(self):
         del self._input_key
 class GeneralConditioner(nn.Module):
     OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
     KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
                 f"Initialized embedder #{n}: {embedder.__class__.__name__} "
                 f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
             )
             if "input_key" in embconfig:
                 embedder.input_key = embconfig["input_key"]
             elif "input_keys" in embconfig:
             if not isinstance(emb_out, (list, tuple)):
                 emb_out = [emb_out]
             for emb in emb_out:
+                if embedder.is_add_embedder:
+                    out_key = "add_crossattn"
                 else:
                     out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
+                if embedder.input_key == "mask":
+                    H, W = batch["image"].shape[-2:]
+                    emb = nn.functional.interpolate(emb, (H//8, W//8))
                 if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
                     emb = (
                         expand_dims_like(
         return c, uc
+class DualConditioner(GeneralConditioner):
+    def get_unconditional_conditioning(
+        self, batch_c, batch_uc_1=None, batch_uc_2=None, force_uc_zero_embeddings=None
+    ):
+        if force_uc_zero_embeddings is None:
+            force_uc_zero_embeddings = []
+        ucg_rates = list()
+        for embedder in self.embedders:
+            ucg_rates.append(embedder.ucg_rate)
+            embedder.ucg_rate = 0.0
+        c = self(batch_c)
+        uc_1 = self(batch_uc_1, force_uc_zero_embeddings) if batch_uc_1 is not None else None
+        uc_2 = self(batch_uc_2, force_uc_zero_embeddings[:1]) if batch_uc_2 is not None else None
+        for embedder, rate in zip(self.embedders, ucg_rates):
+            embedder.ucg_rate = rate
+        return c, uc_1, uc_2
 class InceptionV3(nn.Module):
     """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
     port with an additional squeeze at the end"""
     def freeze(self):
         self.transformer = self.transformer.eval()
         for param in self.parameters():
             param.requires_grad = False
         if self.output_tokens:
             z, tokens = z[0], z[1]
         z = z.to(image.dtype)
+        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+            z = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+                )[:, None]
+                * z
+            )
+            if tokens is not None:
+                tokens = (
+                    expand_dims_like(
+                        torch.bernoulli(
+                            (1.0 - self.ucg_rate)
+                            * torch.ones(tokens.shape[0], device=tokens.device)
+                        ),
+                        tokens,
+                    )
+                    * tokens
+                )
         if self.unsqueeze_dim:
             z = z[:, None, :]
         if self.output_tokens:
         return [clip_z, t5_z]
+class SpatialRescaler(nn.Module):
     def __init__(
         self,
         n_stages=1,
                 padding=kernel_size // 2,
             )
         self.wrap_video = wrap_video
     def forward(self, x):
         if self.wrap_video and x.ndim == 5:

util.py CHANGED Viewed

@@ -65,14 +65,6 @@ def prepare_batch(cfgs, batch):
         if isinstance(batch[key], torch.Tensor):
             batch[key] = batch[key].to(torch.device("cuda", index=cfgs.gpu))
-    batch_uc = deep_copy(batch)
-    if "ntxt" in batch:
-        batch_uc["txt"] = batch["ntxt"]
-    else:
-        batch_uc["txt"] = ["" for _ in range(len(batch["txt"]))]
-    if "label" in batch:
-        batch_uc["label"] = ["" for _ in range(len(batch["label"]))]
     return batch, batch_uc

         if isinstance(batch[key], torch.Tensor):
             batch[key] = batch[key].to(torch.device("cuda", index=cfgs.gpu))
+    batch_uc = batch
     return batch, batch_uc