tolgacangoz
/

matryoshka-diffusion-models

Text-to-Image

Diffusers

Safetensors

English

mdm

Model card Files Files and versions Community

tolgacangoz commited on Oct 13, 2024

Commit

b17a00f

verified ·

1 Parent(s): d470ded

Upload matryoshka.py

Browse files

Files changed (1) hide show

scheduler/matryoshka.py +21 -21

scheduler/matryoshka.py CHANGED Viewed

@@ -1517,7 +1517,7 @@ class MatryoshkaTransformerBlock(nn.Module):
                 # **cross_attention_kwargs,
             )
-        attn_output_cond = attn_output_cond.permute(0, 2, 1).contiguous()
         attn_output_cond = self.proj_out(attn_output_cond)
         attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
@@ -1635,11 +1635,30 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
         # key = key.permute(0, 2, 1)
         # value = value.permute(0, 2, 1)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
             key = attn.norm_k(key)
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1 if F.scaled_dot_product_attention() is available
         # hidden_states = self.attention(
@@ -1649,31 +1668,12 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
         #     mask=attention_mask,
         #     num_heads=attn.heads,
         # )
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        #query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        query = query.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.reshape(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        hidden_states = F.scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            dropout_p=attn.dropout,
-        )
         hidden_states = hidden_states.to(query.dtype)
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, height * width, channel)
         if self_attention_output is not None:
             hidden_states = hidden_states + self_attention_output
-        if not attn.pre_only:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
         if attn.residual_connection:
             hidden_states = hidden_states + residual

                 # **cross_attention_kwargs,
             )
+        # attn_output_cond = attn_output_cond.permute(0, 2, 1).contiguous()
         attn_output_cond = self.proj_out(attn_output_cond)
         attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
         # key = key.permute(0, 2, 1)
         # value = value.permute(0, 2, 1)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        if self_attention_output is None:
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
             key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1 if F.scaled_dot_product_attention() is available
         # hidden_states = self.attention(
         #     mask=attention_mask,
         #     num_heads=attn.heads,
         # )
         hidden_states = hidden_states.to(query.dtype)
         if self_attention_output is not None:
             hidden_states = hidden_states + self_attention_output
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         if attn.residual_connection:
             hidden_states = hidden_states + residual