DEBUG steaming past key changes 47x during next token calc

Browse files

Files changed (3) hide show

audiocraft/lm.py +1 -1
audiocraft/transformer.py +151 -127
demo.py +2 -2

audiocraft/lm.py CHANGED Viewed

@@ -254,7 +254,7 @@ class LMModel(nn.Module):
         # so only 2 x sel.flinear() of 4 are used ?
         # WHy torch.stack is in dim=1
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
-        print(f'{input_.shape=}  {out.shape=}  {cross_attention_input.shape=}  {logits.shape=} FUSER LLM')
         # remove the prefix from the model outputs
         # if len(self.fuser.fuse2cond['prepend']) > 0:
         #     logits = logits[:, :, -S:]

         # so only 2 x sel.flinear() of 4 are used ?
         # WHy torch.stack is in dim=1
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+        # print(f'{input_.shape=}  {out.shape=}  {cross_attention_input.shape=}  {logits.shape=} FUSER LLM')
         # remove the prefix from the model outputs
         # if len(self.fuser.fuse2cond['prepend']) > 0:
         #     logits = logits[:, :, -S:]

audiocraft/transformer.py CHANGED Viewed

@@ -18,13 +18,7 @@ def set_efficient_attention_backend(backend: str = 'torch'):
-def _is_profiled():
-    # Return true if we are currently running with a xformers profiler activated.
-    try:
-        from xformers.profiler import profiler
-    except ImportError:
-        return False
-    return profiler._Profiler._CURRENT_PROFILER is not None
 def create_norm_fn(norm_type, dim, **kwargs):
@@ -69,35 +63,34 @@ class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
                  num_heads,
-                 dropout=0.0, bias: bool = True,
-                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
-                 memory_efficient: bool = False, attention_as_float32: bool = False,
                  cross_attention: bool = False,
-                 qk_layer_norm: bool = False, kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
         if past_context is not None:
             assert causal
         self.embed_dim = embed_dim
         self.causal = causal
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
         if cross_attention:
             assert not causal, "Causal cannot work with cross attention."
         if memory_efficient:
             _verify_xformers_memory_efficient_compat()
         self.custom = _is_custom(custom, memory_efficient)
         if self.custom:
             out_dim = embed_dim
@@ -116,18 +109,12 @@ class StreamingMultiheadAttention(nn.Module):
             if bias:
                 self.out_proj.bias.data.zero_()
         else:
-            assert not qk_layer_norm
-            assert kv_repeat == 1
-            self.mha = nn.MultiheadAttention(
-                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
-                **factory_kwargs)
         self.qk_layer_norm = qk_layer_norm
         if qk_layer_norm:
-            assert self.custom
-            assert kv_repeat == 1
-            ln_dim = embed_dim
-            self.q_layer_norm = nn.LayerNorm(ln_dim)
-            self.k_layer_norm = nn.LayerNorm(ln_dim)
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         if not self.custom:
@@ -137,13 +124,7 @@ class StreamingMultiheadAttention(nn.Module):
                 if prefix + key in state_dict:
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def forward(self,
                 query,
                 key,
@@ -152,7 +133,53 @@ class StreamingMultiheadAttention(nn.Module):
                 need_weights=False,
                 attn_mask=None,
                 is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
         # print(f'{query.shape=} {key.shape=} {value.shape=} MHA')
@@ -167,9 +194,9 @@ class StreamingMultiheadAttention(nn.Module):
         custom_attn_mask = attn_mask is not None
         if self.custom:
-            # custom implementation
-            assert need_weights is False
-            assert key_padding_mask is None
             if self.cross_attention:
                 # print('\n\n\n\nCROSS\n\n\n\n')
@@ -178,9 +205,8 @@ class StreamingMultiheadAttention(nn.Module):
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
                 else:
-                    bias_q = self.in_proj_bias[:dim]
-                    bias_k = self.in_proj_bias[dim: 2 * dim]
-                    bias_v = self.in_proj_bias[2 * dim:]
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # print(f'{q.shape=} TRANSF FORW who concaten')
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
@@ -191,18 +217,9 @@ class StreamingMultiheadAttention(nn.Module):
                     k = self.k_layer_norm(k)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
-                # print(f'{q.shape=}  {k.shape=}  {v.shape=}  after rearrange')
             else:
-                # print('\n\n\n\nSELF\n\n\n\n')
-                #
-                # 47x Transformers selfattn followed by crossattn
-                #
-                # self-attn is on history? previous key or is it on only the last token?
-                if not _is_profiled():
-                    # profiling breaks that propertysomehow.
-                    assert query is key, "specialized implementation"
-                    assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     if time_dim == 2:
@@ -217,66 +234,50 @@ class StreamingMultiheadAttention(nn.Module):
                     q, k, v = ops.unbind(packed, dim=2)
                     # print(f'{q.shape=} {v.shape=} @L331 trasnforemr.py') # packed is bs=2
                 else:
-                    embed_dim = self.embed_dim
-                    per_head_dim = (embed_dim // self.num_heads)
-                    kv_heads = self.num_heads // self.kv_repeat
-                    q = projected[:, :, :embed_dim]
-                    start = embed_dim
-                    end = start + per_head_dim * kv_heads
-                    k = projected[:, :, start: end]
-                    v = projected[:, :, end:]
-                    q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
-                    k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
-                    v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
                 if self.qk_layer_norm is True:
-                    assert self.kv_repeat == 1
-                    q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
-                    q = self.q_layer_norm(q)
-                    k = self.k_layer_norm(k)
-                    q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
                 if self.kv_repeat > 1:
-                    #
                     print('Expand repear 2')
             if self.attention_as_float32:
-                q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 if custom_attn_mask:
-                    # When using a custom attn mask:
-                    # Move to query's device, repeat for each sample, remove align8 padding
-                    seq_len = query.shape[1]
-                    attn_mask = attn_mask.to(q.dtype)
-                    attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
-                    attn_mask = attn_mask[..., :seq_len, :seq_len]
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=attn_mask is not None, dropout_p=p)
                 else:
-                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
             else:
-                # We include the dot product as float32, for consistency
-                # with the other implementations that include that step
-                # as part of the attention. Note that when using `autocast`,
-                # the einsums would be done as bfloat16, but the softmax
-                # would be done as bfloat16, so `attention_as_float32` will
-                # extend a bit the range of operations done in float32,
-                # although this should make no difference.
-                q = q / q.shape[-1] ** 0.5
-                key_layout = layout.replace('t', 'k')
-                query_layout = layout
-                pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
-                if attn_mask is not None:
-                    pre_w = pre_w + attn_mask
-                w = torch.softmax(pre_w, dim=-1)
-                w = F.dropout(w, self.dropout, training=self.training).to(v)
-                # Key and value have the same format.
-                x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
             x = x.to(dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
@@ -313,8 +314,9 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
             'memory_efficient': memory_efficient,
             'attention_as_float32': attention_as_float32,
         }
-        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
-            causal=causal, past_context=past_context,
             # rope=rope,
             qk_layer_norm=qk_layer_norm,
             kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
@@ -336,6 +338,17 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
     def _cross_attention_block(self,
                                src,
@@ -353,27 +366,37 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
                 cross_attention_src=None):
-        x = src
         if self.norm_first:
-            # print('selfattn', x.shape, src_mask, src_key_padding_mask)
-            x = x + self._sa_block(self.norm1(x),
-                                   src_mask,               #None
-                                   src_key_padding_mask  # None
-                                   )  # Internal nn
-            # print('crossattn', x.shape, cross_attention_src.shape)
             if cross_attention_src is not None:
                 x = x + self._cross_attention_block(
                         self.norm_cross(x),
                         cross_attention_src)
-                # selfattn  torch.Size([2, 2, 1536]) None None           NO 4D TOKEN!
-                # crossattn torch.Size([2, 2, 1536]) torch.Size([2, 4, 1536])
             else:
-                raise NotImplementedError  # all layers have a self & cross?
             x = x + self._ff_block(self.norm2(x))
         else:
             print('NLAST')
-        # print('NT', x.shape)  # [1,2 ,1536]
         return x
 class StreamingTransformer(nn.Module):
@@ -422,6 +445,7 @@ class StreamingTransformer(nn.Module):
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
             for layer in self.layers:
                 # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
                 # backward hook inside of FSDP...
@@ -443,30 +467,30 @@ class StreamingTransformer(nn.Module):
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
-        # UNTIL HERE BATCH=1
         for _, lay in enumerate(self.layers):
-            # if _ < 2:
-                # L=0  [1,1,1536]
-                # L=1  [2,1,1536]
-            print(f'L={_}    {args=} {kwargs["cross_attention_src"].shape=} {x.shape=} StreamTransf ForLoop')   # [2, 1, 1536]  BATCH=2
-            # x = self._apply_layer(layer, x, *args, **kwargs)
-            # x = lay(x, **kwargs)
-            x = lay(x,
-                    cross_attention_src=kwargs["cross_attention_src"],
-                    src_mask=kwargs['src_mask'])
-            # concat old token to query oh not here is on lm generate
-        print('OUT OF Tall', x.shape) # [1,2,1536]  # why this gets filled with sequence 1,2...
-        # should be 1 query
         return x
-    def make_optim_group(self):
-        group = {"params": list(self.parameters())}
-        if self.lr is not None:
-            group["lr"] = self.lr
-        if self.weight_decay is not None:
-            group["weight_decay"] = self.weight_decay
-        return group
 # special attention related function

 def create_norm_fn(norm_type, dim, **kwargs):
     def __init__(self,
                  embed_dim,
                  num_heads,
+                 dropout=0.0,
+                 bias: bool = True,
+                 causal: bool = False,
+                 past_context: tp.Optional[int] = None,
+                 custom: bool = False,
+                 memory_efficient: bool = False,
+                 attention_as_float32: bool = False,
                  cross_attention: bool = False,
+                 qk_layer_norm: bool = False,
+                 kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
         if past_context is not None:
             assert causal
         self.embed_dim = embed_dim
         self.causal = causal
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
         if cross_attention:
             assert not causal, "Causal cannot work with cross attention."
         if memory_efficient:
             _verify_xformers_memory_efficient_compat()
         self.custom = _is_custom(custom, memory_efficient)
         if self.custom:
             out_dim = embed_dim
             if bias:
                 self.out_proj.bias.data.zero_()
         else:
+            print('mha ini else')
         self.qk_layer_norm = qk_layer_norm
         if qk_layer_norm:
+            print('QK norm')
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         if not self.custom:
                 if prefix + key in state_dict:
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def forward(self,
                 query,
                 key,
                 need_weights=False,
                 attn_mask=None,
                 is_causal=False):
+# 2=cond/uncond
+# 24=heads
+# 1=seqlen
+# 64=channel
+#
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
+# 43
+# ____________
+# SELF
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
+# sa_ x.shape=torch.Size([2, 1, 1536])
+# X
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
+# 44
+# ____________
+# SELF
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
+# sa_ x.shape=torch.Size([2, 1, 1536])
+# X
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
+# 45
+# ____________
+# SELF
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
+# sa_ x.shape=torch.Size([2, 1, 1536])
+# X
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
+# 46
+# ____________
+# SELF
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
+# sa_ x.shape=torch.Size([2, 1, 1536])
+# X
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
+# 47
+# ____________
+# SELF
+# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
+# sa_ x.shape=torch.Size([2, 1, 1536])
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
         # print(f'{query.shape=} {key.shape=} {value.shape=} MHA')
         custom_attn_mask = attn_mask is not None
         if self.custom:
             if self.cross_attention:
                 # print('\n\n\n\nCROSS\n\n\n\n')
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
                 else:
+                    print('no self proj bi')
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # print(f'{q.shape=} TRANSF FORW who concaten')
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                     k = self.k_layer_norm(k)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     if time_dim == 2:
                     q, k, v = ops.unbind(packed, dim=2)
                     # print(f'{q.shape=} {v.shape=} @L331 trasnforemr.py') # packed is bs=2
                 else:
+                    print("ELSE kv rp")
                 if self.qk_layer_norm is True:
+                    print('QL lay norm')
                 if self.kv_repeat > 1:
                     print('Expand repear 2')
             if self.attention_as_float32:
+                print('AS FLOAT32')
             if self.memory_efficient:
                 if custom_attn_mask:
+                    print('CUSTOM ATTN MSK')
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
+                    # print(f'{q.shape=} {k.shape=} {v.shape=}   90')
+                    print(f'{x.sum()=} {q.sum()=} {k.sum()=} {v.sum()=}   90 variation of qkv during 47')
+                    # the k.sum(),v.sum() changes over the 47transfs how is that possible if self._sa
+                    # has q-len = 1.
+                    #
+                    #
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=attn_mask is not None, dropout_p=p)
                 else:
+                    print('MHA OPS')
             else:
+                print('CONSISTENCY ')
             x = x.to(dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
             'memory_efficient': memory_efficient,
             'attention_as_float32': attention_as_float32,
         }
+        self.self_attn=StreamingMultiheadAttention(
+            causal=causal,
+            past_context=past_context,
             # rope=rope,
             qk_layer_norm=qk_layer_norm,
             kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+# ENVS....d4/lib/python3.10/site-packages/torch/nn/modules/transformer.py @TransformerEncoderLayer
+    def _sa_block(self, q, k, v):
+        x = self.self_attn(q,
+                           k,
+                           v,
+                           attn_mask=None,
+                           key_padding_mask=None,
+                           need_weights=False,
+                           is_causal=None)[0]
+        return self.dropout1(x)
     def _cross_attention_block(self,
                                src,
                 cross_attention_src=None):
         if self.norm_first:
+            print('selfattn')
+            history = self.norm1(src)
+            x = history[:, -1:, :]
+            # THIS IS COMPUTED with 1 timestep
+            # just before the call there is cat([past_k, k])
+            # Thus we just
+            x = x + self._sa_block(x,  # THIS should be square as the history is updated
+                                   # then the -1 item of history goes to the text x text
+                                   #
+                                   history,
+                                   history)
+            print('crossattn')
             if cross_attention_src is not None:
                 x = x + self._cross_attention_block(
                         self.norm_cross(x),
                         cross_attention_src)
             else:
+                print('NOT IMPL')
             x = x + self._ff_block(self.norm2(x))
         else:
             print('NLAST')
         return x
 class StreamingTransformer(nn.Module):
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
+            print('Checkpointing????????????')
             for layer in self.layers:
                 # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
                 # backward hook inside of FSDP...
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
+        # 47x transformer layers for frozen history
+        #       -> history is updated by self._sa() althought her length is fixed
+        #       -> the q that comes out of the text x text cross attn
+        #          is given as q to the next lay's self._sa() with updated history
+        #       ->
+        #       ->
         for _, lay in enumerate(self.layers):
+            print(f'_________________\n{_}')
+            # 1 q = last_token x history x history
+            # 2 next_token = q x text x text
+            # x preserves full history for self._sa(). After all transformers we return only last -1 tok
+            x, history = lay(
+                x,
+                history=history,   # only updated by self_attn (the cross sees only last token)
+                cross_attention_src=kwargs["cross_attention_src"],
+                src_mask=kwargs['src_mask']
+                )  # x : [bs, 24, 37, 64]
         return x
 # special attention related function

demo.py CHANGED Viewed

@@ -4,10 +4,10 @@ import numpy as np
 print('\n\n\n\n___________________')
-txt = 'dogs in street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=1.24)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 print('\n\n\n\n___________________')
+txt = 'dogs in the street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=.74)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7