togethercomputer
/

evo-1-131k-base

Text Generation

deep signal processing

Model card Files Files and versions Community

maxall4 commited on 4 days ago

Commit

b07b34f

·

verified ·

1 Parent(s): e84e201

Update model.py

Files changed (1) hide show

model.py +3 -4

model.py CHANGED Viewed

@@ -73,13 +73,12 @@ class AttentionBlock(nn.Module):
         ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
             # and attention scores will be also automatically zeroed.
             u = u * padding_mask[..., None]
-        u = (
-            self.inner_mha_cls(
                 self.pre_norm(u),
                 inference_params=inference_params,
-            )
-            + u
         )
         if type(padding_mask) == torch.Tensor:  # guard against bias
             u = u * padding_mask[..., None]
         u = self.mlp(self.post_norm(u)) + u

         ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
             # and attention scores will be also automatically zeroed.
             u = u * padding_mask[..., None]
+        w = self.inner_mha_cls(
                 self.pre_norm(u),
                 inference_params=inference_params,
         )
+        self.filter_output = w
+        u = w + u
         if type(padding_mask) == torch.Tensor:  # guard against bias
             u = u * padding_mask[..., None]
         u = self.mlp(self.post_norm(u)) + u