Spaces:
Sleeping
Sleeping
File size: 1,781 Bytes
20239f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# Attention Block with option to return the mean of k over heads from attention
import torch
from timm.models.vision_transformer import Attention, Block
import torch.nn.functional as F
from typing import Tuple
class AttentionWQKVReturn(Attention):
"""
Modifications:
- Return the qkv tensors from the attention
"""
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0)
q, k = self.q_norm(q), self.k_norm(k)
if self.fused_attn:
x = F.scaled_dot_product_attention(
q, k, v,
dropout_p=self.attn_drop.p if self.training else 0.,
)
else:
q = q * self.scale
attn = q @ k.transpose(-2, -1)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = attn @ v
x = x.transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x, torch.stack((q, k, v), dim=0)
class BlockWQKVReturn(Block):
"""
Modifications:
- Use AttentionWQKVReturn instead of Attention
- Return the qkv tensors from the attention
"""
def forward(self, x: torch.Tensor, return_qkv: bool = False) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor]:
# Note: this is copied from timm.models.vision_transformer.Block with modifications.
x_attn, qkv = self.attn(self.norm1(x))
x = x + self.drop_path1(self.ls1(x_attn))
x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
if return_qkv:
return x, qkv
else:
return x
|