Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 2,738 Bytes

8e8cd3e

from torch import nn
import torch

class SELayer(nn.Module):
    """
    Squeeze-and-Excitation (SE) Layer.

    This layer implements the Squeeze-and-Excitation mechanism, which adaptively
    recalibrates channel-wise feature responses by explicitly modeling 
    interdependencies between channels. It enhances the representational power
    of a neural network by emphasizing informative features while suppressing
    less useful ones.

    Args:
        channel (int): The number of input channels.
        reduction (int, optional): Reduction ratio for the dimensionality
            of the intermediate representations. Default is 16.
    """
    
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        # Adaptive average pooling to reduce spatial dimensions to 1x1
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.avg_pool_layer = nn.Sequential(
            nn.Linear(channel, channel // reduction),  # First linear layer
            nn.ReLU(inplace=True),                    # Activation layer
            nn.Linear(channel // reduction, channel),  # Second linear layer
            nn.Sigmoid()                              # Sigmoid activation for scaling
        )
        
        # Adaptive max pooling to reduce spatial dimensions to 1x1
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.max_pool_layer = nn.Sequential(
            nn.Linear(channel, channel // reduction),  # First linear layer
            nn.ReLU(inplace=True),                    # Activation layer
            nn.Linear(channel // reduction, channel),  # Second linear layer
            nn.Sigmoid()                              # Sigmoid activation for scaling
        )

    def forward(self, x):
        """
        Forward pass for the SE Layer.

        Args:
            x (Tensor): Input tensor of shape (B, C, H, W) where:
                B = batch size,
                C = number of channels,
                H = height,
                W = width.

        Returns:
            Tensor: Output tensor of the same shape as input `x` after
            applying the squeeze-and-excitation mechanism.
        """
        
        b, c, _, _ = x.size()  # Unpack input dimensions
        x_avg = self.avg_pool(x).view(b, c)  # Squeeze: apply average pooling
        x_avg = self.avg_pool_layer(x_avg).view(b, c, 1, 1)  # Excitation: pass through layers

        x_max = self.max_pool(x).view(b, c)  # Squeeze: apply max pooling
        x_max = self.max_pool_layer(x_max).view(b, c, 1, 1)  # Excitation: pass through layers
        
        # Scale the input features by the computed channel weights
        y = (x_avg + x_max) * x  
        return y  # Return the recalibrated output