Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 15, 2024

Commit

61c9550

verified ·

1 Parent(s): 6354236

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +11 -10

modeling_gemmoe.py CHANGED Viewed

@@ -682,42 +682,43 @@ class GemmoeBlockSparseTop2MLP(GemmoeBlockSparseTop2MLP):
         super().__init__(*args, **kwargs)
 class GemmoeSparseMoeBlock(nn.Module):
-    """
-    This implementation is strictly equivalent to standard MoE with full capacity (no dropped tokens). It's faster since it formulates MoE operations in terms of block-sparse operations to accommodate imbalanced assignments of tokens to experts.
-    """
     def __init__(self, config):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.ffn_dim = config.intermediate_size
         self.num_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
         topk_weight = topk_weight.to(hidden_states.dtype)
         hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
         y = torch.empty_like(hidden_states)
         flat_topk_idx = topk_idx.view(-1)
         for i in range(self.num_experts):
             expert = self.experts[i]
-            mask = flat_topk_idx == i
-            if mask.any():
-                y[mask] = expert(hidden_states[mask]).to(y.dtype)
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits

         super().__init__(*args, **kwargs)
 class GemmoeSparseMoeBlock(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.ffn_dim = config.intermediate_size
         self.num_experts = config.num_local_experts
+        self.top_k = 2
         # gating
         self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
         topk_weight = topk_weight.to(hidden_states.dtype)
         hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
         y = torch.empty_like(hidden_states)
         flat_topk_idx = topk_idx.view(-1)
         for i in range(self.num_experts):
             expert = self.experts[i]
+            expert_output = expert(hidden_states[flat_topk_idx == i])
+            y[flat_topk_idx == i] = expert_output.to(y.dtype)  # Cast expert_output to the same dtype as y
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits