Crystalcareai
/

GemMoE-Beta-1

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 15, 2024

Commit

a0d5586

·

verified ·

1 Parent(s): 7b9b6d3

Update modeling_gemmoe.py

Files changed (1) hide show

modeling_gemmoe.py +5 -10

modeling_gemmoe.py CHANGED Viewed

@@ -670,10 +670,9 @@ class GemmoeBlockSparseTop2MLP(nn.Module):
         self.act_fn = approx_gelu
     def forward(self, hidden_states):
-        hidden_states = hidden_states.to(torch.float32)  # Cast to float32
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-        current_hidden_states = self.w2(current_hidden_states.to(hidden_states.dtype))  # Cast back to original dtype
-        return current_hidden_states
 class GemmoeSparseMoeBlock(nn.Module):
@@ -694,15 +693,11 @@ class GemmoeSparseMoeBlock(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
-        hidden_states_float = hidden_states.float()  # Cast to float32
-        router_logits = self.gate(hidden_states_float)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-        # we cast back to the input dtype
-        topk_weight = topk_weight.to(hidden_states.dtype)
         hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
         y = torch.empty_like(hidden_states)
@@ -716,7 +711,7 @@ class GemmoeSparseMoeBlock(nn.Module):
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
-        return final_hidden_states, router_logits
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMOE,Llama->Gemmoe

         self.act_fn = approx_gelu
     def forward(self, hidden_states):
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states.to(hidden_states.dtype)
 class GemmoeSparseMoeBlock(nn.Module):
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1)
         topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
         topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
         hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
         y = torch.empty_like(hidden_states)
         y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states.to(hidden_states.dtype), router_logits.to(hidden_states.dtype)
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMOE,Llama->Gemmoe