workatournd for overflow
Browse filesSigned-off-by: wenhuach <[email protected]>
- config.json +1 -1
- modeling_deepseek.py +1 -1
config.json
CHANGED
@@ -79,7 +79,7 @@
|
|
79 |
"tie_word_embeddings": false,
|
80 |
"topk_group": 4,
|
81 |
"topk_method": "noaux_tc",
|
82 |
-
"torch_dtype": "
|
83 |
"transformers_version": "4.47.0",
|
84 |
"use_cache": true,
|
85 |
"v_head_dim": 128,
|
|
|
79 |
"tie_word_embeddings": false,
|
80 |
"topk_group": 4,
|
81 |
"topk_method": "noaux_tc",
|
82 |
+
"torch_dtype": "float16",
|
83 |
"transformers_version": "4.47.0",
|
84 |
"use_cache": true,
|
85 |
"v_head_dim": 128,
|
modeling_deepseek.py
CHANGED
@@ -386,7 +386,7 @@ class DeepseekV3MLP(nn.Module):
|
|
386 |
self.act_fn = ACT2FN[config.hidden_act]
|
387 |
|
388 |
def forward(self, x):
|
389 |
-
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
390 |
return down_proj
|
391 |
|
392 |
|
|
|
386 |
self.act_fn = ACT2FN[config.hidden_act]
|
387 |
|
388 |
def forward(self, x):
|
389 |
+
down_proj = self.down_proj(torch.clip(self.act_fn(self.gate_proj(x)) * self.up_proj(x),-65504,65504))
|
390 |
return down_proj
|
391 |
|
392 |
|