kuleshov-group
/

caduceus-ps_seqlen-1k_d_model-118_n_layer-4_lr-8e-3

Fill-Mask

Transformers

Safetensors

caduceus

custom_code

Model card Files Files and versions Community

yairschiff commited on Nov 26, 2024

Commit

e3f8061

verified ·

1 Parent(s): 0600f8f

Ensure weights are tied for BiMamba (if applicable) when loaded from_pretrained

Browse files

Files changed (1) hide show

modeling_caduceus.py +31 -4

modeling_caduceus.py CHANGED Viewed

@@ -1,5 +1,4 @@
 """Caduceus model for Hugging Face.
 """
 import inspect
@@ -46,7 +45,6 @@ def create_block(
         dtype=None,
 ):
     """Create Caduceus block.
     Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
     """
     if ssm_cfg is None:
@@ -121,7 +119,6 @@ class BiMambaWrapper(nn.Module):
     def forward(self, hidden_states, inference_params=None):
         """Bidirectional-enabled forward pass
         hidden_states: (B, L, D)
         Returns: same shape as hidden_states
         """
@@ -360,6 +357,24 @@ class Caduceus(CaduceusPreTrainedModel):
         factory_kwargs = {"device": device, "dtype": dtype}
         self.backbone = CaduceusMixerModel(config, **factory_kwargs, **kwargs)
     def forward(
             self,
             input_ids: torch.LongTensor = None,
@@ -431,8 +446,12 @@ class CaduceusForMaskedLM(CaduceusPreTrainedModel):
             raise NotImplementedError("Setting output embeddings for RCPS LM is not supported.")
         self.lm_head = new_embeddings
     def tie_weights(self):
         """Tie weights, accounting for RCPS."""
         if self.config.rcps:
             self.lm_head.set_weight(self.get_input_embeddings().weight)
         else:
@@ -445,7 +464,7 @@ class CaduceusForMaskedLM(CaduceusPreTrainedModel):
     def set_decoder(self, decoder):
         """Set decoder (backbone) for the model."""
         self.caduceus = decoder
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -536,6 +555,13 @@ class CaduceusForSequenceClassification(CaduceusPreTrainedModel):
         if self.pooling_strategy == "first":  # Use embedding of first token in the sequence
             return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[0, ...]
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -543,6 +569,7 @@ class CaduceusForSequenceClassification(CaduceusPreTrainedModel):
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):

 """Caduceus model for Hugging Face.
 """
 import inspect
         dtype=None,
 ):
     """Create Caduceus block.
     Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
     """
     if ssm_cfg is None:
     def forward(self, hidden_states, inference_params=None):
         """Bidirectional-enabled forward pass
         hidden_states: (B, L, D)
         Returns: same shape as hidden_states
         """
         factory_kwargs = {"device": device, "dtype": dtype}
         self.backbone = CaduceusMixerModel(config, **factory_kwargs, **kwargs)
+    def maybe_weight_tie_mamba(self):
+        if getattr(self.config, 'bidirectional', False) and getattr(self.config, 'bidirectional_weight_tie', False):
+            if getattr(self.config, 'rcps', False):
+                for layer in self.backbone.layers:
+                    layer.mixer.submodule.mamba_rev.in_proj.weight = layer.mixer.submodule.mamba_fwd.in_proj.weight
+                    layer.mixer.submodule.mamba_rev.in_proj.bias = layer.mixer.submodule.mamba_fwd.in_proj.bias
+                    layer.mixer.submodule.mamba_rev.out_proj.weight = layer.mixer.submodule.mamba_fwd.out_proj.weight
+                    layer.mixer.submodule.mamba_rev.out_proj.bias = layer.mixer.submodule.mamba_fwd.out_proj.bias
+            else:
+                for layer in self.backbone.layers:
+                    layer.mixer.mamba_rev.in_proj.weight = layer.mixer.mamba_fwd.in_proj.weight
+                    layer.mixer.mamba_rev.in_proj.bias = layer.mixer.mamba_fwd.in_proj.bias
+                    layer.mixer.mamba_rev.out_proj.weight = layer.mixer.mamba_fwd.out_proj.weight
+                    layer.mixer.mamba_rev.out_proj.bias = layer.mixer.mamba_fwd.out_proj.bias
+    def tie_weights(self):
+        self.maybe_weight_tie_mamba()
     def forward(
             self,
             input_ids: torch.LongTensor = None,
             raise NotImplementedError("Setting output embeddings for RCPS LM is not supported.")
         self.lm_head = new_embeddings
+    def maybe_weight_tie_mamba(self):
+        self.caduceus.maybe_weight_tie_mamba()
     def tie_weights(self):
         """Tie weights, accounting for RCPS."""
+        self.maybe_weight_tie_mamba()
         if self.config.rcps:
             self.lm_head.set_weight(self.get_input_embeddings().weight)
         else:
     def set_decoder(self, decoder):
         """Set decoder (backbone) for the model."""
         self.caduceus = decoder
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         if self.pooling_strategy == "first":  # Use embedding of first token in the sequence
             return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[0, ...]
+    def maybe_weight_tie_mamba(self):
+        self.caduceus.maybe_weight_tie_mamba()
+    def tie_weights(self):
+        self.maybe_weight_tie_mamba()
+        super().tie_weights()
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):