update for transformers >= 4.29.1
Browse files- modeling_lsg_camembert.py +16 -23
modeling_lsg_camembert.py
CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
191 |
-
# Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
|
192 |
-
attention_scores = attention_scores + attention_mask
|
193 |
-
|
194 |
# Add causal mask
|
195 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
196 |
causal_mask = torch.tril(
|
197 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
198 |
diagonal=-1
|
199 |
)
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
203 |
del attention_mask
|
|
|
204 |
|
205 |
# Normalize the attention scores to probabilities.
|
206 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
@@ -974,6 +980,8 @@ class LSGCamembertPreTrainedModel(CamembertPreTrainedModel):
|
|
974 |
"""
|
975 |
|
976 |
config_class = LSGCamembertConfig
|
|
|
|
|
977 |
|
978 |
def _set_gradient_checkpointing(self, module, value=False):
|
979 |
if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
|
@@ -986,8 +994,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
|
|
986 |
documentation alongside usage examples.
|
987 |
"""
|
988 |
|
989 |
-
|
990 |
-
|
991 |
|
992 |
def __init__(self, config, add_pooling_layer=True):
|
993 |
|
@@ -1025,9 +1032,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
|
|
1025 |
|
1026 |
class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
|
1027 |
|
1028 |
-
|
1029 |
-
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1030 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1031 |
|
1032 |
def __init__(self, config):
|
1033 |
|
@@ -1052,9 +1057,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, CamembertForMaskedLM)
|
|
1052 |
documentation alongside usage examples.
|
1053 |
"""
|
1054 |
|
1055 |
-
|
1056 |
-
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1057 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1058 |
|
1059 |
def __init__(self, config):
|
1060 |
|
@@ -1082,8 +1085,6 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Camembe
|
|
1082 |
appropriate documentation alongside usage examples.
|
1083 |
"""
|
1084 |
|
1085 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1086 |
-
|
1087 |
def __init__(self, config):
|
1088 |
|
1089 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
@@ -1104,8 +1105,6 @@ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, CamembertForMul
|
|
1104 |
appropriate documentation alongside usage examples.
|
1105 |
"""
|
1106 |
|
1107 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1108 |
-
|
1109 |
def __init__(self, config):
|
1110 |
|
1111 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
@@ -1124,9 +1123,6 @@ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, CamembertF
|
|
1124 |
appropriate documentation alongside usage examples.
|
1125 |
"""
|
1126 |
|
1127 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1128 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1129 |
-
|
1130 |
def __init__(self, config):
|
1131 |
|
1132 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
@@ -1150,9 +1146,6 @@ class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, CamembertFor
|
|
1150 |
appropriate documentation alongside usage examples.
|
1151 |
"""
|
1152 |
|
1153 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1154 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1155 |
-
|
1156 |
def __init__(self, config):
|
1157 |
|
1158 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
|
|
|
|
|
|
191 |
# Add causal mask
|
192 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
193 |
causal_mask = torch.tril(
|
194 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
195 |
diagonal=-1
|
196 |
)
|
197 |
+
|
198 |
+
# Min value
|
199 |
+
dtype_min = torch.tensor(
|
200 |
+
torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
|
201 |
+
)
|
202 |
+
|
203 |
+
# Build causal + attention_mask
|
204 |
+
causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
|
205 |
+
attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
|
206 |
|
207 |
+
attention_scores = attention_scores + attention_mask
|
208 |
del attention_mask
|
209 |
+
del causal_mask
|
210 |
|
211 |
# Normalize the attention scores to probabilities.
|
212 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
|
|
980 |
"""
|
981 |
|
982 |
config_class = LSGCamembertConfig
|
983 |
+
base_model_prefix = "roberta"
|
984 |
+
supports_gradient_checkpointing = True
|
985 |
|
986 |
def _set_gradient_checkpointing(self, module, value=False):
|
987 |
if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
|
|
|
994 |
documentation alongside usage examples.
|
995 |
"""
|
996 |
|
997 |
+
_no_split_modules = []
|
|
|
998 |
|
999 |
def __init__(self, config, add_pooling_layer=True):
|
1000 |
|
|
|
1032 |
|
1033 |
class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
|
1034 |
|
1035 |
+
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
|
|
|
|
1036 |
|
1037 |
def __init__(self, config):
|
1038 |
|
|
|
1057 |
documentation alongside usage examples.
|
1058 |
"""
|
1059 |
|
1060 |
+
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
|
|
|
|
1061 |
|
1062 |
def __init__(self, config):
|
1063 |
|
|
|
1085 |
appropriate documentation alongside usage examples.
|
1086 |
"""
|
1087 |
|
|
|
|
|
1088 |
def __init__(self, config):
|
1089 |
|
1090 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
|
|
1105 |
appropriate documentation alongside usage examples.
|
1106 |
"""
|
1107 |
|
|
|
|
|
1108 |
def __init__(self, config):
|
1109 |
|
1110 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
|
|
1123 |
appropriate documentation alongside usage examples.
|
1124 |
"""
|
1125 |
|
|
|
|
|
|
|
1126 |
def __init__(self, config):
|
1127 |
|
1128 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
|
|
1146 |
appropriate documentation alongside usage examples.
|
1147 |
"""
|
1148 |
|
|
|
|
|
|
|
1149 |
def __init__(self, config):
|
1150 |
|
1151 |
LSGCamembertPreTrainedModel.__init__(self, config)
|