ccdv commited on
Commit
6e4b51f
·
1 Parent(s): acff012

small fix with torch.finfo

Browse files
Files changed (1) hide show
  1. modeling_lsg_camembert.py +90 -121
modeling_lsg_camembert.py CHANGED
@@ -1,5 +1,5 @@
1
  from logging import warn
2
- from transformers.models.roberta.modeling_roberta import *
3
  import torch
4
  import torch.nn as nn
5
  from transformers.models.camembert.configuration_camembert import CamembertConfig
@@ -156,7 +156,7 @@ class BaseAttentionProduct(nn.Module):
156
  # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
157
  attention_scores = attention_scores + attention_mask
158
  del attention_mask
159
-
160
  # Normalize the attention scores to probabilities.
161
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
162
 
@@ -198,7 +198,7 @@ class CausalAttentionProduct(nn.Module):
198
  diagonal=-1
199
  )
200
  causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
201
- attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
202
 
203
  del attention_mask
204
 
@@ -296,7 +296,7 @@ class LSGAttentionProduct(nn.Module):
296
  ).transpose(-1, -2)
297
  del sparse_mask
298
  del global_mask
299
-
300
  # expect (..., t, d) shape
301
  # Compute attention
302
  context_layer = self.attention(
@@ -391,7 +391,7 @@ class LSGAttentionProduct(nn.Module):
391
  return x.reshape(*x.size()[:-2], n_blocks, -1, d)
392
 
393
 
394
- class LSGCamembertEmbeddings(RobertaEmbeddings):
395
 
396
  def __init__(self, config):
397
  super().__init__(config)
@@ -447,7 +447,7 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
447
  return embeddings
448
 
449
 
450
- class LSGAttention(RobertaAttention):
451
 
452
  def __init__(self, config):
453
 
@@ -546,7 +546,8 @@ class LSGSelfAttention(BaseSelfAttention):
546
  keys = keys.sum(dim=-2) / (mask + 1e-6)
547
  values = values.sum(dim=-2) / (mask + 1e-6)
548
 
549
- mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
 
550
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
551
 
552
  def get_sparse_tokens_with_stride(self, keys, values, mask):
@@ -611,7 +612,8 @@ class LSGSelfAttention(BaseSelfAttention):
611
  keys /= mask + 1e-8
612
  values /= mask + 1e-8
613
 
614
- mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
 
615
 
616
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
617
 
@@ -879,7 +881,7 @@ class LSGSelfAttention(BaseSelfAttention):
879
  return x.reshape(n, h, -1, chunk_size, d)
880
 
881
 
882
- class LSGCamembertLayer(RobertaLayer):
883
 
884
  def __init__(self, config):
885
 
@@ -891,7 +893,7 @@ class LSGCamembertLayer(RobertaLayer):
891
  self.crossattention = LSGAttention(config)
892
 
893
 
894
- class LSGCamembertEncoder(RobertaEncoder):
895
 
896
  def __init__(self, config):
897
 
@@ -899,8 +901,73 @@ class LSGCamembertEncoder(RobertaEncoder):
899
 
900
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
- class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
 
 
 
 
 
 
904
  """
905
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
906
  models.
@@ -909,11 +976,11 @@ class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
909
  config_class = LSGCamembertConfig
910
 
911
  def _set_gradient_checkpointing(self, module, value=False):
912
- if isinstance(module, (RobertaEncoder, LSGCamembertEncoder)):
913
  module.gradient_checkpointing = value
914
 
915
 
916
- class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
917
  """
918
  This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
919
  documentation alongside usage examples.
@@ -926,19 +993,9 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
926
 
927
  LSGCamembertPreTrainedModel.__init__(self, config)
928
 
929
- assert hasattr(config, "num_global_tokens")
930
- self.num_global_tokens = config.num_global_tokens
931
- self.pad_idx = config.pad_token_id
932
-
933
- assert hasattr(config, "block_size") and hasattr(config, "adaptive")
934
- self.block_size = config.block_size
935
- self.adaptive = config.adaptive
936
- self.mask_first_token = config.mask_first_token
937
- self.pool_with_global = config.pool_with_global
938
-
939
  self.embeddings = LSGCamembertEmbeddings(config)
940
  self.encoder = LSGCamembertEncoder(config)
941
- self.pooler = RobertaPooler(config) if add_pooling_layer else None
942
 
943
  if config.add_cross_attention:
944
  logger.warning(
@@ -948,94 +1005,6 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
948
  # Initialize weights and apply final processing
949
  self.post_init()
950
 
951
- def forward(
952
- self,
953
- input_ids=None,
954
- attention_mask=None,
955
- token_type_ids=None,
956
- position_ids=None,
957
- head_mask=None,
958
- inputs_embeds=None,
959
- encoder_hidden_states=None,
960
- encoder_attention_mask=None,
961
- past_key_values=None,
962
- use_cache=None,
963
- output_attentions=None,
964
- output_hidden_states=None,
965
- return_dict=None
966
- ):
967
-
968
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
969
- output_hidden_states = (
970
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
971
- )
972
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
973
-
974
- inputs_ = input_ids if input_ids is not None else inputs_embeds
975
- n, t = inputs_.size()[:2]
976
-
977
- if attention_mask is None:
978
- attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
979
- if self.mask_first_token:
980
- attention_mask[:,0] = 0
981
-
982
- b = self.block_size * 2
983
- pad = t % self.block_size
984
-
985
- # Check if t is multiple of block_size and pad
986
- if self.adaptive and t > b and pad > 0:
987
- pad_length = self.block_size - pad
988
- if input_ids is not None:
989
- input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
990
- else:
991
- inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
992
-
993
- attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
994
-
995
- if token_type_ids is not None:
996
- token_type_ids = torch.nn.functional.pad(token_type_ids, (0, pad_length), value=0)
997
- if position_ids is not None:
998
- position_ids = torch.nn.functional.pad(position_ids, (0, pad_length), value=0)
999
-
1000
- n, t_ = attention_mask.size()
1001
-
1002
- encoder_outputs = super().forward(
1003
- input_ids=input_ids,
1004
- attention_mask=attention_mask,
1005
- token_type_ids=token_type_ids,
1006
- position_ids=position_ids,
1007
- head_mask=head_mask,
1008
- inputs_embeds=inputs_embeds,
1009
- encoder_hidden_states=encoder_hidden_states,
1010
- encoder_attention_mask=encoder_attention_mask,
1011
- past_key_values=past_key_values,
1012
- use_cache=use_cache,
1013
- output_attentions=output_attentions,
1014
- output_hidden_states=output_hidden_states,
1015
- return_dict=return_dict
1016
- )
1017
-
1018
- sequence_output = encoder_outputs[0]
1019
- if self.pool_with_global:
1020
- sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
1021
-
1022
- diff = t - t_
1023
- n, _, d = sequence_output.size()
1024
- sequence_output = sequence_output[..., self.num_global_tokens:, :]
1025
-
1026
- # Adapt sequence to initial shape
1027
- if diff < 0:
1028
- sequence_output = sequence_output[:, :t]
1029
-
1030
- pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1031
-
1032
- if not return_dict:
1033
- return (sequence_output, pooled_output) + encoder_outputs[1:]
1034
-
1035
- encoder_outputs.last_hidden_state = sequence_output
1036
- encoder_outputs.pooler_output = pooled_output
1037
- return encoder_outputs
1038
-
1039
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1040
 
1041
  # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
@@ -1054,7 +1023,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
1054
  return extended_attention_mask
1055
 
1056
 
1057
- class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
1058
 
1059
  _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1060
  _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -1068,7 +1037,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
1068
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1069
 
1070
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1071
- self.lm_head = RobertaLMHead(config)
1072
 
1073
  # The LM head weights require special treatment only when they are tied with the word embeddings
1074
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1077,7 +1046,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
1077
  self.post_init()
1078
 
1079
 
1080
- class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1081
  """
1082
  This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
1083
  documentation alongside usage examples.
@@ -1098,7 +1067,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1098
  )
1099
 
1100
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1101
- self.lm_head = RobertaLMHead(config)
1102
 
1103
  # The LM head weights require special treatment only when they are tied with the word embeddings
1104
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1107,7 +1076,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1107
  self.post_init()
1108
 
1109
 
1110
- class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
1111
  """
1112
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
1113
  appropriate documentation alongside usage examples.
@@ -1123,13 +1092,13 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
1123
  self.config = config
1124
 
1125
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1126
- self.classifier = RobertaClassificationHead(config)
1127
 
1128
  # Initialize weights and apply final processing
1129
  self.post_init()
1130
 
1131
 
1132
- class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
1133
  """
1134
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
1135
  appropriate documentation alongside usage examples.
@@ -1149,7 +1118,7 @@ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMulti
1149
  self.post_init()
1150
 
1151
 
1152
- class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, RobertaForTokenClassification):
1153
  """
1154
  This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
1155
  appropriate documentation alongside usage examples.
@@ -1175,7 +1144,7 @@ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, RobertaFor
1175
  self.post_init()
1176
 
1177
 
1178
- class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, RobertaForQuestionAnswering):
1179
  """
1180
  This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
1181
  appropriate documentation alongside usage examples.
 
1
  from logging import warn
2
+ from transformers.models.camembert.modeling_camembert import *
3
  import torch
4
  import torch.nn as nn
5
  from transformers.models.camembert.configuration_camembert import CamembertConfig
 
156
  # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
157
  attention_scores = attention_scores + attention_mask
158
  del attention_mask
159
+
160
  # Normalize the attention scores to probabilities.
161
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
162
 
 
198
  diagonal=-1
199
  )
200
  causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
201
+ attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
202
 
203
  del attention_mask
204
 
 
296
  ).transpose(-1, -2)
297
  del sparse_mask
298
  del global_mask
299
+
300
  # expect (..., t, d) shape
301
  # Compute attention
302
  context_layer = self.attention(
 
391
  return x.reshape(*x.size()[:-2], n_blocks, -1, d)
392
 
393
 
394
+ class LSGCamembertEmbeddings(CamembertEmbeddings):
395
 
396
  def __init__(self, config):
397
  super().__init__(config)
 
447
  return embeddings
448
 
449
 
450
+ class LSGAttention(CamembertAttention):
451
 
452
  def __init__(self, config):
453
 
 
546
  keys = keys.sum(dim=-2) / (mask + 1e-6)
547
  values = values.sum(dim=-2) / (mask + 1e-6)
548
 
549
+ mask = (1. - mask.clamp(0, 1))
550
+ mask *= torch.finfo(mask.dtype).min
551
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
552
 
553
  def get_sparse_tokens_with_stride(self, keys, values, mask):
 
612
  keys /= mask + 1e-8
613
  values /= mask + 1e-8
614
 
615
+ mask = (1. - mask.clamp(0, 1))
616
+ mask *= torch.finfo(mask.dtype).min
617
 
618
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
619
 
 
881
  return x.reshape(n, h, -1, chunk_size, d)
882
 
883
 
884
+ class LSGCamembertLayer(CamembertLayer):
885
 
886
  def __init__(self, config):
887
 
 
893
  self.crossattention = LSGAttention(config)
894
 
895
 
896
+ class LSGCamembertEncoder(CamembertEncoder):
897
 
898
  def __init__(self, config):
899
 
 
901
 
902
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
903
 
904
+ assert hasattr(config, "num_global_tokens")
905
+ self.num_global_tokens = config.num_global_tokens
906
+ self.pad_idx = config.pad_token_id
907
+
908
+ assert hasattr(config, "block_size") and hasattr(config, "adaptive")
909
+ self.block_size = config.block_size
910
+ self.adaptive = config.adaptive
911
+ self.mask_first_token = config.mask_first_token
912
+ self.pool_with_global = config.pool_with_global
913
+
914
+ def forward(
915
+ self,
916
+ hidden_states: torch.Tensor,
917
+ attention_mask: Optional[torch.FloatTensor] = None,
918
+ head_mask: Optional[torch.FloatTensor] = None,
919
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
920
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
921
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
922
+ use_cache: Optional[bool] = None,
923
+ output_attentions: Optional[bool] = False,
924
+ output_hidden_states: Optional[bool] = False,
925
+ return_dict: Optional[bool] = True,
926
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
927
+
928
+ mask_value = torch.finfo(attention_mask.dtype).min
929
+ n, _, __, t = attention_mask.size()
930
+
931
+ if not (self.config.is_decoder and encoder_hidden_states is not None):
932
+ b = self.block_size * 2
933
+ pad = t % self.block_size
934
+
935
+ # Check if t is multiple of block_size and pad
936
+ if self.adaptive and t > b and pad > 0:
937
+ pad_length = self.block_size - pad
938
+ hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
939
+ attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=mask_value)
940
+
941
+ if self.mask_first_token:
942
+ attention_mask[..., 0] = mask_value
943
+
944
+ encoder_outputs = super().forward(
945
+ hidden_states=hidden_states,
946
+ attention_mask=attention_mask,
947
+ head_mask=head_mask,
948
+ encoder_hidden_states=encoder_hidden_states,
949
+ encoder_attention_mask=encoder_attention_mask,
950
+ past_key_values=past_key_values,
951
+ use_cache=use_cache,
952
+ output_attentions=output_attentions,
953
+ output_hidden_states=output_hidden_states,
954
+ return_dict=return_dict
955
+ )
956
+
957
+ sequence_output = encoder_outputs[0]
958
+ if self.pool_with_global:
959
+ sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
960
+
961
+ # Adapt sequence to initial shape
962
+ sequence_output = sequence_output[..., self.num_global_tokens: t + self.num_global_tokens, :]
963
 
964
+ if not return_dict:
965
+ return (sequence_output, ) + encoder_outputs[1:]
966
+
967
+ encoder_outputs.last_hidden_state = sequence_output
968
+ return encoder_outputs
969
+
970
+ class LSGCamembertPreTrainedModel(CamembertPreTrainedModel):
971
  """
972
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
973
  models.
 
976
  config_class = LSGCamembertConfig
977
 
978
  def _set_gradient_checkpointing(self, module, value=False):
979
+ if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
980
  module.gradient_checkpointing = value
981
 
982
 
983
+ class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
984
  """
985
  This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
986
  documentation alongside usage examples.
 
993
 
994
  LSGCamembertPreTrainedModel.__init__(self, config)
995
 
 
 
 
 
 
 
 
 
 
 
996
  self.embeddings = LSGCamembertEmbeddings(config)
997
  self.encoder = LSGCamembertEncoder(config)
998
+ self.pooler = CamembertPooler(config) if add_pooling_layer else None
999
 
1000
  if config.add_cross_attention:
1001
  logger.warning(
 
1005
  # Initialize weights and apply final processing
1006
  self.post_init()
1007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1009
 
1010
  # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
 
1023
  return extended_attention_mask
1024
 
1025
 
1026
+ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
1027
 
1028
  _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1029
  _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
 
1037
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1038
 
1039
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1040
+ self.lm_head = CamembertLMHead(config)
1041
 
1042
  # The LM head weights require special treatment only when they are tied with the word embeddings
1043
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1046
  self.post_init()
1047
 
1048
 
1049
+ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, CamembertForMaskedLM):
1050
  """
1051
  This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
1052
  documentation alongside usage examples.
 
1067
  )
1068
 
1069
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1070
+ self.lm_head = CamembertLMHead(config)
1071
 
1072
  # The LM head weights require special treatment only when they are tied with the word embeddings
1073
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1076
  self.post_init()
1077
 
1078
 
1079
+ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, CamembertForSequenceClassification):
1080
  """
1081
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
1082
  appropriate documentation alongside usage examples.
 
1092
  self.config = config
1093
 
1094
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1095
+ self.classifier = CamembertClassificationHead(config)
1096
 
1097
  # Initialize weights and apply final processing
1098
  self.post_init()
1099
 
1100
 
1101
+ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, CamembertForMultipleChoice):
1102
  """
1103
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
1104
  appropriate documentation alongside usage examples.
 
1118
  self.post_init()
1119
 
1120
 
1121
+ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, CamembertForTokenClassification):
1122
  """
1123
  This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
1124
  appropriate documentation alongside usage examples.
 
1144
  self.post_init()
1145
 
1146
 
1147
+ class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, CamembertForQuestionAnswering):
1148
  """
1149
  This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
1150
  appropriate documentation alongside usage examples.