Jackmin108 commited on
Commit
389ddad
·
1 Parent(s): ccb6a83

Init Model

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Jackmin108/bert-base-uncased",
3
+ "architectures": [
4
+ "MyBertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_bert.MyBertConfig",
9
+ "AutoModelForMaskedLM": "modeling_bert.MyBertForMaskedLM",
10
+ "AutoModel": "modeling_bert.MyBertModel",
11
+ "AutoModelForSequenceClassification": "modeling_bert.MyBertForSequenceClassification"
12
+ },
13
+ "gradient_checkpointing": false,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "bert",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "position_embedding_type": "absolute",
26
+ "transformers_version": "4.6.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": true,
29
+ "vocab_size": 30522
30
+ }
31
+
configuration_bert.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ BERT model configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28
+ "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
29
+ "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
30
+ "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
31
+ "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
32
+ "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
33
+ "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
34
+ "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
35
+ "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
36
+ "bert-large-uncased-whole-word-masking": (
37
+ "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json"
38
+ ),
39
+ "bert-large-cased-whole-word-masking": (
40
+ "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json"
41
+ ),
42
+ "bert-large-uncased-whole-word-masking-finetuned-squad": (
43
+ "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json"
44
+ ),
45
+ "bert-large-cased-whole-word-masking-finetuned-squad": (
46
+ "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json"
47
+ ),
48
+ "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
49
+ "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
50
+ "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
51
+ "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
52
+ "cl-tohoku/bert-base-japanese-whole-word-masking": (
53
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json"
54
+ ),
55
+ "cl-tohoku/bert-base-japanese-char": (
56
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json"
57
+ ),
58
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": (
59
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json"
60
+ ),
61
+ "TurkuNLP/bert-base-finnish-cased-v1": (
62
+ "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json"
63
+ ),
64
+ "TurkuNLP/bert-base-finnish-uncased-v1": (
65
+ "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json"
66
+ ),
67
+ "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
68
+ # See all BERT models at https://huggingface.co/models?filter=bert
69
+ }
70
+
71
+
72
+ class MyBertConfig(PretrainedConfig):
73
+ r"""
74
+ This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
75
+ instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
76
+ configuration with the defaults will yield a similar configuration to that of the BERT
77
+ [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
78
+
79
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
80
+ documentation from [`PretrainedConfig`] for more information.
81
+
82
+
83
+ Args:
84
+ vocab_size (`int`, *optional*, defaults to 30522):
85
+ Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
86
+ `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
87
+ hidden_size (`int`, *optional*, defaults to 768):
88
+ Dimensionality of the encoder layers and the pooler layer.
89
+ num_hidden_layers (`int`, *optional*, defaults to 12):
90
+ Number of hidden layers in the Transformer encoder.
91
+ num_attention_heads (`int`, *optional*, defaults to 12):
92
+ Number of attention heads for each attention layer in the Transformer encoder.
93
+ intermediate_size (`int`, *optional*, defaults to 3072):
94
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
95
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
96
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
97
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
98
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
99
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
100
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
101
+ The dropout ratio for the attention probabilities.
102
+ max_position_embeddings (`int`, *optional*, defaults to 512):
103
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
104
+ just in case (e.g., 512 or 1024 or 2048).
105
+ type_vocab_size (`int`, *optional*, defaults to 2):
106
+ The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
107
+ initializer_range (`float`, *optional*, defaults to 0.02):
108
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
109
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
110
+ The epsilon used by the layer normalization layers.
111
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
112
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
113
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
114
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
115
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
116
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
117
+ is_decoder (`bool`, *optional*, defaults to `False`):
118
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
119
+ use_cache (`bool`, *optional*, defaults to `True`):
120
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
121
+ relevant if `config.is_decoder=True`.
122
+ classifier_dropout (`float`, *optional*):
123
+ The dropout ratio for the classification head.
124
+
125
+ Examples:
126
+
127
+ ```python
128
+ >>> from transformers import BertConfig, BertModel
129
+
130
+ >>> # Initializing a BERT bert-base-uncased style configuration
131
+ >>> configuration = BertConfig()
132
+
133
+ >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
134
+ >>> model = BertModel(configuration)
135
+
136
+ >>> # Accessing the model configuration
137
+ >>> configuration = model.config
138
+ ```"""
139
+ model_type = "bert"
140
+
141
+ def __init__(
142
+ self,
143
+ vocab_size=30522,
144
+ hidden_size=768,
145
+ num_hidden_layers=12,
146
+ num_attention_heads=12,
147
+ intermediate_size=3072,
148
+ hidden_act="gelu",
149
+ hidden_dropout_prob=0.1,
150
+ attention_probs_dropout_prob=0.1,
151
+ max_position_embeddings=512,
152
+ type_vocab_size=2,
153
+ initializer_range=0.02,
154
+ layer_norm_eps=1e-12,
155
+ pad_token_id=0,
156
+ position_embedding_type="absolute",
157
+ use_cache=True,
158
+ classifier_dropout=None,
159
+ **kwargs,
160
+ ):
161
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
162
+
163
+ self.vocab_size = vocab_size
164
+ self.hidden_size = hidden_size
165
+ self.num_hidden_layers = num_hidden_layers
166
+ self.num_attention_heads = num_attention_heads
167
+ self.hidden_act = hidden_act
168
+ self.intermediate_size = intermediate_size
169
+ self.hidden_dropout_prob = hidden_dropout_prob
170
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
171
+ self.max_position_embeddings = max_position_embeddings
172
+ self.type_vocab_size = type_vocab_size
173
+ self.initializer_range = initializer_range
174
+ self.layer_norm_eps = layer_norm_eps
175
+ self.position_embedding_type = position_embedding_type
176
+ self.use_cache = use_cache
177
+ self.classifier_dropout = classifier_dropout
178
+
179
+
180
+ class MyBertOnnxConfig(OnnxConfig):
181
+ @property
182
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
183
+ if self.task == "multiple-choice":
184
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
185
+ else:
186
+ dynamic_axis = {0: "batch", 1: "sequence"}
187
+ return OrderedDict(
188
+ [
189
+ ("input_ids", dynamic_axis),
190
+ ("attention_mask", dynamic_axis),
191
+ ("token_type_ids", dynamic_axis),
192
+ ]
193
+ )
modeling_bert.py ADDED
@@ -0,0 +1,1890 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch BERT model."""
17
+
18
+
19
+ import math
20
+ import os
21
+ import warnings
22
+ from dataclasses import dataclass
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.modeling_outputs import (
32
+ BaseModelOutputWithPastAndCrossAttentions,
33
+ BaseModelOutputWithPoolingAndCrossAttentions,
34
+ CausalLMOutputWithCrossAttentions,
35
+ MaskedLMOutput,
36
+ MultipleChoiceModelOutput,
37
+ NextSentencePredictorOutput,
38
+ QuestionAnsweringModelOutput,
39
+ SequenceClassifierOutput,
40
+ TokenClassifierOutput,
41
+ )
42
+ from transformers.modeling_utils import PreTrainedModel
43
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
44
+ from transformers.utils import (
45
+ ModelOutput,
46
+ add_code_sample_docstrings,
47
+ add_start_docstrings,
48
+ add_start_docstrings_to_model_forward,
49
+ logging,
50
+ replace_return_docstrings,
51
+ )
52
+ from .configuration_bert import MyBertConfig
53
+
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+ _CHECKPOINT_FOR_DOC = "bert-base-uncased"
58
+ _CONFIG_FOR_DOC = "MyBertConfig"
59
+
60
+ # TokenClassification docstring
61
+ _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
62
+ _TOKEN_CLASS_EXPECTED_OUTPUT = (
63
+ "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] "
64
+ )
65
+ _TOKEN_CLASS_EXPECTED_LOSS = 0.01
66
+
67
+ # QuestionAnswering docstring
68
+ _CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
69
+ _QA_EXPECTED_OUTPUT = "'a nice puppet'"
70
+ _QA_EXPECTED_LOSS = 7.41
71
+ _QA_TARGET_START_INDEX = 14
72
+ _QA_TARGET_END_INDEX = 15
73
+
74
+ # SequenceClassification docstring
75
+ _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
76
+ _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
77
+ _SEQ_CLASS_EXPECTED_LOSS = 0.01
78
+
79
+
80
+ BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
81
+ "bert-base-uncased",
82
+ "bert-large-uncased",
83
+ "bert-base-cased",
84
+ "bert-large-cased",
85
+ "bert-base-multilingual-uncased",
86
+ "bert-base-multilingual-cased",
87
+ "bert-base-chinese",
88
+ "bert-base-german-cased",
89
+ "bert-large-uncased-whole-word-masking",
90
+ "bert-large-cased-whole-word-masking",
91
+ "bert-large-uncased-whole-word-masking-finetuned-squad",
92
+ "bert-large-cased-whole-word-masking-finetuned-squad",
93
+ "bert-base-cased-finetuned-mrpc",
94
+ "bert-base-german-dbmdz-cased",
95
+ "bert-base-german-dbmdz-uncased",
96
+ "cl-tohoku/bert-base-japanese",
97
+ "cl-tohoku/bert-base-japanese-whole-word-masking",
98
+ "cl-tohoku/bert-base-japanese-char",
99
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking",
100
+ "TurkuNLP/bert-base-finnish-cased-v1",
101
+ "TurkuNLP/bert-base-finnish-uncased-v1",
102
+ "wietsedv/bert-base-dutch-cased",
103
+ # See all BERT models at https://huggingface.co/models?filter=bert
104
+ ]
105
+
106
+
107
+ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
108
+ """Load tf checkpoints in a pytorch model."""
109
+ try:
110
+ import re
111
+
112
+ import numpy as np
113
+ import tensorflow as tf
114
+ except ImportError:
115
+ logger.error(
116
+ "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
117
+ "https://www.tensorflow.org/install/ for installation instructions."
118
+ )
119
+ raise
120
+ tf_path = os.path.abspath(tf_checkpoint_path)
121
+ logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
122
+ # Load weights from TF model
123
+ init_vars = tf.train.list_variables(tf_path)
124
+ names = []
125
+ arrays = []
126
+ for name, shape in init_vars:
127
+ logger.info(f"Loading TF weight {name} with shape {shape}")
128
+ array = tf.train.load_variable(tf_path, name)
129
+ names.append(name)
130
+ arrays.append(array)
131
+
132
+ for name, array in zip(names, arrays):
133
+ name = name.split("/")
134
+ # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
135
+ # which are not required for using pretrained model
136
+ if any(
137
+ n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
138
+ for n in name
139
+ ):
140
+ logger.info(f"Skipping {'/'.join(name)}")
141
+ continue
142
+ pointer = model
143
+ for m_name in name:
144
+ if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
145
+ scope_names = re.split(r"_(\d+)", m_name)
146
+ else:
147
+ scope_names = [m_name]
148
+ if scope_names[0] == "kernel" or scope_names[0] == "gamma":
149
+ pointer = getattr(pointer, "weight")
150
+ elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
151
+ pointer = getattr(pointer, "bias")
152
+ elif scope_names[0] == "output_weights":
153
+ pointer = getattr(pointer, "weight")
154
+ elif scope_names[0] == "squad":
155
+ pointer = getattr(pointer, "classifier")
156
+ else:
157
+ try:
158
+ pointer = getattr(pointer, scope_names[0])
159
+ except AttributeError:
160
+ logger.info(f"Skipping {'/'.join(name)}")
161
+ continue
162
+ if len(scope_names) >= 2:
163
+ num = int(scope_names[1])
164
+ pointer = pointer[num]
165
+ if m_name[-11:] == "_embeddings":
166
+ pointer = getattr(pointer, "weight")
167
+ elif m_name == "kernel":
168
+ array = np.transpose(array)
169
+ try:
170
+ if pointer.shape != array.shape:
171
+ raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
172
+ except ValueError as e:
173
+ e.args += (pointer.shape, array.shape)
174
+ raise
175
+ logger.info(f"Initialize PyTorch weight {name}")
176
+ pointer.data = torch.from_numpy(array)
177
+ return model
178
+
179
+
180
+ class MyBertEmbeddings(nn.Module):
181
+ """Construct the embeddings from word, position and token_type embeddings."""
182
+
183
+ def __init__(self, config):
184
+ super().__init__()
185
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
186
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
187
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
188
+
189
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
190
+ # any TensorFlow checkpoint file
191
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
192
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
193
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
194
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
195
+ self.register_buffer(
196
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
197
+ )
198
+ self.register_buffer(
199
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
200
+ )
201
+
202
+ def forward(
203
+ self,
204
+ input_ids: Optional[torch.LongTensor] = None,
205
+ token_type_ids: Optional[torch.LongTensor] = None,
206
+ position_ids: Optional[torch.LongTensor] = None,
207
+ inputs_embeds: Optional[torch.FloatTensor] = None,
208
+ past_key_values_length: int = 0,
209
+ ) -> torch.Tensor:
210
+ if input_ids is not None:
211
+ input_shape = input_ids.size()
212
+ else:
213
+ input_shape = inputs_embeds.size()[:-1]
214
+
215
+ seq_length = input_shape[1]
216
+
217
+ if position_ids is None:
218
+ position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
219
+
220
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
221
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
222
+ # issue #5664
223
+ if token_type_ids is None:
224
+ if hasattr(self, "token_type_ids"):
225
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
226
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
227
+ token_type_ids = buffered_token_type_ids_expanded
228
+ else:
229
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
230
+
231
+ if inputs_embeds is None:
232
+ inputs_embeds = self.word_embeddings(input_ids)
233
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
234
+
235
+ embeddings = inputs_embeds + token_type_embeddings
236
+ if self.position_embedding_type == "absolute":
237
+ position_embeddings = self.position_embeddings(position_ids)
238
+ embeddings += position_embeddings
239
+ embeddings = self.LayerNorm(embeddings)
240
+ embeddings = self.dropout(embeddings)
241
+ return embeddings
242
+
243
+
244
+ class MyBertSelfAttention(nn.Module):
245
+ def __init__(self, config, position_embedding_type=None):
246
+ super().__init__()
247
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
248
+ raise ValueError(
249
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
250
+ f"heads ({config.num_attention_heads})"
251
+ )
252
+
253
+ self.num_attention_heads = config.num_attention_heads
254
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
255
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
256
+
257
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
258
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
259
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
260
+
261
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
262
+ self.position_embedding_type = position_embedding_type or getattr(
263
+ config, "position_embedding_type", "absolute"
264
+ )
265
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
266
+ self.max_position_embeddings = config.max_position_embeddings
267
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
268
+
269
+ self.is_decoder = config.is_decoder
270
+
271
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
272
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
273
+ x = x.view(new_x_shape)
274
+ return x.permute(0, 2, 1, 3)
275
+
276
+ def forward(
277
+ self,
278
+ hidden_states: torch.Tensor,
279
+ attention_mask: Optional[torch.FloatTensor] = None,
280
+ head_mask: Optional[torch.FloatTensor] = None,
281
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
282
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
283
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
284
+ output_attentions: Optional[bool] = False,
285
+ ) -> Tuple[torch.Tensor]:
286
+ mixed_query_layer = self.query(hidden_states)
287
+
288
+ # If this is instantiated as a cross-attention module, the keys
289
+ # and values come from an encoder; the attention mask needs to be
290
+ # such that the encoder's padding tokens are not attended to.
291
+ is_cross_attention = encoder_hidden_states is not None
292
+
293
+ if is_cross_attention and past_key_value is not None:
294
+ # reuse k,v, cross_attentions
295
+ key_layer = past_key_value[0]
296
+ value_layer = past_key_value[1]
297
+ attention_mask = encoder_attention_mask
298
+ elif is_cross_attention:
299
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
300
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
301
+ attention_mask = encoder_attention_mask
302
+ elif past_key_value is not None:
303
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
304
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
305
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
306
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
307
+ else:
308
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
309
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
310
+
311
+ query_layer = self.transpose_for_scores(mixed_query_layer)
312
+
313
+ use_cache = past_key_value is not None
314
+ if self.is_decoder:
315
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
316
+ # Further calls to cross_attention layer can then reuse all cross-attention
317
+ # key/value_states (first "if" case)
318
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
319
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
320
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
321
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
322
+ past_key_value = (key_layer, value_layer)
323
+
324
+ # Take the dot product between "query" and "key" to get the raw attention scores.
325
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
326
+
327
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
328
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
329
+ if use_cache:
330
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
331
+ -1, 1
332
+ )
333
+ else:
334
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
335
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
336
+ distance = position_ids_l - position_ids_r
337
+
338
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
339
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
340
+
341
+ if self.position_embedding_type == "relative_key":
342
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
343
+ attention_scores = attention_scores + relative_position_scores
344
+ elif self.position_embedding_type == "relative_key_query":
345
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
346
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
347
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
348
+
349
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
350
+ if attention_mask is not None:
351
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
352
+ attention_scores = attention_scores + attention_mask
353
+
354
+ # Normalize the attention scores to probabilities.
355
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
356
+
357
+ # This is actually dropping out entire tokens to attend to, which might
358
+ # seem a bit unusual, but is taken from the original Transformer paper.
359
+ attention_probs = self.dropout(attention_probs)
360
+
361
+ # Mask heads if we want to
362
+ if head_mask is not None:
363
+ attention_probs = attention_probs * head_mask
364
+
365
+ context_layer = torch.matmul(attention_probs, value_layer)
366
+
367
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
368
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
369
+ context_layer = context_layer.view(new_context_layer_shape)
370
+
371
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
372
+
373
+ if self.is_decoder:
374
+ outputs = outputs + (past_key_value,)
375
+ return outputs
376
+
377
+
378
+ class MyBertSelfOutput(nn.Module):
379
+ def __init__(self, config):
380
+ super().__init__()
381
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
382
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
383
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
384
+
385
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
386
+ hidden_states = self.dense(hidden_states)
387
+ hidden_states = self.dropout(hidden_states)
388
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
389
+ return hidden_states
390
+
391
+
392
+ class MyBertAttention(nn.Module):
393
+ def __init__(self, config, position_embedding_type=None):
394
+ super().__init__()
395
+ self.self = MyBertSelfAttention(config, position_embedding_type=position_embedding_type)
396
+ self.output = MyBertSelfOutput(config)
397
+ self.pruned_heads = set()
398
+
399
+ def prune_heads(self, heads):
400
+ if len(heads) == 0:
401
+ return
402
+ heads, index = find_pruneable_heads_and_indices(
403
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
404
+ )
405
+
406
+ # Prune linear layers
407
+ self.self.query = prune_linear_layer(self.self.query, index)
408
+ self.self.key = prune_linear_layer(self.self.key, index)
409
+ self.self.value = prune_linear_layer(self.self.value, index)
410
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
411
+
412
+ # Update hyper params and store pruned heads
413
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
414
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
415
+ self.pruned_heads = self.pruned_heads.union(heads)
416
+
417
+ def forward(
418
+ self,
419
+ hidden_states: torch.Tensor,
420
+ attention_mask: Optional[torch.FloatTensor] = None,
421
+ head_mask: Optional[torch.FloatTensor] = None,
422
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
423
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
424
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
425
+ output_attentions: Optional[bool] = False,
426
+ ) -> Tuple[torch.Tensor]:
427
+ self_outputs = self.self(
428
+ hidden_states,
429
+ attention_mask,
430
+ head_mask,
431
+ encoder_hidden_states,
432
+ encoder_attention_mask,
433
+ past_key_value,
434
+ output_attentions,
435
+ )
436
+ attention_output = self.output(self_outputs[0], hidden_states)
437
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
438
+ return outputs
439
+
440
+
441
+ class MyBertIntermediate(nn.Module):
442
+ def __init__(self, config):
443
+ super().__init__()
444
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
445
+ if isinstance(config.hidden_act, str):
446
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
447
+ else:
448
+ self.intermediate_act_fn = config.hidden_act
449
+
450
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
451
+ hidden_states = self.dense(hidden_states)
452
+ hidden_states = self.intermediate_act_fn(hidden_states)
453
+ return hidden_states
454
+
455
+
456
+ class MyBertOutput(nn.Module):
457
+ def __init__(self, config):
458
+ super().__init__()
459
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
460
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
461
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
462
+
463
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
464
+ hidden_states = self.dense(hidden_states)
465
+ hidden_states = self.dropout(hidden_states)
466
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
467
+ return hidden_states
468
+
469
+
470
+ class MyBertLayer(nn.Module):
471
+ def __init__(self, config):
472
+ super().__init__()
473
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
474
+ self.seq_len_dim = 1
475
+ self.attention = MyBertAttention(config)
476
+ self.is_decoder = config.is_decoder
477
+ self.add_cross_attention = config.add_cross_attention
478
+ if self.add_cross_attention:
479
+ if not self.is_decoder:
480
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
481
+ self.crossattention = MyBertAttention(config, position_embedding_type="absolute")
482
+ self.intermediate = MyBertIntermediate(config)
483
+ self.output = MyBertOutput(config)
484
+
485
+ def forward(
486
+ self,
487
+ hidden_states: torch.Tensor,
488
+ attention_mask: Optional[torch.FloatTensor] = None,
489
+ head_mask: Optional[torch.FloatTensor] = None,
490
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
491
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
492
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
493
+ output_attentions: Optional[bool] = False,
494
+ ) -> Tuple[torch.Tensor]:
495
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
496
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
497
+ self_attention_outputs = self.attention(
498
+ hidden_states,
499
+ attention_mask,
500
+ head_mask,
501
+ output_attentions=output_attentions,
502
+ past_key_value=self_attn_past_key_value,
503
+ )
504
+ attention_output = self_attention_outputs[0]
505
+
506
+ # if decoder, the last output is tuple of self-attn cache
507
+ if self.is_decoder:
508
+ outputs = self_attention_outputs[1:-1]
509
+ present_key_value = self_attention_outputs[-1]
510
+ else:
511
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
512
+
513
+ cross_attn_present_key_value = None
514
+ if self.is_decoder and encoder_hidden_states is not None:
515
+ if not hasattr(self, "crossattention"):
516
+ raise ValueError(
517
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
518
+ " by setting `config.add_cross_attention=True`"
519
+ )
520
+
521
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
522
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
523
+ cross_attention_outputs = self.crossattention(
524
+ attention_output,
525
+ attention_mask,
526
+ head_mask,
527
+ encoder_hidden_states,
528
+ encoder_attention_mask,
529
+ cross_attn_past_key_value,
530
+ output_attentions,
531
+ )
532
+ attention_output = cross_attention_outputs[0]
533
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
534
+
535
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
536
+ cross_attn_present_key_value = cross_attention_outputs[-1]
537
+ present_key_value = present_key_value + cross_attn_present_key_value
538
+
539
+ layer_output = apply_chunking_to_forward(
540
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
541
+ )
542
+ outputs = (layer_output,) + outputs
543
+
544
+ # if decoder, return the attn key/values as the last output
545
+ if self.is_decoder:
546
+ outputs = outputs + (present_key_value,)
547
+
548
+ return outputs
549
+
550
+ def feed_forward_chunk(self, attention_output):
551
+ intermediate_output = self.intermediate(attention_output)
552
+ layer_output = self.output(intermediate_output, attention_output)
553
+ return layer_output
554
+
555
+
556
+ class MyBertEncoder(nn.Module):
557
+ def __init__(self, config):
558
+ super().__init__()
559
+ self.config = config
560
+ self.layer = nn.ModuleList([MyBertLayer(config) for _ in range(config.num_hidden_layers)])
561
+ self.gradient_checkpointing = False
562
+
563
+ def forward(
564
+ self,
565
+ hidden_states: torch.Tensor,
566
+ attention_mask: Optional[torch.FloatTensor] = None,
567
+ head_mask: Optional[torch.FloatTensor] = None,
568
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
569
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
570
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
571
+ use_cache: Optional[bool] = None,
572
+ output_attentions: Optional[bool] = False,
573
+ output_hidden_states: Optional[bool] = False,
574
+ return_dict: Optional[bool] = True,
575
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
576
+ all_hidden_states = () if output_hidden_states else None
577
+ all_self_attentions = () if output_attentions else None
578
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
579
+
580
+ if self.gradient_checkpointing and self.training:
581
+ if use_cache:
582
+ logger.warning_once(
583
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
584
+ )
585
+ use_cache = False
586
+
587
+ next_decoder_cache = () if use_cache else None
588
+ for i, layer_module in enumerate(self.layer):
589
+ if output_hidden_states:
590
+ all_hidden_states = all_hidden_states + (hidden_states,)
591
+
592
+ layer_head_mask = head_mask[i] if head_mask is not None else None
593
+ past_key_value = past_key_values[i] if past_key_values is not None else None
594
+
595
+ if self.gradient_checkpointing and self.training:
596
+
597
+ def create_custom_forward(module):
598
+ def custom_forward(*inputs):
599
+ return module(*inputs, past_key_value, output_attentions)
600
+
601
+ return custom_forward
602
+
603
+ layer_outputs = torch.utils.checkpoint.checkpoint(
604
+ create_custom_forward(layer_module),
605
+ hidden_states,
606
+ attention_mask,
607
+ layer_head_mask,
608
+ encoder_hidden_states,
609
+ encoder_attention_mask,
610
+ )
611
+ else:
612
+ layer_outputs = layer_module(
613
+ hidden_states,
614
+ attention_mask,
615
+ layer_head_mask,
616
+ encoder_hidden_states,
617
+ encoder_attention_mask,
618
+ past_key_value,
619
+ output_attentions,
620
+ )
621
+
622
+ hidden_states = layer_outputs[0]
623
+ if use_cache:
624
+ next_decoder_cache += (layer_outputs[-1],)
625
+ if output_attentions:
626
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
627
+ if self.config.add_cross_attention:
628
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
629
+
630
+ if output_hidden_states:
631
+ all_hidden_states = all_hidden_states + (hidden_states,)
632
+
633
+ if not return_dict:
634
+ return tuple(
635
+ v
636
+ for v in [
637
+ hidden_states,
638
+ next_decoder_cache,
639
+ all_hidden_states,
640
+ all_self_attentions,
641
+ all_cross_attentions,
642
+ ]
643
+ if v is not None
644
+ )
645
+ return BaseModelOutputWithPastAndCrossAttentions(
646
+ last_hidden_state=hidden_states,
647
+ past_key_values=next_decoder_cache,
648
+ hidden_states=all_hidden_states,
649
+ attentions=all_self_attentions,
650
+ cross_attentions=all_cross_attentions,
651
+ )
652
+
653
+
654
+ class MyBertPooler(nn.Module):
655
+ def __init__(self, config):
656
+ super().__init__()
657
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
658
+ self.activation = nn.Tanh()
659
+
660
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
661
+ # We "pool" the model by simply taking the hidden state corresponding
662
+ # to the first token.
663
+ first_token_tensor = hidden_states[:, 0]
664
+ pooled_output = self.dense(first_token_tensor)
665
+ pooled_output = self.activation(pooled_output)
666
+ return pooled_output
667
+
668
+
669
+ class MyBertPredictionHeadTransform(nn.Module):
670
+ def __init__(self, config):
671
+ super().__init__()
672
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
673
+ if isinstance(config.hidden_act, str):
674
+ self.transform_act_fn = ACT2FN[config.hidden_act]
675
+ else:
676
+ self.transform_act_fn = config.hidden_act
677
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
678
+
679
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
680
+ hidden_states = self.dense(hidden_states)
681
+ hidden_states = self.transform_act_fn(hidden_states)
682
+ hidden_states = self.LayerNorm(hidden_states)
683
+ return hidden_states
684
+
685
+
686
+ class MyBertLMPredictionHead(nn.Module):
687
+ def __init__(self, config):
688
+ super().__init__()
689
+ self.transform = MyBertPredictionHeadTransform(config)
690
+
691
+ # The output weights are the same as the input embeddings, but there is
692
+ # an output-only bias for each token.
693
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
694
+
695
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
696
+
697
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
698
+ self.decoder.bias = self.bias
699
+
700
+ def forward(self, hidden_states):
701
+ hidden_states = self.transform(hidden_states)
702
+ hidden_states = self.decoder(hidden_states)
703
+ return hidden_states
704
+
705
+
706
+ class MyBertOnlyMLMHead(nn.Module):
707
+ def __init__(self, config):
708
+ super().__init__()
709
+ self.predictions = MyBertLMPredictionHead(config)
710
+
711
+ def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
712
+ prediction_scores = self.predictions(sequence_output)
713
+ return prediction_scores
714
+
715
+
716
+ class MyBertOnlyNSPHead(nn.Module):
717
+ def __init__(self, config):
718
+ super().__init__()
719
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
720
+
721
+ def forward(self, pooled_output):
722
+ seq_relationship_score = self.seq_relationship(pooled_output)
723
+ return seq_relationship_score
724
+
725
+
726
+ class MyBertPreTrainingHeads(nn.Module):
727
+ def __init__(self, config):
728
+ super().__init__()
729
+ self.predictions = MyBertLMPredictionHead(config)
730
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
731
+
732
+ def forward(self, sequence_output, pooled_output):
733
+ prediction_scores = self.predictions(sequence_output)
734
+ seq_relationship_score = self.seq_relationship(pooled_output)
735
+ return prediction_scores, seq_relationship_score
736
+
737
+
738
+ class MyBertPreTrainedModel(PreTrainedModel):
739
+ """
740
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
741
+ models.
742
+ """
743
+
744
+ config_class = MyBertConfig
745
+ load_tf_weights = load_tf_weights_in_bert
746
+ base_model_prefix = "bert"
747
+ supports_gradient_checkpointing = True
748
+
749
+ def _init_weights(self, module):
750
+ """Initialize the weights"""
751
+ if isinstance(module, nn.Linear):
752
+ # Slightly different from the TF version which uses truncated_normal for initialization
753
+ # cf https://github.com/pytorch/pytorch/pull/5617
754
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
755
+ if module.bias is not None:
756
+ module.bias.data.zero_()
757
+ elif isinstance(module, nn.Embedding):
758
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
759
+ if module.padding_idx is not None:
760
+ module.weight.data[module.padding_idx].zero_()
761
+ elif isinstance(module, nn.LayerNorm):
762
+ module.bias.data.zero_()
763
+ module.weight.data.fill_(1.0)
764
+
765
+ def _set_gradient_checkpointing(self, module, value=False):
766
+ if isinstance(module, MyBertEncoder):
767
+ module.gradient_checkpointing = value
768
+
769
+
770
+ @dataclass
771
+ class MyBertForPreTrainingOutput(ModelOutput):
772
+ """
773
+ Output type of [`BertForPreTraining`].
774
+
775
+ Args:
776
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
777
+ Total loss as the sum of the masked language modeling loss and the next sequence prediction
778
+ (classification) loss.
779
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
780
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
781
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
782
+ Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
783
+ before SoftMax).
784
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
785
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
786
+ shape `(batch_size, sequence_length, hidden_size)`.
787
+
788
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
789
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
790
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
791
+ sequence_length)`.
792
+
793
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
794
+ heads.
795
+ """
796
+
797
+ loss: Optional[torch.FloatTensor] = None
798
+ prediction_logits: torch.FloatTensor = None
799
+ seq_relationship_logits: torch.FloatTensor = None
800
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
801
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
802
+
803
+
804
+ BERT_START_DOCSTRING = r"""
805
+
806
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
807
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
808
+ etc.)
809
+
810
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
811
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
812
+ and behavior.
813
+
814
+ Parameters:
815
+ config ([`BertConfig`]): Model configuration class with all the parameters of the model.
816
+ Initializing with a config file does not load the weights associated with the model, only the
817
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
818
+ """
819
+
820
+ BERT_INPUTS_DOCSTRING = r"""
821
+ Args:
822
+ input_ids (`torch.LongTensor` of shape `({0})`):
823
+ Indices of input sequence tokens in the vocabulary.
824
+
825
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
826
+ [`PreTrainedTokenizer.__call__`] for details.
827
+
828
+ [What are input IDs?](../glossary#input-ids)
829
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
830
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
831
+
832
+ - 1 for tokens that are **not masked**,
833
+ - 0 for tokens that are **masked**.
834
+
835
+ [What are attention masks?](../glossary#attention-mask)
836
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
837
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
838
+ 1]`:
839
+
840
+ - 0 corresponds to a *sentence A* token,
841
+ - 1 corresponds to a *sentence B* token.
842
+
843
+ [What are token type IDs?](../glossary#token-type-ids)
844
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
845
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
846
+ config.max_position_embeddings - 1]`.
847
+
848
+ [What are position IDs?](../glossary#position-ids)
849
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
850
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
851
+
852
+ - 1 indicates the head is **not masked**,
853
+ - 0 indicates the head is **masked**.
854
+
855
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
856
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
857
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
858
+ model's internal embedding lookup matrix.
859
+ output_attentions (`bool`, *optional*):
860
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
861
+ tensors for more detail.
862
+ output_hidden_states (`bool`, *optional*):
863
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
864
+ more detail.
865
+ return_dict (`bool`, *optional*):
866
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
867
+ """
868
+
869
+
870
+ @add_start_docstrings(
871
+ "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
872
+ BERT_START_DOCSTRING,
873
+ )
874
+ class MyBertModel(MyBertPreTrainedModel):
875
+ """
876
+
877
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
878
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
879
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
880
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
881
+
882
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
883
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
884
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
885
+ """
886
+
887
+ def __init__(self, config, add_pooling_layer=True):
888
+ super().__init__(config)
889
+ self.config = config
890
+
891
+ self.embeddings = MyBertEmbeddings(config)
892
+ self.encoder = MyBertEncoder(config)
893
+
894
+ self.pooler = MyBertPooler(config) if add_pooling_layer else None
895
+
896
+ # Initialize weights and apply final processing
897
+ self.post_init()
898
+
899
+ def get_input_embeddings(self):
900
+ return self.embeddings.word_embeddings
901
+
902
+ def set_input_embeddings(self, value):
903
+ self.embeddings.word_embeddings = value
904
+
905
+ def _prune_heads(self, heads_to_prune):
906
+ """
907
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
908
+ class PreTrainedModel
909
+ """
910
+ for layer, heads in heads_to_prune.items():
911
+ self.encoder.layer[layer].attention.prune_heads(heads)
912
+
913
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
914
+ @add_code_sample_docstrings(
915
+ checkpoint=_CHECKPOINT_FOR_DOC,
916
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
917
+ config_class=_CONFIG_FOR_DOC,
918
+ )
919
+ def forward(
920
+ self,
921
+ input_ids: Optional[torch.Tensor] = None,
922
+ attention_mask: Optional[torch.Tensor] = None,
923
+ token_type_ids: Optional[torch.Tensor] = None,
924
+ position_ids: Optional[torch.Tensor] = None,
925
+ head_mask: Optional[torch.Tensor] = None,
926
+ inputs_embeds: Optional[torch.Tensor] = None,
927
+ encoder_hidden_states: Optional[torch.Tensor] = None,
928
+ encoder_attention_mask: Optional[torch.Tensor] = None,
929
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
930
+ use_cache: Optional[bool] = None,
931
+ output_attentions: Optional[bool] = None,
932
+ output_hidden_states: Optional[bool] = None,
933
+ return_dict: Optional[bool] = None,
934
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
935
+ r"""
936
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
937
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
938
+ the model is configured as a decoder.
939
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
940
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
941
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
942
+
943
+ - 1 for tokens that are **not masked**,
944
+ - 0 for tokens that are **masked**.
945
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
946
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
947
+
948
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
949
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
950
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
951
+ use_cache (`bool`, *optional*):
952
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
953
+ `past_key_values`).
954
+ """
955
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
956
+ output_hidden_states = (
957
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
958
+ )
959
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
960
+
961
+ if self.config.is_decoder:
962
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
963
+ else:
964
+ use_cache = False
965
+
966
+ if input_ids is not None and inputs_embeds is not None:
967
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
968
+ elif input_ids is not None:
969
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
970
+ input_shape = input_ids.size()
971
+ elif inputs_embeds is not None:
972
+ input_shape = inputs_embeds.size()[:-1]
973
+ else:
974
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
975
+
976
+ batch_size, seq_length = input_shape
977
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
978
+
979
+ # past_key_values_length
980
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
981
+
982
+ if attention_mask is None:
983
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
984
+
985
+ if token_type_ids is None:
986
+ if hasattr(self.embeddings, "token_type_ids"):
987
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
988
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
989
+ token_type_ids = buffered_token_type_ids_expanded
990
+ else:
991
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
992
+
993
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
994
+ # ourselves in which case we just need to make it broadcastable to all heads.
995
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
996
+
997
+ # If a 2D or 3D attention mask is provided for the cross-attention
998
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
999
+ if self.config.is_decoder and encoder_hidden_states is not None:
1000
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1001
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1002
+ if encoder_attention_mask is None:
1003
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1004
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1005
+ else:
1006
+ encoder_extended_attention_mask = None
1007
+
1008
+ # Prepare head mask if needed
1009
+ # 1.0 in head_mask indicate we keep the head
1010
+ # attention_probs has shape bsz x n_heads x N x N
1011
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1012
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1013
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1014
+
1015
+ embedding_output = self.embeddings(
1016
+ input_ids=input_ids,
1017
+ position_ids=position_ids,
1018
+ token_type_ids=token_type_ids,
1019
+ inputs_embeds=inputs_embeds,
1020
+ past_key_values_length=past_key_values_length,
1021
+ )
1022
+ encoder_outputs = self.encoder(
1023
+ embedding_output,
1024
+ attention_mask=extended_attention_mask,
1025
+ head_mask=head_mask,
1026
+ encoder_hidden_states=encoder_hidden_states,
1027
+ encoder_attention_mask=encoder_extended_attention_mask,
1028
+ past_key_values=past_key_values,
1029
+ use_cache=use_cache,
1030
+ output_attentions=output_attentions,
1031
+ output_hidden_states=output_hidden_states,
1032
+ return_dict=return_dict,
1033
+ )
1034
+ sequence_output = encoder_outputs[0]
1035
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1036
+
1037
+ if not return_dict:
1038
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1039
+
1040
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1041
+ last_hidden_state=sequence_output,
1042
+ pooler_output=pooled_output,
1043
+ past_key_values=encoder_outputs.past_key_values,
1044
+ hidden_states=encoder_outputs.hidden_states,
1045
+ attentions=encoder_outputs.attentions,
1046
+ cross_attentions=encoder_outputs.cross_attentions,
1047
+ )
1048
+
1049
+
1050
+ @add_start_docstrings(
1051
+ """
1052
+ Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
1053
+ sentence prediction (classification)` head.
1054
+ """,
1055
+ BERT_START_DOCSTRING,
1056
+ )
1057
+ class MyBertForPreTraining(MyBertPreTrainedModel):
1058
+ _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
1059
+
1060
+ def __init__(self, config):
1061
+ super().__init__(config)
1062
+
1063
+ self.bert = MyBertModel(config)
1064
+ self.cls = MyBertPreTrainingHeads(config)
1065
+
1066
+ # Initialize weights and apply final processing
1067
+ self.post_init()
1068
+
1069
+ def get_output_embeddings(self):
1070
+ return self.cls.predictions.decoder
1071
+
1072
+ def set_output_embeddings(self, new_embeddings):
1073
+ self.cls.predictions.decoder = new_embeddings
1074
+
1075
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1076
+ @replace_return_docstrings(output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
1077
+ def forward(
1078
+ self,
1079
+ input_ids: Optional[torch.Tensor] = None,
1080
+ attention_mask: Optional[torch.Tensor] = None,
1081
+ token_type_ids: Optional[torch.Tensor] = None,
1082
+ position_ids: Optional[torch.Tensor] = None,
1083
+ head_mask: Optional[torch.Tensor] = None,
1084
+ inputs_embeds: Optional[torch.Tensor] = None,
1085
+ labels: Optional[torch.Tensor] = None,
1086
+ next_sentence_label: Optional[torch.Tensor] = None,
1087
+ output_attentions: Optional[bool] = None,
1088
+ output_hidden_states: Optional[bool] = None,
1089
+ return_dict: Optional[bool] = None,
1090
+ ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
1091
+ r"""
1092
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1093
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1094
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
1095
+ the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1096
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1097
+ Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
1098
+ pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
1099
+
1100
+ - 0 indicates sequence B is a continuation of sequence A,
1101
+ - 1 indicates sequence B is a random sequence.
1102
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1103
+ Used to hide legacy arguments that have been deprecated.
1104
+
1105
+ Returns:
1106
+
1107
+ Example:
1108
+
1109
+ ```python
1110
+ >>> from transformers import AutoTokenizer, MyBertForPreTraining
1111
+ >>> import torch
1112
+
1113
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
1114
+ >>> model = MyBertForPreTraining.from_pretrained("bert-base-uncased")
1115
+
1116
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1117
+ >>> outputs = model(**inputs)
1118
+
1119
+ >>> prediction_logits = outputs.prediction_logits
1120
+ >>> seq_relationship_logits = outputs.seq_relationship_logits
1121
+ ```
1122
+ """
1123
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1124
+
1125
+ outputs = self.bert(
1126
+ input_ids,
1127
+ attention_mask=attention_mask,
1128
+ token_type_ids=token_type_ids,
1129
+ position_ids=position_ids,
1130
+ head_mask=head_mask,
1131
+ inputs_embeds=inputs_embeds,
1132
+ output_attentions=output_attentions,
1133
+ output_hidden_states=output_hidden_states,
1134
+ return_dict=return_dict,
1135
+ )
1136
+
1137
+ sequence_output, pooled_output = outputs[:2]
1138
+ prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
1139
+
1140
+ total_loss = None
1141
+ if labels is not None and next_sentence_label is not None:
1142
+ loss_fct = CrossEntropyLoss()
1143
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1144
+ next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
1145
+ total_loss = masked_lm_loss + next_sentence_loss
1146
+
1147
+ if not return_dict:
1148
+ output = (prediction_scores, seq_relationship_score) + outputs[2:]
1149
+ return ((total_loss,) + output) if total_loss is not None else output
1150
+
1151
+ return MyBertForPreTrainingOutput(
1152
+ loss=total_loss,
1153
+ prediction_logits=prediction_scores,
1154
+ seq_relationship_logits=seq_relationship_score,
1155
+ hidden_states=outputs.hidden_states,
1156
+ attentions=outputs.attentions,
1157
+ )
1158
+
1159
+
1160
+ @add_start_docstrings(
1161
+ """MyBert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
1162
+ )
1163
+ class MyBertLMHeadModel(MyBertPreTrainedModel):
1164
+ _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
1165
+
1166
+ def __init__(self, config):
1167
+ super().__init__(config)
1168
+
1169
+ if not config.is_decoder:
1170
+ logger.warning("If you want to use `MyBertLMHeadModel` as a standalone, add `is_decoder=True.`")
1171
+
1172
+ self.bert = MyBertModel(config, add_pooling_layer=False)
1173
+ self.cls = MyBertOnlyMLMHead(config)
1174
+
1175
+ # Initialize weights and apply final processing
1176
+ self.post_init()
1177
+
1178
+ def get_output_embeddings(self):
1179
+ return self.cls.predictions.decoder
1180
+
1181
+ def set_output_embeddings(self, new_embeddings):
1182
+ self.cls.predictions.decoder = new_embeddings
1183
+
1184
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1185
+ @add_code_sample_docstrings(
1186
+ checkpoint=_CHECKPOINT_FOR_DOC,
1187
+ output_type=CausalLMOutputWithCrossAttentions,
1188
+ config_class=_CONFIG_FOR_DOC,
1189
+ )
1190
+ def forward(
1191
+ self,
1192
+ input_ids: Optional[torch.Tensor] = None,
1193
+ attention_mask: Optional[torch.Tensor] = None,
1194
+ token_type_ids: Optional[torch.Tensor] = None,
1195
+ position_ids: Optional[torch.Tensor] = None,
1196
+ head_mask: Optional[torch.Tensor] = None,
1197
+ inputs_embeds: Optional[torch.Tensor] = None,
1198
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1199
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1200
+ labels: Optional[torch.Tensor] = None,
1201
+ past_key_values: Optional[List[torch.Tensor]] = None,
1202
+ use_cache: Optional[bool] = None,
1203
+ output_attentions: Optional[bool] = None,
1204
+ output_hidden_states: Optional[bool] = None,
1205
+ return_dict: Optional[bool] = None,
1206
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
1207
+ r"""
1208
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1209
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1210
+ the model is configured as a decoder.
1211
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1212
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1213
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1214
+
1215
+ - 1 for tokens that are **not masked**,
1216
+ - 0 for tokens that are **masked**.
1217
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1218
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1219
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
1220
+ ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
1221
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1222
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1223
+
1224
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1225
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1226
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1227
+ use_cache (`bool`, *optional*):
1228
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1229
+ `past_key_values`).
1230
+ """
1231
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1232
+ if labels is not None:
1233
+ use_cache = False
1234
+
1235
+ outputs = self.bert(
1236
+ input_ids,
1237
+ attention_mask=attention_mask,
1238
+ token_type_ids=token_type_ids,
1239
+ position_ids=position_ids,
1240
+ head_mask=head_mask,
1241
+ inputs_embeds=inputs_embeds,
1242
+ encoder_hidden_states=encoder_hidden_states,
1243
+ encoder_attention_mask=encoder_attention_mask,
1244
+ past_key_values=past_key_values,
1245
+ use_cache=use_cache,
1246
+ output_attentions=output_attentions,
1247
+ output_hidden_states=output_hidden_states,
1248
+ return_dict=return_dict,
1249
+ )
1250
+
1251
+ sequence_output = outputs[0]
1252
+ prediction_scores = self.cls(sequence_output)
1253
+
1254
+ lm_loss = None
1255
+ if labels is not None:
1256
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1257
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1258
+ labels = labels[:, 1:].contiguous()
1259
+ loss_fct = CrossEntropyLoss()
1260
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1261
+
1262
+ if not return_dict:
1263
+ output = (prediction_scores,) + outputs[2:]
1264
+ return ((lm_loss,) + output) if lm_loss is not None else output
1265
+
1266
+ return CausalLMOutputWithCrossAttentions(
1267
+ loss=lm_loss,
1268
+ logits=prediction_scores,
1269
+ past_key_values=outputs.past_key_values,
1270
+ hidden_states=outputs.hidden_states,
1271
+ attentions=outputs.attentions,
1272
+ cross_attentions=outputs.cross_attentions,
1273
+ )
1274
+
1275
+ def prepare_inputs_for_generation(
1276
+ self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
1277
+ ):
1278
+ input_shape = input_ids.shape
1279
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1280
+ if attention_mask is None:
1281
+ attention_mask = input_ids.new_ones(input_shape)
1282
+
1283
+ # cut decoder_input_ids if past_key_values is used
1284
+ if past_key_values is not None:
1285
+ input_ids = input_ids[:, -1:]
1286
+
1287
+ return {
1288
+ "input_ids": input_ids,
1289
+ "attention_mask": attention_mask,
1290
+ "past_key_values": past_key_values,
1291
+ "use_cache": use_cache,
1292
+ }
1293
+
1294
+ def _reorder_cache(self, past_key_values, beam_idx):
1295
+ reordered_past = ()
1296
+ for layer_past in past_key_values:
1297
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
1298
+ return reordered_past
1299
+
1300
+
1301
+ @add_start_docstrings("""MyBert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
1302
+ class MyBertForMaskedLM(MyBertPreTrainedModel):
1303
+ _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
1304
+
1305
+ def __init__(self, config):
1306
+ super().__init__(config)
1307
+
1308
+ if config.is_decoder:
1309
+ logger.warning(
1310
+ "If you want to use `MyBertForMaskedLM` make sure `config.is_decoder=False` for "
1311
+ "bi-directional self-attention."
1312
+ )
1313
+
1314
+ self.bert = MyBertModel(config, add_pooling_layer=False)
1315
+ self.cls = MyBertOnlyMLMHead(config)
1316
+
1317
+ # Initialize weights and apply final processing
1318
+ self.post_init()
1319
+
1320
+ def get_output_embeddings(self):
1321
+ return self.cls.predictions.decoder
1322
+
1323
+ def set_output_embeddings(self, new_embeddings):
1324
+ self.cls.predictions.decoder = new_embeddings
1325
+
1326
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1327
+ @add_code_sample_docstrings(
1328
+ checkpoint=_CHECKPOINT_FOR_DOC,
1329
+ output_type=MaskedLMOutput,
1330
+ config_class=_CONFIG_FOR_DOC,
1331
+ expected_output="'paris'",
1332
+ expected_loss=0.88,
1333
+ )
1334
+ def forward(
1335
+ self,
1336
+ input_ids: Optional[torch.Tensor] = None,
1337
+ attention_mask: Optional[torch.Tensor] = None,
1338
+ token_type_ids: Optional[torch.Tensor] = None,
1339
+ position_ids: Optional[torch.Tensor] = None,
1340
+ head_mask: Optional[torch.Tensor] = None,
1341
+ inputs_embeds: Optional[torch.Tensor] = None,
1342
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1343
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1344
+ labels: Optional[torch.Tensor] = None,
1345
+ output_attentions: Optional[bool] = None,
1346
+ output_hidden_states: Optional[bool] = None,
1347
+ return_dict: Optional[bool] = None,
1348
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1349
+ r"""
1350
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1351
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1352
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1353
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1354
+ """
1355
+
1356
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1357
+
1358
+ outputs = self.bert(
1359
+ input_ids,
1360
+ attention_mask=attention_mask,
1361
+ token_type_ids=token_type_ids,
1362
+ position_ids=position_ids,
1363
+ head_mask=head_mask,
1364
+ inputs_embeds=inputs_embeds,
1365
+ encoder_hidden_states=encoder_hidden_states,
1366
+ encoder_attention_mask=encoder_attention_mask,
1367
+ output_attentions=output_attentions,
1368
+ output_hidden_states=output_hidden_states,
1369
+ return_dict=return_dict,
1370
+ )
1371
+
1372
+ sequence_output = outputs[0]
1373
+ prediction_scores = self.cls(sequence_output)
1374
+
1375
+ masked_lm_loss = None
1376
+ if labels is not None:
1377
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
1378
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1379
+
1380
+ if not return_dict:
1381
+ output = (prediction_scores,) + outputs[2:]
1382
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1383
+
1384
+ return MaskedLMOutput(
1385
+ loss=masked_lm_loss,
1386
+ logits=prediction_scores,
1387
+ hidden_states=outputs.hidden_states,
1388
+ attentions=outputs.attentions,
1389
+ )
1390
+
1391
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
1392
+ input_shape = input_ids.shape
1393
+ effective_batch_size = input_shape[0]
1394
+
1395
+ # add a dummy token
1396
+ if self.config.pad_token_id is None:
1397
+ raise ValueError("The PAD token should be defined for generation")
1398
+
1399
+ attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
1400
+ dummy_token = torch.full(
1401
+ (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
1402
+ )
1403
+ input_ids = torch.cat([input_ids, dummy_token], dim=1)
1404
+
1405
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
1406
+
1407
+
1408
+ @add_start_docstrings(
1409
+ """MyBert Model with a `next sentence prediction (classification)` head on top.""",
1410
+ BERT_START_DOCSTRING,
1411
+ )
1412
+ class MyBertForNextSentencePrediction(MyBertPreTrainedModel):
1413
+ def __init__(self, config):
1414
+ super().__init__(config)
1415
+
1416
+ self.bert = MyBertModel(config)
1417
+ self.cls = MyBertOnlyNSPHead(config)
1418
+
1419
+ # Initialize weights and apply final processing
1420
+ self.post_init()
1421
+
1422
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1423
+ @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
1424
+ def forward(
1425
+ self,
1426
+ input_ids: Optional[torch.Tensor] = None,
1427
+ attention_mask: Optional[torch.Tensor] = None,
1428
+ token_type_ids: Optional[torch.Tensor] = None,
1429
+ position_ids: Optional[torch.Tensor] = None,
1430
+ head_mask: Optional[torch.Tensor] = None,
1431
+ inputs_embeds: Optional[torch.Tensor] = None,
1432
+ labels: Optional[torch.Tensor] = None,
1433
+ output_attentions: Optional[bool] = None,
1434
+ output_hidden_states: Optional[bool] = None,
1435
+ return_dict: Optional[bool] = None,
1436
+ **kwargs,
1437
+ ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
1438
+ r"""
1439
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1440
+ Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
1441
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
1442
+
1443
+ - 0 indicates sequence B is a continuation of sequence A,
1444
+ - 1 indicates sequence B is a random sequence.
1445
+
1446
+ Returns:
1447
+
1448
+ Example:
1449
+
1450
+ ```python
1451
+ >>> from transformers import AutoTokenizer, MyBertForNextSentencePrediction
1452
+ >>> import torch
1453
+
1454
+ >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
1455
+ >>> model = MyBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
1456
+
1457
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1458
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
1459
+ >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
1460
+
1461
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
1462
+ >>> logits = outputs.logits
1463
+ >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
1464
+ ```
1465
+ """
1466
+
1467
+ if "next_sentence_label" in kwargs:
1468
+ warnings.warn(
1469
+ "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
1470
+ " `labels` instead.",
1471
+ FutureWarning,
1472
+ )
1473
+ labels = kwargs.pop("next_sentence_label")
1474
+
1475
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1476
+
1477
+ outputs = self.bert(
1478
+ input_ids,
1479
+ attention_mask=attention_mask,
1480
+ token_type_ids=token_type_ids,
1481
+ position_ids=position_ids,
1482
+ head_mask=head_mask,
1483
+ inputs_embeds=inputs_embeds,
1484
+ output_attentions=output_attentions,
1485
+ output_hidden_states=output_hidden_states,
1486
+ return_dict=return_dict,
1487
+ )
1488
+
1489
+ pooled_output = outputs[1]
1490
+
1491
+ seq_relationship_scores = self.cls(pooled_output)
1492
+
1493
+ next_sentence_loss = None
1494
+ if labels is not None:
1495
+ loss_fct = CrossEntropyLoss()
1496
+ next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
1497
+
1498
+ if not return_dict:
1499
+ output = (seq_relationship_scores,) + outputs[2:]
1500
+ return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
1501
+
1502
+ return NextSentencePredictorOutput(
1503
+ loss=next_sentence_loss,
1504
+ logits=seq_relationship_scores,
1505
+ hidden_states=outputs.hidden_states,
1506
+ attentions=outputs.attentions,
1507
+ )
1508
+
1509
+
1510
+ @add_start_docstrings(
1511
+ """
1512
+ MyBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
1513
+ output) e.g. for GLUE tasks.
1514
+ """,
1515
+ BERT_START_DOCSTRING,
1516
+ )
1517
+ class MyBertForSequenceClassification(MyBertPreTrainedModel):
1518
+ def __init__(self, config):
1519
+ super().__init__(config)
1520
+ self.num_labels = config.num_labels
1521
+ self.config = config
1522
+
1523
+ self.bert = MyBertModel(config)
1524
+ classifier_dropout = (
1525
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1526
+ )
1527
+ self.dropout = nn.Dropout(classifier_dropout)
1528
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1529
+
1530
+ # Initialize weights and apply final processing
1531
+ self.post_init()
1532
+
1533
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1534
+ @add_code_sample_docstrings(
1535
+ checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
1536
+ output_type=SequenceClassifierOutput,
1537
+ config_class=_CONFIG_FOR_DOC,
1538
+ expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
1539
+ expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
1540
+ )
1541
+ def forward(
1542
+ self,
1543
+ input_ids: Optional[torch.Tensor] = None,
1544
+ attention_mask: Optional[torch.Tensor] = None,
1545
+ token_type_ids: Optional[torch.Tensor] = None,
1546
+ position_ids: Optional[torch.Tensor] = None,
1547
+ head_mask: Optional[torch.Tensor] = None,
1548
+ inputs_embeds: Optional[torch.Tensor] = None,
1549
+ labels: Optional[torch.Tensor] = None,
1550
+ output_attentions: Optional[bool] = None,
1551
+ output_hidden_states: Optional[bool] = None,
1552
+ return_dict: Optional[bool] = None,
1553
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1554
+ r"""
1555
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1556
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1557
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1558
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1559
+ """
1560
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1561
+
1562
+ outputs = self.bert(
1563
+ input_ids,
1564
+ attention_mask=attention_mask,
1565
+ token_type_ids=token_type_ids,
1566
+ position_ids=position_ids,
1567
+ head_mask=head_mask,
1568
+ inputs_embeds=inputs_embeds,
1569
+ output_attentions=output_attentions,
1570
+ output_hidden_states=output_hidden_states,
1571
+ return_dict=return_dict,
1572
+ )
1573
+
1574
+ pooled_output = outputs[1]
1575
+
1576
+ pooled_output = self.dropout(pooled_output)
1577
+ logits = self.classifier(pooled_output)
1578
+
1579
+ loss = None
1580
+ if labels is not None:
1581
+ if self.config.problem_type is None:
1582
+ if self.num_labels == 1:
1583
+ self.config.problem_type = "regression"
1584
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1585
+ self.config.problem_type = "single_label_classification"
1586
+ else:
1587
+ self.config.problem_type = "multi_label_classification"
1588
+
1589
+ if self.config.problem_type == "regression":
1590
+ loss_fct = MSELoss()
1591
+ if self.num_labels == 1:
1592
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1593
+ else:
1594
+ loss = loss_fct(logits, labels)
1595
+ elif self.config.problem_type == "single_label_classification":
1596
+ loss_fct = CrossEntropyLoss()
1597
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1598
+ elif self.config.problem_type == "multi_label_classification":
1599
+ loss_fct = BCEWithLogitsLoss()
1600
+ loss = loss_fct(logits, labels)
1601
+ if not return_dict:
1602
+ output = (logits,) + outputs[2:]
1603
+ return ((loss,) + output) if loss is not None else output
1604
+
1605
+ return SequenceClassifierOutput(
1606
+ loss=loss,
1607
+ logits=logits,
1608
+ hidden_states=outputs.hidden_states,
1609
+ attentions=outputs.attentions,
1610
+ )
1611
+
1612
+
1613
+ @add_start_docstrings(
1614
+ """
1615
+ MyBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
1616
+ softmax) e.g. for RocStories/SWAG tasks.
1617
+ """,
1618
+ BERT_START_DOCSTRING,
1619
+ )
1620
+ class MyBertForMultipleChoice(MyBertPreTrainedModel):
1621
+ def __init__(self, config):
1622
+ super().__init__(config)
1623
+
1624
+ self.bert = MyBertModel(config)
1625
+ classifier_dropout = (
1626
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1627
+ )
1628
+ self.dropout = nn.Dropout(classifier_dropout)
1629
+ self.classifier = nn.Linear(config.hidden_size, 1)
1630
+
1631
+ # Initialize weights and apply final processing
1632
+ self.post_init()
1633
+
1634
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
1635
+ @add_code_sample_docstrings(
1636
+ checkpoint=_CHECKPOINT_FOR_DOC,
1637
+ output_type=MultipleChoiceModelOutput,
1638
+ config_class=_CONFIG_FOR_DOC,
1639
+ )
1640
+ def forward(
1641
+ self,
1642
+ input_ids: Optional[torch.Tensor] = None,
1643
+ attention_mask: Optional[torch.Tensor] = None,
1644
+ token_type_ids: Optional[torch.Tensor] = None,
1645
+ position_ids: Optional[torch.Tensor] = None,
1646
+ head_mask: Optional[torch.Tensor] = None,
1647
+ inputs_embeds: Optional[torch.Tensor] = None,
1648
+ labels: Optional[torch.Tensor] = None,
1649
+ output_attentions: Optional[bool] = None,
1650
+ output_hidden_states: Optional[bool] = None,
1651
+ return_dict: Optional[bool] = None,
1652
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1653
+ r"""
1654
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1655
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1656
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1657
+ `input_ids` above)
1658
+ """
1659
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1660
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1661
+
1662
+ input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1663
+ attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1664
+ token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1665
+ position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1666
+ inputs_embeds = (
1667
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1668
+ if inputs_embeds is not None
1669
+ else None
1670
+ )
1671
+
1672
+ outputs = self.bert(
1673
+ input_ids,
1674
+ attention_mask=attention_mask,
1675
+ token_type_ids=token_type_ids,
1676
+ position_ids=position_ids,
1677
+ head_mask=head_mask,
1678
+ inputs_embeds=inputs_embeds,
1679
+ output_attentions=output_attentions,
1680
+ output_hidden_states=output_hidden_states,
1681
+ return_dict=return_dict,
1682
+ )
1683
+
1684
+ pooled_output = outputs[1]
1685
+
1686
+ pooled_output = self.dropout(pooled_output)
1687
+ logits = self.classifier(pooled_output)
1688
+ reshaped_logits = logits.view(-1, num_choices)
1689
+
1690
+ loss = None
1691
+ if labels is not None:
1692
+ loss_fct = CrossEntropyLoss()
1693
+ loss = loss_fct(reshaped_logits, labels)
1694
+
1695
+ if not return_dict:
1696
+ output = (reshaped_logits,) + outputs[2:]
1697
+ return ((loss,) + output) if loss is not None else output
1698
+
1699
+ return MultipleChoiceModelOutput(
1700
+ loss=loss,
1701
+ logits=reshaped_logits,
1702
+ hidden_states=outputs.hidden_states,
1703
+ attentions=outputs.attentions,
1704
+ )
1705
+
1706
+
1707
+ @add_start_docstrings(
1708
+ """
1709
+ MyBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
1710
+ Named-Entity-Recognition (NER) tasks.
1711
+ """,
1712
+ BERT_START_DOCSTRING,
1713
+ )
1714
+ class MyBertForTokenClassification(MyBertPreTrainedModel):
1715
+ def __init__(self, config):
1716
+ super().__init__(config)
1717
+ self.num_labels = config.num_labels
1718
+
1719
+ self.bert = MyBertModel(config, add_pooling_layer=False)
1720
+ classifier_dropout = (
1721
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1722
+ )
1723
+ self.dropout = nn.Dropout(classifier_dropout)
1724
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1725
+
1726
+ # Initialize weights and apply final processing
1727
+ self.post_init()
1728
+
1729
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1730
+ @add_code_sample_docstrings(
1731
+ checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
1732
+ output_type=TokenClassifierOutput,
1733
+ config_class=_CONFIG_FOR_DOC,
1734
+ expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
1735
+ expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
1736
+ )
1737
+ def forward(
1738
+ self,
1739
+ input_ids: Optional[torch.Tensor] = None,
1740
+ attention_mask: Optional[torch.Tensor] = None,
1741
+ token_type_ids: Optional[torch.Tensor] = None,
1742
+ position_ids: Optional[torch.Tensor] = None,
1743
+ head_mask: Optional[torch.Tensor] = None,
1744
+ inputs_embeds: Optional[torch.Tensor] = None,
1745
+ labels: Optional[torch.Tensor] = None,
1746
+ output_attentions: Optional[bool] = None,
1747
+ output_hidden_states: Optional[bool] = None,
1748
+ return_dict: Optional[bool] = None,
1749
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1750
+ r"""
1751
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1752
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1753
+ """
1754
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1755
+
1756
+ outputs = self.bert(
1757
+ input_ids,
1758
+ attention_mask=attention_mask,
1759
+ token_type_ids=token_type_ids,
1760
+ position_ids=position_ids,
1761
+ head_mask=head_mask,
1762
+ inputs_embeds=inputs_embeds,
1763
+ output_attentions=output_attentions,
1764
+ output_hidden_states=output_hidden_states,
1765
+ return_dict=return_dict,
1766
+ )
1767
+
1768
+ sequence_output = outputs[0]
1769
+
1770
+ sequence_output = self.dropout(sequence_output)
1771
+ logits = self.classifier(sequence_output)
1772
+
1773
+ loss = None
1774
+ if labels is not None:
1775
+ loss_fct = CrossEntropyLoss()
1776
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1777
+
1778
+ if not return_dict:
1779
+ output = (logits,) + outputs[2:]
1780
+ return ((loss,) + output) if loss is not None else output
1781
+
1782
+ return TokenClassifierOutput(
1783
+ loss=loss,
1784
+ logits=logits,
1785
+ hidden_states=outputs.hidden_states,
1786
+ attentions=outputs.attentions,
1787
+ )
1788
+
1789
+
1790
+ @add_start_docstrings(
1791
+ """
1792
+ MyBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1793
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1794
+ """,
1795
+ BERT_START_DOCSTRING,
1796
+ )
1797
+ class MyBertForQuestionAnswering(MyBertPreTrainedModel):
1798
+ def __init__(self, config):
1799
+ super().__init__(config)
1800
+ self.num_labels = config.num_labels
1801
+
1802
+ self.bert = MyBertModel(config, add_pooling_layer=False)
1803
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1804
+
1805
+ # Initialize weights and apply final processing
1806
+ self.post_init()
1807
+
1808
+ @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1809
+ @add_code_sample_docstrings(
1810
+ checkpoint=_CHECKPOINT_FOR_QA,
1811
+ output_type=QuestionAnsweringModelOutput,
1812
+ config_class=_CONFIG_FOR_DOC,
1813
+ qa_target_start_index=_QA_TARGET_START_INDEX,
1814
+ qa_target_end_index=_QA_TARGET_END_INDEX,
1815
+ expected_output=_QA_EXPECTED_OUTPUT,
1816
+ expected_loss=_QA_EXPECTED_LOSS,
1817
+ )
1818
+ def forward(
1819
+ self,
1820
+ input_ids: Optional[torch.Tensor] = None,
1821
+ attention_mask: Optional[torch.Tensor] = None,
1822
+ token_type_ids: Optional[torch.Tensor] = None,
1823
+ position_ids: Optional[torch.Tensor] = None,
1824
+ head_mask: Optional[torch.Tensor] = None,
1825
+ inputs_embeds: Optional[torch.Tensor] = None,
1826
+ start_positions: Optional[torch.Tensor] = None,
1827
+ end_positions: Optional[torch.Tensor] = None,
1828
+ output_attentions: Optional[bool] = None,
1829
+ output_hidden_states: Optional[bool] = None,
1830
+ return_dict: Optional[bool] = None,
1831
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1832
+ r"""
1833
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1834
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1835
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1836
+ are not taken into account for computing the loss.
1837
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1838
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1839
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1840
+ are not taken into account for computing the loss.
1841
+ """
1842
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1843
+
1844
+ outputs = self.bert(
1845
+ input_ids,
1846
+ attention_mask=attention_mask,
1847
+ token_type_ids=token_type_ids,
1848
+ position_ids=position_ids,
1849
+ head_mask=head_mask,
1850
+ inputs_embeds=inputs_embeds,
1851
+ output_attentions=output_attentions,
1852
+ output_hidden_states=output_hidden_states,
1853
+ return_dict=return_dict,
1854
+ )
1855
+
1856
+ sequence_output = outputs[0]
1857
+
1858
+ logits = self.qa_outputs(sequence_output)
1859
+ start_logits, end_logits = logits.split(1, dim=-1)
1860
+ start_logits = start_logits.squeeze(-1).contiguous()
1861
+ end_logits = end_logits.squeeze(-1).contiguous()
1862
+
1863
+ total_loss = None
1864
+ if start_positions is not None and end_positions is not None:
1865
+ # If we are on multi-GPU, split add a dimension
1866
+ if len(start_positions.size()) > 1:
1867
+ start_positions = start_positions.squeeze(-1)
1868
+ if len(end_positions.size()) > 1:
1869
+ end_positions = end_positions.squeeze(-1)
1870
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1871
+ ignored_index = start_logits.size(1)
1872
+ start_positions = start_positions.clamp(0, ignored_index)
1873
+ end_positions = end_positions.clamp(0, ignored_index)
1874
+
1875
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1876
+ start_loss = loss_fct(start_logits, start_positions)
1877
+ end_loss = loss_fct(end_logits, end_positions)
1878
+ total_loss = (start_loss + end_loss) / 2
1879
+
1880
+ if not return_dict:
1881
+ output = (start_logits, end_logits) + outputs[2:]
1882
+ return ((total_loss,) + output) if total_loss is not None else output
1883
+
1884
+ return QuestionAnsweringModelOutput(
1885
+ loss=total_loss,
1886
+ start_logits=start_logits,
1887
+ end_logits=end_logits,
1888
+ hidden_states=outputs.hidden_states,
1889
+ attentions=outputs.attentions,
1890
+ )
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097417381d6c7230bd9e3557456d726de6e83245ec8b24f529f60198a67b203a
3
+ size 440473133
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 8192,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff