RichardErkhov commited on
Commit
75d6e8b
·
verified ·
1 Parent(s): 5f9af11

uploaded model

Browse files
Files changed (1) hide show
  1. configuration_phi3_v.py +218 -0
configuration_phi3_v.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Phi-3-V model configuration"""
17
+
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
27
+ "microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct/resolve/main/config.json",
28
+ }
29
+
30
+
31
+ class Phi3VConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the
36
+ [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32064):
43
+ Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`Phi3VModel`].
45
+ hidden_size (`int`, *optional*, defaults to 3072):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 8192):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer decoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer decoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
62
+ Dropout probability for mlp outputs.
63
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
64
+ The dropout ratio for the embeddings.
65
+ attention_dropout (`float`, *optional*, defaults to 0.0):
66
+ The dropout ratio after computing the attention scores.
67
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
68
+ The non-linear activation function (function or string) in the decoder.
69
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
70
+ The maximum sequence length that this model might ever be used with.
71
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
72
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
73
+ original RoPE embeddings when using long scaling.
74
+ initializer_range (`float`, *optional*, defaults to 0.02):
75
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
77
+ The epsilon value used for the RMSNorm.
78
+ use_cache (`bool`, *optional*, defaults to `True`):
79
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
80
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
+ Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`dict`, *optional*):
86
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
88
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
+ divided by the number of attention heads divided by 2.
90
+ bos_token_id (`int`, *optional*, defaults to 1):
91
+ The id of the "beginning-of-sequence" token.
92
+ eos_token_id (`int`, *optional*, defaults to 32000):
93
+ The id of the "end-of-sequence" token.
94
+ pad_token_id (`int`, *optional*, defaults to 32000):
95
+ The id of the padding token.
96
+ sliding_window (`int`, *optional*):
97
+ Sliding window attention window size. If `None`, no sliding window is applied.
98
+ embd_layer (`str`, *optional*, defaults to `"default"`):
99
+ The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text.
100
+
101
+ Example:
102
+
103
+ ```python
104
+ >>> from transformers import Phi3VModel, Phi3VConfig
105
+
106
+ >>> # Initializing a Phi-3-V style configuration
107
+ >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")
108
+
109
+ >>> # Initializing a model from the configuration
110
+ >>> model = Phi3VModel(configuration)
111
+
112
+ >>> # Accessing the model configuration
113
+ >>> configuration = model.config
114
+ ```"""
115
+
116
+ model_type = "phi3_v"
117
+ keys_to_ignore_at_inference = ["past_key_values"]
118
+
119
+ def __init__(
120
+ self,
121
+ vocab_size=32064,
122
+ hidden_size=3072,
123
+ intermediate_size=8192,
124
+ num_hidden_layers=32,
125
+ num_attention_heads=32,
126
+ num_key_value_heads=None,
127
+ resid_pdrop=0.0,
128
+ embd_pdrop=0.0,
129
+ attention_dropout=0.0,
130
+ hidden_act="silu",
131
+ max_position_embeddings=4096,
132
+ original_max_position_embeddings=4096,
133
+ initializer_range=0.02,
134
+ rms_norm_eps=1e-5,
135
+ use_cache=True,
136
+ tie_word_embeddings=False,
137
+ rope_theta=10000.0,
138
+ rope_scaling=None,
139
+ bos_token_id=1,
140
+ eos_token_id=32000,
141
+ pad_token_id=32000,
142
+ sliding_window=None,
143
+ embd_layer: str = "default",
144
+ **kwargs,
145
+ ):
146
+ self.vocab_size = vocab_size
147
+ self.hidden_size = hidden_size
148
+ self.intermediate_size = intermediate_size
149
+ self.num_hidden_layers = num_hidden_layers
150
+ self.num_attention_heads = num_attention_heads
151
+
152
+ if num_key_value_heads is None:
153
+ num_key_value_heads = num_attention_heads
154
+
155
+ self.num_key_value_heads = num_key_value_heads
156
+ self.resid_pdrop = resid_pdrop
157
+ self.embd_pdrop = embd_pdrop
158
+ self.attention_dropout = attention_dropout
159
+ self.hidden_act = hidden_act
160
+ self.max_position_embeddings = max_position_embeddings
161
+ self.original_max_position_embeddings = original_max_position_embeddings
162
+ self.initializer_range = initializer_range
163
+ self.rms_norm_eps = rms_norm_eps
164
+ self.use_cache = use_cache
165
+ self.rope_theta = rope_theta
166
+ self.rope_scaling = rope_scaling
167
+ self._rope_scaling_validation()
168
+ self.sliding_window = sliding_window
169
+ self.embd_layer = embd_layer
170
+
171
+
172
+ super().__init__(
173
+ bos_token_id=bos_token_id,
174
+ eos_token_id=eos_token_id,
175
+ pad_token_id=pad_token_id,
176
+ tie_word_embeddings=tie_word_embeddings,
177
+ **kwargs,
178
+ )
179
+
180
+ def _rope_scaling_validation(self):
181
+ """
182
+ Validate the `rope_scaling` configuration.
183
+ """
184
+ if self.rope_scaling is None:
185
+ return
186
+
187
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
188
+ raise ValueError(
189
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
190
+ f"got {self.rope_scaling}"
191
+ )
192
+ rope_scaling_type = self.rope_scaling.get("type", None)
193
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
194
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
195
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
196
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
197
+ if not (
198
+ isinstance(rope_scaling_short_factor, list)
199
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
200
+ ):
201
+ raise ValueError(
202
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
203
+ )
204
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
205
+ raise ValueError(
206
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
207
+ )
208
+ if not (
209
+ isinstance(rope_scaling_long_factor, list)
210
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
211
+ ):
212
+ raise ValueError(
213
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
214
+ )
215
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
216
+ raise ValueError(
217
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
218
+ )