BAAI
/

ldwang commited on
Commit
2ea5426
·
1 Parent(s): 1f527cc

Upload configuration_aquila.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_aquila.py +128 -0
configuration_aquila.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ Aquila model configuration"""
21
+
22
+ from transformers import PretrainedConfig
23
+
24
+
25
+
26
+ class AquilaConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila
29
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
30
+ defaults will yield a similar configuration to that of the Aquila-7B.
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 32000):
38
+ Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`AquilaModel`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 11008):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
49
+ The non-linear activation function (function or string) in the decoder.
50
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
51
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
52
+ just in case (e.g., 512 or 1024 or 2048).
53
+ initializer_range (`float`, *optional*, defaults to 0.02):
54
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
55
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
56
+ The epsilon used by the rms normalization layers.
57
+ use_cache (`bool`, *optional*, defaults to `True`):
58
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
59
+ relevant if `config.is_decoder=True`.
60
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
61
+ Whether to tie weight embeddings
62
+ Example:
63
+
64
+ ```python
65
+ >>> from transformers import AquilaModel, AquilaConfig
66
+
67
+ >>> # Initializing a Aquila aquila-7b style configuration
68
+ >>> configuration = AquilaConfig()
69
+
70
+ >>> # Initializing a model from the aquila-7b style configuration
71
+ >>> model = AquilaModel(configuration)
72
+
73
+ >>> # Accessing the model configuration
74
+ >>> configuration = model.config
75
+ ```"""
76
+ model_type = "aquila"
77
+ keys_to_ignore_at_inference = ["past_key_values"]
78
+
79
+ def __init__(
80
+ self,
81
+ vocab_size=100008,
82
+ hidden_size=4096,
83
+ intermediate_size=11008,
84
+ num_hidden_layers=32,
85
+ num_attention_heads=32,
86
+ num_key_value_heads=None,
87
+ hidden_act="silu",
88
+ max_position_embeddings=2048,
89
+ initializer_range=0.02,
90
+ rms_norm_eps=1e-6,
91
+ use_cache=True,
92
+ pad_token_id=0,
93
+ bos_token_id=1,
94
+ eos_token_id=2,
95
+ pretraining_tp=1,
96
+ tie_word_embeddings=False,
97
+ rope_theta=10000.0,
98
+ rope_scaling=None,
99
+ **kwargs,
100
+ ):
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+
107
+ # for backward compatibility
108
+ if num_key_value_heads is None:
109
+ num_key_value_heads = num_attention_heads
110
+
111
+ self.num_key_value_heads = num_key_value_heads
112
+
113
+ self.num_attention_heads = num_attention_heads
114
+ self.hidden_act = hidden_act
115
+ self.initializer_range = initializer_range
116
+ self.rms_norm_eps = rms_norm_eps
117
+ self.pretraining_tp = pretraining_tp
118
+ self.use_cache = use_cache
119
+ self.rope_theta = rope_theta
120
+ self.rope_scaling = rope_scaling
121
+
122
+ super().__init__(
123
+ pad_token_id=pad_token_id,
124
+ bos_token_id=bos_token_id,
125
+ eos_token_id=eos_token_id,
126
+ tie_word_embeddings=tie_word_embeddings,
127
+ **kwargs,
128
+ )