zxdu20 commited on
Commit
bfb1a8f
·
1 Parent(s): 68873da

Slim embedding

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "THUDM/chatglm-6b",
3
  "architectures": [
4
  "ChatGLMModel"
5
  ],
@@ -8,21 +8,23 @@
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
- "bos_token_id": 150004,
12
- "eos_token_id": 150005,
13
- "pad_token_id": 20003,
14
  "hidden_size": 4096,
15
  "inner_hidden_size": 16384,
16
  "layernorm_epsilon": 1e-05,
 
17
  "max_sequence_length": 2048,
18
  "model_type": "chatglm",
19
  "num_attention_heads": 32,
20
  "num_layers": 28,
 
21
  "position_encoding_2d": true,
22
  "quantization_bit": 4,
23
  "quantization_embeddings": false,
24
  "torch_dtype": "float16",
25
- "transformers_version": "4.23.1",
26
  "use_cache": true,
27
- "vocab_size": 150528
28
- }
 
1
  {
2
+ "_name_or_path": "THUDM/chatglm-6b-int4",
3
  "architectures": [
4
  "ChatGLMModel"
5
  ],
 
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
+ "bos_token_id": 130004,
12
+ "eos_token_id": 130005,
13
+ "gmask_token_id": 130001,
14
  "hidden_size": 4096,
15
  "inner_hidden_size": 16384,
16
  "layernorm_epsilon": 1e-05,
17
+ "mask_token_id": 130000,
18
  "max_sequence_length": 2048,
19
  "model_type": "chatglm",
20
  "num_attention_heads": 32,
21
  "num_layers": 28,
22
+ "pad_token_id": 3,
23
  "position_encoding_2d": true,
24
  "quantization_bit": 4,
25
  "quantization_embeddings": false,
26
  "torch_dtype": "float16",
27
+ "transformers_version": "4.27.1",
28
  "use_cache": true,
29
+ "vocab_size": 130528
30
+ }
configuration_chatglm.py CHANGED
@@ -66,6 +66,8 @@ class ChatGLMConfig(PretrainedConfig):
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
 
 
69
  pad_token_id=0,
70
  max_sequence_length=2048,
71
  inner_hidden_size=16384,
@@ -87,6 +89,8 @@ class ChatGLMConfig(PretrainedConfig):
87
  self.bos_token_id = bos_token_id
88
  self.eos_token_id = eos_token_id
89
  self.pad_token_id = pad_token_id
 
 
90
  self.position_encoding_2d = position_encoding_2d
91
  self.quantization_bit = quantization_bit
92
  self.quantization_embeddings = quantization_embeddings
 
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
69
+ mask_token_id=150000,
70
+ gmask_token_id=150001,
71
  pad_token_id=0,
72
  max_sequence_length=2048,
73
  inner_hidden_size=16384,
 
89
  self.bos_token_id = bos_token_id
90
  self.eos_token_id = eos_token_id
91
  self.pad_token_id = pad_token_id
92
+ self.mask_token_id = mask_token_id
93
+ self.gmask_token_id = gmask_token_id
94
  self.position_encoding_2d = position_encoding_2d
95
  self.quantization_bit = quantization_bit
96
  self.quantization_embeddings = quantization_embeddings
modeling_chatglm.py CHANGED
@@ -921,9 +921,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
921
 
922
 
923
  if position_ids is None:
924
- MASK, gMASK = 150000, 150001
925
- mask_token = MASK if MASK in input_ids else gMASK
926
- use_gmask = False if MASK in input_ids else True
927
 
928
  mask_positions = [seq.tolist().index(mask_token) for seq in input_ids]
929
  position_ids = self.get_position_ids(
@@ -1084,9 +1084,9 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
- MASK, gMASK = 150000, 150001
1088
- mask_token = MASK if MASK in input_ids else gMASK
1089
- use_gmask = False if MASK in input_ids else True
1090
  seqs = input_ids.tolist()
1091
  mask_positions = [seq.index(mask_token) for seq in seqs]
1092
 
 
921
 
922
 
923
  if position_ids is None:
924
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
925
+ mask_token = gMASK if gMASK in input_ids else MASK
926
+ use_gmask = True if gMASK in input_ids else False
927
 
928
  mask_positions = [seq.tolist().index(mask_token) for seq in input_ids]
929
  position_ids = self.get_position_ids(
 
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
1088
+ mask_token = gMASK if gMASK in input_ids else MASK
1089
+ use_gmask = True if gMASK in input_ids else False
1090
  seqs = input_ids.tolist()
1091
  mask_positions = [seq.index(mask_token) for seq in seqs]
1092
 
tokenization_chatglm.py CHANGED
@@ -48,11 +48,13 @@ class SPTokenizer:
48
  def __init__(
49
  self,
50
  vocab_file,
 
51
  max_blank_length=80,
52
  byte_fallback=True,
53
  ):
54
  assert vocab_file is not None
55
  self.vocab_file = vocab_file
 
56
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
57
  self.max_blank_length = max_blank_length
58
  self.byte_fallback = byte_fallback
@@ -70,10 +72,6 @@ class SPTokenizer:
70
  def get_tab_token():
71
  return f"<|tab|>"
72
 
73
- @property
74
- def num_image_tokens(self):
75
- return 20000
76
-
77
  @property
78
  def num_text_tokens(self):
79
  return self.text_tokenizer.num_tokens
@@ -178,6 +176,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
178
  mask_token='[MASK]',
179
  gmask_token='[gMASK]',
180
  padding_side="left",
 
181
  **kwargs
182
  ) -> None:
183
  super().__init__(
@@ -197,10 +196,16 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
197
  self.mask_token = mask_token
198
  self.gmask_token = gmask_token
199
 
200
- self.sp_tokenizer = SPTokenizer(vocab_file)
201
 
202
  """ Initialisation """
203
 
 
 
 
 
 
 
204
  @property
205
  def eop_token_id(self) -> Optional[int]:
206
  """
 
48
  def __init__(
49
  self,
50
  vocab_file,
51
+ num_image_tokens=20000,
52
  max_blank_length=80,
53
  byte_fallback=True,
54
  ):
55
  assert vocab_file is not None
56
  self.vocab_file = vocab_file
57
+ self.num_image_tokens = num_image_tokens
58
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
59
  self.max_blank_length = max_blank_length
60
  self.byte_fallback = byte_fallback
 
72
  def get_tab_token():
73
  return f"<|tab|>"
74
 
 
 
 
 
75
  @property
76
  def num_text_tokens(self):
77
  return self.text_tokenizer.num_tokens
 
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
179
+ num_image_tokens=20000,
180
  **kwargs
181
  ) -> None:
182
  super().__init__(
 
196
  self.mask_token = mask_token
197
  self.gmask_token = gmask_token
198
 
199
+ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
200
 
201
  """ Initialisation """
202
 
203
+ @property
204
+ def gmask_token_id(self) -> Optional[int]:
205
+ if self.gmask_token is None:
206
+ return None
207
+ return self.convert_tokens_to_ids(self.gmask_token)
208
+
209
  @property
210
  def eop_token_id(self) -> Optional[int]:
211
  """
tokenizer_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "name_or_path": "THUDM/chatglm-6b",
3
  "bos_token": "<sop>",
4
  "eop_token": "<eop>",
5
  "eos_token": "</s>",
@@ -10,6 +10,7 @@
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
 
13
  "auto_map": {
14
  "AutoTokenizer": [
15
  "tokenization_chatglm.ChatGLMTokenizer",
 
1
  {
2
+ "name_or_path": "THUDM/chatglm-6b-int4",
3
  "bos_token": "<sop>",
4
  "eop_token": "<eop>",
5
  "eos_token": "</s>",
 
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
13
+ "num_image_tokens": 0,
14
  "auto_map": {
15
  "AutoTokenizer": [
16
  "tokenization_chatglm.ChatGLMTokenizer",