chielo commited on
Commit
7ba7038
Β·
1 Parent(s): 9eae4b4

add ChatGLMTokenizerFast and ChatGLMTokenizerConverter

Browse files
Files changed (2) hide show
  1. tokenization_chatglm.py +251 -28
  2. tokenizer_config.json +2 -2
tokenization_chatglm.py CHANGED
@@ -1,13 +1,39 @@
1
  import json
2
  import os
3
- import torch
4
  from typing import List, Optional, Union, Dict
5
  from sentencepiece import SentencePieceProcessor
6
- from transformers import PreTrainedTokenizer
 
 
 
 
 
 
 
 
7
  from transformers.utils import logging, PaddingStrategy
8
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class SPTokenizer:
12
  def __init__(self, model_path: str):
13
  # reload tokenizer
@@ -21,17 +47,29 @@ class SPTokenizer:
21
  self.pad_id: int = self.sp_model.unk_id()
22
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
23
 
24
- special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>",
25
- "<|observation|>"]
26
  self.special_tokens = {}
27
  self.index_special_tokens = {}
28
  for token in special_tokens:
29
  self.special_tokens[token] = self.n_words
30
  self.index_special_tokens[self.n_words] = token
31
  self.n_words += 1
32
-
33
- def tokenize(self, s: str):
34
- return self.sp_model.EncodeAsPieces(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
37
  assert type(s) is str
@@ -70,27 +108,40 @@ class SPTokenizer:
70
  """Converts an index (integer) in a token (str) using the vocab."""
71
  if index in self.index_special_tokens:
72
  return self.index_special_tokens[index]
73
- if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
74
  return ""
75
  return self.sp_model.IdToPiece(index)
76
 
77
 
78
  class ChatGLMTokenizer(PreTrainedTokenizer):
79
- vocab_files_names = {"vocab_file": "tokenizer.model"}
80
 
 
81
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
82
 
83
- def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
 
 
 
 
 
 
 
84
  self.name = "GLMTokenizer"
85
-
86
  self.vocab_file = vocab_file
87
  self.tokenizer = SPTokenizer(vocab_file)
88
  self.special_tokens = {
89
  "<bos>": self.tokenizer.bos_id,
90
  "<eos>": self.tokenizer.eos_id,
 
91
  "<pad>": self.tokenizer.pad_id
92
  }
93
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
 
 
 
 
 
 
94
 
95
  def get_command(self, token):
96
  if token in self.special_tokens:
@@ -100,24 +151,40 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
100
 
101
  @property
102
  def unk_token(self) -> str:
103
- return "<unk>"
104
 
105
  @property
106
  def pad_token(self) -> str:
107
- return "<unk>"
108
 
109
  @property
110
- def pad_token_id(self):
111
- return self.get_command("<pad>")
112
 
113
  @property
114
- def eos_token(self) -> str:
115
- return "</s>"
 
 
 
 
116
 
117
  @property
118
  def eos_token_id(self):
119
  return self.get_command("<eos>")
120
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  @property
122
  def vocab_size(self):
123
  return self.tokenizer.n_words
@@ -129,7 +196,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
129
  return vocab
130
 
131
  def _tokenize(self, text, **kwargs):
132
- return self.tokenizer.tokenize(text)
133
 
134
  def _convert_token_to_id(self, token):
135
  """ Converts a token (str) in an id using the vocab. """
@@ -171,8 +238,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
171
  return (vocab_file,)
172
 
173
  def get_prefix_tokens(self):
174
- prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
175
- return prefix_tokens
176
 
177
  def build_single_message(self, role, metadata, message):
178
  assert role in ["system", "user", "assistant", "observation"], role
@@ -195,7 +261,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
195
  return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
196
 
197
  def build_inputs_with_special_tokens(
198
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
199
  ) -> List[int]:
200
  """
201
  Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
@@ -220,12 +286,12 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
220
  return token_ids_0
221
 
222
  def _pad(
223
- self,
224
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
225
- max_length: Optional[int] = None,
226
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
227
- pad_to_multiple_of: Optional[int] = None,
228
- return_attention_mask: Optional[bool] = None,
229
  ) -> dict:
230
  """
231
  Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
@@ -281,3 +347,160 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
281
  encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
282
 
283
  return encoded_inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ import re
4
  from typing import List, Optional, Union, Dict
5
  from sentencepiece import SentencePieceProcessor
6
+ from transformers import AddedToken, PreTrainedTokenizer, PreTrainedTokenizerFast
7
+ from transformers.convert_slow_tokenizer import (
8
+ SLOW_TO_FAST_CONVERTERS,
9
+ SpmConverter,
10
+ decoders,
11
+ normalizers,
12
+ pre_tokenizers,
13
+ processors,
14
+ )
15
  from transformers.utils import logging, PaddingStrategy
16
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
17
 
18
 
19
+ logger = logging.get_logger(__name__)
20
+
21
+ ADDITIONAL_SPECIAL_TOKENS = [
22
+ "[MASK]",
23
+ "[gMASK]",
24
+ "[sMASK]",
25
+ "<!sop!>",
26
+ "<!eop!>",
27
+ "<|system|>",
28
+ "<|user|>",
29
+ "<|assistant|>",
30
+ "<|observation|>",
31
+ ]
32
+ PREFIX_TOKENS = ["[gMASK]", "<!sop!>"]
33
+
34
+ DUMMY_PREFIX_INDICATOR_FOR_FAST = "<!dummy-prefix!>"
35
+
36
+
37
  class SPTokenizer:
38
  def __init__(self, model_path: str):
39
  # reload tokenizer
 
47
  self.pad_id: int = self.sp_model.unk_id()
48
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
49
 
50
+ special_tokens = ADDITIONAL_SPECIAL_TOKENS
 
51
  self.special_tokens = {}
52
  self.index_special_tokens = {}
53
  for token in special_tokens:
54
  self.special_tokens[token] = self.n_words
55
  self.index_special_tokens[self.n_words] = token
56
  self.n_words += 1
57
+ self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
58
+
59
+ def tokenize(self, s: str, encode_special_tokens=False):
60
+ if encode_special_tokens:
61
+ last_index = 0
62
+ t = []
63
+ for match in re.finditer(self.role_special_token_expression, s):
64
+ if last_index < match.start():
65
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
66
+ t.append(s[match.start():match.end()])
67
+ last_index = match.end()
68
+ if last_index < len(s):
69
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
70
+ return t
71
+ else:
72
+ return self.sp_model.EncodeAsPieces(s)
73
 
74
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
75
  assert type(s) is str
 
108
  """Converts an index (integer) in a token (str) using the vocab."""
109
  if index in self.index_special_tokens:
110
  return self.index_special_tokens[index]
111
+ if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0 or index >= self.sp_model.vocab_size():
112
  return ""
113
  return self.sp_model.IdToPiece(index)
114
 
115
 
116
  class ChatGLMTokenizer(PreTrainedTokenizer):
 
117
 
118
+ vocab_files_names = {"vocab_file": "tokenizer.model"}
119
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
120
 
121
+ def __init__(
122
+ self,
123
+ vocab_file,
124
+ padding_side="left",
125
+ clean_up_tokenization_spaces=False,
126
+ encode_special_tokens=False,
127
+ **kwargs
128
+ ):
129
  self.name = "GLMTokenizer"
 
130
  self.vocab_file = vocab_file
131
  self.tokenizer = SPTokenizer(vocab_file)
132
  self.special_tokens = {
133
  "<bos>": self.tokenizer.bos_id,
134
  "<eos>": self.tokenizer.eos_id,
135
+ "<unk>": self.tokenizer.pad_id,
136
  "<pad>": self.tokenizer.pad_id
137
  }
138
+ self.encode_special_tokens = encode_special_tokens
139
+
140
+ super().__init__(
141
+ padding_side=padding_side,
142
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
143
+ **kwargs
144
+ )
145
 
146
  def get_command(self, token):
147
  if token in self.special_tokens:
 
151
 
152
  @property
153
  def unk_token(self) -> str:
154
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<unk>"))
155
 
156
  @property
157
  def pad_token(self) -> str:
158
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<pad>"))
159
 
160
  @property
161
+ def eos_token(self) -> str:
162
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<eos>"))
163
 
164
  @property
165
+ def unk_token_id(self) -> int:
166
+ return self.get_command("<unk>")
167
+
168
+ @property
169
+ def pad_token_id(self) -> int:
170
+ return self.get_command("<pad>")
171
 
172
  @property
173
  def eos_token_id(self):
174
  return self.get_command("<eos>")
175
 
176
+ @unk_token.setter
177
+ def unk_token(self, value):
178
+ logger.warning("Setting unk_token is not supported, use the default one.")
179
+
180
+ @pad_token.setter
181
+ def pad_token(self, value):
182
+ logger.warning("Setting pad_token is not supported, use the default one.")
183
+
184
+ @eos_token.setter
185
+ def eos_token(self, value):
186
+ logger.warning("Setting eos_token is not supported, use the default one.")
187
+
188
  @property
189
  def vocab_size(self):
190
  return self.tokenizer.n_words
 
196
  return vocab
197
 
198
  def _tokenize(self, text, **kwargs):
199
+ return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
200
 
201
  def _convert_token_to_id(self, token):
202
  """ Converts a token (str) in an id using the vocab. """
 
238
  return (vocab_file,)
239
 
240
  def get_prefix_tokens(self):
241
+ return list(map(self.get_command, PREFIX_TOKENS))
 
242
 
243
  def build_single_message(self, role, metadata, message):
244
  assert role in ["system", "user", "assistant", "observation"], role
 
261
  return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
262
 
263
  def build_inputs_with_special_tokens(
264
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
265
  ) -> List[int]:
266
  """
267
  Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
 
286
  return token_ids_0
287
 
288
  def _pad(
289
+ self,
290
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
291
+ max_length: Optional[int] = None,
292
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
293
+ pad_to_multiple_of: Optional[int] = None,
294
+ return_attention_mask: Optional[bool] = None,
295
  ) -> dict:
296
  """
297
  Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
 
347
  encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
348
 
349
  return encoded_inputs
350
+
351
+
352
+ class ChatGLMTokenizerFast(PreTrainedTokenizerFast):
353
+ # multiple breaking changes, no backward-compatibility
354
+ slow_tokenizer_class = ChatGLMTokenizer
355
+ vocab_files_names = {
356
+ **ChatGLMTokenizer.vocab_files_names,
357
+ **PreTrainedTokenizerFast.vocab_files_names,
358
+ }
359
+
360
+ def __init__(self, **kwargs):
361
+ kwargs.setdefault("clean_up_tokenization_spaces", False)
362
+ kwargs.setdefault("bos_token", "<s>")
363
+ kwargs.setdefault("eos_token", "</s>")
364
+ kwargs.setdefault("unk_token", "<unk>")
365
+ kwargs.setdefault("pad_token", "<unk>")
366
+ super().__init__(**kwargs)
367
+
368
+ @property
369
+ def dummy_prefix_indicator(self):
370
+ return DUMMY_PREFIX_INDICATOR_FOR_FAST
371
+
372
+ @property
373
+ def can_save_slow_tokenizer(self) -> bool:
374
+ # multiple breaking changes
375
+ return False
376
+
377
+ def save_pretrained(self, *args, **kwargs):
378
+ if not self.can_save_slow_tokenizer:
379
+ logger.warning(
380
+ f"{type(self).__name__} does not support saving slow tokenizer. "
381
+ "Saving it at the same directory may break the original tokenizer. "
382
+ "Please keep a backup beforehand."
383
+ )
384
+
385
+ return super().save_pretrained(*args, **kwargs)
386
+
387
+ def build_single_message_prompt(self, role, metadata, message):
388
+ assert role in ["system", "user", "assistant", "observation"], role
389
+ return (
390
+ f"<|{role}|>"
391
+ f"{self.dummy_prefix_indicator}{metadata}\n"
392
+ f"{self.dummy_prefix_indicator}{message}"
393
+ )
394
+
395
+ def build_chat_prompt(self, query, history=None, role="user", metadata=""):
396
+ inputs = []
397
+
398
+ for item in history or []:
399
+ content = item["content"]
400
+
401
+ if item["role"] == "system" and "tools" in item:
402
+ content += "\n" + json.dumps(
403
+ item["tools"], indent=4, ensure_ascii=False
404
+ )
405
+
406
+ inputs.append(
407
+ self.build_single_message_prompt(
408
+ item["role"], item.get("metadata", ""), content
409
+ )
410
+ )
411
+
412
+ inputs.append(self.build_single_message_prompt(role, metadata, query))
413
+ inputs.append("<|assistant|>")
414
+
415
+ return "".join(inputs)
416
+
417
+ def build_chat_input(self, *args, **kwargs):
418
+ return self.batch_encode_plus(
419
+ [self.build_chat_prompt(*args, **kwargs)],
420
+ return_tensors="pt",
421
+ )
422
+
423
+
424
+ ChatGLMTokenizer.register_for_auto_class()
425
+ ChatGLMTokenizerFast.register_for_auto_class()
426
+
427
+
428
+ class ChatGLMTokenizerConverter(SpmConverter):
429
+ handle_byte_fallback = True
430
+
431
+ def normalizer(self, proto):
432
+ return normalizers.Sequence(
433
+ [
434
+ normalizers.Replace(
435
+ pattern=DUMMY_PREFIX_INDICATOR_FOR_FAST, content="▁"
436
+ ),
437
+ normalizers.Replace(pattern=" ", content="▁"),
438
+ ]
439
+ )
440
+
441
+ def pre_tokenizer(self, replacement, add_prefix_space):
442
+ # NOTE: don't use Metaspace, it won't merge spaces into one token
443
+ # without Metaspace: " " => ["▁▁"]
444
+ # with Metaspace: " " => ["▁", "▁"]
445
+ return pre_tokenizers.Split(DUMMY_PREFIX_INDICATOR_FOR_FAST, "merged_with_next")
446
+
447
+ def decoder(self, replacement, add_prefix_space):
448
+ return decoders.Sequence(
449
+ [
450
+ decoders.ByteFallback(),
451
+ decoders.Metaspace(replacement="▁", add_prefix_space=True),
452
+ ]
453
+ )
454
+
455
+ def tokenizer(self, proto):
456
+ tokenizer = super().tokenizer(proto)
457
+
458
+ tokenizer.model.byte_fallback = True
459
+
460
+ assert tokenizer.token_to_id("<unk>") == 0
461
+ assert tokenizer.token_to_id("<s>") == 1
462
+ assert tokenizer.token_to_id("</s>") == 2
463
+ special_tokens = [
464
+ "<unk>",
465
+ "<s>",
466
+ "</s>",
467
+ *ADDITIONAL_SPECIAL_TOKENS,
468
+ ]
469
+
470
+ tokenizer.add_special_tokens(
471
+ [AddedToken(token, special=True) for token in special_tokens]
472
+ )
473
+
474
+ return tokenizer
475
+
476
+ def converted(self):
477
+ tokenizer = super().converted()
478
+
479
+ # Post processors
480
+ prefix_token_ids = list(map(tokenizer.token_to_id, PREFIX_TOKENS))
481
+ assert all(i is not None for i in prefix_token_ids)
482
+ prefix_template = " ".join(PREFIX_TOKENS)
483
+
484
+ template_special_tokens = list(frozenset(zip(PREFIX_TOKENS, prefix_token_ids)))
485
+
486
+ if "</s>" not in PREFIX_TOKENS:
487
+ eos_token_id = tokenizer.token_to_id("</s>")
488
+ assert eos_token_id is not None
489
+ template_special_tokens.append(("</s>", eos_token_id))
490
+
491
+ post = processors.TemplateProcessing(
492
+ single=f"{prefix_template} $A",
493
+ pair=f"{prefix_template} $A $B:1 </s>:1",
494
+ special_tokens=template_special_tokens,
495
+ )
496
+ if tokenizer.post_processor is None:
497
+ tokenizer.post_processor = post
498
+ else:
499
+ tokenizer.post_processor = processors.Sequence(
500
+ [tokenizer.post_processor, post]
501
+ )
502
+
503
+ return tokenizer
504
+
505
+
506
+ SLOW_TO_FAST_CONVERTERS[ChatGLMTokenizer.__name__] = ChatGLMTokenizerConverter
tokenizer_config.json CHANGED
@@ -7,7 +7,7 @@
7
  "auto_map": {
8
  "AutoTokenizer": [
9
  "tokenization_chatglm.ChatGLMTokenizer",
10
- null
11
- ]
12
  }
13
  }
 
7
  "auto_map": {
8
  "AutoTokenizer": [
9
  "tokenization_chatglm.ChatGLMTokenizer",
10
+ "tokenization_chatglm.ChatGLMTokenizerFast"
11
+ ]
12
  }
13
  }