Bo1015 commited on
Commit
153db58
·
verified ·
1 Parent(s): d519c0b

Delete tokenization_xtrimopglm.py

Browse files
Files changed (1) hide show
  1. tokenization_xtrimopglm.py +0 -140
tokenization_xtrimopglm.py DELETED
@@ -1,140 +0,0 @@
1
- """Tokenization classes for xTrimoPGLM."""
2
-
3
- import os
4
- from typing import List, Optional, Union, Dict, Any
5
- from torch import TensorType
6
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
7
- from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
8
-
9
- VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
10
-
11
-
12
- def load_vocab_file(vocab_file: str) -> List[str]:
13
- with open(vocab_file, "r") as f:
14
- lines = f.read().splitlines()
15
- return [line.strip() for line in lines]
16
-
17
-
18
- class xTrimoPGLMTokenizer(PreTrainedTokenizer):
19
- """
20
- Constructs a xTrimoPGLM tokenizer.
21
- """
22
-
23
- vocab_files_names = VOCAB_FILES_NAMES
24
- model_input_names = ["input_ids", "attention_mask", "position_ids"]
25
- def __init__(
26
- self,
27
- vocab_file: str,
28
- unk_token: str = "<unk>",
29
- pad_token: str = "<pad>",
30
- mask_token: str = "<mask>",
31
- eos_token: str = "<eos>",
32
- model_max_length: int = 2048,
33
- additional_special_tokens: Optional[List[str]] = None,
34
- **kwargs,
35
- ):
36
- self.all_tokens = load_vocab_file(vocab_file)
37
- self._id_to_token = dict(enumerate(self.all_tokens))
38
- self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)}
39
-
40
- if additional_special_tokens is None:
41
- additional_special_tokens = ['<pad>', '<mask>', '<gmask>', '<smask>', '<eod>', '<sop>', '<eop>', '<eos>', '<unk>']
42
-
43
- super().__init__(
44
- unk_token=unk_token,
45
- pad_token=pad_token,
46
- mask_token=mask_token,
47
- eos_token=eos_token,
48
- model_max_length=model_max_length,
49
- additional_special_tokens=additional_special_tokens,
50
- **kwargs,
51
- )
52
-
53
- self.unique_no_split_tokens = self.all_tokens
54
- self._update_trie(self.unique_no_split_tokens)
55
-
56
- def _convert_id_to_token(self, index: int) -> str:
57
- return self._id_to_token.get(index, self.unk_token)
58
-
59
- def _convert_token_to_id(self, token: str) -> int:
60
- return self._token_to_id.get(token, self._token_to_id.get(self.unk_token))
61
-
62
- def _tokenize(self, text: str, **kwargs) -> List[str]:
63
- return text.split()
64
-
65
- def get_vocab(self) -> dict:
66
- base_vocab = self._token_to_id.copy()
67
- base_vocab.update(self.added_tokens_encoder)
68
- return base_vocab
69
-
70
- def token_to_id(self, token: str) -> int:
71
- return self._token_to_id.get(token, self._token_to_id.get(self.unk_token))
72
-
73
- def id_to_token(self, index: int) -> str:
74
- return self._id_to_token.get(index, self.unk_token)
75
-
76
- def build_inputs_with_special_tokens(
77
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
78
- ) -> List[int]:
79
- sep = [self.eos_token_id]
80
- if token_ids_1 is None:
81
- if self.eos_token_id is None:
82
- return token_ids_0
83
- else:
84
- return token_ids_0 + sep
85
- elif self.eos_token_id is None:
86
- raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
87
- return token_ids_0 + sep + token_ids_1 + sep # Multiple inputs always have an EOS token
88
-
89
-
90
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
91
- vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.model")
92
- with open(vocab_file, "w") as f:
93
- f.write("\n".join(self.all_tokens))
94
- return (vocab_file,)
95
-
96
- @property
97
- def vocab_size(self) -> int:
98
- return len(self.all_tokens)
99
-
100
- def apply_chat_template(
101
- self,
102
- query,
103
- add_generation_prompt: bool = True,
104
- tokenize: bool = True,
105
- padding: bool = False,
106
- truncation: bool = False,
107
- max_length: Optional[int] = None,
108
- return_tensors: Optional[Union[str, TensorType]] = None,
109
- return_dict: bool = False,
110
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
111
- add_special_tokens: bool = True,
112
- **kwargs,
113
- ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
114
-
115
- generation_prompt = "<gmask><sop><eos>"
116
- if isinstance(query, str):
117
- query = [query]
118
- prompt_query = []
119
- if add_generation_prompt:
120
- for each in query:
121
- assert isinstance(each, str)
122
- prompt_query.append(generation_prompt+each)
123
- else:
124
- prompt_query = query
125
- if tokenize:
126
- output = self.batch_encode_plus(
127
- prompt_query,
128
- padding=padding,
129
- truncation=truncation,
130
- max_length=max_length,
131
- return_tensors=return_tensors,
132
- is_split_into_words=True,
133
- add_special_tokens=False
134
- )
135
- if return_dict:
136
- return output
137
- else:
138
- return output["input_ids"]
139
- else:
140
- return prompt_query