jeduardogruiz commited on
Commit
d991264
·
verified ·
1 Parent(s): 637a7d6

Upload 7 files

Browse files
test_encoding.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Note that there are more actual tests, they're just not currently public :-)
2
+
3
+ from typing import Callable
4
+
5
+ import hypothesis
6
+ import hypothesis.strategies as st
7
+ import pytest
8
+
9
+ import tiktoken
10
+
11
+ from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES
12
+
13
+
14
+ def test_simple():
15
+ enc = tiktoken.get_encoding("gpt2")
16
+ assert enc.encode("hello world") == [31373, 995]
17
+ assert enc.decode([31373, 995]) == "hello world"
18
+ assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
19
+
20
+ enc = tiktoken.get_encoding("cl100k_base")
21
+ assert enc.encode("hello world") == [15339, 1917]
22
+ assert enc.decode([15339, 1917]) == "hello world"
23
+ assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
24
+
25
+ for enc_name in tiktoken.list_encoding_names():
26
+ enc = tiktoken.get_encoding(enc_name)
27
+ for token in range(10_000):
28
+ assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
29
+
30
+
31
+ def test_simple_repeated():
32
+ enc = tiktoken.get_encoding("gpt2")
33
+ assert enc.encode("0") == [15]
34
+ assert enc.encode("00") == [405]
35
+ assert enc.encode("000") == [830]
36
+ assert enc.encode("0000") == [2388]
37
+ assert enc.encode("00000") == [20483]
38
+ assert enc.encode("000000") == [10535]
39
+ assert enc.encode("0000000") == [24598]
40
+ assert enc.encode("00000000") == [8269]
41
+ assert enc.encode("000000000") == [10535, 830]
42
+ assert enc.encode("0000000000") == [8269, 405]
43
+ assert enc.encode("00000000000") == [8269, 830]
44
+ assert enc.encode("000000000000") == [8269, 2388]
45
+ assert enc.encode("0000000000000") == [8269, 20483]
46
+ assert enc.encode("00000000000000") == [8269, 10535]
47
+ assert enc.encode("000000000000000") == [8269, 24598]
48
+ assert enc.encode("0000000000000000") == [25645]
49
+ assert enc.encode("00000000000000000") == [8269, 10535, 830]
50
+
51
+
52
+ def test_simple_regex():
53
+ enc = tiktoken.get_encoding("cl100k_base")
54
+ assert enc.encode("rer") == [38149]
55
+ assert enc.encode("'rer") == [2351, 81]
56
+ assert enc.encode("today\n ") == [31213, 198, 220]
57
+ assert enc.encode("today\n \n") == [31213, 27907]
58
+ assert enc.encode("today\n \n") == [31213, 14211]
59
+
60
+
61
+ def test_basic_encode():
62
+ enc = tiktoken.get_encoding("r50k_base")
63
+ assert enc.encode("hello world") == [31373, 995]
64
+
65
+ enc = tiktoken.get_encoding("p50k_base")
66
+ assert enc.encode("hello world") == [31373, 995]
67
+
68
+ enc = tiktoken.get_encoding("cl100k_base")
69
+ assert enc.encode("hello world") == [15339, 1917]
70
+ assert enc.encode(" \x850") == [220, 126, 227, 15]
71
+
72
+
73
+ def test_encode_empty():
74
+ enc = tiktoken.get_encoding("r50k_base")
75
+ assert enc.encode("") == []
76
+
77
+
78
+ def test_encode_bytes():
79
+ enc = tiktoken.get_encoding("cl100k_base")
80
+ assert enc._encode_bytes(b" \xec\x8b\xa4\xed") == [62085]
81
+
82
+
83
+ def test_encode_surrogate_pairs():
84
+ enc = tiktoken.get_encoding("cl100k_base")
85
+
86
+ assert enc.encode("👍") == [9468, 239, 235]
87
+ # surrogate pair gets converted to codepoint
88
+ assert enc.encode("\ud83d\udc4d") == [9468, 239, 235]
89
+
90
+ # lone surrogate just gets replaced
91
+ assert enc.encode("\ud83d") == enc.encode("�")
92
+
93
+
94
+ # ====================
95
+ # Roundtrip
96
+ # ====================
97
+
98
+
99
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
100
+ def test_basic_roundtrip(make_enc):
101
+ enc = make_enc()
102
+ for value in (
103
+ "hello",
104
+ "hello ",
105
+ "hello ",
106
+ " hello",
107
+ " hello ",
108
+ " hello ",
109
+ "hello world",
110
+ "请考试我的软件!12345",
111
+ ):
112
+ assert value == enc.decode(enc.encode(value))
113
+ assert value == enc.decode(enc.encode_ordinary(value))
114
+
115
+
116
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
117
+ @hypothesis.given(text=st.text())
118
+ @hypothesis.settings(deadline=None)
119
+ def test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text):
120
+ enc = make_enc()
121
+
122
+ assert text == enc.decode(enc.encode(text))
123
+
124
+
125
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
126
+ def test_single_token_roundtrip(make_enc: Callable[[], tiktoken.Encoding]):
127
+ enc = make_enc()
128
+
129
+ for token in range(enc.n_vocab):
130
+ try:
131
+ token_bytes = enc.decode_single_token_bytes(token)
132
+ except KeyError:
133
+ continue
134
+ assert enc.encode_single_token(token_bytes) == token
135
+
136
+
137
+ # ====================
138
+ # Special tokens
139
+ # ====================
140
+
141
+
142
+ def test_special_token():
143
+ enc = tiktoken.get_encoding("cl100k_base")
144
+
145
+ eot = enc.encode_single_token("<|endoftext|>")
146
+ assert eot == enc.eot_token
147
+ fip = enc.encode_single_token("<|fim_prefix|>")
148
+ fim = enc.encode_single_token("<|fim_middle|>")
149
+
150
+ text = "<|endoftext|> hello <|fim_prefix|>"
151
+ assert eot not in enc.encode(text, disallowed_special=())
152
+ with pytest.raises(ValueError):
153
+ enc.encode(text)
154
+ with pytest.raises(ValueError):
155
+ enc.encode(text, disallowed_special="all")
156
+ with pytest.raises(ValueError):
157
+ enc.encode(text, disallowed_special={"<|endoftext|>"})
158
+ with pytest.raises(ValueError):
159
+ enc.encode(text, disallowed_special={"<|fim_prefix|>"})
160
+
161
+ text = "<|endoftext|> hello <|fim_prefix|> there <|fim_middle|>"
162
+ tokens = enc.encode(text, disallowed_special=())
163
+ assert eot not in tokens
164
+ assert fip not in tokens
165
+ assert fim not in tokens
166
+
167
+ tokens = enc.encode(text, allowed_special="all", disallowed_special=())
168
+ assert eot in tokens
169
+ assert fip in tokens
170
+ assert fim in tokens
171
+
172
+ tokens = enc.encode(text, allowed_special="all", disallowed_special="all")
173
+ assert eot in tokens
174
+ assert fip in tokens
175
+ assert fim in tokens
176
+
177
+ tokens = enc.encode(text, allowed_special={"<|fim_prefix|>"}, disallowed_special=())
178
+ assert eot not in tokens
179
+ assert fip in tokens
180
+ assert fim not in tokens
181
+
182
+ tokens = enc.encode(text, allowed_special={"<|endoftext|>"}, disallowed_special=())
183
+ assert eot in tokens
184
+ assert fip not in tokens
185
+ assert fim not in tokens
186
+
187
+ tokens = enc.encode(text, allowed_special={"<|fim_middle|>"}, disallowed_special=())
188
+ assert eot not in tokens
189
+ assert fip not in tokens
190
+ assert fim in tokens
191
+
192
+
193
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
194
+ @hypothesis.given(text=st.text())
195
+ @hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)
196
+ def test_hyp_special_ordinary(make_enc, text: str):
197
+ enc = make_enc()
198
+ assert enc.encode_ordinary(text) == enc.encode(text, disallowed_special=())
199
+
200
+
201
+ # ====================
202
+ # Batch encoding
203
+ # ====================
204
+
205
+
206
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
207
+ def test_batch_encode(make_enc: Callable[[], tiktoken.Encoding]):
208
+ enc = make_enc()
209
+ text1 = "hello world"
210
+ text2 = "goodbye world"
211
+
212
+ assert enc.encode_batch([text1]) == [enc.encode(text1)]
213
+ assert enc.encode_batch([text1, text2]) == [enc.encode(text1), enc.encode(text2)]
214
+
215
+ assert enc.encode_ordinary_batch([text1]) == [enc.encode_ordinary(text1)]
216
+ assert enc.encode_ordinary_batch([text1, text2]) == [
217
+ enc.encode_ordinary(text1),
218
+ enc.encode_ordinary(text2),
219
+ ]
220
+
221
+
222
+ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
223
+ @hypothesis.given(batch=st.lists(st.text()))
224
+ @hypothesis.settings(deadline=None)
225
+ def test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch):
226
+ enc = make_enc()
227
+
228
+ encoded = enc.encode_batch(batch)
229
+ assert encoded == [enc.encode(t) for t in batch]
230
+ decoded = enc.decode_batch(encoded)
231
+ assert decoded == batch
test_helpers.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bisect
2
+ import functools
3
+ import os
4
+
5
+ import pytest
6
+
7
+ import tiktoken
8
+
9
+ MAX_EXAMPLES: int = int(os.environ.get("TIKTOKEN_MAX_EXAMPLES", "100"))
10
+
11
+ ENCODINGS = ["r50k_base", "cl100k_base"]
12
+ SOME_ENCODINGS = ["cl100k_base"]
13
+
14
+
15
+ ENCODING_FACTORIES = [
16
+ pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in ENCODINGS
17
+ ]
18
+ SOME_ENCODING_FACTORIES = [
19
+ pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in SOME_ENCODINGS
20
+ ]
21
+
22
+
test_misc.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+
4
+ import tiktoken
5
+
6
+
7
+ def test_encoding_for_model():
8
+ enc = tiktoken.encoding_for_model("gpt2")
9
+ assert enc.name == "gpt2"
10
+ enc = tiktoken.encoding_for_model("text-davinci-003")
11
+ assert enc.name == "p50k_base"
12
+ enc = tiktoken.encoding_for_model("text-davinci-edit-001")
13
+ assert enc.name == "p50k_edit"
14
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
15
+ assert enc.name == "cl100k_base"
16
+
17
+
18
+ def test_optional_blobfile_dependency():
19
+ prog = """
20
+ import tiktoken
21
+ import sys
22
+ assert "blobfile" not in sys.modules
23
+ """
24
+ subprocess.check_call([sys.executable, "-c", prog])
test_offsets.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ import hypothesis
4
+ import pytest
5
+ from hypothesis import strategies as st
6
+
7
+ import tiktoken
8
+
9
+ from .test_helpers import MAX_EXAMPLES, SOME_ENCODING_FACTORIES
10
+
11
+
12
+ def _common_prefix_len(a, b):
13
+ i = 0
14
+ while i < len(a) and i < len(b) and a[i] == b[i]:
15
+ i += 1
16
+ return i
17
+
18
+
19
+ def _token_offsets_reference(enc, tokens):
20
+ text = enc.decode(tokens, errors="strict")
21
+ res = []
22
+ for i in range(len(tokens)):
23
+ prefix = enc.decode(tokens[:i], errors="ignore")
24
+ res.append(_common_prefix_len(text, prefix))
25
+ return res
26
+
27
+
28
+ @pytest.mark.parametrize("make_enc", SOME_ENCODING_FACTORIES)
29
+ @hypothesis.given(data=st.data())
30
+ @hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)
31
+ def test_hyp_offsets(make_enc: Callable[[], tiktoken.Encoding], data):
32
+ enc = make_enc()
33
+
34
+ tokens_st = st.lists(
35
+ st.integers(0, enc.n_vocab - 1).filter(
36
+ lambda x: x in enc._special_tokens.values() or x in enc._mergeable_ranks.values()
37
+ ),
38
+ min_size=1,
39
+ max_size=20,
40
+ )
41
+ tokens = data.draw(tokens_st)
42
+
43
+ # This is a dumb hack to make sure that our tokens are a valid UTF-8 string
44
+ # We could potentially drop this, see the TODO in decode_with_offsets
45
+ tokens = enc.encode(enc.decode(tokens, errors="ignore"), allowed_special="all")
46
+ assert enc.decode_with_offsets(tokens)[1] == _token_offsets_reference(enc, tokens)
47
+
48
+
49
+ def test_basic_offsets():
50
+ enc = tiktoken.get_encoding("cl100k_base")
51
+
52
+ prompt = "hello world"
53
+ p, o = enc.decode_with_offsets(enc.encode(prompt))
54
+ assert p == prompt
55
+ assert o == [0, 5]
56
+
57
+ prompt = "hello world<|endoftext|> green cow"
58
+ p, o = enc.decode_with_offsets(enc.encode(prompt, allowed_special="all"))
59
+ assert p == prompt
60
+ assert o == [0, 5, 11, 24, 30]
61
+
62
+ prompt = "我非常渴望与人工智能一起工作"
63
+ p, o = enc.decode_with_offsets(enc.encode(prompt))
64
+ assert p == prompt
65
+ assert o == [0, 1, 2, 3, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13]
66
+
67
+ # contains the interesting tokens b'\xe0\xae\xbf\xe0\xae' and b'\xe0\xaf\x8d\xe0\xae'
68
+ # in which \xe0 is the start of a 3-byte UTF-8 character
69
+ prompt = "நடிகர் சூர்யா"
70
+ p, o = enc.decode_with_offsets(enc.encode(prompt))
71
+ assert p == prompt
72
+ assert o == [0, 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 11, 12, 12]
73
+
74
+ # contains the interesting token b'\xa0\xe9\x99\xa4'
75
+ # in which \xe9 is the start of a 3-byte UTF-8 character and \xa0 is a continuation byte
76
+ prompt = " Ġ除"
77
+ p, o = enc.decode_with_offsets(enc.encode(prompt))
78
+ assert p == prompt
79
+ assert o == [0, 1]
test_simple_public.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+
4
+ import tiktoken
5
+
6
+
7
+ def test_simple():
8
+ # Note that there are more actual tests, they're just not currently public :-)
9
+ enc = tiktoken.get_encoding("gpt2")
10
+ assert enc.encode("hello world") == [31373, 995]
11
+ assert enc.decode([31373, 995]) == "hello world"
12
+ assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
13
+
14
+ enc = tiktoken.get_encoding("cl100k_base")
15
+ assert enc.encode("hello world") == [15339, 1917]
16
+ assert enc.decode([15339, 1917]) == "hello world"
17
+ assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
18
+
19
+ for enc_name in tiktoken.list_encoding_names():
20
+ enc = tiktoken.get_encoding(enc_name)
21
+ for token in range(10_000):
22
+ assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
23
+
24
+
25
+ def test_encoding_for_model():
26
+ enc = tiktoken.encoding_for_model("gpt2")
27
+ assert enc.name == "gpt2"
28
+ enc = tiktoken.encoding_for_model("text-davinci-003")
29
+ assert enc.name == "p50k_base"
30
+ enc = tiktoken.encoding_for_model("text-davinci-edit-001")
31
+ assert enc.name == "p50k_edit"
32
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
33
+ assert enc.name == "cl100k_base"
34
+
35
+
36
+ def test_optional_blobfile_dependency():
37
+ prog = """
38
+ import tiktoken
39
+ import sys
40
+ assert "blobfile" not in sys.modules
41
+ """
42
+ subprocess.check_call([sys.executable, "-c", prog])
tokenizador.json ADDED
File without changes
tokenizer.json ADDED
File without changes