khulnasoft commited on
Commit
c37b750
·
verified ·
1 Parent(s): 26ec8ac

Create get_token_ids.py

Browse files
Files changed (1) hide show
  1. get_token_ids.py +92 -0
get_token_ids.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ # Mapping of model names to their respective encodings
4
+ ENCODINGS = {
5
+ "gpt-4": tiktoken.get_encoding("cl100k_base"),
6
+ "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
7
+ "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
8
+ "text-davinci-003": tiktoken.get_encoding("p50k_base"),
9
+ "text-davinci-002": tiktoken.get_encoding("p50k_base"),
10
+ "text-davinci-001": tiktoken.get_encoding("r50k_base"),
11
+ "text-curie-001": tiktoken.get_encoding("r50k_base"),
12
+ "text-babbage-001": tiktoken.get_encoding("r50k_base"),
13
+ "text-ada-001": tiktoken.get_encoding("r50k_base"),
14
+ "davinci": tiktoken.get_encoding("r50k_base"),
15
+ "curie": tiktoken.get_encoding("r50k_base"),
16
+ "babbage": tiktoken.get_encoding("r50k_base"),
17
+ "ada": tiktoken.get_encoding("r50k_base"),
18
+ }
19
+
20
+ # Mapping of model names to their respective maximum context lengths
21
+ MAX_LENGTH = {
22
+ "gpt-4": 8192,
23
+ "gpt-3.5-turbo": 4096,
24
+ "gpt-3.5-turbo-0301": 4096,
25
+ "text-davinci-003": 4096,
26
+ "text-davinci-002": 4096,
27
+ "text-davinci-001": 2049,
28
+ "text-curie-001": 2049,
29
+ "text-babbage-001": 2049,
30
+ "text-ada-001": 2049,
31
+ "davinci": 2049,
32
+ "curie": 2049,
33
+ "babbage": 2049,
34
+ "ada": 2049
35
+ }
36
+
37
+ def count_tokens(model_name, text):
38
+ """
39
+ Count the number of tokens for a given model and text.
40
+
41
+ Parameters:
42
+ - model_name (str): The name of the model.
43
+ - text (str): The input text.
44
+
45
+ Returns:
46
+ - int: The number of tokens.
47
+ """
48
+ if model_name not in ENCODINGS:
49
+ raise ValueError(f"Model name '{model_name}' not found in encodings.")
50
+ return len(ENCODINGS[model_name].encode(text))
51
+
52
+ def get_max_context_length(model_name):
53
+ """
54
+ Get the maximum context length for a given model.
55
+
56
+ Parameters:
57
+ - model_name (str): The name of the model.
58
+
59
+ Returns:
60
+ - int: The maximum context length.
61
+ """
62
+ if model_name not in MAX_LENGTH:
63
+ raise ValueError(f"Model name '{model_name}' not found in max length dictionary.")
64
+ return MAX_LENGTH[model_name]
65
+
66
+ def get_token_ids_for_text(model_name, text):
67
+ """
68
+ Get unique token IDs for a given text using the specified model's encoding.
69
+
70
+ Parameters:
71
+ - model_name (str): The name of the model.
72
+ - text (str): The input text.
73
+
74
+ Returns:
75
+ - list: A list of unique token IDs.
76
+ """
77
+ if model_name not in ENCODINGS:
78
+ raise ValueError(f"Model name '{model_name}' not found in encodings.")
79
+ encoded_tokens = ENCODINGS[model_name].encode(text)
80
+ return list(set(encoded_tokens))
81
+
82
+ def get_token_ids_for_task_parsing(model_name):
83
+ """
84
+ Get unique token IDs for task parsing.
85
+
86
+ Parameters:
87
+ - model_name (str): The name of the model.
88
+
89
+ Returns:
90
+ - list: A list of unique token IDs for task parsing.
91
+ """
92
+ text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text