Spaces:
Sleeping
Sleeping
File size: 3,158 Bytes
6f61bb9 96a6d6a 6f61bb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
from pathlib import Path
import tiktoken
import yaml
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
class GradioInputs:
"""
This DTO class formalized the format of "inputs" from gradio and prevent long signature
It will be converted in GradioMethodService.
"""
def __init__(self, apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
self.apikey_textbox = apikey_textbox
self.source_textbox = source_textbox
self.source_target_textbox = source_target_textbox
self.qa_textbox = qa_textbox
self.gpt_model_textbox = gpt_model_textbox
self.language_textbox = language_textbox
self.chatbot = chatbot
self.history = history
class Prompt:
"""
Define the prompt structure
Prompt = "{prompt_prefix}{prompt_main}{prompt_suffix}"
where if the prompt is too long, {prompt_main} will be splitted into multiple parts to fulfill context length of LLM
Example: for Youtube-timestamped summary
prompt_prefix: Youtube Video types definitions, Title
prompt_main: transcript (splittable)
prompt_suffix: task description / constraints
"""
def __init__(self, prompt_prefix, prompt_main, prompt_suffix):
self.prompt_prefix = prompt_prefix
self.prompt_main = prompt_main
self.prompt_suffix = prompt_suffix
def get_project_root():
return Path(__file__).parent.parent
def get_config():
with open(os.path.join(get_project_root(), 'config/config.yaml'), encoding='utf-8') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
try:
with open(os.path.join(get_project_root(), 'config/config_secret.yaml'), encoding='utf-8') as f:
config_secret = yaml.load(f, Loader=yaml.FullLoader)
config.update(config_secret)
except FileNotFoundError:
pass # okay to not have config_secret.yaml
return config
def get_token(text: str):
return len(tokenizer.encode(text, disallowed_special=()))
def get_first_n_tokens_and_remaining(text: str, n: int):
tokens = tokenizer.encode(text, disallowed_special=())
return tokenizer.decode(tokens[:n]), tokenizer.decode(tokens[n:])
def provide_text_with_css(text, color):
if color == "red":
return f'<span style="background-color: red; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
elif color == "green":
return f'<span style="background-color: #307530; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
elif color == "blue":
return f'<span style="background-color: #7b7bff; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
elif color == "yellow":
return f'<span style="background-color: yellow; color: black; padding: 3px; border-radius: 8px;">{text}</span>'
else:
return text
if __name__ == '__main__':
# print(get_token("def get_token(text: str)"))
# print(get_token("ηγγγγγ«γ‘γ―"))
print(get_first_n_tokens_and_remaining("This is a string with some text to tokenize.", 30))
|