File size: 3,158 Bytes
6f61bb9
 
 
 
 
 
96a6d6a
6f61bb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
from pathlib import Path

import tiktoken
import yaml

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")


class GradioInputs:
    """
    This DTO class formalized the format of "inputs" from gradio and prevent long signature
    It will be converted in GradioMethodService.
    """

    def __init__(self, apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
        self.apikey_textbox = apikey_textbox
        self.source_textbox = source_textbox
        self.source_target_textbox = source_target_textbox
        self.qa_textbox = qa_textbox
        self.gpt_model_textbox = gpt_model_textbox
        self.language_textbox = language_textbox
        self.chatbot = chatbot
        self.history = history


class Prompt:
    """
    Define the prompt structure
    Prompt = "{prompt_prefix}{prompt_main}{prompt_suffix}"
    where if the prompt is too long, {prompt_main} will be splitted into multiple parts to fulfill context length of LLM

    Example: for Youtube-timestamped summary
        prompt_prefix: Youtube Video types definitions, Title
        prompt_main: transcript (splittable)
        prompt_suffix: task description / constraints
    """

    def __init__(self, prompt_prefix, prompt_main, prompt_suffix):
        self.prompt_prefix = prompt_prefix
        self.prompt_main = prompt_main
        self.prompt_suffix = prompt_suffix


def get_project_root():
    return Path(__file__).parent.parent


def get_config():
    with open(os.path.join(get_project_root(), 'config/config.yaml'), encoding='utf-8') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    try:
        with open(os.path.join(get_project_root(), 'config/config_secret.yaml'), encoding='utf-8') as f:
            config_secret = yaml.load(f, Loader=yaml.FullLoader)
            config.update(config_secret)
    except FileNotFoundError:
        pass  # okay to not have config_secret.yaml
    return config


def get_token(text: str):
    return len(tokenizer.encode(text, disallowed_special=()))


def get_first_n_tokens_and_remaining(text: str, n: int):
    tokens = tokenizer.encode(text, disallowed_special=())
    return tokenizer.decode(tokens[:n]), tokenizer.decode(tokens[n:])


def provide_text_with_css(text, color):
    if color == "red":
        return f'<span style="background-color: red; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
    elif color == "green":
        return f'<span style="background-color: #307530; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
    elif color == "blue":
        return f'<span style="background-color: #7b7bff; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
    elif color == "yellow":
        return f'<span style="background-color: yellow; color: black; padding: 3px; border-radius: 8px;">{text}</span>'
    else:
        return text


if __name__ == '__main__':
    # print(get_token("def get_token(text: str)"))
    # print(get_token("ηš†γ•γ‚“γ“γ‚“γ«γ‘γ―"))
    print(get_first_n_tokens_and_remaining("This is a string with some text to tokenize.", 30))