Spaces:

Symato
/

tomtat

Running

File size: 7,892 Bytes

#!/usr/bin/env python3
import utils; from utils import *
import os, sys, lzma, json, pprint, time, subprocess

thinker = os.getenv("thinker", "405b")
TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)

LLM_HOST = "gemini"
TKNZ_RATIO = 1

GEMINI_MODEL = 'gemini-1.5-pro-002'
FLASH_MODEL = 'gemini-1.5-flash-002'

MAX_OUTPUT_TOKENS = 1024*8

# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Prompting.ipynb
# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Streaming.ipynb
import google.generativeai as genai # pip install -U -q google-generativeai
llm_log_filename = f"{location__}/.cache/llm.log"


genai.configure(api_key="AIzaSyAUeHVWLkYioIGk6PMbCTqk73PowHCIyPM")

GEMINI_CLIENT = genai.GenerativeModel(GEMINI_MODEL, \
    generation_config = genai.GenerationConfig(
        max_output_tokens = MAX_OUTPUT_TOKENS, 
        temperature = TEMPERATURE,
    ))

def chat(prompt, history=[], use_cache=False, stream=False):
    if stream: return GEMINI_CLIENT.generate_content(prompt, stream=True)

    messages = history + [{"role": "user", "content": prompt}] # fake history
    with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {GEMINI_MODEL} ] - - -\n\nPROMPT:\n{prompt}\n")

    try:
        res = GEMINI_CLIENT.generate_content(prompt, request_options = { "timeout": 6000 })
        with open(llm_log_filename,"at") as f: f.write(f"\nRESPONSE:\n{res}\n"); f.write(f"\nCONTENT:\n{res.text}\n")
        messages += [{"role": "assistant", "content": res.text}]
        return messages

    except Exception as e:
        with open(llm_log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
        print(f"\nEXCEPTION:\n{e}\n"); raise e


FLASH_CLIENT = genai.GenerativeModel(FLASH_MODEL, \
    generation_config=genai.GenerationConfig(
        max_output_tokens=1024*8, 
        temperature=TEMPERATURE
    ))

# def flash_chat(prompt, history=[], use_cache=False, stream=False):
#         res = FLASH_CLIENT.generate_content(prompt)
#         return [{"role": "assistant", "content": res.text}]
flash_chat = chat

def who_are_you():
    print(f"{RED}{LLM_HOST}{RESET}  " * 2)


if thinker == "gemini": # gemini pro
    CTXLEN = 1024*64 # gemini thì vô tư, 128k hoặc 1m ctxlen đều OK
    thinker_chat = chat

elif thinker in "70b|405b":
    cache_filename = f"{location__}/.cache/thinker.jsonl.xz"
    lock_filename = f"{location__}/.cache/thinker.lock"
    log_filename = f"{location__}/.cache/thinker.log"

    ## Load thinker_cache
    lines = [] if not os.path.exists(cache_filename) else \
        [ line for line in lzma.open(cache_filename,"rt") ]
    assert len(lines) % 2 == 0
    thinker_cache = {}; i = 0
    while i < len(lines): # line có \n ở cuối nên [:-1] để bỏ đi
        thinker_cache[lines[i][:-1]] = json.loads(lines[i+1])
        i += 2
    lines = None # Done loading

    # https://docs.together.ai/docs/chat-models#hosted-models
    model = {
        "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo  128k", # $3.50 / 1m tokens(*)
         "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo   128k", # $0.88 / 1m tokens(*)
    }[thinker]

    model, CTXLEN = model.strip().split()
    LLM_HOST = model

    CTXLEN = int(CTXLEN[:-1])
    if CTXLEN > 64: CTXLEN = 64 # max 64k ctxlen
    CTXLEN = CTXLEN*1024 - MAX_OUTPUT_TOKENS

    from together import Together
    together_client = Together(api_key='adc0db56b77fe6508bdeadb4d8253771750a50639f8e87313153e49d4599f6ea')
    ###
    stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
    def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
        if stream:
            with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") 
            return together_client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=MAX_OUTPUT_TOKENS,
                temperature=TEMPERATURE,
                top_p=0.7, top_k=50,
                repetition_penalty=1.2, stop=stops,
                stream=True
            )

        messages = history + [{"role": "user", "content": prompt}]
        messages_jsonl = json.dumps(messages, ensure_ascii=False)
        cache_found = (messages_jsonl in thinker_cache)

        if use_cache and cache_found:
            print(f"{YELLOW}<<< cached content >>>{RESET}")
            content = thinker_cache[messages_jsonl]

        elif testing: 
            print(f"{RED}<<< testing content >>>{RESET}")
            content = "testing testing"

        else:
            print(f"{GREEN}<<< fresh content >>>{RESET}")
            with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") 
            try:
                response = Together(api_key=os.environ.get('TOGETHER_API_KEY')).chat.completions.create(
                    model=model,
                    messages=messages,
                    max_tokens=MAX_OUTPUT_TOKENS,
                    temperature=TEMPERATURE,
                    top_p=0.7, top_k=50,
                    repetition_penalty=1.2, stop=stops,
                    logprobs=1, stream=False
                )
            except Exception as e:
                with open(log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
                print(f"\nEXCEPTION:\n{e}\n"); raise e

            content = response.choices[0].message.content
            with open(log_filename,"at") as f:
                f.write(f"\nRESPONSE:\n{response}\n")
                f.write(f"\nCONTENT:\n{content}\n")

            thinker_cache[messages_jsonl] = content # update new generated content

            waits = 5
            while waits > 0 and os.path.exists(lock_filename): # có người đang write, wait
                waits -= 1
                time.sleep(0.2)

            if waits == 0:
                assert False, f"Bị lock hơn 1 second, có thể xóa {lock_filename} nếu lỗi này lặp lại"

            subprocess.run(f"touch {lock_filename}", shell=True) # lock
            with lzma.open(cache_filename,"at") as f: # write
                f.write(f"{messages_jsonl}\n{json.dumps(content, ensure_ascii=False)}\n")
            subprocess.run(f"rm {lock_filename}", shell=True) # unlock

        messages += [{"role": "assistant", "content": content}]
        return messages


LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
who_are_you()



from prompts import summary_template
from prompts import contextual_template, clean_view_template

USE_CACHE = os.getenv("cache", "1") == "1"


def extract_keyphrases_figures_summary(text):
    if len(text) < 80: return ""

    prompt = summary_template.format(text = text)
    print(f"{GREEN}{text}{RESET}")

    utils.reset_timer(timer = "extract_keyphrases_figures_summary")
    res = chat(prompt, use_cache = USE_CACHE)
    utils.measure_time("", timer = "extract_keyphrases_figures_summary")

    raw = res[-1]["content"]
    print(f"{MAGENTA}{raw}{RESET}")

    return raw


def gen_contextual(document, chunk):
    prompt = contextual_template.format(document = document, chunk = chunk)
    res = thinker_chat(prompt, use_cache = USE_CACHE)
    contextual = res[-1]["content"].strip()
    return contextual


def gen_clean_view(document):
    prompt = clean_view_template.format(document = document)
    res = chat(prompt, use_cache = USE_CACHE)
    ret = res[-1]["content"].strip()
    return ret


if __name__ == "__main__":

    try: filename = sys.argv[1]
    except: filename = None
    if filename: q = open(filename, "rt").read()
    else: q = "What's your name? Who created you?"

    utils.reset_timer(); res = thinker_chat(q, use_cache=False)
    utils.measure_time(LLM_HOST + " ")
    print(f"{CYAN}{q}{RESET}", end="\n\n"); print(res[-1]["content"])