Spaces:

Ibrahimarain
/

llama-2-13B

Runtime error

File size: 5,476 Bytes

import llama_cpp
import llama_cpp.llama_tokenizer
from llama_cpp import Llama

import gradio as gr
from loguru import logger
import psutil
from ctransformers import AutoModelForCausalLM,AutoTokenizer


prompt_template = """[INST] <<SYS>>
You are a helpful assistant for a crowdfunding platform called GiveSendGo. Your goal is to gather essential information for campaign and generate a title and sample pitch of atleast 1000 words for the campaign.
<</SYS>> 

{question} [/INST]
"""

model_loc = "models/llama-2-13b-chat.Q5_K_M.gguf"
model_loc = "TheBloke/Llama-2-13B-chat-GGUF"

llama = AutoModelForCausalLM.from_pretrained(
    model_loc,
    model_type="llama",
    context_length=4096,
    max_new_tokens=2048,
    hf=True
    # threads=cpu_count,
)


# llama = llama_cpp.Llama.from_pretrained(
#     #repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
#     #filename="*q8_0.gguf",
#     mode_path=model_loc,
#     model_type="llama",
#     context_length=4096,
#     max_new_tokens=2048,
#     filename="llama-2-13b-chat.Q5_K_M.gguf",
#     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
#     verbose=False
# )

# llama = Llama(
# model_path=model_loc,
# max_tokens=4096,
# n_ctx=4096,
# verbose=False,
# )

_ = [elm for elm in prompt_template.splitlines() if elm.strip()]
stop_string = [elm.split(":")[0] + ":" for elm in _][-2]

logger.debug(f"{stop_string=}")

_ = psutil.cpu_count(logical=False) - 1
cpu_count: int = int(_) if _ else 1
logger.debug(f"{cpu_count=}")



model = "gpt-3.5-turbo"

def predict(message, history):
    messages = []
    prompt = prompt_template.format(question=message)

    for user_message, assistant_message in history:
        messages.append({"role": "system", "content": prompt},)
        messages.append({"role": "user", "content": user_message})
        messages.append({"role": "assistant", "content": assistant_message})
    
        messages.append({"role": "user", "content": message})

    response = llama.create_chat_completion_openai_v1(
        model=model,
        messages=messages,
        response_format={
        "type": "json_object",
            "schema": {
                "type": "object",
                "properties": {"title": {"type": "string"},
                               #"description": {"type": "string"},
                               "sample_pitch": {"type": "string"},
                               "amount": {"type": "string"},
                               "location": {"type": "string"}},
                "required": ["title","sample_pitch","amount","location"], #description
            },
        },
        temperature=0.7,
        stream=True
    )

    text = ""
    for chunk in response:
        content = chunk.choices[0].delta.content
        if content:
            text += content
            yield text


def generate(message):
    
    try:
        messages = []
        prompt = prompt_template.format(question=message)

        #for user_message, assistant_message in history:
        messages.append({"role": "system", "content": prompt},)
        #messages.append({"role": "user", "content": user_message})
        #messages.append({"role": "assistant", "content": assistant_message})
        
        messages.append({"role": "user", "content": message})

        response = llama.create_chat_completion_openai_v1(
            model=model,
            messages=messages,
            response_format={
            "type": "json_object",
                "schema": {
                    "type": "object",
                    "properties": {"title": {"type": "string"},
                                #"description": {"type": "string"},
                                "sample_pitch": {"type": "string"},
                                "amount": {"type": "string"},
                                "location": {"type": "string"}},
                    "required": ["title","sample_pitch","amount","location"], #description
                },
            },
            temperature=0.7,
            stream=False)

        # text = ""
        # for chunk in response:
        #     content = chunk.choices[0].delta.content
        #     if content:
        #         text += content
        #         logger.debug(f"api: {content=}")

        #         yield text
        
        logger.debug(f"{response}")

        return response.choices[0].delta.content
                

    except Exception as exc:
        logger.error(exc)
        response = f"{exc=}"
    

def predict_api(message):
    logger.debug(f"{message=}")
    text = generate(message)
    logger.debug(f"text::{text=}")
            
    return f"json: {text=}"



js = """function () {
  gradioURL = window.location.href
  if (!gradioURL.endsWith('?__theme=dark')) {
    window.location.replace(gradioURL + '?__theme=dark');
  }
}"""

css = """
footer {
    visibility: hidden;
}
full-height {
    height: 100%;
}
"""

with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css) as demo:
    gr.ChatInterface(predict, examples=["What is the capital of France?", "Who was the first person on the moon?"])

    with gr.Accordion("For Chat/Translation API", open=False, visible=False):
        input_text = gr.Text()
        api_btn = gr.Button("Go", variant="primary")
        out_text = gr.Text()

    api_btn.click(
        predict_api,
        input_text,
        out_text,
        api_name="api",
    )


if __name__ == "__main__":
    demo.queue().launch(debug=True, share=True)