HW_5_calc / app.py
PetrovDE's picture
app change 4
7bb6f3c
raw
history blame
4.17 kB
import os
import sys
import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from typing import Union
import re
class Prompter(object):
def generate_prompt(
self,
instruction: str,
label: Union[None, str] = None,
) -> str:
res = f"{instruction}\nAnswer: "
if label:
res = f"{res}{label}"
return res
def get_response(self, output: str) -> str:
return (
output.split("Answer:")[1]
.strip()
.replace("/", "\u00F7")
.replace("*", "\u00D7")
)
load_8bit = False # for Colab
base_model = "nickypro/tinyllama-15M"
lora_weights = "./chkp"
share_gradio = True
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
try:
if torch.backends.mps.is_available():
device = "mps"
except:
pass
prompter = Prompter()
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
if device == "cuda":
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model,
lora_weights,
torch_dtype=torch.float16,
device_map={"": 0},
)
elif device == "mps":
model = LlamaForCausalLM.from_pretrained(
base_model,
device_map={"": device},
torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(
model,
lora_weights,
device_map={"": device},
torch_dtype=torch.float16,
)
else:
model = LlamaForCausalLM.from_pretrained(
base_model, device_map={"": device}, low_cpu_mem_usage=True
)
model = PeftModel.from_pretrained(
model,
lora_weights,
device_map={"": device},
)
# if not load_8bit:
# model.half()
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
def evaluate(
instruction,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=15,
stream_output=True,
**kwargs,
):
prompt = prompter.generate_prompt(instruction)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
generate_params = {
"input_ids": input_ids,
"generation_config": generation_config,
"return_dict_in_generate": True,
"output_scores": True,
"max_new_tokens": max_new_tokens,
}
# Without streaming
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s, skip_special_tokens=True).strip()
yield prompter.get_response(output)
gr.Interface(
fn=evaluate,
inputs=[
gr.components.Textbox(
lines=1,
label="Arithmetic",
placeholder="What is 63303235 + 20239503",
),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
gr.components.Slider(
minimum=1, maximum=1024, step=1, value=512, label="Max tokens"
),
],
outputs=[
gr.Textbox(
lines=5,
label="Output",
)
],
title="test model",
description="Это пример реализации из goat", # noqa: E501
).queue().launch(share=share_gradio)