File size: 2,858 Bytes
9d0493a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import json
import os.path as osp
import random
from typing import Union
import os
import sys
from typing import List
import torch
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import gradio as gr
import torch.nn as nn
from peft import (
LoraConfig,
get_peft_model,
get_peft_model_state_dict,
prepare_model_for_int8_training,
set_peft_model_state_dict,
PeftModel
)
from transformers import LlamaForCausalLM, LlamaTokenizer
base_model='nickypro/tinyllama-15M'
class Prompter(object):
def generate_prompt(
self,
instruction: str,
label: Union[None, str] = None,
) -> str:
res = f"{instruction}\nAnswer: "
if label:
res = f"{res}{label}"
return res
def get_response(self, output: str) -> str:
return output.split("Answer:")[1].strip().replace("/", "\u00F7").replace("*", "\u00D7")
model = LlamaForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float32,
device_map="auto",
)
model = PeftModel.from_pretrained(
model,
f'checkpoint-16000',
torch_dtype=torch.float32,
)
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer')
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
def generate_answers(instructions, model, tokenizer):
prompter = Prompter()
raw_answers = []
for instruction in instructions:
prompt = prompter.generate_prompt(instruction)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"]
generation_output = model.generate(
input_ids=input_ids,
return_dict_in_generate=True,
output_scores=True,
pad_token_id=0,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=16
)
s = generation_output.sequences[0]
raw_answers.append(tokenizer.decode(s, skip_special_tokens=True).strip())
return raw_answers
def evaluate(instruction):
return generate_answers([instruction], model, tokenizer)[0]
if __name__ == "__main__":
gr.Interface(
fn=evaluate,
inputs=[
gr.components.Textbox(
lines=1,
label="Arithmetic",
placeholder="63303235 + 20239503",
)
],
outputs=[
gr.Textbox(
lines=5,
label="Output",
)
],
title="Arithmetic LLaMA",
description="This model is 15M llama model, finetuned on a+b tasks",
).queue().launch(server_name="0.0.0.0", share=True)
|