File size: 2,858 Bytes
9d0493a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import os.path as osp
import random
from typing import Union
import os
import sys
from typing import List
import torch
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import gradio as gr
import torch.nn as nn


from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
    PeftModel
)
from transformers import LlamaForCausalLM, LlamaTokenizer


base_model='nickypro/tinyllama-15M'


class Prompter(object):

    def generate_prompt(
        self,
        instruction: str,
        label: Union[None, str] = None,
    ) -> str:

        res = f"{instruction}\nAnswer: "

        if label:
            res = f"{res}{label}"

        return res

    def get_response(self, output: str) -> str:
        return output.split("Answer:")[1].strip().replace("/", "\u00F7").replace("*", "\u00D7")

model = LlamaForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float32,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    f'checkpoint-16000',
    torch_dtype=torch.float32,
)

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer')
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"


def generate_answers(instructions, model, tokenizer):
    prompter = Prompter()
    raw_answers = []

    for instruction in instructions:
        prompt = prompter.generate_prompt(instruction)
        inputs = tokenizer(prompt, return_tensors="pt")

        input_ids = inputs["input_ids"]

        generation_output = model.generate(
            input_ids=input_ids,
            return_dict_in_generate=True,
            output_scores=True,
            pad_token_id=0,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=16
        )
        s = generation_output.sequences[0]
        raw_answers.append(tokenizer.decode(s, skip_special_tokens=True).strip())

    return raw_answers


def evaluate(instruction):
  return generate_answers([instruction], model, tokenizer)[0]


if __name__ == "__main__":
    gr.Interface(
            fn=evaluate,
            inputs=[
                gr.components.Textbox(
                    lines=1,
                    label="Arithmetic",
                    placeholder="63303235 + 20239503",
                )
            ],
            outputs=[
                gr.Textbox(
                    lines=5,
                    label="Output",
                )
            ],
            title="Arithmetic LLaMA",
            description="This model is 15M llama model, finetuned on a+b tasks",
        ).queue().launch(server_name="0.0.0.0", share=True)