Spaces:
Running
on
Zero
Running
on
Zero
DongfuJiang
commited on
Commit
·
301c810
1
Parent(s):
83ba2c4
update
Browse files- app.py +107 -0
- requirements.txt +5 -0
- utils.py +85 -0
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
from datasets import load_dataset
|
5 |
+
from typing import List
|
6 |
+
import utils
|
7 |
+
|
8 |
+
|
9 |
+
DESCRIPTIONS = """# 🐯TIGERScore
|
10 |
+
|
11 |
+
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
|
12 |
+
|
13 |
+
### [**Website**](https://tiger-ai-lab.github.io/TIGERScore/) [**Paper**](https://arxiv.org/abs/2310.00752) [**Code**](https://github.com/TIGER-AI-Lab/TIGERScore) [**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B-V1.0) [**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B-V1.0)
|
14 |
+
|
15 |
+
"""
|
16 |
+
|
17 |
+
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train_mix")
|
18 |
+
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
|
19 |
+
EXAMPLES = []
|
20 |
+
fields = ["task", "instruction", "input_context", "hypo_output"]
|
21 |
+
print("Loading examples...")
|
22 |
+
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
|
23 |
+
if any([not ex[field] for field in fields]):
|
24 |
+
continue
|
25 |
+
EXAMPLES.append([ex[field] for field in fields])
|
26 |
+
if i >= 100:
|
27 |
+
break
|
28 |
+
|
29 |
+
def tigerscore(task, input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
|
30 |
+
return utils.generate(
|
31 |
+
task, input_context,
|
32 |
+
generation_instruction, hypo_output,
|
33 |
+
max_new_tokens=max_new_tokens,
|
34 |
+
temperature=temperature, top_p=top_p
|
35 |
+
)
|
36 |
+
|
37 |
+
def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
|
38 |
+
return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
|
39 |
+
|
40 |
+
## initialize the model
|
41 |
+
print("Loading TIGERScore model...")
|
42 |
+
utils.load_tigerscore("7b")
|
43 |
+
|
44 |
+
with gr.Blocks(theme='gradio/soft') as demo:
|
45 |
+
gr.Markdown(DESCRIPTIONS)
|
46 |
+
gr.Markdown("## TIGERScore Inputs")
|
47 |
+
|
48 |
+
tasks_dropdown = gr.Dropdown(label="Task", choices=utils.tasks + ["other"], value="translation", show_label=True)
|
49 |
+
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
|
50 |
+
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
|
51 |
+
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
|
52 |
+
|
53 |
+
with gr.Row():
|
54 |
+
clear_button = gr.Button('Clear', variant='primary')
|
55 |
+
submit_button = gr.Button('Submit', variant='primary')
|
56 |
+
|
57 |
+
with gr.Accordion(label='Advanced options', open=False):
|
58 |
+
max_new_tokens = gr.Slider(
|
59 |
+
label='Max new tokens fuser can generate',
|
60 |
+
minimum=256,
|
61 |
+
maximum=1024,
|
62 |
+
step=1,
|
63 |
+
value=512,
|
64 |
+
)
|
65 |
+
temperature = gr.Slider(
|
66 |
+
label='Temperature of fuser generation',
|
67 |
+
minimum=0.1,
|
68 |
+
maximum=2.0,
|
69 |
+
step=0.1,
|
70 |
+
value=0.7,
|
71 |
+
)
|
72 |
+
top_p = gr.Slider(
|
73 |
+
label='Top-p of fuser generation',
|
74 |
+
minimum=0.05,
|
75 |
+
maximum=1.0,
|
76 |
+
step=0.05,
|
77 |
+
value=1.0,
|
78 |
+
)
|
79 |
+
|
80 |
+
gr.Markdown("## TIGERScore Outputs")
|
81 |
+
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
|
82 |
+
|
83 |
+
|
84 |
+
submit_button.click(
|
85 |
+
fn=tigerscore,
|
86 |
+
inputs=[tasks_dropdown, input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
|
87 |
+
outputs=evaluation_output_textbox,
|
88 |
+
)
|
89 |
+
|
90 |
+
batch_examples = gr.Examples(
|
91 |
+
examples=EXAMPLES,
|
92 |
+
fn=get_examples,
|
93 |
+
cache_examples=True,
|
94 |
+
examples_per_page=5,
|
95 |
+
inputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
|
96 |
+
outputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
|
97 |
+
)
|
98 |
+
|
99 |
+
citations = gr.Markdown("""## Citation
|
100 |
+
@article{jiang2023TIGERScore,
|
101 |
+
title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
|
102 |
+
author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
|
103 |
+
journal={arXiv preprint arXiv:2310.00752},
|
104 |
+
year={2023}
|
105 |
+
}""")
|
106 |
+
|
107 |
+
demo.queue(max_size=20).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.33.2
|
2 |
+
datasets==2.10.0
|
3 |
+
torch
|
4 |
+
accelerate
|
5 |
+
gradio
|
utils.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
+
from string import Template
|
3 |
+
import torch
|
4 |
+
|
5 |
+
FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
|
6 |
+
FINETUNE_INPUT = """\
|
7 |
+
Task instruction: ${generation_instruction}
|
8 |
+
Source: ${input_context}
|
9 |
+
Model-generated Output: ${hypothesis_output}
|
10 |
+
|
11 |
+
Based on the given task instruction and source, identify errors in this model-generated output.
|
12 |
+
For each error you give in the response, please also elaborate the following information:
|
13 |
+
- error location (the words that are wrong in the output)
|
14 |
+
- error aspect it belongs to.
|
15 |
+
- explanation why it's an error, and the correction suggestions.
|
16 |
+
- severity of the error ("Major" or "Minor").
|
17 |
+
- reduction of score (between 0.5 and 5 given the severity of the error)
|
18 |
+
|
19 |
+
Your evaluation output:
|
20 |
+
"""
|
21 |
+
|
22 |
+
TIGERScore_model_map = {
|
23 |
+
"7b": "TIGER-Lab/TIGERScore-7B-V1.0",
|
24 |
+
"13b": "TIGER-Lab/TIGERScore-13B-V1.0",
|
25 |
+
}
|
26 |
+
tigerscore_model = None
|
27 |
+
tigerscore_tokenizer = None
|
28 |
+
|
29 |
+
tasks = [
|
30 |
+
"translation",
|
31 |
+
"summarization",
|
32 |
+
"data2text",
|
33 |
+
"mathQA",
|
34 |
+
"long-form QA",
|
35 |
+
"instruction-following",
|
36 |
+
]
|
37 |
+
|
38 |
+
def load_tigerscore(model_size):
|
39 |
+
assert model_size in TIGERScore_model_map
|
40 |
+
model_name = TIGERScore_model_map[model_size]
|
41 |
+
global tigerscore_model, tigerscore_tokenizer
|
42 |
+
tigerscore_model = AutoModelForCausalLM.from_pretrained(
|
43 |
+
model_name,
|
44 |
+
torch_dtype=torch.bfloat16,
|
45 |
+
device_map="auto"
|
46 |
+
)
|
47 |
+
tigerscore_tokenizer = AutoTokenizer.from_pretrained(
|
48 |
+
model_name,
|
49 |
+
use_fast=True
|
50 |
+
)
|
51 |
+
|
52 |
+
def generate(task, input_context, generation_instruction, hypo_output, **generate_kwargs):
|
53 |
+
inst_part = Template(FINETUNE_INST)
|
54 |
+
inst_part = inst_part.substitute(task=task)
|
55 |
+
input_part = Template(FINETUNE_INPUT)
|
56 |
+
input_part = input_part.substitute(
|
57 |
+
generation_instruction=generation_instruction,
|
58 |
+
input_context=input_context,
|
59 |
+
hypothesis_output=hypo_output
|
60 |
+
)
|
61 |
+
prompt = (inst_part + "\n" + input_part).strip("\n ") + "\n"
|
62 |
+
encodings = tigerscore_tokenizer(prompt, return_tensors="pt")
|
63 |
+
input_ids = encodings["input_ids"].to(tigerscore_model.device)
|
64 |
+
attention_mask = encodings["attention_mask"].to(tigerscore_model.device)
|
65 |
+
gen_params = {
|
66 |
+
"input_ids": input_ids,
|
67 |
+
"attention_mask": attention_mask,
|
68 |
+
"max_new_tokens": 512,
|
69 |
+
"do_sample": True,
|
70 |
+
"top_k": 1,
|
71 |
+
"num_return_sequences": 1,
|
72 |
+
}
|
73 |
+
gen_params.update(generate_kwargs)
|
74 |
+
output = tigerscore_model.generate(**gen_params)
|
75 |
+
output = tigerscore_tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)
|
76 |
+
return output
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
task = "translation"
|
80 |
+
input_context = "Der künftige EM-Cheforganisator Philipp Lahm soll laut Grindel im DFB-Präsidium mitarbeiten."
|
81 |
+
generation_instruction = "Translate the following text from German to English."
|
82 |
+
hypo_output = "According to Grindel, the future head of the European Championships, Philipp Lahm, is to participate in the DFB Presidency."
|
83 |
+
output = generate(task, input_context, generation_instruction, hypo_output)
|
84 |
+
print(output)
|
85 |
+
|