Elster / app.py
davanstrien's picture
davanstrien HF staff
use Markdown for generated model response
0d2a813
raw
history blame
2.84 kB
import gradio as gr
import transformers
import torch
import json
from transformers import AutoTokenizer
import os
from huggingface_hub import login
import spaces
HF_TOKEN = os.getenv("HF_TOKEN")
login(HF_TOKEN)
# Load the model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=True)
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device="cuda",
)
# Load the model configuration
with open("model_configs.json", "r") as f:
model_configs = json.load(f)
model_config = model_configs[model_id]
# Extract instruction
extract_input = model_config["extract_input"]
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
@spaces.GPU
def generate_instruction_response():
instruction = pipeline(
extract_input,
max_new_tokens=2048,
eos_token_id=terminators,
do_sample=True,
temperature=1,
top_p=1,
)
sanitized_instruction = instruction[0]["generated_text"][
len(extract_input) :
].split("\n")[0]
response_template = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{sanitized_instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""
response = pipeline(
response_template,
max_new_tokens=2048,
eos_token_id=terminators,
do_sample=True,
temperature=1,
top_p=1,
)
user_message = sanitized_instruction
assistant_response = response[0]["generated_text"][len(response_template) :]
return user_message, assistant_response
title = "Magpie demo"
description = """
This Gradio demo showcases the approach described in the Magpie paper. Magpie is a data synthesis pipeline that creates high-quality alignment data without relying on prompt engineering or seed questions. Instead, it generates instruction data by prompting aligned LLMs with a pre-query template. This method does not prompt the model with a question or starting query. Instead, it uses the model's pre-query template to generate instructions. Essentially, the model is given only the template until a user instruction starts, and then it generates the instruction and the response.
In this demo, you can see how the model generates a user instruction and a model response.
You can learn more about the approach [in the paper](https://huggingface.co/papers/2406.08464).
"""
# Create the Gradio interface
iface = gr.Interface(
fn=generate_instruction_response,
inputs=[],
outputs=[
gr.Text(label="Generated User Instruction"),
gr.Markdown(label="Generated Model Response"),
],
title=title,
description=description,
)
# Launch the app
iface.launch(debug=True)