|
import gradio as gr |
|
import spaces |
|
from transformers import AutoModelForCausalLM, AutoProcessor |
|
import torch |
|
from PIL import Image |
|
import subprocess |
|
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) |
|
|
|
models = { |
|
"microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval() |
|
|
|
} |
|
|
|
processors = { |
|
"microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True) |
|
} |
|
|
|
kwargs = {} |
|
kwargs['torch_dtype'] = torch.bfloat16 |
|
|
|
user_prompt = '<|user|>\n' |
|
assistant_prompt = '<|assistant|>\n' |
|
prompt_suffix = "<|end|>\n" |
|
|
|
@spaces.GPU |
|
def run_example(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"): |
|
model = models[model_id] |
|
processor = processors[model_id] |
|
|
|
prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" |
|
image = Image.fromarray(image).convert("RGB") |
|
|
|
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") |
|
generate_ids = model.generate(**inputs, |
|
max_new_tokens=1000, |
|
eos_token_id=processor.tokenizer.eos_token_id, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode(generate_ids, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=False)[0] |
|
return response |
|
|
|
css = """ |
|
#output { |
|
height: 500px; |
|
overflow: auto; |
|
border: 1px solid #ccc; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown("## Phi-3.5 Vision Instruct Demo with Example Inputs") |
|
|
|
with gr.Tab(label="Phi-3.5 Input"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_img = gr.Image(label="Input Picture") |
|
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct") |
|
text_input = gr.Textbox(label="Question") |
|
submit_btn = gr.Button(value="Submit") |
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Output Text") |
|
|
|
|
|
examples = [ |
|
["image1.jpeg", "What does this painting tell us explain in detail?"], |
|
["image2.jpg", "What does this painting tell us explain in detail?"], |
|
["image3.jpg", "Describe the scene in this picture."] |
|
] |
|
|
|
|
|
gr.Examples( |
|
examples=examples, |
|
inputs=[input_img, text_input], |
|
examples_per_page=3 |
|
) |
|
|
|
submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text]) |
|
|
|
|
|
demo.queue(api_open=False) |
|
demo.launch(debug=True, show_api=False) |