Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,067 Bytes
834334c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import os
os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
import gradio as gr
import PIL.Image
import transformers
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import string
import functools
import re
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
import spaces
adapter_id = "merve/paligemma2-3b-vqav2"
model_id = "gv-hf/paligemma2-3b-pt-448"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
processor = PaliGemmaProcessor.from_pretrained(model_id)
###### Transformers Inference
@spaces.GPU
def infer(
text,
image: PIL.Image.Image,
max_new_tokens: int
) -> str:
text = "answer en " + text
inputs = processor(text=text, images=image, return_tensors="pt").to(device)
with torch.inference_mode():
generated_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False
)
result = processor.batch_decode(generated_ids, skip_special_tokens=True)
return result[0][len(text):].lstrip("\n")
######## Demo
INTRO_TEXT = """## PaliGemma 2 demo\n\n
| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
| [Blogpost](https://huggingface.co/blog/paligemma)
| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
|\n\n
PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
answering, text reading, object detection and object segmentation.
\n\n
This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
See the [Blogpost](https://huggingface.co/blog/paligemma2), the project
[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
\n\n
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
"""
with gr.Blocks(css="style.css") as demo:
gr.Markdown(INTRO_TEXT)
with gr.Column():
question = gr.Text(label="Question")
image = gr.Image(label="Input Image", type="pil", height=500)
caption_btn = gr.Button(value="Submit")
text_output = gr.Text(label="Text Output")
tokens = gr.Slider(
label="Max New Tokens",
info="Set to larger for longer generation.",
minimum=20,
maximum=160,
value=80,
step=10,
)
caption_inputs = [
question,
image,
tokens
]
caption_outputs = [
text_output
]
caption_btn.click(
fn=infer,
inputs=caption_inputs,
outputs=caption_outputs,
)
examples = [
["What is the graphic about?", "./howto.jpg", 60],
["What is the password", "./password.jpg", 20],
["Who is in this image?", "./examples_bowie.jpg", 80],
]
gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
gr.Examples(
examples=examples,
inputs=caption_inputs,
)
#########
if __name__ == "__main__":
demo.queue(max_size=10).launch(debug=True) |