import os

os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')

import gradio as gr
import PIL.Image
import transformers
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import string
import functools
import re
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
import spaces


adapter_id = "merve/paligemma2-3b-vqav2"
model_id = "gv-hf/paligemma2-3b-pt-448"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
processor = PaliGemmaProcessor.from_pretrained(model_id)

###### Transformers Inference
@spaces.GPU
def infer(
    text,
    image: PIL.Image.Image,
    max_new_tokens: int
) -> str:
    text = "answer en " + text
    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
    with torch.inference_mode():
      generated_ids = model.generate(
          **inputs,
          max_new_tokens=max_new_tokens,
          do_sample=False
      )
    result = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return result[0][len(text):].lstrip("\n")


######## Demo

INTRO_TEXT = """## PaliGemma 2 demo\n\n
| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) 
| [Blogpost](https://huggingface.co/blog/paligemma) 
| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb) 
|\n\n
PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and 
built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) 
vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile 
model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question 
answering, text reading, object detection and object segmentation.
\n\n
This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
See the [Blogpost](https://huggingface.co/blog/paligemma2), the project  
[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
\n\n
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
"""


with gr.Blocks(css="style.css") as demo:
    gr.Markdown(INTRO_TEXT)
    with gr.Column():
        question = gr.Text(label="Question")
        image = gr.Image(label="Input Image", type="pil", height=500)
        caption_btn = gr.Button(value="Submit")
        text_output = gr.Text(label="Text Output")
        
        tokens = gr.Slider(
            label="Max New Tokens",
            info="Set to larger for longer generation.",
            minimum=20,
            maximum=160,
            value=80,
            step=10,
        )
    
    caption_inputs = [
        question,
        image,
        tokens
        ]
    caption_outputs = [
        text_output
    ]
    caption_btn.click(
        fn=infer,
        inputs=caption_inputs,
        outputs=caption_outputs,
    )
    
    
    examples = [
        ["What is the graphic about?", "./howto.jpg", 60],
        ["What is the password", "./password.jpg", 20],
        ["Who is in this image?", "./examples_bowie.jpg", 80],
        ]
    gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
    
    gr.Examples(
        examples=examples,
        inputs=caption_inputs,
    )
#########

if __name__ == "__main__":
    demo.queue(max_size=10).launch(debug=True)