# Installing the latest version of the transformers library import os os.system("pip install ./transformers-4.47.0.dev0-py3-none-any.whl") # Importing the requirements import warnings warnings.filterwarnings("ignore") import gradio as gr from src.app.response import caption_image # Image, text query, and input parameters image = gr.Image(type="pil", label="Image") # text = gr.Textbox(label="Question", placeholder="Enter your question here") max_new_tokens = gr.Slider( minimum=20, maximum=160, step=10, value=80, label="Max Tokens" ) # Output for the interface answer = gr.Textbox(label="Predicted answer", show_label=True, show_copy_button=True) # Examples for the interface examples = [ [ "images/cat.jpg", 80, ], [ "images/dog.jpg", 80, ], [ "images/bird.jpg", 160, ], ] # Title, description, and article for the interface title = "Visual Question Answering" description = "Gradio Demo for the PaliGemma 2 Vision Language Understanding and Generation model. This model can answer questions about images in natural language. To use it, upload your image, type a question, select associated parameters, use the default values, click 'Submit', or click one of the examples to load them. You can read more at the links below." article = "
" # Launch the interface interface = gr.Interface( fn=caption_image, inputs=[image, max_new_tokens], outputs=answer, examples=examples, cache_examples=True, cache_mode="lazy", title=title, description=description, article=article, theme="Nymbo/Nymbo_Theme", flagging_mode="never", ) interface.launch(debug=False)