import torch from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import numpy as np import os import gradio as gr # Load the model and tokenizer model_path = "ByteDance/Sa2VA-4B" model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True, ).eval().cuda() tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code = True, ) def image_vision(image_input_path, prompt): image_path = image_input_path text_prompts = f"{prompt}" image = Image.open(image_path).convert('RGB') input_dict = { 'image': image, 'text': text_prompts, 'past_text': '', 'mask_prompts': None, 'tokenizer': tokenizer, } return_dict = model.predict_forward(**input_dict) answer = return_dict["prediction"] # the text format answer print(answer) def main_infer(image_input_path, prompt): response = image_vision(image_input_path, prompt) return response # Gradio UI with gr.Blocks() as demo: with gr.Column(): gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos") with gr.Row(): with gr.Column(): image_input = gr.Image(label="Image IN") with gr.Row(): instruction = gr.Textbox(label="Instruction") submit_btn = gr.Button("SUbmit", scale=1) with gr.Column(): output_res = gr.Textbox(label="Response") submit_btn.click( fn = main_infer, inputs = [image_input, instruction], outputs = [output_res] ) demo.queue().launch(show_api=False, show_error=True)