moondream-05

Sleeping

File size: 2,355 Bytes

e352103
36be50d
e352103
ecd7421
c80b72d
e352103
c80b72d
 
 
 
 
e352103
c80b72d
ecd7421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab94263
ecd7421
e352103
36be50d
e352103
36be50d
d9061b4
 
ecd7421
 
 
 
d9061b4
36be50d
ecd7421
 
 
36be50d
ecd7421
 
36be50d
d9061b4

import gradio as gr
from threading import Thread
from PIL import Image
import moondream as md
from huggingface_hub import hf_hub_download

# Download model at runtime
model_path = hf_hub_download(
    repo_id="andito/moondream05",
    filename="moondream-0_5b-int8.mf",
)

model = md.vl(model=model_path)

def model_inference(input_dict, history):
    # Extract image from message if present
    if input_dict.get("files"):
        image_path = input_dict["files"][0]
        if isinstance(image_path, dict) and "path" in image_path:
            image_path = image_path["path"]
        image = Image.open(image_path)
        encoded_image = model.encode_image(image)
        
        # If there's a question, use query
        text = input_dict.get("text", "")
        if text not in ["", "Caption"]:
            response = model.query(encoded_image, text)["answer"]
        # Otherwise generate a caption
        else:
            response = model.caption(encoded_image)["caption"]
            
        return response
    else:
        return "Please provide an image to analyze."

    

examples=[
              [{"text": "Caption", "files": ["example_images/demo-1.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/demo-2.jpg"]}, []],
              [{"text": "What art era do this artpiece belong to?", "files": ["example_images/rococo.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/rococo.jpg"]}, []],
              [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/examples_wat_arun.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/aaron.jpeg"]}, []],
      ]

demo = gr.ChatInterface(fn=model_inference, title="Moondream 0.5B: The World's Smallest Vision-Language Model", 
                description="Play with [Moondream 0.5B](https://huggingface.co/vikhyatk/moondream2) in this demo. To get started, upload an image and text or try one of the examples.",
                examples=examples,
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="single"), stop_btn="Stop Generation", multimodal=True, 
                additional_inputs=[], cache_examples=False)

demo.launch(debug=True)