File size: 2,355 Bytes
e352103
36be50d
e352103
ecd7421
c80b72d
e352103
c80b72d
 
 
 
 
e352103
c80b72d
ecd7421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab94263
ecd7421
e352103
36be50d
e352103
36be50d
d9061b4
 
ecd7421
 
 
 
d9061b4
36be50d
ecd7421
 
 
36be50d
ecd7421
 
36be50d
d9061b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
from threading import Thread
from PIL import Image
import moondream as md
from huggingface_hub import hf_hub_download

# Download model at runtime
model_path = hf_hub_download(
    repo_id="andito/moondream05",
    filename="moondream-0_5b-int8.mf",
)

model = md.vl(model=model_path)

def model_inference(input_dict, history):
    # Extract image from message if present
    if input_dict.get("files"):
        image_path = input_dict["files"][0]
        if isinstance(image_path, dict) and "path" in image_path:
            image_path = image_path["path"]
        image = Image.open(image_path)
        encoded_image = model.encode_image(image)
        
        # If there's a question, use query
        text = input_dict.get("text", "")
        if text not in ["", "Caption"]:
            response = model.query(encoded_image, text)["answer"]
        # Otherwise generate a caption
        else:
            response = model.caption(encoded_image)["caption"]
            
        return response
    else:
        return "Please provide an image to analyze."

    

examples=[
              [{"text": "Caption", "files": ["example_images/demo-1.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/demo-2.jpg"]}, []],
              [{"text": "What art era do this artpiece belong to?", "files": ["example_images/rococo.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/rococo.jpg"]}, []],
              [{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/examples_wat_arun.jpg"]}, []],
              [{"text": "Caption", "files": ["example_images/aaron.jpeg"]}, []],
      ]

demo = gr.ChatInterface(fn=model_inference, title="Moondream 0.5B: The World's Smallest Vision-Language Model", 
                description="Play with [Moondream 0.5B](https://huggingface.co/vikhyatk/moondream2) in this demo. To get started, upload an image and text or try one of the examples.",
                examples=examples,
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="single"), stop_btn="Stop Generation", multimodal=True, 
                additional_inputs=[], cache_examples=False)

demo.launch(debug=True)