File size: 1,727 Bytes
e85dbaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
from PIL import Image
from transformers import AutoTokenizer, AutoModelForVision2Seq, TextStreamer
import torch

# Load model and tokenizer
MODEL_NAME = "RPW/NIH-1.2_Llama-3.2-11B-Vision-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME).to(device)

# Inference function
def generate_caption(image: Image.Image, instruction: str):
    # Prepare input data
    messages = [{"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}]
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)
    
    # Text generation
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    output = model.generate(
        **inputs, streamer=text_streamer,
        max_new_tokens=128,
        use_cache=True, temperature=1.5, min_p=0.1
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Gradio interface
def gradio_interface(image):
    instruction = "You are an expert radiographer. Describe accurately what you see in this image."
    caption = generate_caption(image, instruction)
    return caption

# Create Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(),
    live=True,
    title="Radiograph Image Captioning",
    description="Upload a radiograph image, and the model will generate a caption describing it.",
)

# Launch the Gradio app
interface.launch()