import gradio as gr from PIL import Image from transformers import AutoTokenizer, AutoModelForVision2Seq, TextStreamer import torch # Load model and tokenizer MODEL_NAME = "RPW/NIH-1.2_Llama-3.2-11B-Vision-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME).to(device) # Inference function def generate_caption(image: Image.Image, instruction: str): # Prepare input data messages = [{"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]}] input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) inputs = tokenizer( image, input_text, add_special_tokens=False, return_tensors="pt" ).to(device) # Text generation text_streamer = TextStreamer(tokenizer, skip_prompt=True) output = model.generate( **inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True, temperature=1.5, min_p=0.1 ) return tokenizer.decode(output[0], skip_special_tokens=True) # Gradio interface def gradio_interface(image): instruction = "You are an expert radiographer. Describe accurately what you see in this image." caption = generate_caption(image, instruction) return caption # Create Gradio interface interface = gr.Interface( fn=gradio_interface, inputs=gr.Image(type="pil"), outputs=gr.Textbox(), live=True, title="Radiograph Image Captioning", description="Upload a radiograph image, and the model will generate a caption describing it.", ) # Launch the Gradio app interface.launch()