import sys import gradio as gr import jax from huggingface_hub import snapshot_download from PIL import Image from transformers import AutoTokenizer import torch from torchvision.io import ImageReadMode, read_image LOCAL_PATH = snapshot_download("flax-community/medclip") sys.path.append(LOCAL_PATH) from src.modeling_medclip import FlaxMedCLIP from run_medclip import Transform def prepare_image(image_path, model): image = read_image(image_path, mode=ImageReadMode.RGB) preprocess = Transform(model.config.vision_config.image_size) preprocess = torch.jit.script(preprocess) preprocessed_image = preprocess(image) pixel_values = torch.stack([preprocessed_image]).permute(0, 2, 3, 1).numpy() return pixel_values def prepare_text(text, tokenizer): return tokenizer(text, return_tensors="np") def save_file_to_disk(uplaoded_file): temp_file = "/tmp/image.jpeg" im = Image.fromarray(uplaoded_file) im.save(temp_file) return temp_file def load_tokenizer_and_model(): # load the saved model tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased") model = FlaxMedCLIP.from_pretrained(LOCAL_PATH) return tokenizer, model def run_inference(image_path, text, model, tokenizer): pixel_values = prepare_image(image_path, model) input_text = prepare_text(text, tokenizer) model_output = model( input_text["input_ids"], pixel_values, attention_mask=input_text["attention_mask"], train=False, return_dict=True, ) logits = model_output["logits_per_image"] score = jax.nn.sigmoid(logits)[0][0] return score tokenizer, model = load_tokenizer_and_model() def score_image_caption_pair(uploaded_file, text_input): local_image_path = save_file_to_disk(uploaded_file) score = run_inference( local_image_path, text_input, model, tokenizer).tolist() return {"Score": score} image = gr.inputs.Image(shape=(299, 299)) iface = gr.Interface( fn=score_image_caption_pair, inputs=[image, "text"], outputs=["label"], allow_flagging=False, allow_screenshot=False, title="Your personal TA", description=""" The purpose of this demo is to help medical students measure their diagnostic capabilities in purely academic settings. Under no circumstances should it be used to make a self-diagnosis or confront a real doctor. """ ) iface.launch()