import torch import torch.nn.functional as F import gradio as gr from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel import spaces # Dictionary of available models with their image sizes MODELS = { "CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"), "CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"), "CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"), "CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"), "SigLIP SO400M/14-384": ("google/siglip-so400m-patch14-384", 384, "siglip"), "SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"), "SigLIP SO400M/14-224": ("google/siglip-so400m-patch14-224", 224, "siglip"), "SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"), "SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"), } # Initialize models and processors models = {} processors = {} for model_name, (model_path, _, model_type) in MODELS.items(): if model_type == "clip": models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda") processors[model_name] = CLIPProcessor.from_pretrained(model_path) elif model_type == "siglip": models[model_name] = AutoModel.from_pretrained(model_path).to("cuda") processors[model_name] = AutoProcessor.from_pretrained(model_path) @spaces.GPU def calculate_score(image, text, model_name): labels = text.split(";") labels = [l.strip() for l in labels] labels = list(filter(None, labels)) if len(labels) == 0: return dict() model = models[model_name] processor = processors[model_name] model_type = MODELS[model_name][2] # Preprocess the image and text inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True) inputs = {k: v.to("cuda") for k, v in inputs.items()} # Calculate embeddings with torch.no_grad(): outputs = model(**inputs) if model_type == "clip": image_embeds = outputs.image_embeds text_embeds = outputs.text_embeds elif model_type == "siglip": image_embeds = outputs.image_embeds text_embeds = outputs.text_embeds # Normalize embeddings image_embeds = F.normalize(image_embeds, p=2, dim=1) text_embeds = F.normalize(text_embeds, p=2, dim=1) # Calculate cosine similarity cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1) # Ensure values are between 0 and 1 cosine_similarities = torch.clamp(cosine_similarities, min=0, max=1) # Convert to numpy array similarities = cosine_similarities.cpu().numpy() results_dict = {label: float(score) for label, score in zip(labels, similarities)} return results_dict with gr.Blocks() as demo: gr.Markdown("# Multi-Model CLIP and SigLIP Score") gr.Markdown( "Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants" ) with gr.Row(): image_input = gr.Image(type="pil") output_label = gr.Label() with gr.Row(): text_input = gr.Textbox(label="Descriptions (separated by semicolons)") model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16" ) def process_inputs(image, text, model_name): if image is None or text.strip() == "": return None return calculate_score(image, text, model_name) inputs = [image_input, text_input, model_dropdown] outputs = output_label image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs) text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs) model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs) gr.Examples( examples=[ [ "cat.jpg", "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void", "CLIP ViT-B/16", ] ], fn=process_inputs, inputs=inputs, outputs=outputs, ) demo.launch()