import os import numpy as np import torch from PIL import Image import open_clip import gradio as gr import pickle # Load pre-trained model model, _, tokenizer = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai') # Load features def load_features(pickle_file): with open(pickle_file, 'rb') as f: data = pickle.load(f) return data # Calculate similarity def calculate_similarity(image_features, text_feature, lambda_val=0.5): image_similarities = image_features @ text_feature.T text_similarities = text_feature @ text_feature.T combined_similarities = (1 - lambda_val) * image_similarities + lambda_val * text_similarities return combined_similarities # Load precomputed features features = load_features('features/patternnet_clip.pkl') image_features = torch.tensor(features['feats'])#.cuda() image_paths = features['paths'] def image_text_retrieval(image, text, lambda_val): # Preprocess image preprocess = open_clip.get_preprocess('ViT-L-14') image = preprocess(image).unsqueeze(0)#.cuda() # Encode image and text image_feature = model.encode_image(image).cpu() #text_feature = model.encode_text(tokenizer(text).unsqueeze(0).cuda()).cpu() text_feature = model.encode_text(tokenizer(text).unsqueeze(0)).cpu() # Calculate combined similarities similarities = calculate_similarity(image_features, text_feature, lambda_val) top_indices = similarities.topk(5).indices.squeeze().tolist() # Retrieve top images top_images = [Image.open(image_paths[i]) for i in top_indices] return top_images # Create Gradio interface def demo(image, text, lambda_val): return image_text_retrieval(image, text, lambda_val) iface = gr.Interface( fn=demo, inputs=[ gr.Image(type="pil", label="Query Image"), gr.Textbox(lines=2, placeholder="Enter text query...", label="Text Query"), gr.Slider(minimum=0, maximum=1, value=0.5, label="Lambda Value (Image-Text Weight)") ], outputs=gr.Gallery(label="Retrieved Images"), title="Composed Image Retrieval for Remote Sensing", description="Upload a query image, enter a text query, and adjust the lambda value to retrieve images based on both image and text inputs." ) iface.launch()