import numpy as np import torch import torchvision.transforms as T from PIL import Image from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer import matplotlib.pyplot as plt import random import streamlit as st import requests from io import BytesIO import os IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) return transform def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size ) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images def load_image(image_file, input_size=448, max_num=12): image = Image.open(image_file).convert('RGB') transform = build_transform(input_size=input_size) images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values def prediction(model, image_file, question): question = f"\n{question}" # set the max number of tiles in `max_num` pixel_values = load_image(image_file, max_num=12).to(torch.bfloat16) generation_config = dict(max_new_tokens=1024, do_sample=False) response = model.chat(tokenizer, pixel_values, question, generation_config) return response # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section. path = 'Ramji/slake_vqa_internvl_demo' intern_model = AutoModel.from_pretrained( path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_flash_attn=False, trust_remote_code=True).eval() tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) # Title of the Streamlit app st.title("Image VQA") # Step 1: Upload an image st.header("Upload an Image") uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"]) # Step 2: Input a question st.header("Ask a Question") question = st.text_input("Type your question here:") # Step 3: Handle the uploaded image by saving it and reading its path if uploaded_image is not None: # Save the uploaded image to a file image_path = os.path.join("uploaded_images", uploaded_image.name) # Make sure the directory exists os.makedirs("uploaded_images", exist_ok=True) # Write the image to a file with open(image_path, "wb") as f: f.write(uploaded_image.getbuffer()) # Read the image from the saved file path image = Image.open(image_path) # Display the uploaded image st.image(image, caption="Uploaded Image", use_column_width=True) st.write(f"Image saved at: {image_path}") # Step 4: Display the typed question if question: st.write(f"Your question: **{question}**") # Optional: Process the image and question for a VLM (like CLIP or BLIP) if uploaded_image and question: st.write("Processing the image and question...") output = prediction(intern_model, image_path, question) st.write("Model output: This is where the answer will appear.")