Error processing images: 'MolmoProcessor' object is not callable
I have an issue with using the molmo models allenai/Molmo-7B-D-0924 and allenai/MolmoE-1B-0924 for my ICL and ZSL tasks, it continuously throwing the error,” TypeError Traceback (most recent call last) in <cell line: 0>() 39 40 # Step 7: Process the images using the processor ---> 41 inputs = processor(images=example_images, return_tensors="pt", padding=True) 42 target_input = processor(images=target_image, return_tensors="pt") 43 TypeError: 'MolmoProcessor' object is not callable”, could you please help me with this error .
@PadmajaVaishnavi Can you post your entire code snippet so that I can look into this?
Code:
Install required libraries
!pip install transformers huggingface_hub scikit-learn pillow
import logging
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
Set logging
logging.basicConfig(level=logging.INFO)
Define the model name
model_name = "allenai/MolmoE-1B-0924"
Download model checkpoint
logging.info(f"Downloading {model_name} model checkpoint...")
model_path = snapshot_download(model_name)
Load processor
logging.info("Loading processor...")
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
Explicitly set the device to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Load the model on the selected device
logging.info("Initializing model...")
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
Upload example images
from google.colab import files
logging.info("Please upload example images (ensure paths match captions):")
uploaded = files.upload()
List of example images and their captions
example_images = [
("58658.jpg", "The screen is a tutorial screen having a large text element located at the top part of the screen."),
("59376.jpg", "A search app with a large background image located at the center part."),
("14085.jpg", "The interface looks like a maps app having a large background image component placed at the center part.")
]
Upload target image
logging.info("Please upload the target image for evaluation:")
target_uploaded = files.upload()
target_image_path = list(target_uploaded.keys())[0]
Load target image
target_image = Image.open(target_image_path)
Function to compute similarity score between generated caption and example captions
def compute_similarity(generated_caption, example_caption):
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize both captions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([generated_caption, example_caption])
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return cosine_sim[0][0]
Loop through the example images and captions
for img_name, caption in example_images:
# Load the image
image = Image.open(img_name)
# Example Text Prompt
text_prompt = "Please describe the content of this image."
# Process the image and text
logging.info(f"Processing image: {img_name} with caption: {caption}...")
inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(device)
# Perform inference
logging.info("Performing In-context Learning (ICL) inference...")
with torch.no_grad():
outputs = model.generate(**inputs)
# Decode the output
generated_caption = processor.decode(outputs[0], skip_special_tokens=True)
# Compute similarity score between the generated caption and the example caption
similarity_score = compute_similarity(generated_caption, caption)
logging.info(f"Generated Caption: {generated_caption}")
logging.info(f"Example Caption: {caption}")
logging.info(f"Cosine Similarity Score: {similarity_score}")
Now, process the target image similarly
logging.info("Processing target image for evaluation...")
inputs = processor(images=target_image, text="Please describe the content of this image.", return_tensors="pt").to(device)
Perform inference for the target image
logging.info("Performing inference for target image...")
with torch.no_grad():
target_outputs = model.generate(**inputs)
Decode the output for the target image
target_generated_caption = processor.decode(target_outputs[0], skip_special_tokens=True)
logging.info(f"Generated Caption for Target Image: {target_generated_caption}")
ERROR:
TypeError Traceback (most recent call last)
in <cell line: 0>()
74 # Process the image and text
75 logging.info(f"Processing image: {img_name} with caption: {caption}...")
---> 76 inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(device)
77
78 # Perform inference
TypeError: 'MolmoProcessor' object is not callable