from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image class ImageToText: """ Class to handle Image-to-Text captioning using BLIP. """ def __init__(self): # Initialize the processor and model print("Loading Image-to-Text model...") self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") print("Image-to-Text model loaded successfully.") async def generate_caption(self, image): """ Generate a descriptive caption for an uploaded image. Args: image (PIL.Image): The image to caption. Returns: str: The generated caption. """ inputs = self.processor(image, return_tensors="pt") out = self.model.generate(**inputs) caption = self.processor.decode(out[0], skip_special_tokens=True) return caption