from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

class ImageToText:
    """
    Class to handle Image-to-Text captioning using BLIP.
    """
    def __init__(self):
        # Initialize the processor and model
        print("Loading Image-to-Text model...")
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
        print("Image-to-Text model loaded successfully.")
    
    
    async def generate_caption(self, image):
        """
        Generate a descriptive caption for an uploaded image.

        Args:
            image (PIL.Image): The image to caption.

        Returns:
            str: The generated caption.
        """
        inputs = self.processor(image, return_tensors="pt")
        out = self.model.generate(**inputs)
        caption = self.processor.decode(out[0], skip_special_tokens=True)
        return caption