Spaces:

sitammeur
/

paligemma2-docci

Running on Zero

sitammeur commited on Dec 6, 2024

Commit

e05119b

verified ·

1 Parent(s): df22269

Update src/app/response.py

Files changed (1) hide show

src/app/response.py CHANGED Viewed

@@ -17,13 +17,14 @@ model, processor = load_model_and_processor(model_name, device)
 @spaces.GPU
-def caption_image(image: PIL.Image.Image, max_new_tokens: int) -> str:
     """
     Generates a caption based on the given image using the model.
     Args:
         - image (PIL.Image.Image): The input image to be processed.
         - max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated caption text.
@@ -35,22 +36,26 @@ def caption_image(image: PIL.Image.Image, max_new_tokens: int) -> str:
         # Prepare the inputs
         prompt = "caption en"
-        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
         # Generate the response
         with torch.inference_mode():
-            generated_ids = model.generate(
-                **inputs, max_new_tokens=max_new_tokens, do_sample=sampling
             )
-        # Decode the generated response
-        result = processor.batch_decode(generated_ids, skip_special_tokens=True)
         # Log the successful generation of the caption
         logging.info("Caption generated successfully.")
         # Return the generated caption
-        return result[0][len(prompt) :].lstrip("\n")
     # Handle exceptions that may occur during caption generation
     except Exception as e:

 @spaces.GPU
+def caption_image(image: PIL.Image.Image, max_new_tokens: int, sampling: bool) -> str:
     """
     Generates a caption based on the given image using the model.
     Args:
         - image (PIL.Image.Image): The input image to be processed.
         - max_new_tokens (int): The maximum number of new tokens to generate.
+        - sampling (bool): Whether to use sampling or not.
     Returns:
         str: The generated caption text.
         # Prepare the inputs
         prompt = "caption en"
+        model_inputs = (
+            processor(text=prompt, images=image, return_tensors="pt")
+            .to(torch.bfloat16)
+            .to(device)
+        )
+        input_len = model_inputs["input_ids"].shape[-1]
         # Generate the response
         with torch.inference_mode():
+            generation = model.generate(
+                **model_inputs, max_new_tokens=max_new_tokens, do_sample=sampling
             )
+            generation = generation[0][input_len:]
+            decoded = processor.decode(generation, skip_special_tokens=True)
         # Log the successful generation of the caption
         logging.info("Caption generated successfully.")
         # Return the generated caption
+        return decoded
     # Handle exceptions that may occur during caption generation
     except Exception as e: