Spaces:

ighoshsubho
/

flux-sam-florence

Runtime error

App Files Files Community

ighoshsubho commited on Aug 20, 2024

Commit

9aecc37

0 Parent(s):

Florence sam flux first commit

Browse files

Files changed (6) hide show

.gitignore +3 -0
README.md +12 -0
app.py +121 -0
requirements.txt +13 -0
utils/florence.py +58 -0
utils/sam.py +45 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/venv
+/.idea
+/tmp

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Florence2 + SAM2 + FLUX
+emoji: 🔥
+colorFrom: purple
+colorTo: green
+sdk: gradio
+sdk_version: 4.40.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import numpy as np
+from PIL import Image
+from diffusers import FluxInpaintPipeline
+from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.sam import load_sam_image_model, run_sam_inference
+import gradio as gr
+import supervision as sv
+# Load models
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+FLUX_PIPE = FluxInpaintPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(
+    DEVICE)
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_MODEL = load_sam_image_model(device=DEVICE)
+COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
+COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
+BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+LABEL_ANNOTATOR = sv.LabelAnnotator(
+    color=COLOR_PALETTE,
+    color_lookup=sv.ColorLookup.INDEX,
+    text_position=sv.Position.CENTER_OF_MASS,
+    text_color=sv.Color.from_hex("#000000"),
+    border_radius=5
+)
+MASK_ANNOTATOR = sv.MaskAnnotator(
+    color=COLOR_PALETTE,
+    color_lookup=sv.ColorLookup.INDEX
+)
+def visualize_detections(image, detections):
+    output_image = image.copy()
+    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
+    output_image = BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
+    return output_image
+def detect_objects(image, text_prompt):
+    # Use Florence for object detection
+    _, result = run_florence_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=image,
+        task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+        text=text_prompt
+    )
+    detections = sv.Detections.from_lmm(
+        lmm=sv.LMM.FLORENCE_2,
+        result=result,
+        resolution_wh=image.size
+    )
+    # Use SAM to refine masks
+    detections = run_sam_inference(SAM_MODEL, image, detections)
+    return detections
+def inpaint_selected_objects(image, detections, selected_indices, inpaint_prompt):
+    mask = np.zeros(image.size[::-1], dtype=np.uint8)
+    for idx in selected_indices:
+        mask |= detections.mask[idx]
+    mask_image = Image.fromarray(mask * 255)
+    result = FLUX_PIPE(
+        prompt=inpaint_prompt,
+        image=image,
+        mask_image=mask_image,
+        num_inference_steps=30,
+        strength=0.85,
+    ).images[0]
+    return result
+def process_image(input_image, detection_prompt, inpaint_prompt, selected_objects):
+    detections = detect_objects(input_image, detection_prompt)
+    # Visualize detected objects
+    detected_image = visualize_detections(input_image, detections)
+    if selected_objects:
+        selected_indices = [int(idx) for idx in selected_objects.split(',')]
+        inpainted_image = inpaint_selected_objects(input_image, detections, selected_indices, inpaint_prompt)
+        return detected_image, inpainted_image
+    else:
+        return detected_image, None
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Object Detection and Inpainting with FLUX, Florence, and SAM")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image")
+            detection_prompt = gr.Textbox(label="Detection Prompt", placeholder="Enter objects to detect")
+            detect_button = gr.Button("Detect Objects")
+        with gr.Column():
+            detected_image = gr.Image(type="pil", label="Detected Objects")
+            selected_objects = gr.Textbox(label="Selected Objects",
+                                          placeholder="Enter indices of objects to inpaint (comma-separated)")
+            inpaint_prompt = gr.Textbox(label="Inpainting Prompt", placeholder="Describe what to inpaint")
+            inpaint_button = gr.Button("Inpaint Selected Objects")
+    output_image = gr.Image(type="pil", label="Inpainted Result")
+    detect_button.click(
+        fn=lambda img, prompt: process_image(img, prompt, "", "")[0],
+        inputs=[input_image, detection_prompt],
+        outputs=detected_image
+    )
+    inpaint_button.click(
+        fn=process_image,
+        inputs=[input_image, detection_prompt, inpaint_prompt, selected_objects],
+        outputs=[detected_image, output_image]
+    )
+demo.launch(debug=False, show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+tqdm
+einops
+spaces
+timm
+transformers
+samv2
+gradio
+supervision
+opencv-python
+pytest
+torch
+numpy
+diffusers

utils/florence.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
+FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports
+def load_florence_model(
+    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint, trust_remote_code=True).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            checkpoint, trust_remote_code=True)
+        return model, processor
+def run_florence_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response

utils/sam.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Any
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
+SAM_CONFIG = "sam2_hiera_s.yaml"
+def load_sam_image_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> SAM2ImagePredictor:
+    model = build_sam2(config, checkpoint, device=device)
+    return SAM2ImagePredictor(sam_model=model)
+def load_sam_video_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> Any:
+    return build_sam2_video_predictor(config, checkpoint, device=device)
+def run_sam_inference(
+    model: Any,
+    image: Image,
+    detections: sv.Detections
+) -> sv.Detections:
+    image = np.array(image.convert("RGB"))
+    model.set_image(image)
+    mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
+    # dirty fix; remove this later
+    if len(mask.shape) == 4:
+        mask = np.squeeze(mask)
+    detections.mask = mask.astype(bool)
+    return detections