Spaces:

jiuface
/

flux-controlnet-inpainting

Running on Zero

App Files Files Community

jiuface commited on Sep 11, 2024

Commit

f19c22c

1 Parent(s): f1a7906

bugfix

Browse files

Files changed (16) hide show

app.py +32 -33
checkpoints/sam2_hiera_base_plus.pt +0 -3
checkpoints/sam2_hiera_large.pt +0 -3
checkpoints/sam2_hiera_small.pt +0 -3
checkpoints/sam2_hiera_tiny.pt +0 -3
configs/__init__.py +0 -0
configs/sam2_hiera_b+.yaml +0 -113
configs/sam2_hiera_l.yaml +0 -117
configs/sam2_hiera_s.yaml +0 -116
configs/sam2_hiera_t.yaml +0 -118
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/florence.cpython-310.pyc +0 -0
utils/__pycache__/sam.cpython-310.pyc +0 -0
utils/florence.py +0 -54
utils/sam.py +0 -50

app.py CHANGED Viewed

@@ -21,13 +21,6 @@ from datetime import datetime
 from diffusers.utils import load_image
 import json
-from utils.florence import load_florence_model, run_florence_inference, \
-    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
-from utils.sam import load_sam_image_model, run_sam_inference
-import supervision as sv
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
@@ -44,9 +37,6 @@ taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).
 good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
 pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
-# FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=device)
-# SAM_IMAGE_MODEL = load_sam_image_model(device=device)
 class calculateDuration:
     def __init__(self, activity_name=""):
@@ -148,6 +138,7 @@ def run_flux(
     strength_slider: float,
     num_inference_steps_slider: int,
     resolution_wh: Tuple[int, int],
 ) -> Image.Image:
     print("Running FLUX...")
     if lora_path and lora_weights:
@@ -194,18 +185,19 @@ def process(
     account_id: str,
     access_key: str,
     secret_key: str,
-    bucket:str
 ):
     result = {"status": "false", "message": ""}
     if not image_url:
         gr.Info("please enter image url for inpaiting")
         result["message"] = "invalid image url"
-        return json.dumps(result)
     if not inpainting_prompt_text:
         gr.Info("Please enter inpainting text prompt.")
         result["message"] = "invalid inpainting prompt"
-        return json.dumps(result)
     with calculateDuration("load image"):
@@ -215,7 +207,7 @@ def process(
     if not image or not mask:
         gr.Info("Please upload an image & mask by url.")
         result["message"] = "can not load image"
-        return json.dumps(result)
     # generate
     width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
@@ -223,32 +215,37 @@ def process(
     mask = mask.resize((width, height), Image.LANCZOS)
     mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
-    image = run_flux(
-        image=image,
-        mask=mask,
-        prompt=inpainting_prompt_text,
-        lora_path=lora_path,
-        lora_scale=lora_scale,
-        lora_weights=lora_weights,
-        seed_slicer=seed_slicer,
-        randomize_seed_checkbox=randomize_seed_checkbox,
-        strength_slider=strength_slider,
-        num_inference_steps_slider=num_inference_steps_slider,
-        resolution_wh=(width, height)
-    )
     if upload_to_r2:
         with calculateDuration("upload image"):
-            url = upload_image_to_r2(image, account_id, access_key, secret_key, bucket)
-            result = {"status": "success", "url": url}
     else:
         result = {"status": "success", "message": "Image generated but not uploaded"}
-    return json.dumps(result)
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
@@ -257,7 +254,7 @@ with gr.Blocks() as demo:
                     show_label=True,
                     max_lines=1,
                     placeholder="Enter image url for inpainting",
-                    container=False,
                 )
             mask_url = gr.Text(
@@ -371,6 +368,7 @@ with gr.Blocks() as demo:
                     secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
         with gr.Column():
             output_json_component = gr.Code(label="JSON Result", language="json")
     submit_button_component.click(
@@ -395,6 +393,7 @@ with gr.Blocks() as demo:
             bucket
         ],
         outputs=[
             output_json_component
         ]
     )

 from diffusers.utils import load_image
 import json
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
 good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
 pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
 class calculateDuration:
     def __init__(self, activity_name=""):
     strength_slider: float,
     num_inference_steps_slider: int,
     resolution_wh: Tuple[int, int],
+    progress
 ) -> Image.Image:
     print("Running FLUX...")
     if lora_path and lora_weights:
     account_id: str,
     access_key: str,
     secret_key: str,
+    bucket:str,
+    progress=gr.Progress(track_tqdm=True)
 ):
     result = {"status": "false", "message": ""}
     if not image_url:
         gr.Info("please enter image url for inpaiting")
         result["message"] = "invalid image url"
+        return  None, json.dumps(result)
     if not inpainting_prompt_text:
         gr.Info("Please enter inpainting text prompt.")
         result["message"] = "invalid inpainting prompt"
+        return  None, json.dumps(result)
     with calculateDuration("load image"):
     if not image or not mask:
         gr.Info("Please upload an image & mask by url.")
         result["message"] = "can not load image"
+        return None, json.dumps(result)
     # generate
     width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
     mask = mask.resize((width, height), Image.LANCZOS)
     mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
+    try:
+        generated_image = run_flux(
+            image=image,
+            mask=mask,
+            prompt=inpainting_prompt_text,
+            lora_path=lora_path,
+            lora_scale=lora_scale,
+            lora_weights=lora_weights,
+            seed_slicer=seed_slicer,
+            randomize_seed_checkbox=randomize_seed_checkbox,
+            strength_slider=strength_slider,
+            num_inference_steps_slider=num_inference_steps_slider,
+            resolution_wh=(width, height),
+            progress=progress
+        )
+    except:
+        result["message"] = "generate image failed"
+        return None, json.dumps(result)
     if upload_to_r2:
         with calculateDuration("upload image"):
+            url = upload_image_to_r2(generated_image, account_id, access_key, secret_key, bucket)
+            result = {"status": "success", "message": "upload image success", "url": url}
     else:
         result = {"status": "success", "message": "Image generated but not uploaded"}
+    return generated_image, json.dumps(result)
 with gr.Blocks() as demo:
+    gr.Markdown("Flux inpaint with lora")
     with gr.Row():
         with gr.Column():
                     show_label=True,
                     max_lines=1,
                     placeholder="Enter image url for inpainting",
+                    container=False
                 )
             mask_url = gr.Text(
                     secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
         with gr.Column():
+            generated_image = gr.Image(label="Result", show_label=False)
             output_json_component = gr.Code(label="JSON Result", language="json")
     submit_button_component.click(
             bucket
         ],
         outputs=[
+            generated_image,
             output_json_component
         ]
     )

checkpoints/sam2_hiera_base_plus.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
-size 323493298

checkpoints/sam2_hiera_large.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
-size 897952466

checkpoints/sam2_hiera_small.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
-size 184309650

checkpoints/sam2_hiera_tiny.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
-size 155906050

configs/__init__.py DELETED Viewed

File without changes

configs/sam2_hiera_b+.yaml DELETED Viewed

@@ -1,113 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 112
-      num_heads: 2
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [896, 448, 224, 112]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_l.yaml DELETED Viewed

@@ -1,117 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 144
-      num_heads: 2
-      stages: [2, 6, 36, 4]
-      global_att_blocks: [23, 33, 43]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-      window_spec: [8, 4, 16, 8]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [1152, 576, 288, 144]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_s.yaml DELETED Viewed

@@ -1,116 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 96
-      num_heads: 1
-      stages: [1, 2, 11, 2]
-      global_att_blocks: [7, 10, 13]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [768, 384, 192, 96]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_t.yaml DELETED Viewed

@@ -1,118 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 96
-      num_heads: 1
-      stages: [1, 2, 7, 2]
-      global_att_blocks: [5, 7, 9]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [768, 384, 192, 96]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  # SAM decoder
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  # HieraT does not currently support compilation, should always be set to False
-  compile_image_encoder: False

utils/__init__.py DELETED Viewed

File without changes

utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (125 Bytes)

utils/__pycache__/florence.cpython-310.pyc DELETED Viewed

Binary file (2.31 kB)

utils/__pycache__/sam.cpython-310.pyc DELETED Viewed

Binary file (1.57 kB)

utils/florence.py DELETED Viewed

@@ -1,54 +0,0 @@
-import os
-from typing import Union, Any, Tuple, Dict
-from unittest.mock import patch
-import torch
-from PIL import Image
-from transformers import AutoModelForCausalLM, AutoProcessor
-from transformers.dynamic_module_utils import get_imports
-# FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
-FLORENCE_CHECKPOINT = "microsoft/Florence-2-large-ft"
-FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
-FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
-FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
-FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
-FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
-def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
-    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
-    if not str(filename).endswith("/modeling_florence2.py"):
-        return get_imports(filename)
-    imports = get_imports(filename)
-    imports.remove("flash_attn")
-    return imports
-def load_florence_model(
-    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
-) -> Tuple[Any, Any]:
-    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
-        model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device).eval()
-        processor = AutoProcessor.from_pretrained(checkpoint, trust_remote_code=True)
-        return model, processor
-def run_florence_inference(
-    model: Any,
-    processor: Any,
-    device: torch.device,
-    image: Image,
-    task: str,
-    text: str = None
-) -> Tuple[str, Dict]:
-    if text:
-        prompt = task + text
-    else:
-        prompt = task
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
-    generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3)
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    response = processor.post_process_generation(generated_text, task=task, image_size=image.size)
-    print("run_florence_inference", "finish", generated_text, response)
-    return generated_text, response

utils/sam.py DELETED Viewed

@@ -1,50 +0,0 @@
-from typing import Any
-import numpy as np
-import supervision as sv
-import torch
-from PIL import Image
-from sam2.build_sam import build_sam2, build_sam2_video_predictor
-from sam2.sam2_image_predictor import SAM2ImagePredictor
-# SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
-# SAM_CONFIG = "sam2_hiera_s.yaml"
-SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
-SAM_CONFIG = "sam2_hiera_l.yaml"
-def load_sam_image_model(
-    device: torch.device,
-    config: str = SAM_CONFIG,
-    checkpoint: str = SAM_CHECKPOINT
-) -> SAM2ImagePredictor:
-    model = build_sam2(config, checkpoint, device=device)
-    return SAM2ImagePredictor(sam_model=model)
-def load_sam_video_model(
-    device: torch.device,
-    config: str = SAM_CONFIG,
-    checkpoint: str = SAM_CHECKPOINT
-) -> Any:
-    return build_sam2_video_predictor(config, checkpoint, device=device)
-def run_sam_inference(
-    model: Any,
-    image: Image,
-    detections: sv.Detections
-) -> sv.Detections:
-    image = np.array(image.convert("RGB"))
-    model.set_image(image)
-    # from left to right
-    bboxes = detections.xyxy
-    bboxes = sorted(bboxes, key=lambda bbox: bbox[0])
-    mask, score, _ = model.predict(box=bboxes, multimask_output=False)
-    # dirty fix; remove this later
-    if len(mask.shape) == 4:
-        mask = np.squeeze(mask)
-    detections.mask = mask.astype(bool)
-    return detections