Spaces:
Running
on
Zero
Running
on
Zero
bugfix
Browse files- app.py +32 -33
- checkpoints/sam2_hiera_base_plus.pt +0 -3
- checkpoints/sam2_hiera_large.pt +0 -3
- checkpoints/sam2_hiera_small.pt +0 -3
- checkpoints/sam2_hiera_tiny.pt +0 -3
- configs/__init__.py +0 -0
- configs/sam2_hiera_b+.yaml +0 -113
- configs/sam2_hiera_l.yaml +0 -117
- configs/sam2_hiera_s.yaml +0 -116
- configs/sam2_hiera_t.yaml +0 -118
- utils/__init__.py +0 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/florence.cpython-310.pyc +0 -0
- utils/__pycache__/sam.cpython-310.pyc +0 -0
- utils/florence.py +0 -54
- utils/sam.py +0 -50
app.py
CHANGED
@@ -21,13 +21,6 @@ from datetime import datetime
|
|
21 |
from diffusers.utils import load_image
|
22 |
import json
|
23 |
|
24 |
-
from utils.florence import load_florence_model, run_florence_inference, \
|
25 |
-
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
26 |
-
from utils.sam import load_sam_image_model, run_sam_inference
|
27 |
-
import supervision as sv
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
32 |
|
33 |
login(token=HF_TOKEN)
|
@@ -44,9 +37,6 @@ taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).
|
|
44 |
good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
|
45 |
pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
|
46 |
|
47 |
-
# FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=device)
|
48 |
-
# SAM_IMAGE_MODEL = load_sam_image_model(device=device)
|
49 |
-
|
50 |
|
51 |
class calculateDuration:
|
52 |
def __init__(self, activity_name=""):
|
@@ -148,6 +138,7 @@ def run_flux(
|
|
148 |
strength_slider: float,
|
149 |
num_inference_steps_slider: int,
|
150 |
resolution_wh: Tuple[int, int],
|
|
|
151 |
) -> Image.Image:
|
152 |
print("Running FLUX...")
|
153 |
if lora_path and lora_weights:
|
@@ -194,18 +185,19 @@ def process(
|
|
194 |
account_id: str,
|
195 |
access_key: str,
|
196 |
secret_key: str,
|
197 |
-
bucket:str
|
|
|
198 |
):
|
199 |
result = {"status": "false", "message": ""}
|
200 |
if not image_url:
|
201 |
gr.Info("please enter image url for inpaiting")
|
202 |
result["message"] = "invalid image url"
|
203 |
-
return json.dumps(result)
|
204 |
|
205 |
if not inpainting_prompt_text:
|
206 |
gr.Info("Please enter inpainting text prompt.")
|
207 |
result["message"] = "invalid inpainting prompt"
|
208 |
-
return json.dumps(result)
|
209 |
|
210 |
|
211 |
with calculateDuration("load image"):
|
@@ -215,7 +207,7 @@ def process(
|
|
215 |
if not image or not mask:
|
216 |
gr.Info("Please upload an image & mask by url.")
|
217 |
result["message"] = "can not load image"
|
218 |
-
return json.dumps(result)
|
219 |
|
220 |
# generate
|
221 |
width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
|
@@ -223,32 +215,37 @@ def process(
|
|
223 |
mask = mask.resize((width, height), Image.LANCZOS)
|
224 |
mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
240 |
if upload_to_r2:
|
241 |
with calculateDuration("upload image"):
|
242 |
-
url = upload_image_to_r2(
|
243 |
-
result = {"status": "success", "url": url}
|
244 |
else:
|
245 |
result = {"status": "success", "message": "Image generated but not uploaded"}
|
246 |
|
247 |
-
return json.dumps(result)
|
248 |
|
249 |
|
250 |
with gr.Blocks() as demo:
|
251 |
-
|
252 |
with gr.Row():
|
253 |
with gr.Column():
|
254 |
|
@@ -257,7 +254,7 @@ with gr.Blocks() as demo:
|
|
257 |
show_label=True,
|
258 |
max_lines=1,
|
259 |
placeholder="Enter image url for inpainting",
|
260 |
-
container=False
|
261 |
)
|
262 |
|
263 |
mask_url = gr.Text(
|
@@ -371,6 +368,7 @@ with gr.Blocks() as demo:
|
|
371 |
secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
|
372 |
|
373 |
with gr.Column():
|
|
|
374 |
output_json_component = gr.Code(label="JSON Result", language="json")
|
375 |
|
376 |
submit_button_component.click(
|
@@ -395,6 +393,7 @@ with gr.Blocks() as demo:
|
|
395 |
bucket
|
396 |
],
|
397 |
outputs=[
|
|
|
398 |
output_json_component
|
399 |
]
|
400 |
)
|
|
|
21 |
from diffusers.utils import load_image
|
22 |
import json
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
25 |
|
26 |
login(token=HF_TOKEN)
|
|
|
37 |
good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
|
38 |
pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
|
39 |
|
|
|
|
|
|
|
40 |
|
41 |
class calculateDuration:
|
42 |
def __init__(self, activity_name=""):
|
|
|
138 |
strength_slider: float,
|
139 |
num_inference_steps_slider: int,
|
140 |
resolution_wh: Tuple[int, int],
|
141 |
+
progress
|
142 |
) -> Image.Image:
|
143 |
print("Running FLUX...")
|
144 |
if lora_path and lora_weights:
|
|
|
185 |
account_id: str,
|
186 |
access_key: str,
|
187 |
secret_key: str,
|
188 |
+
bucket:str,
|
189 |
+
progress=gr.Progress(track_tqdm=True)
|
190 |
):
|
191 |
result = {"status": "false", "message": ""}
|
192 |
if not image_url:
|
193 |
gr.Info("please enter image url for inpaiting")
|
194 |
result["message"] = "invalid image url"
|
195 |
+
return None, json.dumps(result)
|
196 |
|
197 |
if not inpainting_prompt_text:
|
198 |
gr.Info("Please enter inpainting text prompt.")
|
199 |
result["message"] = "invalid inpainting prompt"
|
200 |
+
return None, json.dumps(result)
|
201 |
|
202 |
|
203 |
with calculateDuration("load image"):
|
|
|
207 |
if not image or not mask:
|
208 |
gr.Info("Please upload an image & mask by url.")
|
209 |
result["message"] = "can not load image"
|
210 |
+
return None, json.dumps(result)
|
211 |
|
212 |
# generate
|
213 |
width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
|
|
|
215 |
mask = mask.resize((width, height), Image.LANCZOS)
|
216 |
mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
|
217 |
|
218 |
+
try:
|
219 |
+
generated_image = run_flux(
|
220 |
+
image=image,
|
221 |
+
mask=mask,
|
222 |
+
prompt=inpainting_prompt_text,
|
223 |
+
lora_path=lora_path,
|
224 |
+
lora_scale=lora_scale,
|
225 |
+
lora_weights=lora_weights,
|
226 |
+
seed_slicer=seed_slicer,
|
227 |
+
randomize_seed_checkbox=randomize_seed_checkbox,
|
228 |
+
strength_slider=strength_slider,
|
229 |
+
num_inference_steps_slider=num_inference_steps_slider,
|
230 |
+
resolution_wh=(width, height),
|
231 |
+
progress=progress
|
232 |
+
)
|
233 |
+
except:
|
234 |
+
result["message"] = "generate image failed"
|
235 |
+
return None, json.dumps(result)
|
236 |
+
|
237 |
if upload_to_r2:
|
238 |
with calculateDuration("upload image"):
|
239 |
+
url = upload_image_to_r2(generated_image, account_id, access_key, secret_key, bucket)
|
240 |
+
result = {"status": "success", "message": "upload image success", "url": url}
|
241 |
else:
|
242 |
result = {"status": "success", "message": "Image generated but not uploaded"}
|
243 |
|
244 |
+
return generated_image, json.dumps(result)
|
245 |
|
246 |
|
247 |
with gr.Blocks() as demo:
|
248 |
+
gr.Markdown("Flux inpaint with lora")
|
249 |
with gr.Row():
|
250 |
with gr.Column():
|
251 |
|
|
|
254 |
show_label=True,
|
255 |
max_lines=1,
|
256 |
placeholder="Enter image url for inpainting",
|
257 |
+
container=False
|
258 |
)
|
259 |
|
260 |
mask_url = gr.Text(
|
|
|
368 |
secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
|
369 |
|
370 |
with gr.Column():
|
371 |
+
generated_image = gr.Image(label="Result", show_label=False)
|
372 |
output_json_component = gr.Code(label="JSON Result", language="json")
|
373 |
|
374 |
submit_button_component.click(
|
|
|
393 |
bucket
|
394 |
],
|
395 |
outputs=[
|
396 |
+
generated_image,
|
397 |
output_json_component
|
398 |
]
|
399 |
)
|
checkpoints/sam2_hiera_base_plus.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
|
3 |
-
size 323493298
|
|
|
|
|
|
|
|
checkpoints/sam2_hiera_large.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
|
3 |
-
size 897952466
|
|
|
|
|
|
|
|
checkpoints/sam2_hiera_small.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
|
3 |
-
size 184309650
|
|
|
|
|
|
|
|
checkpoints/sam2_hiera_tiny.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
|
3 |
-
size 155906050
|
|
|
|
|
|
|
|
configs/__init__.py
DELETED
File without changes
|
configs/sam2_hiera_b+.yaml
DELETED
@@ -1,113 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 112
|
12 |
-
num_heads: 2
|
13 |
-
neck:
|
14 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
15 |
-
position_encoding:
|
16 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
17 |
-
num_pos_feats: 256
|
18 |
-
normalize: true
|
19 |
-
scale: null
|
20 |
-
temperature: 10000
|
21 |
-
d_model: 256
|
22 |
-
backbone_channel_list: [896, 448, 224, 112]
|
23 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
24 |
-
fpn_interp_model: nearest
|
25 |
-
|
26 |
-
memory_attention:
|
27 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
28 |
-
d_model: 256
|
29 |
-
pos_enc_at_input: true
|
30 |
-
layer:
|
31 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
32 |
-
activation: relu
|
33 |
-
dim_feedforward: 2048
|
34 |
-
dropout: 0.1
|
35 |
-
pos_enc_at_attn: false
|
36 |
-
self_attention:
|
37 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
38 |
-
rope_theta: 10000.0
|
39 |
-
feat_sizes: [32, 32]
|
40 |
-
embedding_dim: 256
|
41 |
-
num_heads: 1
|
42 |
-
downsample_rate: 1
|
43 |
-
dropout: 0.1
|
44 |
-
d_model: 256
|
45 |
-
pos_enc_at_cross_attn_keys: true
|
46 |
-
pos_enc_at_cross_attn_queries: false
|
47 |
-
cross_attention:
|
48 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
49 |
-
rope_theta: 10000.0
|
50 |
-
feat_sizes: [32, 32]
|
51 |
-
rope_k_repeat: True
|
52 |
-
embedding_dim: 256
|
53 |
-
num_heads: 1
|
54 |
-
downsample_rate: 1
|
55 |
-
dropout: 0.1
|
56 |
-
kv_in_dim: 64
|
57 |
-
num_layers: 4
|
58 |
-
|
59 |
-
memory_encoder:
|
60 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
61 |
-
out_dim: 64
|
62 |
-
position_encoding:
|
63 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
64 |
-
num_pos_feats: 64
|
65 |
-
normalize: true
|
66 |
-
scale: null
|
67 |
-
temperature: 10000
|
68 |
-
mask_downsampler:
|
69 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
70 |
-
kernel_size: 3
|
71 |
-
stride: 2
|
72 |
-
padding: 1
|
73 |
-
fuser:
|
74 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
75 |
-
layer:
|
76 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
77 |
-
dim: 256
|
78 |
-
kernel_size: 7
|
79 |
-
padding: 3
|
80 |
-
layer_scale_init_value: 1e-6
|
81 |
-
use_dwconv: True # depth-wise convs
|
82 |
-
num_layers: 2
|
83 |
-
|
84 |
-
num_maskmem: 7
|
85 |
-
image_size: 1024
|
86 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
87 |
-
sigmoid_scale_for_mem_enc: 20.0
|
88 |
-
sigmoid_bias_for_mem_enc: -10.0
|
89 |
-
use_mask_input_as_output_without_sam: true
|
90 |
-
# Memory
|
91 |
-
directly_add_no_mem_embed: true
|
92 |
-
# use high-resolution feature map in the SAM mask decoder
|
93 |
-
use_high_res_features_in_sam: true
|
94 |
-
# output 3 masks on the first click on initial conditioning frames
|
95 |
-
multimask_output_in_sam: true
|
96 |
-
# SAM heads
|
97 |
-
iou_prediction_use_sigmoid: True
|
98 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
99 |
-
use_obj_ptrs_in_encoder: true
|
100 |
-
add_tpos_enc_to_obj_ptrs: false
|
101 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
102 |
-
# object occlusion prediction
|
103 |
-
pred_obj_scores: true
|
104 |
-
pred_obj_scores_mlp: true
|
105 |
-
fixed_no_obj_ptr: true
|
106 |
-
# multimask tracking settings
|
107 |
-
multimask_output_for_tracking: true
|
108 |
-
use_multimask_token_for_obj_ptr: true
|
109 |
-
multimask_min_pt_num: 0
|
110 |
-
multimask_max_pt_num: 1
|
111 |
-
use_mlp_for_obj_ptr_proj: true
|
112 |
-
# Compilation flag
|
113 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/sam2_hiera_l.yaml
DELETED
@@ -1,117 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 144
|
12 |
-
num_heads: 2
|
13 |
-
stages: [2, 6, 36, 4]
|
14 |
-
global_att_blocks: [23, 33, 43]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
window_spec: [8, 4, 16, 8]
|
17 |
-
neck:
|
18 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
19 |
-
position_encoding:
|
20 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
21 |
-
num_pos_feats: 256
|
22 |
-
normalize: true
|
23 |
-
scale: null
|
24 |
-
temperature: 10000
|
25 |
-
d_model: 256
|
26 |
-
backbone_channel_list: [1152, 576, 288, 144]
|
27 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28 |
-
fpn_interp_model: nearest
|
29 |
-
|
30 |
-
memory_attention:
|
31 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
32 |
-
d_model: 256
|
33 |
-
pos_enc_at_input: true
|
34 |
-
layer:
|
35 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
36 |
-
activation: relu
|
37 |
-
dim_feedforward: 2048
|
38 |
-
dropout: 0.1
|
39 |
-
pos_enc_at_attn: false
|
40 |
-
self_attention:
|
41 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
42 |
-
rope_theta: 10000.0
|
43 |
-
feat_sizes: [32, 32]
|
44 |
-
embedding_dim: 256
|
45 |
-
num_heads: 1
|
46 |
-
downsample_rate: 1
|
47 |
-
dropout: 0.1
|
48 |
-
d_model: 256
|
49 |
-
pos_enc_at_cross_attn_keys: true
|
50 |
-
pos_enc_at_cross_attn_queries: false
|
51 |
-
cross_attention:
|
52 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
53 |
-
rope_theta: 10000.0
|
54 |
-
feat_sizes: [32, 32]
|
55 |
-
rope_k_repeat: True
|
56 |
-
embedding_dim: 256
|
57 |
-
num_heads: 1
|
58 |
-
downsample_rate: 1
|
59 |
-
dropout: 0.1
|
60 |
-
kv_in_dim: 64
|
61 |
-
num_layers: 4
|
62 |
-
|
63 |
-
memory_encoder:
|
64 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
65 |
-
out_dim: 64
|
66 |
-
position_encoding:
|
67 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
68 |
-
num_pos_feats: 64
|
69 |
-
normalize: true
|
70 |
-
scale: null
|
71 |
-
temperature: 10000
|
72 |
-
mask_downsampler:
|
73 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
74 |
-
kernel_size: 3
|
75 |
-
stride: 2
|
76 |
-
padding: 1
|
77 |
-
fuser:
|
78 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
79 |
-
layer:
|
80 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
81 |
-
dim: 256
|
82 |
-
kernel_size: 7
|
83 |
-
padding: 3
|
84 |
-
layer_scale_init_value: 1e-6
|
85 |
-
use_dwconv: True # depth-wise convs
|
86 |
-
num_layers: 2
|
87 |
-
|
88 |
-
num_maskmem: 7
|
89 |
-
image_size: 1024
|
90 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
91 |
-
sigmoid_scale_for_mem_enc: 20.0
|
92 |
-
sigmoid_bias_for_mem_enc: -10.0
|
93 |
-
use_mask_input_as_output_without_sam: true
|
94 |
-
# Memory
|
95 |
-
directly_add_no_mem_embed: true
|
96 |
-
# use high-resolution feature map in the SAM mask decoder
|
97 |
-
use_high_res_features_in_sam: true
|
98 |
-
# output 3 masks on the first click on initial conditioning frames
|
99 |
-
multimask_output_in_sam: true
|
100 |
-
# SAM heads
|
101 |
-
iou_prediction_use_sigmoid: True
|
102 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
-
use_obj_ptrs_in_encoder: true
|
104 |
-
add_tpos_enc_to_obj_ptrs: false
|
105 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
-
# object occlusion prediction
|
107 |
-
pred_obj_scores: true
|
108 |
-
pred_obj_scores_mlp: true
|
109 |
-
fixed_no_obj_ptr: true
|
110 |
-
# multimask tracking settings
|
111 |
-
multimask_output_for_tracking: true
|
112 |
-
use_multimask_token_for_obj_ptr: true
|
113 |
-
multimask_min_pt_num: 0
|
114 |
-
multimask_max_pt_num: 1
|
115 |
-
use_mlp_for_obj_ptr_proj: true
|
116 |
-
# Compilation flag
|
117 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/sam2_hiera_s.yaml
DELETED
@@ -1,116 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 96
|
12 |
-
num_heads: 1
|
13 |
-
stages: [1, 2, 11, 2]
|
14 |
-
global_att_blocks: [7, 10, 13]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
neck:
|
17 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
-
position_encoding:
|
19 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
-
num_pos_feats: 256
|
21 |
-
normalize: true
|
22 |
-
scale: null
|
23 |
-
temperature: 10000
|
24 |
-
d_model: 256
|
25 |
-
backbone_channel_list: [768, 384, 192, 96]
|
26 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
-
fpn_interp_model: nearest
|
28 |
-
|
29 |
-
memory_attention:
|
30 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
-
d_model: 256
|
32 |
-
pos_enc_at_input: true
|
33 |
-
layer:
|
34 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
-
activation: relu
|
36 |
-
dim_feedforward: 2048
|
37 |
-
dropout: 0.1
|
38 |
-
pos_enc_at_attn: false
|
39 |
-
self_attention:
|
40 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
-
rope_theta: 10000.0
|
42 |
-
feat_sizes: [32, 32]
|
43 |
-
embedding_dim: 256
|
44 |
-
num_heads: 1
|
45 |
-
downsample_rate: 1
|
46 |
-
dropout: 0.1
|
47 |
-
d_model: 256
|
48 |
-
pos_enc_at_cross_attn_keys: true
|
49 |
-
pos_enc_at_cross_attn_queries: false
|
50 |
-
cross_attention:
|
51 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
-
rope_theta: 10000.0
|
53 |
-
feat_sizes: [32, 32]
|
54 |
-
rope_k_repeat: True
|
55 |
-
embedding_dim: 256
|
56 |
-
num_heads: 1
|
57 |
-
downsample_rate: 1
|
58 |
-
dropout: 0.1
|
59 |
-
kv_in_dim: 64
|
60 |
-
num_layers: 4
|
61 |
-
|
62 |
-
memory_encoder:
|
63 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
-
out_dim: 64
|
65 |
-
position_encoding:
|
66 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
-
num_pos_feats: 64
|
68 |
-
normalize: true
|
69 |
-
scale: null
|
70 |
-
temperature: 10000
|
71 |
-
mask_downsampler:
|
72 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
-
kernel_size: 3
|
74 |
-
stride: 2
|
75 |
-
padding: 1
|
76 |
-
fuser:
|
77 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
-
layer:
|
79 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
-
dim: 256
|
81 |
-
kernel_size: 7
|
82 |
-
padding: 3
|
83 |
-
layer_scale_init_value: 1e-6
|
84 |
-
use_dwconv: True # depth-wise convs
|
85 |
-
num_layers: 2
|
86 |
-
|
87 |
-
num_maskmem: 7
|
88 |
-
image_size: 1024
|
89 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
-
sigmoid_scale_for_mem_enc: 20.0
|
91 |
-
sigmoid_bias_for_mem_enc: -10.0
|
92 |
-
use_mask_input_as_output_without_sam: true
|
93 |
-
# Memory
|
94 |
-
directly_add_no_mem_embed: true
|
95 |
-
# use high-resolution feature map in the SAM mask decoder
|
96 |
-
use_high_res_features_in_sam: true
|
97 |
-
# output 3 masks on the first click on initial conditioning frames
|
98 |
-
multimask_output_in_sam: true
|
99 |
-
# SAM heads
|
100 |
-
iou_prediction_use_sigmoid: True
|
101 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
102 |
-
use_obj_ptrs_in_encoder: true
|
103 |
-
add_tpos_enc_to_obj_ptrs: false
|
104 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
105 |
-
# object occlusion prediction
|
106 |
-
pred_obj_scores: true
|
107 |
-
pred_obj_scores_mlp: true
|
108 |
-
fixed_no_obj_ptr: true
|
109 |
-
# multimask tracking settings
|
110 |
-
multimask_output_for_tracking: true
|
111 |
-
use_multimask_token_for_obj_ptr: true
|
112 |
-
multimask_min_pt_num: 0
|
113 |
-
multimask_max_pt_num: 1
|
114 |
-
use_mlp_for_obj_ptr_proj: true
|
115 |
-
# Compilation flag
|
116 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/sam2_hiera_t.yaml
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 96
|
12 |
-
num_heads: 1
|
13 |
-
stages: [1, 2, 7, 2]
|
14 |
-
global_att_blocks: [5, 7, 9]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
neck:
|
17 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
-
position_encoding:
|
19 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
-
num_pos_feats: 256
|
21 |
-
normalize: true
|
22 |
-
scale: null
|
23 |
-
temperature: 10000
|
24 |
-
d_model: 256
|
25 |
-
backbone_channel_list: [768, 384, 192, 96]
|
26 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
-
fpn_interp_model: nearest
|
28 |
-
|
29 |
-
memory_attention:
|
30 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
-
d_model: 256
|
32 |
-
pos_enc_at_input: true
|
33 |
-
layer:
|
34 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
-
activation: relu
|
36 |
-
dim_feedforward: 2048
|
37 |
-
dropout: 0.1
|
38 |
-
pos_enc_at_attn: false
|
39 |
-
self_attention:
|
40 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
-
rope_theta: 10000.0
|
42 |
-
feat_sizes: [32, 32]
|
43 |
-
embedding_dim: 256
|
44 |
-
num_heads: 1
|
45 |
-
downsample_rate: 1
|
46 |
-
dropout: 0.1
|
47 |
-
d_model: 256
|
48 |
-
pos_enc_at_cross_attn_keys: true
|
49 |
-
pos_enc_at_cross_attn_queries: false
|
50 |
-
cross_attention:
|
51 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
-
rope_theta: 10000.0
|
53 |
-
feat_sizes: [32, 32]
|
54 |
-
rope_k_repeat: True
|
55 |
-
embedding_dim: 256
|
56 |
-
num_heads: 1
|
57 |
-
downsample_rate: 1
|
58 |
-
dropout: 0.1
|
59 |
-
kv_in_dim: 64
|
60 |
-
num_layers: 4
|
61 |
-
|
62 |
-
memory_encoder:
|
63 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
-
out_dim: 64
|
65 |
-
position_encoding:
|
66 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
-
num_pos_feats: 64
|
68 |
-
normalize: true
|
69 |
-
scale: null
|
70 |
-
temperature: 10000
|
71 |
-
mask_downsampler:
|
72 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
-
kernel_size: 3
|
74 |
-
stride: 2
|
75 |
-
padding: 1
|
76 |
-
fuser:
|
77 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
-
layer:
|
79 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
-
dim: 256
|
81 |
-
kernel_size: 7
|
82 |
-
padding: 3
|
83 |
-
layer_scale_init_value: 1e-6
|
84 |
-
use_dwconv: True # depth-wise convs
|
85 |
-
num_layers: 2
|
86 |
-
|
87 |
-
num_maskmem: 7
|
88 |
-
image_size: 1024
|
89 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
-
# SAM decoder
|
91 |
-
sigmoid_scale_for_mem_enc: 20.0
|
92 |
-
sigmoid_bias_for_mem_enc: -10.0
|
93 |
-
use_mask_input_as_output_without_sam: true
|
94 |
-
# Memory
|
95 |
-
directly_add_no_mem_embed: true
|
96 |
-
# use high-resolution feature map in the SAM mask decoder
|
97 |
-
use_high_res_features_in_sam: true
|
98 |
-
# output 3 masks on the first click on initial conditioning frames
|
99 |
-
multimask_output_in_sam: true
|
100 |
-
# SAM heads
|
101 |
-
iou_prediction_use_sigmoid: True
|
102 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
-
use_obj_ptrs_in_encoder: true
|
104 |
-
add_tpos_enc_to_obj_ptrs: false
|
105 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
-
# object occlusion prediction
|
107 |
-
pred_obj_scores: true
|
108 |
-
pred_obj_scores_mlp: true
|
109 |
-
fixed_no_obj_ptr: true
|
110 |
-
# multimask tracking settings
|
111 |
-
multimask_output_for_tracking: true
|
112 |
-
use_multimask_token_for_obj_ptr: true
|
113 |
-
multimask_min_pt_num: 0
|
114 |
-
multimask_max_pt_num: 1
|
115 |
-
use_mlp_for_obj_ptr_proj: true
|
116 |
-
# Compilation flag
|
117 |
-
# HieraT does not currently support compilation, should always be set to False
|
118 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/__init__.py
DELETED
File without changes
|
utils/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (125 Bytes)
|
|
utils/__pycache__/florence.cpython-310.pyc
DELETED
Binary file (2.31 kB)
|
|
utils/__pycache__/sam.cpython-310.pyc
DELETED
Binary file (1.57 kB)
|
|
utils/florence.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import Union, Any, Tuple, Dict
|
3 |
-
from unittest.mock import patch
|
4 |
-
|
5 |
-
import torch
|
6 |
-
from PIL import Image
|
7 |
-
from transformers import AutoModelForCausalLM, AutoProcessor
|
8 |
-
from transformers.dynamic_module_utils import get_imports
|
9 |
-
|
10 |
-
# FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
|
11 |
-
FLORENCE_CHECKPOINT = "microsoft/Florence-2-large-ft"
|
12 |
-
FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
|
13 |
-
FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
|
14 |
-
FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
|
15 |
-
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
|
16 |
-
FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
|
17 |
-
|
18 |
-
|
19 |
-
def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
|
20 |
-
"""Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
|
21 |
-
if not str(filename).endswith("/modeling_florence2.py"):
|
22 |
-
return get_imports(filename)
|
23 |
-
imports = get_imports(filename)
|
24 |
-
imports.remove("flash_attn")
|
25 |
-
return imports
|
26 |
-
|
27 |
-
|
28 |
-
def load_florence_model(
|
29 |
-
device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
|
30 |
-
) -> Tuple[Any, Any]:
|
31 |
-
with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
|
32 |
-
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device).eval()
|
33 |
-
processor = AutoProcessor.from_pretrained(checkpoint, trust_remote_code=True)
|
34 |
-
return model, processor
|
35 |
-
|
36 |
-
|
37 |
-
def run_florence_inference(
|
38 |
-
model: Any,
|
39 |
-
processor: Any,
|
40 |
-
device: torch.device,
|
41 |
-
image: Image,
|
42 |
-
task: str,
|
43 |
-
text: str = None
|
44 |
-
) -> Tuple[str, Dict]:
|
45 |
-
if text:
|
46 |
-
prompt = task + text
|
47 |
-
else:
|
48 |
-
prompt = task
|
49 |
-
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
50 |
-
generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3)
|
51 |
-
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
52 |
-
response = processor.post_process_generation(generated_text, task=task, image_size=image.size)
|
53 |
-
print("run_florence_inference", "finish", generated_text, response)
|
54 |
-
return generated_text, response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/sam.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
from typing import Any
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import supervision as sv
|
5 |
-
import torch
|
6 |
-
from PIL import Image
|
7 |
-
from sam2.build_sam import build_sam2, build_sam2_video_predictor
|
8 |
-
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
9 |
-
|
10 |
-
# SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
|
11 |
-
# SAM_CONFIG = "sam2_hiera_s.yaml"
|
12 |
-
SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
|
13 |
-
SAM_CONFIG = "sam2_hiera_l.yaml"
|
14 |
-
|
15 |
-
|
16 |
-
def load_sam_image_model(
|
17 |
-
device: torch.device,
|
18 |
-
config: str = SAM_CONFIG,
|
19 |
-
checkpoint: str = SAM_CHECKPOINT
|
20 |
-
) -> SAM2ImagePredictor:
|
21 |
-
model = build_sam2(config, checkpoint, device=device)
|
22 |
-
return SAM2ImagePredictor(sam_model=model)
|
23 |
-
|
24 |
-
|
25 |
-
def load_sam_video_model(
|
26 |
-
device: torch.device,
|
27 |
-
config: str = SAM_CONFIG,
|
28 |
-
checkpoint: str = SAM_CHECKPOINT
|
29 |
-
) -> Any:
|
30 |
-
return build_sam2_video_predictor(config, checkpoint, device=device)
|
31 |
-
|
32 |
-
|
33 |
-
def run_sam_inference(
|
34 |
-
model: Any,
|
35 |
-
image: Image,
|
36 |
-
detections: sv.Detections
|
37 |
-
) -> sv.Detections:
|
38 |
-
image = np.array(image.convert("RGB"))
|
39 |
-
model.set_image(image)
|
40 |
-
# from left to right
|
41 |
-
bboxes = detections.xyxy
|
42 |
-
bboxes = sorted(bboxes, key=lambda bbox: bbox[0])
|
43 |
-
mask, score, _ = model.predict(box=bboxes, multimask_output=False)
|
44 |
-
|
45 |
-
# dirty fix; remove this later
|
46 |
-
if len(mask.shape) == 4:
|
47 |
-
mask = np.squeeze(mask)
|
48 |
-
|
49 |
-
detections.mask = mask.astype(bool)
|
50 |
-
return detections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|