jiuface commited on
Commit
f19c22c
·
1 Parent(s): f1a7906
app.py CHANGED
@@ -21,13 +21,6 @@ from datetime import datetime
21
  from diffusers.utils import load_image
22
  import json
23
 
24
- from utils.florence import load_florence_model, run_florence_inference, \
25
- FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
26
- from utils.sam import load_sam_image_model, run_sam_inference
27
- import supervision as sv
28
-
29
-
30
-
31
  HF_TOKEN = os.environ.get("HF_TOKEN")
32
 
33
  login(token=HF_TOKEN)
@@ -44,9 +37,6 @@ taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).
44
  good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
45
  pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
46
 
47
- # FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=device)
48
- # SAM_IMAGE_MODEL = load_sam_image_model(device=device)
49
-
50
 
51
  class calculateDuration:
52
  def __init__(self, activity_name=""):
@@ -148,6 +138,7 @@ def run_flux(
148
  strength_slider: float,
149
  num_inference_steps_slider: int,
150
  resolution_wh: Tuple[int, int],
 
151
  ) -> Image.Image:
152
  print("Running FLUX...")
153
  if lora_path and lora_weights:
@@ -194,18 +185,19 @@ def process(
194
  account_id: str,
195
  access_key: str,
196
  secret_key: str,
197
- bucket:str
 
198
  ):
199
  result = {"status": "false", "message": ""}
200
  if not image_url:
201
  gr.Info("please enter image url for inpaiting")
202
  result["message"] = "invalid image url"
203
- return json.dumps(result)
204
 
205
  if not inpainting_prompt_text:
206
  gr.Info("Please enter inpainting text prompt.")
207
  result["message"] = "invalid inpainting prompt"
208
- return json.dumps(result)
209
 
210
 
211
  with calculateDuration("load image"):
@@ -215,7 +207,7 @@ def process(
215
  if not image or not mask:
216
  gr.Info("Please upload an image & mask by url.")
217
  result["message"] = "can not load image"
218
- return json.dumps(result)
219
 
220
  # generate
221
  width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
@@ -223,32 +215,37 @@ def process(
223
  mask = mask.resize((width, height), Image.LANCZOS)
224
  mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
225
 
226
- image = run_flux(
227
- image=image,
228
- mask=mask,
229
- prompt=inpainting_prompt_text,
230
- lora_path=lora_path,
231
- lora_scale=lora_scale,
232
- lora_weights=lora_weights,
233
- seed_slicer=seed_slicer,
234
- randomize_seed_checkbox=randomize_seed_checkbox,
235
- strength_slider=strength_slider,
236
- num_inference_steps_slider=num_inference_steps_slider,
237
- resolution_wh=(width, height)
238
- )
239
-
 
 
 
 
 
240
  if upload_to_r2:
241
  with calculateDuration("upload image"):
242
- url = upload_image_to_r2(image, account_id, access_key, secret_key, bucket)
243
- result = {"status": "success", "url": url}
244
  else:
245
  result = {"status": "success", "message": "Image generated but not uploaded"}
246
 
247
- return json.dumps(result)
248
 
249
 
250
  with gr.Blocks() as demo:
251
-
252
  with gr.Row():
253
  with gr.Column():
254
 
@@ -257,7 +254,7 @@ with gr.Blocks() as demo:
257
  show_label=True,
258
  max_lines=1,
259
  placeholder="Enter image url for inpainting",
260
- container=False,
261
  )
262
 
263
  mask_url = gr.Text(
@@ -371,6 +368,7 @@ with gr.Blocks() as demo:
371
  secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
372
 
373
  with gr.Column():
 
374
  output_json_component = gr.Code(label="JSON Result", language="json")
375
 
376
  submit_button_component.click(
@@ -395,6 +393,7 @@ with gr.Blocks() as demo:
395
  bucket
396
  ],
397
  outputs=[
 
398
  output_json_component
399
  ]
400
  )
 
21
  from diffusers.utils import load_image
22
  import json
23
 
 
 
 
 
 
 
 
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
 
26
  login(token=HF_TOKEN)
 
37
  good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
38
  pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
39
 
 
 
 
40
 
41
  class calculateDuration:
42
  def __init__(self, activity_name=""):
 
138
  strength_slider: float,
139
  num_inference_steps_slider: int,
140
  resolution_wh: Tuple[int, int],
141
+ progress
142
  ) -> Image.Image:
143
  print("Running FLUX...")
144
  if lora_path and lora_weights:
 
185
  account_id: str,
186
  access_key: str,
187
  secret_key: str,
188
+ bucket:str,
189
+ progress=gr.Progress(track_tqdm=True)
190
  ):
191
  result = {"status": "false", "message": ""}
192
  if not image_url:
193
  gr.Info("please enter image url for inpaiting")
194
  result["message"] = "invalid image url"
195
+ return None, json.dumps(result)
196
 
197
  if not inpainting_prompt_text:
198
  gr.Info("Please enter inpainting text prompt.")
199
  result["message"] = "invalid inpainting prompt"
200
+ return None, json.dumps(result)
201
 
202
 
203
  with calculateDuration("load image"):
 
207
  if not image or not mask:
208
  gr.Info("Please upload an image & mask by url.")
209
  result["message"] = "can not load image"
210
+ return None, json.dumps(result)
211
 
212
  # generate
213
  width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
 
215
  mask = mask.resize((width, height), Image.LANCZOS)
216
  mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
217
 
218
+ try:
219
+ generated_image = run_flux(
220
+ image=image,
221
+ mask=mask,
222
+ prompt=inpainting_prompt_text,
223
+ lora_path=lora_path,
224
+ lora_scale=lora_scale,
225
+ lora_weights=lora_weights,
226
+ seed_slicer=seed_slicer,
227
+ randomize_seed_checkbox=randomize_seed_checkbox,
228
+ strength_slider=strength_slider,
229
+ num_inference_steps_slider=num_inference_steps_slider,
230
+ resolution_wh=(width, height),
231
+ progress=progress
232
+ )
233
+ except:
234
+ result["message"] = "generate image failed"
235
+ return None, json.dumps(result)
236
+
237
  if upload_to_r2:
238
  with calculateDuration("upload image"):
239
+ url = upload_image_to_r2(generated_image, account_id, access_key, secret_key, bucket)
240
+ result = {"status": "success", "message": "upload image success", "url": url}
241
  else:
242
  result = {"status": "success", "message": "Image generated but not uploaded"}
243
 
244
+ return generated_image, json.dumps(result)
245
 
246
 
247
  with gr.Blocks() as demo:
248
+ gr.Markdown("Flux inpaint with lora")
249
  with gr.Row():
250
  with gr.Column():
251
 
 
254
  show_label=True,
255
  max_lines=1,
256
  placeholder="Enter image url for inpainting",
257
+ container=False
258
  )
259
 
260
  mask_url = gr.Text(
 
368
  secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
369
 
370
  with gr.Column():
371
+ generated_image = gr.Image(label="Result", show_label=False)
372
  output_json_component = gr.Code(label="JSON Result", language="json")
373
 
374
  submit_button_component.click(
 
393
  bucket
394
  ],
395
  outputs=[
396
+ generated_image,
397
  output_json_component
398
  ]
399
  )
checkpoints/sam2_hiera_base_plus.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
3
- size 323493298
 
 
 
 
checkpoints/sam2_hiera_large.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
3
- size 897952466
 
 
 
 
checkpoints/sam2_hiera_small.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
3
- size 184309650
 
 
 
 
checkpoints/sam2_hiera_tiny.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
3
- size 155906050
 
 
 
 
configs/__init__.py DELETED
File without changes
configs/sam2_hiera_b+.yaml DELETED
@@ -1,113 +0,0 @@
1
- # @package _global_
2
-
3
- # Model
4
- model:
5
- _target_: sam2.modeling.sam2_base.SAM2Base
6
- image_encoder:
7
- _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
- scalp: 1
9
- trunk:
10
- _target_: sam2.modeling.backbones.hieradet.Hiera
11
- embed_dim: 112
12
- num_heads: 2
13
- neck:
14
- _target_: sam2.modeling.backbones.image_encoder.FpnNeck
15
- position_encoding:
16
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
17
- num_pos_feats: 256
18
- normalize: true
19
- scale: null
20
- temperature: 10000
21
- d_model: 256
22
- backbone_channel_list: [896, 448, 224, 112]
23
- fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
24
- fpn_interp_model: nearest
25
-
26
- memory_attention:
27
- _target_: sam2.modeling.memory_attention.MemoryAttention
28
- d_model: 256
29
- pos_enc_at_input: true
30
- layer:
31
- _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
32
- activation: relu
33
- dim_feedforward: 2048
34
- dropout: 0.1
35
- pos_enc_at_attn: false
36
- self_attention:
37
- _target_: sam2.modeling.sam.transformer.RoPEAttention
38
- rope_theta: 10000.0
39
- feat_sizes: [32, 32]
40
- embedding_dim: 256
41
- num_heads: 1
42
- downsample_rate: 1
43
- dropout: 0.1
44
- d_model: 256
45
- pos_enc_at_cross_attn_keys: true
46
- pos_enc_at_cross_attn_queries: false
47
- cross_attention:
48
- _target_: sam2.modeling.sam.transformer.RoPEAttention
49
- rope_theta: 10000.0
50
- feat_sizes: [32, 32]
51
- rope_k_repeat: True
52
- embedding_dim: 256
53
- num_heads: 1
54
- downsample_rate: 1
55
- dropout: 0.1
56
- kv_in_dim: 64
57
- num_layers: 4
58
-
59
- memory_encoder:
60
- _target_: sam2.modeling.memory_encoder.MemoryEncoder
61
- out_dim: 64
62
- position_encoding:
63
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
64
- num_pos_feats: 64
65
- normalize: true
66
- scale: null
67
- temperature: 10000
68
- mask_downsampler:
69
- _target_: sam2.modeling.memory_encoder.MaskDownSampler
70
- kernel_size: 3
71
- stride: 2
72
- padding: 1
73
- fuser:
74
- _target_: sam2.modeling.memory_encoder.Fuser
75
- layer:
76
- _target_: sam2.modeling.memory_encoder.CXBlock
77
- dim: 256
78
- kernel_size: 7
79
- padding: 3
80
- layer_scale_init_value: 1e-6
81
- use_dwconv: True # depth-wise convs
82
- num_layers: 2
83
-
84
- num_maskmem: 7
85
- image_size: 1024
86
- # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
87
- sigmoid_scale_for_mem_enc: 20.0
88
- sigmoid_bias_for_mem_enc: -10.0
89
- use_mask_input_as_output_without_sam: true
90
- # Memory
91
- directly_add_no_mem_embed: true
92
- # use high-resolution feature map in the SAM mask decoder
93
- use_high_res_features_in_sam: true
94
- # output 3 masks on the first click on initial conditioning frames
95
- multimask_output_in_sam: true
96
- # SAM heads
97
- iou_prediction_use_sigmoid: True
98
- # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
99
- use_obj_ptrs_in_encoder: true
100
- add_tpos_enc_to_obj_ptrs: false
101
- only_obj_ptrs_in_the_past_for_eval: true
102
- # object occlusion prediction
103
- pred_obj_scores: true
104
- pred_obj_scores_mlp: true
105
- fixed_no_obj_ptr: true
106
- # multimask tracking settings
107
- multimask_output_for_tracking: true
108
- use_multimask_token_for_obj_ptr: true
109
- multimask_min_pt_num: 0
110
- multimask_max_pt_num: 1
111
- use_mlp_for_obj_ptr_proj: true
112
- # Compilation flag
113
- compile_image_encoder: False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/sam2_hiera_l.yaml DELETED
@@ -1,117 +0,0 @@
1
- # @package _global_
2
-
3
- # Model
4
- model:
5
- _target_: sam2.modeling.sam2_base.SAM2Base
6
- image_encoder:
7
- _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
- scalp: 1
9
- trunk:
10
- _target_: sam2.modeling.backbones.hieradet.Hiera
11
- embed_dim: 144
12
- num_heads: 2
13
- stages: [2, 6, 36, 4]
14
- global_att_blocks: [23, 33, 43]
15
- window_pos_embed_bkg_spatial_size: [7, 7]
16
- window_spec: [8, 4, 16, 8]
17
- neck:
18
- _target_: sam2.modeling.backbones.image_encoder.FpnNeck
19
- position_encoding:
20
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
21
- num_pos_feats: 256
22
- normalize: true
23
- scale: null
24
- temperature: 10000
25
- d_model: 256
26
- backbone_channel_list: [1152, 576, 288, 144]
27
- fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
28
- fpn_interp_model: nearest
29
-
30
- memory_attention:
31
- _target_: sam2.modeling.memory_attention.MemoryAttention
32
- d_model: 256
33
- pos_enc_at_input: true
34
- layer:
35
- _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
36
- activation: relu
37
- dim_feedforward: 2048
38
- dropout: 0.1
39
- pos_enc_at_attn: false
40
- self_attention:
41
- _target_: sam2.modeling.sam.transformer.RoPEAttention
42
- rope_theta: 10000.0
43
- feat_sizes: [32, 32]
44
- embedding_dim: 256
45
- num_heads: 1
46
- downsample_rate: 1
47
- dropout: 0.1
48
- d_model: 256
49
- pos_enc_at_cross_attn_keys: true
50
- pos_enc_at_cross_attn_queries: false
51
- cross_attention:
52
- _target_: sam2.modeling.sam.transformer.RoPEAttention
53
- rope_theta: 10000.0
54
- feat_sizes: [32, 32]
55
- rope_k_repeat: True
56
- embedding_dim: 256
57
- num_heads: 1
58
- downsample_rate: 1
59
- dropout: 0.1
60
- kv_in_dim: 64
61
- num_layers: 4
62
-
63
- memory_encoder:
64
- _target_: sam2.modeling.memory_encoder.MemoryEncoder
65
- out_dim: 64
66
- position_encoding:
67
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
68
- num_pos_feats: 64
69
- normalize: true
70
- scale: null
71
- temperature: 10000
72
- mask_downsampler:
73
- _target_: sam2.modeling.memory_encoder.MaskDownSampler
74
- kernel_size: 3
75
- stride: 2
76
- padding: 1
77
- fuser:
78
- _target_: sam2.modeling.memory_encoder.Fuser
79
- layer:
80
- _target_: sam2.modeling.memory_encoder.CXBlock
81
- dim: 256
82
- kernel_size: 7
83
- padding: 3
84
- layer_scale_init_value: 1e-6
85
- use_dwconv: True # depth-wise convs
86
- num_layers: 2
87
-
88
- num_maskmem: 7
89
- image_size: 1024
90
- # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
91
- sigmoid_scale_for_mem_enc: 20.0
92
- sigmoid_bias_for_mem_enc: -10.0
93
- use_mask_input_as_output_without_sam: true
94
- # Memory
95
- directly_add_no_mem_embed: true
96
- # use high-resolution feature map in the SAM mask decoder
97
- use_high_res_features_in_sam: true
98
- # output 3 masks on the first click on initial conditioning frames
99
- multimask_output_in_sam: true
100
- # SAM heads
101
- iou_prediction_use_sigmoid: True
102
- # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
- use_obj_ptrs_in_encoder: true
104
- add_tpos_enc_to_obj_ptrs: false
105
- only_obj_ptrs_in_the_past_for_eval: true
106
- # object occlusion prediction
107
- pred_obj_scores: true
108
- pred_obj_scores_mlp: true
109
- fixed_no_obj_ptr: true
110
- # multimask tracking settings
111
- multimask_output_for_tracking: true
112
- use_multimask_token_for_obj_ptr: true
113
- multimask_min_pt_num: 0
114
- multimask_max_pt_num: 1
115
- use_mlp_for_obj_ptr_proj: true
116
- # Compilation flag
117
- compile_image_encoder: False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/sam2_hiera_s.yaml DELETED
@@ -1,116 +0,0 @@
1
- # @package _global_
2
-
3
- # Model
4
- model:
5
- _target_: sam2.modeling.sam2_base.SAM2Base
6
- image_encoder:
7
- _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
- scalp: 1
9
- trunk:
10
- _target_: sam2.modeling.backbones.hieradet.Hiera
11
- embed_dim: 96
12
- num_heads: 1
13
- stages: [1, 2, 11, 2]
14
- global_att_blocks: [7, 10, 13]
15
- window_pos_embed_bkg_spatial_size: [7, 7]
16
- neck:
17
- _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
- position_encoding:
19
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
- num_pos_feats: 256
21
- normalize: true
22
- scale: null
23
- temperature: 10000
24
- d_model: 256
25
- backbone_channel_list: [768, 384, 192, 96]
26
- fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
- fpn_interp_model: nearest
28
-
29
- memory_attention:
30
- _target_: sam2.modeling.memory_attention.MemoryAttention
31
- d_model: 256
32
- pos_enc_at_input: true
33
- layer:
34
- _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
- activation: relu
36
- dim_feedforward: 2048
37
- dropout: 0.1
38
- pos_enc_at_attn: false
39
- self_attention:
40
- _target_: sam2.modeling.sam.transformer.RoPEAttention
41
- rope_theta: 10000.0
42
- feat_sizes: [32, 32]
43
- embedding_dim: 256
44
- num_heads: 1
45
- downsample_rate: 1
46
- dropout: 0.1
47
- d_model: 256
48
- pos_enc_at_cross_attn_keys: true
49
- pos_enc_at_cross_attn_queries: false
50
- cross_attention:
51
- _target_: sam2.modeling.sam.transformer.RoPEAttention
52
- rope_theta: 10000.0
53
- feat_sizes: [32, 32]
54
- rope_k_repeat: True
55
- embedding_dim: 256
56
- num_heads: 1
57
- downsample_rate: 1
58
- dropout: 0.1
59
- kv_in_dim: 64
60
- num_layers: 4
61
-
62
- memory_encoder:
63
- _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
- out_dim: 64
65
- position_encoding:
66
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
- num_pos_feats: 64
68
- normalize: true
69
- scale: null
70
- temperature: 10000
71
- mask_downsampler:
72
- _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
- kernel_size: 3
74
- stride: 2
75
- padding: 1
76
- fuser:
77
- _target_: sam2.modeling.memory_encoder.Fuser
78
- layer:
79
- _target_: sam2.modeling.memory_encoder.CXBlock
80
- dim: 256
81
- kernel_size: 7
82
- padding: 3
83
- layer_scale_init_value: 1e-6
84
- use_dwconv: True # depth-wise convs
85
- num_layers: 2
86
-
87
- num_maskmem: 7
88
- image_size: 1024
89
- # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
- sigmoid_scale_for_mem_enc: 20.0
91
- sigmoid_bias_for_mem_enc: -10.0
92
- use_mask_input_as_output_without_sam: true
93
- # Memory
94
- directly_add_no_mem_embed: true
95
- # use high-resolution feature map in the SAM mask decoder
96
- use_high_res_features_in_sam: true
97
- # output 3 masks on the first click on initial conditioning frames
98
- multimask_output_in_sam: true
99
- # SAM heads
100
- iou_prediction_use_sigmoid: True
101
- # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
102
- use_obj_ptrs_in_encoder: true
103
- add_tpos_enc_to_obj_ptrs: false
104
- only_obj_ptrs_in_the_past_for_eval: true
105
- # object occlusion prediction
106
- pred_obj_scores: true
107
- pred_obj_scores_mlp: true
108
- fixed_no_obj_ptr: true
109
- # multimask tracking settings
110
- multimask_output_for_tracking: true
111
- use_multimask_token_for_obj_ptr: true
112
- multimask_min_pt_num: 0
113
- multimask_max_pt_num: 1
114
- use_mlp_for_obj_ptr_proj: true
115
- # Compilation flag
116
- compile_image_encoder: False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/sam2_hiera_t.yaml DELETED
@@ -1,118 +0,0 @@
1
- # @package _global_
2
-
3
- # Model
4
- model:
5
- _target_: sam2.modeling.sam2_base.SAM2Base
6
- image_encoder:
7
- _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
- scalp: 1
9
- trunk:
10
- _target_: sam2.modeling.backbones.hieradet.Hiera
11
- embed_dim: 96
12
- num_heads: 1
13
- stages: [1, 2, 7, 2]
14
- global_att_blocks: [5, 7, 9]
15
- window_pos_embed_bkg_spatial_size: [7, 7]
16
- neck:
17
- _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
- position_encoding:
19
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
- num_pos_feats: 256
21
- normalize: true
22
- scale: null
23
- temperature: 10000
24
- d_model: 256
25
- backbone_channel_list: [768, 384, 192, 96]
26
- fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
- fpn_interp_model: nearest
28
-
29
- memory_attention:
30
- _target_: sam2.modeling.memory_attention.MemoryAttention
31
- d_model: 256
32
- pos_enc_at_input: true
33
- layer:
34
- _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
- activation: relu
36
- dim_feedforward: 2048
37
- dropout: 0.1
38
- pos_enc_at_attn: false
39
- self_attention:
40
- _target_: sam2.modeling.sam.transformer.RoPEAttention
41
- rope_theta: 10000.0
42
- feat_sizes: [32, 32]
43
- embedding_dim: 256
44
- num_heads: 1
45
- downsample_rate: 1
46
- dropout: 0.1
47
- d_model: 256
48
- pos_enc_at_cross_attn_keys: true
49
- pos_enc_at_cross_attn_queries: false
50
- cross_attention:
51
- _target_: sam2.modeling.sam.transformer.RoPEAttention
52
- rope_theta: 10000.0
53
- feat_sizes: [32, 32]
54
- rope_k_repeat: True
55
- embedding_dim: 256
56
- num_heads: 1
57
- downsample_rate: 1
58
- dropout: 0.1
59
- kv_in_dim: 64
60
- num_layers: 4
61
-
62
- memory_encoder:
63
- _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
- out_dim: 64
65
- position_encoding:
66
- _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
- num_pos_feats: 64
68
- normalize: true
69
- scale: null
70
- temperature: 10000
71
- mask_downsampler:
72
- _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
- kernel_size: 3
74
- stride: 2
75
- padding: 1
76
- fuser:
77
- _target_: sam2.modeling.memory_encoder.Fuser
78
- layer:
79
- _target_: sam2.modeling.memory_encoder.CXBlock
80
- dim: 256
81
- kernel_size: 7
82
- padding: 3
83
- layer_scale_init_value: 1e-6
84
- use_dwconv: True # depth-wise convs
85
- num_layers: 2
86
-
87
- num_maskmem: 7
88
- image_size: 1024
89
- # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
- # SAM decoder
91
- sigmoid_scale_for_mem_enc: 20.0
92
- sigmoid_bias_for_mem_enc: -10.0
93
- use_mask_input_as_output_without_sam: true
94
- # Memory
95
- directly_add_no_mem_embed: true
96
- # use high-resolution feature map in the SAM mask decoder
97
- use_high_res_features_in_sam: true
98
- # output 3 masks on the first click on initial conditioning frames
99
- multimask_output_in_sam: true
100
- # SAM heads
101
- iou_prediction_use_sigmoid: True
102
- # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
- use_obj_ptrs_in_encoder: true
104
- add_tpos_enc_to_obj_ptrs: false
105
- only_obj_ptrs_in_the_past_for_eval: true
106
- # object occlusion prediction
107
- pred_obj_scores: true
108
- pred_obj_scores_mlp: true
109
- fixed_no_obj_ptr: true
110
- # multimask tracking settings
111
- multimask_output_for_tracking: true
112
- use_multimask_token_for_obj_ptr: true
113
- multimask_min_pt_num: 0
114
- multimask_max_pt_num: 1
115
- use_mlp_for_obj_ptr_proj: true
116
- # Compilation flag
117
- # HieraT does not currently support compilation, should always be set to False
118
- compile_image_encoder: False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__init__.py DELETED
File without changes
utils/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (125 Bytes)
 
utils/__pycache__/florence.cpython-310.pyc DELETED
Binary file (2.31 kB)
 
utils/__pycache__/sam.cpython-310.pyc DELETED
Binary file (1.57 kB)
 
utils/florence.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- from typing import Union, Any, Tuple, Dict
3
- from unittest.mock import patch
4
-
5
- import torch
6
- from PIL import Image
7
- from transformers import AutoModelForCausalLM, AutoProcessor
8
- from transformers.dynamic_module_utils import get_imports
9
-
10
- # FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
11
- FLORENCE_CHECKPOINT = "microsoft/Florence-2-large-ft"
12
- FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
13
- FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
14
- FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
15
- FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
16
- FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
17
-
18
-
19
- def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
20
- """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
21
- if not str(filename).endswith("/modeling_florence2.py"):
22
- return get_imports(filename)
23
- imports = get_imports(filename)
24
- imports.remove("flash_attn")
25
- return imports
26
-
27
-
28
- def load_florence_model(
29
- device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
30
- ) -> Tuple[Any, Any]:
31
- with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
32
- model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device).eval()
33
- processor = AutoProcessor.from_pretrained(checkpoint, trust_remote_code=True)
34
- return model, processor
35
-
36
-
37
- def run_florence_inference(
38
- model: Any,
39
- processor: Any,
40
- device: torch.device,
41
- image: Image,
42
- task: str,
43
- text: str = None
44
- ) -> Tuple[str, Dict]:
45
- if text:
46
- prompt = task + text
47
- else:
48
- prompt = task
49
- inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
50
- generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3)
51
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
52
- response = processor.post_process_generation(generated_text, task=task, image_size=image.size)
53
- print("run_florence_inference", "finish", generated_text, response)
54
- return generated_text, response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sam.py DELETED
@@ -1,50 +0,0 @@
1
- from typing import Any
2
-
3
- import numpy as np
4
- import supervision as sv
5
- import torch
6
- from PIL import Image
7
- from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
- from sam2.sam2_image_predictor import SAM2ImagePredictor
9
-
10
- # SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
- # SAM_CONFIG = "sam2_hiera_s.yaml"
12
- SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
13
- SAM_CONFIG = "sam2_hiera_l.yaml"
14
-
15
-
16
- def load_sam_image_model(
17
- device: torch.device,
18
- config: str = SAM_CONFIG,
19
- checkpoint: str = SAM_CHECKPOINT
20
- ) -> SAM2ImagePredictor:
21
- model = build_sam2(config, checkpoint, device=device)
22
- return SAM2ImagePredictor(sam_model=model)
23
-
24
-
25
- def load_sam_video_model(
26
- device: torch.device,
27
- config: str = SAM_CONFIG,
28
- checkpoint: str = SAM_CHECKPOINT
29
- ) -> Any:
30
- return build_sam2_video_predictor(config, checkpoint, device=device)
31
-
32
-
33
- def run_sam_inference(
34
- model: Any,
35
- image: Image,
36
- detections: sv.Detections
37
- ) -> sv.Detections:
38
- image = np.array(image.convert("RGB"))
39
- model.set_image(image)
40
- # from left to right
41
- bboxes = detections.xyxy
42
- bboxes = sorted(bboxes, key=lambda bbox: bbox[0])
43
- mask, score, _ = model.predict(box=bboxes, multimask_output=False)
44
-
45
- # dirty fix; remove this later
46
- if len(mask.shape) == 4:
47
- mask = np.squeeze(mask)
48
-
49
- detections.mask = mask.astype(bool)
50
- return detections