jadechoghari commited on
Commit
e352874
·
verified ·
1 Parent(s): 32fc8c9

Create convert.py

Browse files
Files changed (1) hide show
  1. convert.py +695 -0
convert.py ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Convert RT Detr checkpoints with Timm backbone"""
16
+
17
+ import argparse
18
+ import json
19
+ from pathlib import Path
20
+
21
+ import requests
22
+ import torch
23
+ from huggingface_hub import hf_hub_download
24
+ from PIL import Image
25
+ from torchvision import transforms
26
+
27
+ from transformers import RTDetrImageProcessor
28
+ from modular_rtdetrv2 import RTDetrV2Config, RTDetrV2ForObjectDetection
29
+ from transformers.utils import logging
30
+
31
+
32
+ logging.set_verbosity_info()
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ def get_rt_detr_v2_config(model_name: str) -> RTDetrV2Config:
37
+ config = RTDetrV2Config()
38
+
39
+ config.num_labels = 80
40
+ repo_id = "huggingface/label-files"
41
+ filename = "coco-detection-mmdet-id2label.json"
42
+ id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
43
+ id2label = {int(k): v for k, v in id2label.items()}
44
+ config.id2label = id2label
45
+ config.label2id = {v: k for k, v in id2label.items()}
46
+
47
+ if model_name == "rtdetr_v2_r18vd":
48
+ config.backbone_config.hidden_sizes = [64, 128, 256, 512]
49
+ config.backbone_config.depths = [2, 2, 2, 2]
50
+ config.backbone_config.layer_type = "basic"
51
+ config.encoder_in_channels = [128, 256, 512]
52
+ config.hidden_expansion = 0.5
53
+ config.decoder_layers = 3
54
+ elif model_name == "rtdetr_v2_r34vd":
55
+ config.backbone_config.hidden_sizes = [64, 128, 256, 512]
56
+ config.backbone_config.depths = [3, 4, 6, 3]
57
+ config.backbone_config.layer_type = "basic"
58
+ config.encoder_in_channels = [128, 256, 512]
59
+ config.hidden_expansion = 0.5
60
+ config.decoder_layers = 4
61
+ elif model_name == "rtdetr_v2_r50vd_m":
62
+ config.hidden_expansion = 0.5
63
+ elif model_name == "rtdetr_v2_r50vd":
64
+ pass
65
+ elif model_name == "rtdetr_v2_r101vd":
66
+ config.backbone_config.depths = [3, 4, 23, 3]
67
+ config.encoder_ffn_dim = 2048
68
+ config.encoder_hidden_dim = 384
69
+ config.decoder_in_channels = [384, 384, 384]
70
+
71
+ return config
72
+
73
+
74
+ def create_rename_keys(config):
75
+ # here we list all keys to be renamed (original name on the left, our name on the right)
76
+ rename_keys = []
77
+
78
+ # stem
79
+ # fmt: off
80
+ last_key = ["weight", "bias", "running_mean", "running_var"]
81
+
82
+ for level in range(3):
83
+ rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
84
+ for last in last_key:
85
+ rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
86
+
87
+ for stage_idx in range(len(config.backbone_config.depths)):
88
+ for layer_idx in range(config.backbone_config.depths[stage_idx]):
89
+ # shortcut
90
+ if layer_idx == 0:
91
+ if stage_idx == 0:
92
+ rename_keys.append(
93
+ (
94
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
95
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
96
+ )
97
+ )
98
+ for last in last_key:
99
+ rename_keys.append(
100
+ (
101
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
102
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
103
+ )
104
+ )
105
+ else:
106
+ rename_keys.append(
107
+ (
108
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
109
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
110
+ )
111
+ )
112
+ for last in last_key:
113
+ rename_keys.append(
114
+ (
115
+ f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
116
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
117
+ )
118
+ )
119
+
120
+ rename_keys.append(
121
+ (
122
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
123
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
124
+ )
125
+ )
126
+ for last in last_key:
127
+ rename_keys.append((
128
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
129
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
130
+ ))
131
+
132
+ rename_keys.append(
133
+ (
134
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
135
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
136
+ )
137
+ )
138
+ for last in last_key:
139
+ rename_keys.append((
140
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
141
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
142
+ ))
143
+
144
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
145
+ if config.backbone_config.layer_type != "basic":
146
+ rename_keys.append(
147
+ (
148
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
149
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
150
+ )
151
+ )
152
+ for last in last_key:
153
+ rename_keys.append((
154
+ f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
155
+ f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
156
+ ))
157
+ # fmt: on
158
+
159
+ for i in range(config.encoder_layers):
160
+ # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
161
+ rename_keys.append(
162
+ (
163
+ f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
164
+ f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
165
+ )
166
+ )
167
+ rename_keys.append(
168
+ (
169
+ f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
170
+ f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
171
+ )
172
+ )
173
+ rename_keys.append(
174
+ (
175
+ f"encoder.encoder.{i}.layers.0.linear1.weight",
176
+ f"model.encoder.encoder.{i}.layers.0.fc1.weight",
177
+ )
178
+ )
179
+ rename_keys.append(
180
+ (
181
+ f"encoder.encoder.{i}.layers.0.linear1.bias",
182
+ f"model.encoder.encoder.{i}.layers.0.fc1.bias",
183
+ )
184
+ )
185
+ rename_keys.append(
186
+ (
187
+ f"encoder.encoder.{i}.layers.0.linear2.weight",
188
+ f"model.encoder.encoder.{i}.layers.0.fc2.weight",
189
+ )
190
+ )
191
+ rename_keys.append(
192
+ (
193
+ f"encoder.encoder.{i}.layers.0.linear2.bias",
194
+ f"model.encoder.encoder.{i}.layers.0.fc2.bias",
195
+ )
196
+ )
197
+ rename_keys.append(
198
+ (
199
+ f"encoder.encoder.{i}.layers.0.norm1.weight",
200
+ f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
201
+ )
202
+ )
203
+ rename_keys.append(
204
+ (
205
+ f"encoder.encoder.{i}.layers.0.norm1.bias",
206
+ f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
207
+ )
208
+ )
209
+ rename_keys.append(
210
+ (
211
+ f"encoder.encoder.{i}.layers.0.norm2.weight",
212
+ f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
213
+ )
214
+ )
215
+ rename_keys.append(
216
+ (
217
+ f"encoder.encoder.{i}.layers.0.norm2.bias",
218
+ f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
219
+ )
220
+ )
221
+
222
+ for j in range(0, 3):
223
+ rename_keys.append((f"encoder.input_proj.{j}.conv.weight", f"model.encoder_input_proj.{j}.0.weight"))
224
+ for last in last_key:
225
+ rename_keys.append((f"encoder.input_proj.{j}.norm.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
226
+
227
+ block_levels = 4
228
+
229
+ for i in range(len(config.encoder_in_channels) - 1):
230
+ # encoder layers: hybridencoder parts
231
+ for j in range(1, block_levels):
232
+ rename_keys.append(
233
+ (f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
234
+ )
235
+ for last in last_key:
236
+ rename_keys.append(
237
+ (
238
+ f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
239
+ f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
240
+ )
241
+ )
242
+
243
+ rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
244
+ for last in last_key:
245
+ rename_keys.append(
246
+ (f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
247
+ )
248
+
249
+ for j in range(3):
250
+ for k in range(1, 3):
251
+ rename_keys.append(
252
+ (
253
+ f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
254
+ f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
255
+ )
256
+ )
257
+ for last in last_key:
258
+ rename_keys.append(
259
+ (
260
+ f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
261
+ f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
262
+ )
263
+ )
264
+
265
+ for j in range(1, block_levels):
266
+ rename_keys.append(
267
+ (f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
268
+ )
269
+ for last in last_key:
270
+ rename_keys.append(
271
+ (
272
+ f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
273
+ f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
274
+ )
275
+ )
276
+
277
+ for j in range(3):
278
+ for k in range(1, 3):
279
+ rename_keys.append(
280
+ (
281
+ f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
282
+ f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
283
+ )
284
+ )
285
+ for last in last_key:
286
+ rename_keys.append(
287
+ (
288
+ f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
289
+ f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
290
+ )
291
+ )
292
+
293
+ rename_keys.append(
294
+ (f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
295
+ )
296
+ for last in last_key:
297
+ rename_keys.append(
298
+ (f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
299
+ )
300
+
301
+ for i in range(config.decoder_layers):
302
+ # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
303
+ rename_keys.append(
304
+ (
305
+ f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
306
+ f"model.decoder.layers.{i}.self_attn.out_proj.weight",
307
+ )
308
+ )
309
+ rename_keys.append(
310
+ (
311
+ f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
312
+ f"model.decoder.layers.{i}.self_attn.out_proj.bias",
313
+ )
314
+ )
315
+ rename_keys.append(
316
+ (
317
+ f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
318
+ f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
319
+ )
320
+ )
321
+ rename_keys.append(
322
+ (
323
+ f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
324
+ f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
325
+ )
326
+ )
327
+ rename_keys.append(
328
+ (
329
+ f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
330
+ f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
331
+ )
332
+ )
333
+ rename_keys.append(
334
+ (
335
+ f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
336
+ f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
337
+ )
338
+ )
339
+ rename_keys.append(
340
+ (
341
+ f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
342
+ f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
343
+ )
344
+ )
345
+ rename_keys.append(
346
+ (
347
+ f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
348
+ f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
349
+ )
350
+ )
351
+ rename_keys.append(
352
+ (
353
+ f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
354
+ f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
355
+ )
356
+ )
357
+ rename_keys.append(
358
+ (
359
+ f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
360
+ f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
361
+ )
362
+ )
363
+ rename_keys.append(
364
+ (f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
365
+ )
366
+ rename_keys.append(
367
+ (f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
368
+ )
369
+ rename_keys.append(
370
+ (f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
371
+ )
372
+ rename_keys.append(
373
+ (f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
374
+ )
375
+ rename_keys.append(
376
+ (
377
+ f"decoder.decoder.layers.{i}.cross_attn.num_points_scale",
378
+ f"model.decoder.layers.{i}.encoder_attn.n_points_scale",
379
+ )
380
+ )
381
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
382
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
383
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
384
+ rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
385
+ rename_keys.append(
386
+ (f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
387
+ )
388
+ rename_keys.append(
389
+ (f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
390
+ )
391
+
392
+ for i in range(config.decoder_layers):
393
+ # decoder + class and bounding box heads
394
+ rename_keys.append(
395
+ (
396
+ f"decoder.dec_score_head.{i}.weight",
397
+ f"model.decoder.class_embed.{i}.weight",
398
+ )
399
+ )
400
+ rename_keys.append(
401
+ (
402
+ f"decoder.dec_score_head.{i}.bias",
403
+ f"model.decoder.class_embed.{i}.bias",
404
+ )
405
+ )
406
+ rename_keys.append(
407
+ (
408
+ f"decoder.dec_bbox_head.{i}.layers.0.weight",
409
+ f"model.decoder.bbox_embed.{i}.layers.0.weight",
410
+ )
411
+ )
412
+ rename_keys.append(
413
+ (
414
+ f"decoder.dec_bbox_head.{i}.layers.0.bias",
415
+ f"model.decoder.bbox_embed.{i}.layers.0.bias",
416
+ )
417
+ )
418
+ rename_keys.append(
419
+ (
420
+ f"decoder.dec_bbox_head.{i}.layers.1.weight",
421
+ f"model.decoder.bbox_embed.{i}.layers.1.weight",
422
+ )
423
+ )
424
+ rename_keys.append(
425
+ (
426
+ f"decoder.dec_bbox_head.{i}.layers.1.bias",
427
+ f"model.decoder.bbox_embed.{i}.layers.1.bias",
428
+ )
429
+ )
430
+ rename_keys.append(
431
+ (
432
+ f"decoder.dec_bbox_head.{i}.layers.2.weight",
433
+ f"model.decoder.bbox_embed.{i}.layers.2.weight",
434
+ )
435
+ )
436
+ rename_keys.append(
437
+ (
438
+ f"decoder.dec_bbox_head.{i}.layers.2.bias",
439
+ f"model.decoder.bbox_embed.{i}.layers.2.bias",
440
+ )
441
+ )
442
+
443
+ # decoder projection
444
+ for i in range(len(config.decoder_in_channels)):
445
+ rename_keys.append(
446
+ (
447
+ f"decoder.input_proj.{i}.conv.weight",
448
+ f"model.decoder_input_proj.{i}.0.weight",
449
+ )
450
+ )
451
+ for last in last_key:
452
+ rename_keys.append(
453
+ (
454
+ f"decoder.input_proj.{i}.norm.{last}",
455
+ f"model.decoder_input_proj.{i}.1.{last}",
456
+ )
457
+ )
458
+
459
+ # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
460
+ rename_keys.extend(
461
+ [
462
+ ("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
463
+ ("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
464
+ ("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
465
+ ("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
466
+ ("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
467
+ ("decoder.enc_output.proj.weight", "model.enc_output.0.weight"),
468
+ ("decoder.enc_output.proj.bias", "model.enc_output.0.bias"),
469
+ ("decoder.enc_output.norm.weight", "model.enc_output.1.weight"),
470
+ ("decoder.enc_output.norm.bias", "model.enc_output.1.bias"),
471
+ ("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
472
+ ("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
473
+ ("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
474
+ ("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
475
+ ("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
476
+ ("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
477
+ ("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
478
+ ("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
479
+ ]
480
+ )
481
+
482
+ return rename_keys
483
+
484
+
485
+ def rename_key(state_dict, old, new):
486
+ try:
487
+ val = state_dict.pop(old)
488
+ state_dict[new] = val
489
+ except Exception:
490
+ pass
491
+
492
+
493
+ def read_in_q_k_v(state_dict, config):
494
+ prefix = ""
495
+ encoder_hidden_dim = config.encoder_hidden_dim
496
+
497
+ # first: transformer encoder
498
+ for i in range(config.encoder_layers):
499
+ # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
500
+ in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
501
+ in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
502
+ # next, add query, keys and values (in that order) to the state dict
503
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
504
+ :encoder_hidden_dim, :
505
+ ]
506
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
507
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
508
+ encoder_hidden_dim : 2 * encoder_hidden_dim, :
509
+ ]
510
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
511
+ encoder_hidden_dim : 2 * encoder_hidden_dim
512
+ ]
513
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
514
+ -encoder_hidden_dim:, :
515
+ ]
516
+ state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
517
+ # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
518
+ for i in range(config.decoder_layers):
519
+ # read in weights + bias of input projection layer of self-attention
520
+ in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
521
+ in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
522
+ # next, add query, keys and values (in that order) to the state dict
523
+ state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
524
+ state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
525
+ state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
526
+ state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
527
+ state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
528
+ state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
529
+
530
+
531
+ # We will verify our results on an image of cute cats
532
+ def prepare_img():
533
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
534
+ im = Image.open(requests.get(url, stream=True).raw)
535
+
536
+ return im
537
+
538
+
539
+ @torch.no_grad()
540
+ def convert_rt_detr_v2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
541
+ """
542
+ Copy/paste/tweak model's weights to our RTDETR structure.
543
+ """
544
+
545
+ # load default config
546
+ config = get_rt_detr_v2_config(model_name)
547
+
548
+ # load original model from torch hub
549
+ model_name_to_checkpoint_url = {
550
+ "rtdetr_v2_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth",
551
+ "rtdetr_v2_r34vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth",
552
+ "rtdetr_v2_r50vd_m": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth",
553
+ "rtdetr_v2_r50vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth",
554
+ "rtdetr_v2_r101vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth",
555
+ }
556
+ logger.info(f"Converting model {model_name}...")
557
+ state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
558
+ "ema"
559
+ ]["module"]
560
+
561
+ # rename keys
562
+ for src, dest in create_rename_keys(config):
563
+ rename_key(state_dict, src, dest)
564
+ # query, key and value matrices need special treatment
565
+ read_in_q_k_v(state_dict, config)
566
+ # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
567
+ for key in state_dict.copy().keys():
568
+ if key.endswith("num_batches_tracked"):
569
+ del state_dict[key]
570
+ # for two_stage
571
+ if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
572
+ state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
573
+
574
+ # This layer is not required since it is static layer
575
+ del state_dict["decoder.anchors"]
576
+ del state_dict["decoder.valid_mask"]
577
+
578
+ print("renaming is done ")
579
+
580
+ # finally, create HuggingFace model and load state dict
581
+ model = RTDetrV2ForObjectDetection(config)
582
+ model.load_state_dict(state_dict, strict=False)
583
+ model.eval()
584
+
585
+ # load image processor
586
+ image_processor = RTDetrImageProcessor()
587
+
588
+ # prepare image
589
+ img = prepare_img()
590
+
591
+ # preprocess image
592
+ transformations = transforms.Compose(
593
+ [
594
+ transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
595
+ transforms.ToTensor(),
596
+ ]
597
+ )
598
+ original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension
599
+
600
+ encoding = image_processor(images=img, return_tensors="pt")
601
+ pixel_values = encoding["pixel_values"]
602
+
603
+ assert torch.allclose(original_pixel_values, pixel_values)
604
+
605
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
606
+ model.to(device)
607
+ pixel_values = pixel_values.to(device)
608
+
609
+ # Pass image by the model
610
+ outputs = model(pixel_values)
611
+
612
+ if model_name == "rtdetr_v2_r18vd":
613
+ expected_slice_logits = torch.tensor(
614
+ [[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]]
615
+ )
616
+ expected_slice_boxes = torch.tensor(
617
+ [[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]]
618
+ )
619
+ elif model_name == "rtdetr_v2_r34vd":
620
+ expected_slice_logits = torch.tensor(
621
+ [[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]]
622
+ )
623
+ expected_slice_boxes = torch.tensor(
624
+ [[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]]
625
+ )
626
+ elif model_name == "rtdetr_v2_r50vd_m":
627
+ expected_slice_logits = torch.tensor(
628
+ [[-2.7453, -5.4595, -7.3702], [-3.1858, -5.3803, -7.9838], [-5.0293, -7.0083, -4.2888]]
629
+ )
630
+ expected_slice_boxes = torch.tensor(
631
+ [[0.7711, 0.4135, 0.4577], [0.2570, 0.5480, 0.4755], [0.1694, 0.1992, 0.2127]]
632
+ )
633
+ elif model_name == "rtdetr_v2_r50vd":
634
+ expected_slice_logits = torch.tensor(
635
+ [[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]]
636
+ )
637
+ expected_slice_boxes = torch.tensor(
638
+ [[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]]
639
+ )
640
+ elif model_name == "rtdetr_v2_r101vd":
641
+ expected_slice_logits = torch.tensor(
642
+ [[-4.6162, -4.9189, -4.6656], [-4.4701, -4.4997, -4.9659], [-5.6641, -7.9000, -5.0725]]
643
+ )
644
+ expected_slice_boxes = torch.tensor(
645
+ [[0.7707, 0.4124, 0.4585], [0.2589, 0.5492, 0.4735], [0.1688, 0.1993, 0.2108]]
646
+ )
647
+ else:
648
+ raise ValueError(f"Unknown rt_detr_v2_name: {model_name}")
649
+
650
+ assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-3)
651
+ assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
652
+
653
+ if pytorch_dump_folder_path is not None:
654
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
655
+ print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
656
+ model.save_pretrained(pytorch_dump_folder_path)
657
+ print(f"Saving image processor to {pytorch_dump_folder_path}")
658
+ image_processor.save_pretrained(pytorch_dump_folder_path)
659
+
660
+ if push_to_hub:
661
+ # Upload model, image processor and config to the hub
662
+ logger.info("Uploading PyTorch model and image processor to the hub...")
663
+ config.push_to_hub(
664
+ repo_id=repo_id,
665
+ commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
666
+ )
667
+ model.push_to_hub(
668
+ repo_id=repo_id,
669
+ commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
670
+ )
671
+ image_processor.push_to_hub(
672
+ repo_id=repo_id,
673
+ commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
674
+ )
675
+
676
+
677
+ if __name__ == "__main__":
678
+ parser = argparse.ArgumentParser()
679
+ parser.add_argument(
680
+ "--model_name",
681
+ default="rtdetr_v2_r50vd",
682
+ type=str,
683
+ help="model_name of the checkpoint you'd like to convert.",
684
+ )
685
+ parser.add_argument(
686
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
687
+ )
688
+ parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
689
+ parser.add_argument(
690
+ "--repo_id",
691
+ type=str,
692
+ help="repo_id where the model will be pushed to.",
693
+ )
694
+ args = parser.parse_args()
695
+ convert_rt_detr_v2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)