Spaces:
Running
on
Zero
Running
on
Zero
File size: 66,173 Bytes
ad88a0b 61fbdeb ad88a0b a29cf91 ad88a0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 |
import torch
import torch.nn as nn
from transformers import CLIPTokenizer, CLIPImageProcessor
from .arc2face_models import CLIPTextModelWrapper
from ConsistentID.lib.pipeline_ConsistentID import ConsistentIDPipeline
from .util import perturb_tensor, pad_image_obj_to_square, \
calc_stats, patch_clip_image_encoder_with_mask, CLIPVisionModelWithMask
from adaface.subj_basis_generator import SubjBasisGenerator
import torch.nn.functional as F
import numpy as np
import cv2
from PIL import Image
from insightface.app import FaceAnalysis
import os
from omegaconf.listconfig import ListConfig
# adaface_encoder_types can be a list of one or more encoder types.
# adaface_ckpt_paths can be one or a list of ckpt paths.
# adaface_encoder_cfg_scales is None, or a list of scales for the adaface encoder types.
def create_id2ada_prompt_encoder(adaface_encoder_types, adaface_ckpt_paths=None,
adaface_encoder_cfg_scales=None, enabled_encoders=None,
*args, **kwargs):
if len(adaface_encoder_types) == 1:
adaface_encoder_type = adaface_encoder_types[0]
adaface_ckpt_path = adaface_ckpt_paths[0] if adaface_ckpt_paths is not None else None
if adaface_encoder_type == 'arc2face':
id2ada_prompt_encoder = \
Arc2Face_ID2AdaPrompt(adaface_ckpt_path=adaface_ckpt_path,
*args, **kwargs)
elif adaface_encoder_type == 'consistentID':
id2ada_prompt_encoder = \
ConsistentID_ID2AdaPrompt(pipe=None,
adaface_ckpt_path=adaface_ckpt_path,
*args, **kwargs)
else:
id2ada_prompt_encoder = Joint_FaceID2AdaPrompt(adaface_encoder_types, adaface_ckpt_paths,
adaface_encoder_cfg_scales, enabled_encoders,
*args, **kwargs)
return id2ada_prompt_encoder
class FaceID2AdaPrompt(nn.Module):
# To be initialized in derived classes.
def __init__(self, *args, **kwargs):
super().__init__()
# Initialize model components.
# These components of ConsistentID_ID2AdaPrompt will be shared with the teacher model.
# So we don't initialize them in the ctor(), but borrow them from the teacher model.
# These components of Arc2Face_ID2AdaPrompt will be initialized in its ctor().
self.clip_image_encoder = None
self.clip_preprocessor = None
self.face_app = None
self.text_to_image_prompt_encoder = None
self.tokenizer = None
self.dtype = kwargs.get('dtype', torch.float16)
# Load Img2Ada SubjectBasisGenerator.
self.subject_string = kwargs.get('subject_string', 'z')
self.adaface_ckpt_path = kwargs.get('adaface_ckpt_path', None)
self.subj_basis_generator = None
# -1: use the default scale for the adaface encoder type.
# i.e., 6 for arc2face and 1 for consistentID.
self.out_id_embs_cfg_scale = kwargs.get('out_id_embs_cfg_scale', -1)
self.is_training = kwargs.get('is_training', False)
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
self.extend_prompt2token_proj_attention_multiplier = kwargs.get('extend_prompt2token_proj_attention_multiplier', 1)
self.prompt2token_proj_ext_attention_perturb_ratio = kwargs.get('prompt2token_proj_ext_attention_perturb_ratio', 0.1)
# Set model behavior configurations.
self.gen_neg_img_prompt = False
self.clip_neg_features = None
self.use_clip_embs = False
self.do_contrast_clip_embs_on_bg_features = False
# num_id_vecs is the output embeddings of the ID2ImgPrompt module.
# If there's no static image suffix embeddings, then num_id_vecs is also
# the number of ada embeddings returned by the subject basis generator.
# num_id_vecs will be set in each derived class.
self.num_static_img_suffix_embs = kwargs.get('num_static_img_suffix_embs', 0)
print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings and {self.num_static_img_suffix_embs} fixed image embeddings as input.')
self.id_img_prompt_max_length = 77
self.face_id_dim = 512
# clip_embedding_dim: by default it's the OpenAI CLIP embedding dim.
# Could be overridden by derived classes.
self.clip_embedding_dim = 1024
self.output_dim = 768
def get_id2img_learnable_modules(self):
raise NotImplementedError
def load_id2img_learnable_modules(self, id2img_learnable_modules_state_dict_list):
id2img_prompt_encoder_learnable_modules = self.get_id2img_learnable_modules()
for module, state_dict in zip(id2img_prompt_encoder_learnable_modules, id2img_learnable_modules_state_dict_list):
module.load_state_dict(state_dict)
print(f'{len(id2img_prompt_encoder_learnable_modules)} ID2ImgPrompt encoder modules loaded.')
# init_subj_basis_generator() can only be called after the derived class is initialized,
# when self.num_id_vecs, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
def init_subj_basis_generator(self):
self.subj_basis_generator = \
SubjBasisGenerator(num_id_vecs = self.num_id_vecs,
num_static_img_suffix_embs = self.num_static_img_suffix_embs,
bg_image_embedding_dim = self.clip_embedding_dim,
output_dim = self.output_dim,
placeholder_is_bg = False,
prompt2token_proj_grad_scale = 1,
bg_prompt_translator_has_to_out_proj=False)
def load_adaface_ckpt(self, adaface_ckpt_path):
ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
if self.subject_string not in string_to_subj_basis_generator_dict:
print(f"Subject '{self.subject_string}' not found in the embedding manager.")
breakpoint()
ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
ckpt_subj_basis_generator.N_ID = self.num_id_vecs
# Since we directly use the subject basis generator object from the ckpt,
# fixing the number of static image suffix embeddings is much simpler.
# Otherwise if we want to load the subject basis generator from its state_dict,
# things are more complicated, see embedding manager's load().
ckpt_subj_basis_generator.N_SFX = self.num_static_img_suffix_embs
# obj_proj_in and pos_embs are for non-faces. So they are useless for human faces.
ckpt_subj_basis_generator.obj_proj_in = None
ckpt_subj_basis_generator.pos_embs = None
# Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
# Fix missing variables in old ckpt.
ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
self.subj_basis_generator.extend_prompt2token_proj_attention(\
ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
print(f"{adaface_ckpt_path}: subject basis generator loaded for '{self.name}'.")
print(repr(ckpt_subj_basis_generator))
if ret is not None and len(ret.missing_keys) > 0:
print(f"Missing keys: {ret.missing_keys}")
if ret is not None and len(ret.unexpected_keys) > 0:
print(f"Unexpected keys: {ret.unexpected_keys}")
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
# If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
# extend subj_basis_generator again.
if self.extend_prompt2token_proj_attention_multiplier > 1:
# During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
# During training, prompt2token_proj_ext_attention_perturb_ratio == 0.1.
# During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
self.subj_basis_generator.extend_prompt2token_proj_attention(\
None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
self.subj_basis_generator.freeze_prompt2token_proj()
@torch.no_grad()
def get_clip_neg_features(self, BS):
if self.clip_neg_features is None:
# neg_pixel_values: [1, 3, 224, 224]. clip_neg_features is invariant to the actual image.
neg_pixel_values = torch.zeros([1, 3, 224, 224], device=self.clip_image_encoder.device, dtype=self.dtype)
# Precompute CLIP negative features for the negative image prompt.
self.clip_neg_features = self.clip_image_encoder(neg_pixel_values, attn_mask=None, output_hidden_states=True).hidden_states[-2]
clip_neg_features = self.clip_neg_features.repeat(BS, 1, 1)
return clip_neg_features
# image_objs: a list of np array / tensor / Image objects of different sizes [Hi, Wi].
# If image_objs is a list of tensors, then each tensor should be [3, Hi, Wi].
# If image_objs is None, then image_paths should be provided,
# and image_objs will be loaded from image_paths.
# fg_masks: None, or a list of [Hi, Wi].
def extract_init_id_embeds_from_images(self, image_objs, image_paths, fg_masks=None,
size=(512, 512), calc_avg=False,
skip_non_faces=True, return_clip_embs=None,
do_contrast_clip_embs_on_bg_features=None,
verbose=False):
# If return_clip_embs or do_contrast_clip_embs_on_bg_features is not provided,
# then use their default values.
if return_clip_embs is None:
return_clip_embs = self.use_clip_embs
if do_contrast_clip_embs_on_bg_features is None:
do_contrast_clip_embs_on_bg_features = self.do_contrast_clip_embs_on_bg_features
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.clip_image_encoder.device
image_pixel_values = []
all_id_embs = []
faceless_img_count = 0
if image_objs is None and image_paths is not None:
image_objs = []
for image_path in image_paths:
image_obj = Image.open(image_path)
image_objs.append(image_obj)
print(f'Loaded {len(image_objs)} images from {image_paths[0]}...')
# image_objs could be a batch of images that have been collated into a tensor or np array.
# image_objs can also be a list of images.
# The code below that processes them one by one can be applied in both cases.
# If image_objs are a collated batch, processing them one by one will not add much overhead.
for idx, image_obj in enumerate(image_objs):
if return_clip_embs:
# input to clip_preprocessor: an image or a batch of images, each being PIL.Image.Image, numpy.ndarray,
# torch.Tensor, tf.Tensor or jax.ndarray.
# Different sizes of images are standardized to the same size 224*224.
clip_image_pixel_values = self.clip_preprocessor(images=image_obj, return_tensors="pt").pixel_values
image_pixel_values.append(clip_image_pixel_values)
# Convert tensor to numpy array.
if isinstance(image_obj, torch.Tensor):
image_obj = image_obj.cpu().numpy().transpose(1, 2, 0)
if isinstance(image_obj, np.ndarray):
image_obj = Image.fromarray(image_obj)
# Resize image_obj to (512, 512). The scheme is Image.NEAREST, to be consistent with
# PersonalizedBase dataset class.
image_obj, _, _ = pad_image_obj_to_square(image_obj)
image_np = np.array(image_obj.resize(size, Image.NEAREST))
face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
if len(face_info) > 0:
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
# id_emb: [512,]
id_emb = torch.from_numpy(face_info.normed_embedding)
else:
faceless_img_count += 1
print(f'No face detected in {image_paths[idx]}.', end=' ')
if not skip_non_faces:
print('Replace with random face embedding.')
# During training, use a random tensor as the face embedding.
id_emb = torch.randn(512)
else:
print(f'Skip.')
continue
all_id_embs.append(id_emb)
if verbose:
print(f'{len(all_id_embs)} face images identified, {faceless_img_count} faceless images.')
# No face is detected in the input images.
if len(all_id_embs) == 0:
return faceless_img_count, None, None
# all_id_embs: [BS, 512].
all_id_embs = torch.stack(all_id_embs, dim=0).to(device=device, dtype=torch.float16)
if return_clip_embs:
# image_pixel_values: [BS, 3, 224, 224]
image_pixel_values = torch.cat(image_pixel_values, dim=0)
image_pixel_values = image_pixel_values.to(device=device, dtype=torch.float16)
if fg_masks is not None:
assert len(fg_masks) == len(image_objs)
# fg_masks is a list of masks.
if isinstance(fg_masks, (list, tuple)):
fg_masks2 = []
for fg_mask in fg_masks:
# fg_mask: [Hi, Wi]
# BUG: clip_preprocessor will do central crop on images. But fg_mask is not central cropped.
# If the ref image is not square, then the fg_mask will not match the image.
# TODO: crop fg_mask and images to square before calling extract_init_id_embeds_from_images().
# fg_mask2: [Hi, Wi] -> [1, 1, 224, 224]
fg_mask2 = torch.tensor(fg_mask, device=device, dtype=torch.float16).unsqueeze(0).unsqueeze(0)
fg_mask2 = F.interpolate(fg_mask2, size=image_pixel_values.shape[-2:], mode='bilinear', align_corners=False)
fg_masks2.append(fg_mask2)
# fg_masks2: [BS, 224, 224]
fg_masks2 = torch.cat(fg_masks2, dim=0).squeeze(1)
else:
# fg_masks is a collated batch of masks.
# The actual size doesn't matter,
# as fg_mask2 will be resized to the same size as image features
# (much smaller than image_pixel_values).
fg_masks2 = fg_masks.to(device=device, dtype=torch.float16).unsqueeze(1)
# F.interpolate() always return a copy, even if scale_factor=1. So we don't need to clone fg_masks2.
fg_masks2 = F.interpolate(fg_masks2, size=image_pixel_values.shape[-2:], mode='bilinear', align_corners=False)
fg_masks2 = fg_masks2.squeeze(1)
else:
# fg_mask2: [BS, 224, 224].
fg_masks2 = torch.ones_like(image_pixel_values[:, 0, :, :], device=device, dtype=torch.float16)
clip_neg_features = self.get_clip_neg_features(BS=image_pixel_values.shape[0])
with torch.no_grad():
# image_fg_features: [BS, 257, 1280]. 257: 16*16 (patch_embeds) + 1 (class_embeds).
image_fg_dict = self.clip_image_encoder(image_pixel_values, attn_mask=fg_masks2, output_hidden_states=True)
# attn_mask: [BS, 1, 257]
image_fg_features = image_fg_dict.hidden_states[-2]
if image_fg_dict.attn_mask is not None:
image_fg_features = image_fg_features * image_fg_dict.attn_mask
# A negative mask is used to extract the background features.
# If fg_masks is None, then fg_masks2 is all ones, and bg masks is all zeros.
# Therefore, all pixels are masked. The extracted image_bg_features will be
# meaningless in this case.
image_bg_dict = self.clip_image_encoder(image_pixel_values, attn_mask=1-fg_masks2, output_hidden_states=True)
image_bg_features = image_bg_dict.hidden_states[-2]
# Subtract the feature bias (null features) from the bg features, to highlight the useful bg features.
if do_contrast_clip_embs_on_bg_features:
image_bg_features = image_bg_features - clip_neg_features
if image_bg_dict.attn_mask is not None:
image_bg_features = image_bg_features * image_bg_dict.attn_mask
# clip_fgbg_features: [BS, 514, 1280]. 514 = 257*2.
# all_id_embs: [BS, 512].
clip_fgbg_features = torch.cat([image_fg_features, image_bg_features], dim=1)
else:
clip_fgbg_features = None
clip_neg_features = None
if calc_avg:
if return_clip_embs:
# clip_fgbg_features: [BS, 514, 1280] -> [1, 514, 1280].
# all_id_embs: [BS, 512] -> [1, 512].
clip_fgbg_features = clip_fgbg_features.mean(dim=0, keepdim=True)
clip_neg_features = clip_neg_features.mean(dim=0, keepdim=True)
debug = False
if debug and all_id_embs is not None:
print(image_paths)
calc_stats('all_id_embs', all_id_embs)
# Compute pairwise similarities of the embeddings.
all_id_embs = F.normalize(all_id_embs, p=2, dim=1)
pairwise_sim = torch.matmul(all_id_embs, all_id_embs.t())
print('pairwise_sim:', pairwise_sim)
top_dir = os.path.dirname(image_paths[0])
mean_emb_path = os.path.join(top_dir, "mean_emb.pt")
if os.path.exists(mean_emb_path):
mean_emb = torch.load(mean_emb_path)
sim_to_mean = torch.matmul(all_id_embs, mean_emb.t())
print('sim_to_mean:', sim_to_mean)
if all_id_embs is not None:
id_embs = all_id_embs.mean(dim=0, keepdim=True)
# Without normalization, id_embs.norm(dim=1) is ~0.9. So normalization doesn't have much effect.
id_embs = F.normalize(id_embs, p=2, dim=-1)
# id_embs is None only if insightface_app is None, i.e., disabled by the user.
else:
# Don't do average of all_id_embs.
id_embs = all_id_embs
return faceless_img_count, id_embs, clip_fgbg_features
# This function should be implemented in derived classes.
# We don't plan to fine-tune the ID2ImgPrompt module. So disable the gradient computation.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
raise NotImplementedError
# If init_id_embs/pre_clip_features is provided, then use the provided face embeddings.
# Otherwise, if image_paths/image_objs are provided, extract face embeddings from the images.
# Otherwise, we generate random face embeddings [id_batch_size, 512].
def get_img_prompt_embs(self, init_id_embs, pre_clip_features, image_paths, image_objs,
id_batch_size,
skip_non_faces=True,
avg_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_std=0.0,
verbose=False):
face_image_count = 0
device = self.clip_image_encoder.device
clip_neg_features = self.get_clip_neg_features(BS=id_batch_size)
if init_id_embs is None:
# Input images are not provided. Generate random face embeddings.
if image_paths is None and image_objs is None:
faceid_embeds_from_images = False
# Use random face embeddings as faceid_embeds. [BS, 512].
faceid_embeds = torch.randn(id_batch_size, 512).to(device=device, dtype=torch.float16)
# Since it's a batch of random IDs, the CLIP features are all zeros as a placeholder.
# Only ConsistentID_ID2AdaPrompt will use clip_fgbg_features and clip_neg_features.
# Experiments show that using random clip features yields much better images than using zeros.
clip_fgbg_features = torch.randn(id_batch_size, 514, 1280).to(device=device, dtype=torch.float16) \
if self.use_clip_embs else None
else:
# Extract face ID embeddings and CLIP features from the images.
faceid_embeds_from_images = True
faceless_img_count, faceid_embeds, clip_fgbg_features \
= self.extract_init_id_embeds_from_images( \
image_objs, image_paths=image_paths, size=(512, 512),
calc_avg=(avg_at_stage == 'id_emb'),
skip_non_faces=skip_non_faces,
verbose=verbose)
if image_paths is not None:
face_image_count = len(image_paths) - faceless_img_count
else:
face_image_count = len(image_objs) - faceless_img_count
else:
faceid_embeds_from_images = False
# Use the provided init_id_embs as faceid_embeds.
faceid_embeds = init_id_embs
if pre_clip_features is not None:
clip_fgbg_features = pre_clip_features
else:
clip_fgbg_features = None
if faceid_embeds.shape[0] == 1:
faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.repeat(id_batch_size, 1, 1)
# If skip_non_faces, then faceid_embeds won't be None.
# Otherwise, if faceid_embeds_from_images, and no face images are detected,
# then we return Nones.
if faceid_embeds is None:
return face_image_count, None, None, None
if perturb_at_stage == 'id_emb' and perturb_std > 0:
# If id_batch_size > 1, after adding noises, the id_batch_size embeddings will be different.
faceid_embeds = perturb_tensor(faceid_embeds, perturb_std, perturb_std_is_relative=True, keep_norm=True)
if self.name == 'consistentID' or self.name == 'jointIDs':
clip_fgbg_features = perturb_tensor(clip_fgbg_features, perturb_std, perturb_std_is_relative=True, keep_norm=True)
faceid_embeds = F.normalize(faceid_embeds, p=2, dim=-1)
# pos_prompt_embs, neg_prompt_embs: [BS, 77, 768] or [BS, 22, 768].
with torch.no_grad():
pos_prompt_embs = \
self.map_init_id_to_img_prompt_embs(faceid_embeds, clip_fgbg_features,
called_for_neg_img_prompt=False)
if avg_at_stage == 'img_prompt_emb':
pos_prompt_embs = pos_prompt_embs.mean(dim=0, keepdim=True)
faceid_embeds = faceid_embeds.mean(dim=0, keepdim=True)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.mean(dim=0, keepdim=True)
if perturb_at_stage == 'img_prompt_emb' and perturb_std > 0:
# NOTE: for simplicity, pos_prompt_embs and pos_core_prompt_emb are perturbed independently.
# This could cause inconsistency between pos_prompt_embs and pos_core_prompt_emb.
# But in practice, unless we use both pos_prompt_embs and pos_core_prompt_emb
# this is not an issue. But we rarely use pos_prompt_embs and pos_core_prompt_emb together.
pos_prompt_embs = perturb_tensor(pos_prompt_embs, perturb_std, perturb_std_is_relative=True, keep_norm=True)
# If faceid_embeds_from_images, and the prompt embeddings are already averaged, then
# we assume all images are from the same subject, and the batch dim of faceid_embeds is 1.
# So we need to repeat faceid_embeds.
if faceid_embeds_from_images and avg_at_stage is not None:
faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
pos_prompt_embs = pos_prompt_embs.repeat(id_batch_size, 1, 1)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.repeat(id_batch_size, 1, 1)
if self.gen_neg_img_prompt:
# Never perturb the negative prompt embeddings.
with torch.no_grad():
neg_prompt_embs = \
self.map_init_id_to_img_prompt_embs(torch.zeros_like(faceid_embeds),
clip_neg_features,
called_for_neg_img_prompt=True)
return face_image_count, faceid_embeds, pos_prompt_embs, neg_prompt_embs
else:
return face_image_count, faceid_embeds, pos_prompt_embs, None
# get_batched_img_prompt_embs() is a wrapper of get_img_prompt_embs()
# which is convenient for batched training.
# NOTE: get_batched_img_prompt_embs() should only be called during training.
# It is a wrapper of get_img_prompt_embs() which is convenient for batched training.
# If init_id_embs is None, generate random face embeddings [BS, 512].
# Returns faceid_embeds, id2img_prompt_emb.
def get_batched_img_prompt_embs(self, batch_size, init_id_embs, pre_clip_features):
# pos_prompt_embs, neg_prompt_embs are generated without gradient computation.
# So we don't need to worry that the teacher model weights are updated.
return self.get_img_prompt_embs(init_id_embs=init_id_embs,
pre_clip_features=pre_clip_features,
image_paths=None,
image_objs=None,
id_batch_size=batch_size,
# During training, don't skip non-face images. Instead,
# setting skip_non_faces=False will replace them by random face embeddings.
skip_non_faces=False,
# We always assume the instances belong to different subjects.
# So never average the embeddings across instances.
avg_at_stage=None,
verbose=False)
# If img_prompt_embs is provided, we use it directly.
# Otherwise, if face_id_embs is provided, we use it to generate img_prompt_embs.
# Otherwise, if image_paths is provided, we extract face_id_embs from the images.
# image_paths: a list of image paths. image_folder: the parent folder name.
# avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
# avg_at_stage == ada_prompt_emb usually produces the worst results.
# avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
# p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
p_dropout=0,
return_zero_embs_for_dropped_encoders=True,
avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_std=0, enable_static_img_suffix_embs=False):
if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
img_prompt_avg_at_stage = None
else:
img_prompt_avg_at_stage = avg_at_stage
if img_prompt_embs is None:
# Do averaging. So id_batch_size becomes 1 after averaging.
if img_prompt_avg_at_stage is not None:
id_batch_size = 1
else:
if face_id_embs is not None:
id_batch_size = face_id_embs.shape[0]
elif image_paths is not None:
id_batch_size = len(image_paths)
else:
id_batch_size = 1
# faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
# NOTE: If face_id_embs, image_paths and image_objs are all None,
# then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
# and each instance is different.
# Otherwise, if face_id_embs is provided, it's used.
# If not, image_paths/image_objs are used to extract face embeddings.
# img_prompt_embs is in the image prompt space.
# img_prompt_embs: [BS, 16/4, 768].
face_image_count, faceid_embeds, img_prompt_embs, neg_img_prompt_embs \
= self.get_img_prompt_embs(\
init_id_embs=face_id_embs,
pre_clip_features=None,
# image_folder is passed only for logging purpose.
# image_paths contains the paths of the images.
image_paths=image_paths, image_objs=None,
id_batch_size=id_batch_size,
perturb_at_stage=perturb_at_stage,
perturb_std=perturb_std,
avg_at_stage=img_prompt_avg_at_stage,
verbose=True)
if face_image_count == 0:
return None
# No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
# img_prompt_embs: [BS, 16/4, 768] -> [1, 16/4, 768].
img_prompt_embs = img_prompt_embs.mean(dim=0, keepdim=True)
# adaface_subj_embs: [BS, 16/4, 768].
adaface_subj_embs = \
self.subj_basis_generator(img_prompt_embs, clip_features=None, raw_id_embs=None,
out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
is_face=True,
enable_static_img_suffix_embs=enable_static_img_suffix_embs)
# During training, img_prompt_avg_at_stage is None, and BS >= 1.
# During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
if img_prompt_avg_at_stage is not None:
# adaface_subj_embs: [1, 16, 768] -> [16, 768]
adaface_subj_embs = adaface_subj_embs.squeeze(0)
return adaface_subj_embs
class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, *args, **kwargs):
self.name = 'arc2face'
self.num_id_vecs = 16
super().__init__(*args, **kwargs)
self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
self.clip_preprocessor = CLIPImageProcessor.from_pretrained('openai/clip-vit-large-patch14')
self.clip_image_encoder.eval()
if self.dtype == torch.float16:
self.clip_image_encoder.half()
print(f'CLIP image encoder loaded.')
'''
{'landmark_3d_68': <insightface.model_zoo.landmark.Landmark object at 0x7f8e3f0cc190>,
'landmark_2d_106': <insightface.model_zoo.landmark.Landmark object at 0x7f8e3f0cc2b0>,
'detection': <insightface.model_zoo.retinaface.RetinaFace object at 0x7f8e3f0cc100>,
'genderage': <insightface.model_zoo.attribute.Attribute object at 0x7f8e3f0cc1f0>,
'recognition': <insightface.model_zoo.arcface_onnx.ArcFaceONNX object at 0x7f8e3f0cc0d0>}
'''
# Use the same model as ID2AdaPrompt does.
# FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
# Note there's a second "model" in the path.
# Note DON'T use CUDAExecutionProvider, as it will hang DDP training.
# Seems when loading insightface onto the GPU, it will only reside on the first GPU.
# Then the process on the second GPU has issue to communicate with insightface on the first GPU, causing hanging.
self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
providers=['CPUExecutionProvider'])
self.face_app.prepare(ctx_id=0, det_size=(512, 512))
print(f'Face encoder loaded on CPU.')
self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
'models/arc2face', subfolder="encoder",
torch_dtype=self.dtype
)
self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
if self.out_id_embs_cfg_scale == -1:
self.out_id_embs_cfg_scale = 1
#### Arc2Face pipeline specific configs ####
self.gen_neg_img_prompt = False
# bg CLIP features are used by the bg subject basis generator.
self.use_clip_embs = True
self.do_contrast_clip_embs_on_bg_features = True
# self.num_static_img_suffix_embs is initialized in the parent class.
self.id_img_prompt_max_length = 22
self.clip_embedding_dim = 1024
self.init_subj_basis_generator()
if self.adaface_ckpt_path is not None:
self.load_adaface_ckpt(self.adaface_ckpt_path)
print(f"{self.name} ada prompt encoder initialized, "
f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
# Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
'''
self.text_to_image_prompt_encoder: arc2face_models.py:CLIPTextModelWrapper instance.
init_id_embs: (N, 512) normalized Face ID embeddings.
'''
# arcface_token_id: 1014
arcface_token_id = self.tokenizer.encode("id", add_special_tokens=False)[0]
# This step should be quite fast, and there's no need to cache the input_ids.
input_ids = self.tokenizer(
"photo of a id person",
truncation=True,
padding="max_length",
# In Arc2Face_ID2AdaPrompt, id_img_prompt_max_length is 22.
# Arc2Face's image prompt is meanlingless in tokens other than ID tokens.
max_length=self.id_img_prompt_max_length,
return_tensors="pt",
).input_ids.to(init_id_embs.device)
# input_ids: [1, 22] or [3, 22] (during training).
input_ids = input_ids.repeat(len(init_id_embs), 1)
init_id_embs = init_id_embs.to(self.dtype)
# face_embs_padded: [1, 512] -> [1, 768].
face_embs_padded = F.pad(init_id_embs, (0, self.text_to_image_prompt_encoder.config.hidden_size - init_id_embs.shape[-1]), "constant", 0)
# self.text_to_image_prompt_encoder(input_ids=input_ids, ...) is called twice. The first is only to get the token embeddings (the shallowest mapping).
# The second call does the ordinary CLIP text encoding pass.
token_embs = self.text_to_image_prompt_encoder(input_ids=input_ids, return_token_embs=True)
token_embs[input_ids==arcface_token_id] = face_embs_padded
prompt_embeds = self.text_to_image_prompt_encoder(
input_ids=input_ids,
input_token_embs=token_embs,
return_token_embs=False
)[0]
# Restore the original dtype of prompt_embeds: float16 -> float32.
prompt_embeds = prompt_embeds.to(self.dtype)
# token 4: 'id' in "photo of a id person".
# 4:20 are the most important 16 embeddings that contain the subject's identity.
# [N, 22, 768] -> [N, 16, 768]
return prompt_embeds[:, 4:20]
def get_id2img_learnable_modules(self):
return [ self.text_to_image_prompt_encoder ]
# ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
*args, **kwargs):
self.name = 'consistentID'
self.num_id_vecs = 4
super().__init__(*args, **kwargs)
if pipe is None:
# The base_model_path is kind of arbitrary, as the UNet and VAE in the model
# are not used and will be released soon.
# Only the consistentID modules and bise_net are used.
assert base_model_path is not None, "base_model_path should be provided."
pipe = ConsistentIDPipeline.from_single_file(base_model_path)
pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
pipe.to(dtype=self.dtype)
# Since the passed-in pipe is None, this should be called during inference,
# when the teacher ConsistentIDPipeline is not initialized.
# Therefore, we release VAE, UNet and text_encoder to save memory.
pipe.release_components(["unet", "vae"])
# Otherwise, we share the pipeline with the teacher.
# So we don't release the components.
self.pipe = pipe
self.face_app = pipe.face_app
# ConsistentID uses 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K'.
self.clip_image_encoder = patch_clip_image_encoder_with_mask(pipe.clip_encoder)
self.clip_preprocessor = pipe.clip_preprocessor
self.text_to_image_prompt_encoder = pipe.text_encoder
self.tokenizer = pipe.tokenizer
self.image_proj_model = pipe.image_proj_model
self.clip_image_encoder.eval()
self.image_proj_model.eval()
if self.dtype == torch.float16:
self.clip_image_encoder.half()
self.image_proj_model.half()
if self.out_id_embs_cfg_scale == -1:
self.out_id_embs_cfg_scale = 6
#### ConsistentID pipeline specific configs ####
# self.num_static_img_suffix_embs is initialized in the parent class.
self.gen_neg_img_prompt = True
self.use_clip_embs = True
self.do_contrast_clip_embs_on_bg_features = True
self.clip_embedding_dim = 1280
self.s_scale = 1.0
self.shortcut = False
self.init_subj_basis_generator()
if self.adaface_ckpt_path is not None:
self.load_adaface_ckpt(self.adaface_ckpt_path)
print(f"{self.name} ada prompt encoder initialized, "
f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
assert init_id_embs is not None, "init_id_embs should be provided."
init_id_embs = init_id_embs.to(self.dtype)
clip_features = clip_features.to(self.dtype)
if not called_for_neg_img_prompt:
# clip_features: [BS, 514, 1280].
# clip_features is provided when the function is called within
# ConsistentID_ID2AdaPrompt:extract_init_id_embeds_from_images(), which is
# image_fg_features and image_bg_features concatenated at dim=1.
# Therefore, we split clip_image_double_embeds into image_fg_features and image_bg_features.
# image_bg_features is not used in ConsistentID_ID2AdaPrompt.
image_fg_features, image_bg_features = clip_features.chunk(2, dim=1)
# clip_image_embeds: [BS, 257, 1280].
clip_image_embeds = image_fg_features
else:
# clip_features is the negative image features. So we don't need to split it.
clip_image_embeds = clip_features
init_id_embs = torch.zeros_like(init_id_embs)
faceid_embeds = init_id_embs
# image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
# clip_image_embeds are used as queries to transform faceid_embeds.
# faceid_embeds -> kv, clip_image_embeds -> q
if faceid_embeds.shape[0] != clip_image_embeds.shape[0]:
breakpoint()
try:
global_id_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=self.shortcut, scale=self.s_scale)
except:
breakpoint()
return global_id_embeds
def get_id2img_learnable_modules(self):
return [ self.image_proj_model ]
# A wrapper for combining multiple FaceID2AdaPrompt instances.
class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
out_id_embs_cfg_scales=None, enabled_encoders=None,
*args, **kwargs):
self.name = 'jointIDs'
assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
adaface_encoder_types2num_id_vecs = { 'arc2face': 16, 'consistentID': 4 }
self.encoders_num_id_vecs = [ adaface_encoder_types2num_id_vecs[encoder_type] \
for encoder_type in adaface_encoder_types ]
self.num_id_vecs = sum(self.encoders_num_id_vecs)
super().__init__(*args, **kwargs)
self.num_sub_encoders = len(adaface_encoder_types)
self.id2ada_prompt_encoders = nn.ModuleList()
self.encoders_num_static_img_suffix_embs = []
# TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
# Now they are just placeholders.
if out_id_embs_cfg_scales is None:
# -1: use the default scale for the adaface encoder type.
# i.e., 6 for arc2face and 1 for consistentID.
self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
else:
# Do not normalize the weights, and just use them as is.
self.out_id_embs_cfg_scales = out_id_embs_cfg_scales
# Note we don't pass the adaface_ckpt_paths to the base class, but instead,
# we load them once and for all in self.load_adaface_ckpt().
for i, encoder_type in enumerate(adaface_encoder_types):
kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
if encoder_type == 'arc2face':
encoder = Arc2Face_ID2AdaPrompt(*args, **kwargs)
elif encoder_type == 'consistentID':
encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
else:
breakpoint()
self.id2ada_prompt_encoders.append(encoder)
self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
self.num_static_img_suffix_embs = sum(self.encoders_num_static_img_suffix_embs)
# No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
# in the derived classes.
# self.gen_neg_img_prompt = True
# self.use_clip_embs = True
# self.do_contrast_clip_embs_on_bg_features = True
self.face_id_dims = [encoder.face_id_dim for encoder in self.id2ada_prompt_encoders]
self.face_id_dim = sum(self.face_id_dims)
# Different adaface encoders may have different clip_embedding_dim.
# clip_embedding_dim is only used for bg subject basis generator.
# Here we use the joint clip embeddings of both OpenAI CLIP and laion CLIP.
# Therefore, the clip_embedding_dim is the sum of the clip_embedding_dims of all adaface encoders.
self.clip_embedding_dims = [encoder.clip_embedding_dim for encoder in self.id2ada_prompt_encoders]
self.clip_embedding_dim = sum(self.clip_embedding_dims)
# The ctors of the derived classes have already initialized encoder.subj_basis_generator.
# If subj_basis_generator expansion params are specified, they are equally applied to all adaface encoders.
# This self.subj_basis_generator is not meant to be called as self.subj_basis_generator(), but instead,
# it's used as a unified interface to save/load the subj_basis_generator of all adaface encoders.
self.subj_basis_generator = \
nn.ModuleList( [encoder.subj_basis_generator for encoder \
in self.id2ada_prompt_encoders] )
if adaface_ckpt_paths is not None:
self.load_adaface_ckpt(adaface_ckpt_paths)
print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
f"ID vecs: {self.num_id_vecs}, static suffix embs: {self.num_static_img_suffix_embs}.")
if enabled_encoders is not None:
self.are_encoders_enabled = \
torch.tensor([True if encoder_type in enabled_encoders else False \
for encoder_type in adaface_encoder_types])
if not self.are_encoders_enabled.any():
print(f"All encoders are disabled, which shoudn't happen.")
breakpoint()
if self.are_encoders_enabled.sum() < self.num_sub_encoders:
disabled_encoders = [ encoder_type for i, encoder_type in enumerate(adaface_encoder_types) \
if not self.are_encoders_enabled[i] ]
print(f"{len(disabled_encoders)} encoders are disabled: {disabled_encoders}.")
else:
self.are_encoders_enabled = \
torch.tensor([True] * self.num_sub_encoders)
def load_adaface_ckpt(self, adaface_ckpt_paths):
# If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
# so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
if len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
adaface_ckpt_paths = adaface_ckpt_paths[0]
if isinstance(adaface_ckpt_paths, str):
# This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
# the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
# Therefore, no need to patch missing variables.
ckpt = torch.load(adaface_ckpt_paths, map_location='cpu')
string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
if self.subject_string not in string_to_subj_basis_generator_dict:
print(f"Subject '{self.subject_string}' not found in the embedding manager.")
breakpoint()
ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
for i, subj_basis_generator in enumerate(self.subj_basis_generator):
ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
# Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
img_prompt_dim=self.output_dim)
if subj_basis_generator.prompt2token_proj_attention_multipliers \
== [1] * 12:
subj_basis_generator.extend_prompt2token_proj_attention(\
ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
elif subj_basis_generator.prompt2token_proj_attention_multipliers \
!= ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
assert subj_basis_generator.prompt2token_proj_attention_multipliers \
== ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
"Inconsistent prompt2token_proj_attention_multipliers."
subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
# If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
# extend subj_basis_generator again.
if self.extend_prompt2token_proj_attention_multiplier > 1:
# During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
# During training, prompt2token_proj_ext_attention_perturb_ratio == 0.1.
# During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
subj_basis_generator.extend_prompt2token_proj_attention(\
None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
subj_basis_generator.freeze_prompt2token_proj()
print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
elif isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
for i, ckpt_path in enumerate(adaface_ckpt_paths):
self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
else:
breakpoint()
def extract_init_id_embeds_from_images(self, *args, **kwargs):
total_faceless_img_count = 0
all_id_embs = []
all_clip_fgbg_features = []
id_embs_shape = None
clip_fgbg_features_shape = None
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
faceless_img_count, id_embs, clip_fgbg_features = \
id2ada_prompt_encoder.extract_init_id_embeds_from_images(*args, **kwargs)
total_faceless_img_count += faceless_img_count
# id_embs: [BS, 512] or [1, 512] (if calc_avg == True), or None.
# id_embs has the same shape across all id2ada_prompt_encoders.
all_id_embs.append(id_embs)
# clip_fgbg_features: [BS, 514, 1280/1024] or [1, 514, 1280/1024] (if calc_avg == True), or None.
# clip_fgbg_features has the same shape except for the last dimension across all id2ada_prompt_encoders.
all_clip_fgbg_features.append(clip_fgbg_features)
if id_embs is not None:
id_embs_shape = id_embs.shape
if clip_fgbg_features is not None:
clip_fgbg_features_shape = clip_fgbg_features.shape
num_extracted_id_embs = 0
for i in range(len(all_id_embs)):
if all_id_embs[i] is not None:
# As calc_avg is the same for all id2ada_prompt_encoders,
# each id_embs and clip_fgbg_features should have the same shape, if they are not None.
if all_id_embs[i].shape != id_embs_shape:
print("Inconsistent ID embedding shapes.")
breakpoint()
else:
num_extracted_id_embs += 1
else:
all_id_embs[i] = torch.zeros(id_embs_shape, dtype=torch.float16, device=device)
clip_fgbg_features_shape2 = torch.Size(clip_fgbg_features_shape[:-1] + (self.clip_embedding_dims[i],))
if all_clip_fgbg_features[i] is not None:
if all_clip_fgbg_features[i].shape != clip_fgbg_features_shape2:
print("Inconsistent clip features shapes.")
breakpoint()
else:
all_clip_fgbg_features[i] = torch.zeros(clip_fgbg_features_shape2,
dtype=torch.float16, device=device)
# If at least one face encoder detects faces, then return the embeddings.
# Otherwise return None embeddings.
# It's possible that some face encoders detect faces, while others don't,
# since different face encoders use different face detection models.
if num_extracted_id_embs == 0:
return 0, None, None
all_id_embs = torch.cat(all_id_embs, dim=1)
# clip_fgbg_features: [BS, 514, 1280] or [BS, 514, 1024]. So we concatenate them along dim=2.
all_clip_fgbg_features = torch.cat(all_clip_fgbg_features, dim=2)
return total_faceless_img_count, all_id_embs, all_clip_fgbg_features
# init_id_embs, clip_features are never None.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
if init_id_embs is None or clip_features is None:
breakpoint()
# each id_embs and clip_fgbg_features should have the same shape.
# If some of them were None, they have been replaced by zero embeddings.
all_init_id_embs = init_id_embs.split(self.face_id_dims, dim=1)
all_clip_features = clip_features.split(self.clip_embedding_dims, dim=2)
all_img_prompt_embs = []
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
img_prompt_embs = id2ada_prompt_encoder.map_init_id_to_img_prompt_embs(
all_init_id_embs[i], clip_features=all_clip_features[i],
called_for_neg_img_prompt=called_for_neg_img_prompt,
)
all_img_prompt_embs.append(img_prompt_embs)
all_img_prompt_embs = torch.cat(all_img_prompt_embs, dim=1)
return all_img_prompt_embs
# If init_id_embs/pre_clip_features is provided, then use the provided face embeddings.
# Otherwise, if image_paths/image_objs are provided, extract face embeddings from the images.
# Otherwise, we generate random face embeddings [id_batch_size, 512].
def get_img_prompt_embs(self, init_id_embs, pre_clip_features, *args, **kwargs):
face_image_counts = []
all_faceid_embeds = []
all_pos_prompt_embs = []
all_neg_prompt_embs = []
faceid_embeds_shape = None
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
# init_id_embs, pre_clip_features could be None. If they are None,
# we split them into individual vectors for each id2ada_prompt_encoder.
if init_id_embs is not None:
all_init_id_embs = init_id_embs.split(self.face_id_dims, dim=1)
else:
all_init_id_embs = [None] * self.num_sub_encoders
if pre_clip_features is not None:
all_pre_clip_features = pre_clip_features.split(self.clip_embedding_dims, dim=2)
else:
all_pre_clip_features = [None] * self.num_sub_encoders
faceid_embeds_shape = None
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
face_image_count, faceid_embeds, pos_prompt_embs, neg_prompt_embs = \
id2ada_prompt_encoder.get_img_prompt_embs(all_init_id_embs[i], all_pre_clip_features[i],
*args, **kwargs)
face_image_counts.append(face_image_count)
all_faceid_embeds.append(faceid_embeds)
all_pos_prompt_embs.append(pos_prompt_embs)
all_neg_prompt_embs.append(neg_prompt_embs)
# all faceid_embeds have the same shape across all id2ada_prompt_encoders.
# But pos_prompt_embs and neg_prompt_embs may have different number of ID embeddings.
if faceid_embeds is not None:
faceid_embeds_shape = faceid_embeds.shape
if faceid_embeds_shape is None:
return 0, None, None, None
# We take the maximum face_image_count among all adaface encoders.
face_image_count = max(face_image_counts)
BS = faceid_embeds.shape[0]
for i in range(len(all_faceid_embeds)):
if all_faceid_embeds[i] is not None:
if all_faceid_embeds[i].shape != faceid_embeds_shape:
print("Inconsistent face embedding shapes.")
breakpoint()
else:
all_faceid_embeds[i] = torch.zeros(faceid_embeds_shape, dtype=torch.float16, device=device)
N_ID = self.encoders_num_id_vecs[i]
if all_pos_prompt_embs[i] is None:
# Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs embeddings.
all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
if all_neg_prompt_embs[i] is None:
all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
all_faceid_embeds = torch.cat(all_faceid_embeds, dim=1)
all_pos_prompt_embs = torch.cat(all_pos_prompt_embs, dim=1)
all_neg_prompt_embs = torch.cat(all_neg_prompt_embs, dim=1)
return face_image_count, all_faceid_embeds, all_pos_prompt_embs, all_neg_prompt_embs
# We don't need to implement get_batched_img_prompt_embs() since the interface
# is fully compatible with FaceID2AdaPrompt.get_batched_img_prompt_embs().
def generate_adaface_embeddings(self, image_paths, face_id_embs=None,
img_prompt_embs=None, p_dropout=0,
return_zero_embs_for_dropped_encoders=True,
*args, **kwargs):
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
BS = -1
if face_id_embs is not None:
BS = face_id_embs.shape[0]
all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
else:
all_face_id_embs = [None] * self.num_sub_encoders
if img_prompt_embs is not None:
BS = img_prompt_embs.shape[0] if BS == -1 else BS
if img_prompt_embs.shape[1] != self.num_id_vecs:
breakpoint()
all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs, dim=1)
else:
all_img_prompt_embs = [None] * self.num_sub_encoders
if image_paths is not None:
BS = len(image_paths) if BS == -1 else BS
if BS == -1:
breakpoint()
# During training, p_dropout is 0.1. During inference, p_dropout is 0.
# When there are two sub-encoders, the prob of one encoder being dropped is
# p_dropout * 2 - p_dropout^2 = 0.18.
if p_dropout > 0:
# self.are_encoders_enabled is a global mask.
# are_encoders_enabled is a local mask for each batch.
are_encoders_enabled = torch.rand(self.num_sub_encoders) < p_dropout
are_encoders_enabled = are_encoders_enabled & self.are_encoders_enabled
# We should at least enable one encoder.
if not are_encoders_enabled.any():
# Randomly enable an encoder with self.are_encoders_enabled[i] == True.
enabled_indices = torch.nonzero(self.are_encoders_enabled).squeeze(1)
sel_idx = torch.randint(0, len(enabled_indices), (1,)).item()
are_encoders_enabled[enabled_indices[sel_idx]] = True
else:
are_encoders_enabled = self.are_encoders_enabled
all_adaface_subj_embs = []
num_available_id_vecs = 0
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
if not are_encoders_enabled[i]:
adaface_subj_embs = None
print(f"Encoder {id2ada_prompt_encoder.name} is dropped.")
else:
# ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
# -> each sub-enconder's subj_basis_generator.train().
# Therefore grad for the following call is enabled.
adaface_subj_embs = \
id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
all_face_id_embs[i],
all_img_prompt_embs[i],
*args, **kwargs)
# adaface_subj_embs: [16, 768] or [4, 768].
N_ID = self.encoders_num_id_vecs[i]
if adaface_subj_embs is None:
if not return_zero_embs_for_dropped_encoders:
continue
else:
subj_emb_shape = (N_ID, 768) if is_emb_averaged else (BS, N_ID, 768)
# adaface_subj_embs is zero-filled. So N_ID is not counted as available subject embeddings.
adaface_subj_embs = torch.zeros(subj_emb_shape, dtype=torch.float16, device=device)
all_adaface_subj_embs.append(adaface_subj_embs)
else:
all_adaface_subj_embs.append(adaface_subj_embs)
num_available_id_vecs += N_ID
# No faces are found in the images, so return None embeddings.
# We don't want to return an all-zero embedding, which is useless.
if num_available_id_vecs == 0:
return None
# If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
# during inference, we average across the batch dim.
# all_adaface_subj_embs[0]: [4, 768]. all_adaface_subj_embs[1]: [16, 768].
# all_adaface_subj_embs: [20, 768].
# during training, we don't average across the batch dim.
# all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
# all_adaface_subj_embs: [BS, 20, 768].
all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
return all_adaface_subj_embs
'''
# For ip-adapter distillation on objects. Strictly speaking, it's not face-to-image prompts, but
# CLIP/DINO visual features to image prompts.
class Objects_Vis2ImgPrompt(nn.Module):
def __init__(self):
self.dino_encoder = ViTModel.from_pretrained('facebook/dino-vits16')
self.dino_encoder.eval()
self.dino_encoder.half()
self.dino_preprocess = ViTFeatureExtractor.from_pretrained('facebook/dino-vits16')
print(f'DINO encoder loaded.')
'''
|