adaface-animate / adaface /face_id_to_ada_prompt.py
adaface-neurips
Allow dynamically changing base model style type, support anime style, upgrade adaface model
a29cf91
raw
history blame
66.2 kB
import torch
import torch.nn as nn
from transformers import CLIPTokenizer, CLIPImageProcessor
from .arc2face_models import CLIPTextModelWrapper
from ConsistentID.lib.pipeline_ConsistentID import ConsistentIDPipeline
from .util import perturb_tensor, pad_image_obj_to_square, \
calc_stats, patch_clip_image_encoder_with_mask, CLIPVisionModelWithMask
from adaface.subj_basis_generator import SubjBasisGenerator
import torch.nn.functional as F
import numpy as np
import cv2
from PIL import Image
from insightface.app import FaceAnalysis
import os
from omegaconf.listconfig import ListConfig
# adaface_encoder_types can be a list of one or more encoder types.
# adaface_ckpt_paths can be one or a list of ckpt paths.
# adaface_encoder_cfg_scales is None, or a list of scales for the adaface encoder types.
def create_id2ada_prompt_encoder(adaface_encoder_types, adaface_ckpt_paths=None,
adaface_encoder_cfg_scales=None, enabled_encoders=None,
*args, **kwargs):
if len(adaface_encoder_types) == 1:
adaface_encoder_type = adaface_encoder_types[0]
adaface_ckpt_path = adaface_ckpt_paths[0] if adaface_ckpt_paths is not None else None
if adaface_encoder_type == 'arc2face':
id2ada_prompt_encoder = \
Arc2Face_ID2AdaPrompt(adaface_ckpt_path=adaface_ckpt_path,
*args, **kwargs)
elif adaface_encoder_type == 'consistentID':
id2ada_prompt_encoder = \
ConsistentID_ID2AdaPrompt(pipe=None,
adaface_ckpt_path=adaface_ckpt_path,
*args, **kwargs)
else:
id2ada_prompt_encoder = Joint_FaceID2AdaPrompt(adaface_encoder_types, adaface_ckpt_paths,
adaface_encoder_cfg_scales, enabled_encoders,
*args, **kwargs)
return id2ada_prompt_encoder
class FaceID2AdaPrompt(nn.Module):
# To be initialized in derived classes.
def __init__(self, *args, **kwargs):
super().__init__()
# Initialize model components.
# These components of ConsistentID_ID2AdaPrompt will be shared with the teacher model.
# So we don't initialize them in the ctor(), but borrow them from the teacher model.
# These components of Arc2Face_ID2AdaPrompt will be initialized in its ctor().
self.clip_image_encoder = None
self.clip_preprocessor = None
self.face_app = None
self.text_to_image_prompt_encoder = None
self.tokenizer = None
self.dtype = kwargs.get('dtype', torch.float16)
# Load Img2Ada SubjectBasisGenerator.
self.subject_string = kwargs.get('subject_string', 'z')
self.adaface_ckpt_path = kwargs.get('adaface_ckpt_path', None)
self.subj_basis_generator = None
# -1: use the default scale for the adaface encoder type.
# i.e., 6 for arc2face and 1 for consistentID.
self.out_id_embs_cfg_scale = kwargs.get('out_id_embs_cfg_scale', -1)
self.is_training = kwargs.get('is_training', False)
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
self.extend_prompt2token_proj_attention_multiplier = kwargs.get('extend_prompt2token_proj_attention_multiplier', 1)
self.prompt2token_proj_ext_attention_perturb_ratio = kwargs.get('prompt2token_proj_ext_attention_perturb_ratio', 0.1)
# Set model behavior configurations.
self.gen_neg_img_prompt = False
self.clip_neg_features = None
self.use_clip_embs = False
self.do_contrast_clip_embs_on_bg_features = False
# num_id_vecs is the output embeddings of the ID2ImgPrompt module.
# If there's no static image suffix embeddings, then num_id_vecs is also
# the number of ada embeddings returned by the subject basis generator.
# num_id_vecs will be set in each derived class.
self.num_static_img_suffix_embs = kwargs.get('num_static_img_suffix_embs', 0)
print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings and {self.num_static_img_suffix_embs} fixed image embeddings as input.')
self.id_img_prompt_max_length = 77
self.face_id_dim = 512
# clip_embedding_dim: by default it's the OpenAI CLIP embedding dim.
# Could be overridden by derived classes.
self.clip_embedding_dim = 1024
self.output_dim = 768
def get_id2img_learnable_modules(self):
raise NotImplementedError
def load_id2img_learnable_modules(self, id2img_learnable_modules_state_dict_list):
id2img_prompt_encoder_learnable_modules = self.get_id2img_learnable_modules()
for module, state_dict in zip(id2img_prompt_encoder_learnable_modules, id2img_learnable_modules_state_dict_list):
module.load_state_dict(state_dict)
print(f'{len(id2img_prompt_encoder_learnable_modules)} ID2ImgPrompt encoder modules loaded.')
# init_subj_basis_generator() can only be called after the derived class is initialized,
# when self.num_id_vecs, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
def init_subj_basis_generator(self):
self.subj_basis_generator = \
SubjBasisGenerator(num_id_vecs = self.num_id_vecs,
num_static_img_suffix_embs = self.num_static_img_suffix_embs,
bg_image_embedding_dim = self.clip_embedding_dim,
output_dim = self.output_dim,
placeholder_is_bg = False,
prompt2token_proj_grad_scale = 1,
bg_prompt_translator_has_to_out_proj=False)
def load_adaface_ckpt(self, adaface_ckpt_path):
ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
if self.subject_string not in string_to_subj_basis_generator_dict:
print(f"Subject '{self.subject_string}' not found in the embedding manager.")
breakpoint()
ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
ckpt_subj_basis_generator.N_ID = self.num_id_vecs
# Since we directly use the subject basis generator object from the ckpt,
# fixing the number of static image suffix embeddings is much simpler.
# Otherwise if we want to load the subject basis generator from its state_dict,
# things are more complicated, see embedding manager's load().
ckpt_subj_basis_generator.N_SFX = self.num_static_img_suffix_embs
# obj_proj_in and pos_embs are for non-faces. So they are useless for human faces.
ckpt_subj_basis_generator.obj_proj_in = None
ckpt_subj_basis_generator.pos_embs = None
# Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
# Fix missing variables in old ckpt.
ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
self.subj_basis_generator.extend_prompt2token_proj_attention(\
ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
print(f"{adaface_ckpt_path}: subject basis generator loaded for '{self.name}'.")
print(repr(ckpt_subj_basis_generator))
if ret is not None and len(ret.missing_keys) > 0:
print(f"Missing keys: {ret.missing_keys}")
if ret is not None and len(ret.unexpected_keys) > 0:
print(f"Unexpected keys: {ret.unexpected_keys}")
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
# If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
# extend subj_basis_generator again.
if self.extend_prompt2token_proj_attention_multiplier > 1:
# During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
# During training, prompt2token_proj_ext_attention_perturb_ratio == 0.1.
# During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
self.subj_basis_generator.extend_prompt2token_proj_attention(\
None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
self.subj_basis_generator.freeze_prompt2token_proj()
@torch.no_grad()
def get_clip_neg_features(self, BS):
if self.clip_neg_features is None:
# neg_pixel_values: [1, 3, 224, 224]. clip_neg_features is invariant to the actual image.
neg_pixel_values = torch.zeros([1, 3, 224, 224], device=self.clip_image_encoder.device, dtype=self.dtype)
# Precompute CLIP negative features for the negative image prompt.
self.clip_neg_features = self.clip_image_encoder(neg_pixel_values, attn_mask=None, output_hidden_states=True).hidden_states[-2]
clip_neg_features = self.clip_neg_features.repeat(BS, 1, 1)
return clip_neg_features
# image_objs: a list of np array / tensor / Image objects of different sizes [Hi, Wi].
# If image_objs is a list of tensors, then each tensor should be [3, Hi, Wi].
# If image_objs is None, then image_paths should be provided,
# and image_objs will be loaded from image_paths.
# fg_masks: None, or a list of [Hi, Wi].
def extract_init_id_embeds_from_images(self, image_objs, image_paths, fg_masks=None,
size=(512, 512), calc_avg=False,
skip_non_faces=True, return_clip_embs=None,
do_contrast_clip_embs_on_bg_features=None,
verbose=False):
# If return_clip_embs or do_contrast_clip_embs_on_bg_features is not provided,
# then use their default values.
if return_clip_embs is None:
return_clip_embs = self.use_clip_embs
if do_contrast_clip_embs_on_bg_features is None:
do_contrast_clip_embs_on_bg_features = self.do_contrast_clip_embs_on_bg_features
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.clip_image_encoder.device
image_pixel_values = []
all_id_embs = []
faceless_img_count = 0
if image_objs is None and image_paths is not None:
image_objs = []
for image_path in image_paths:
image_obj = Image.open(image_path)
image_objs.append(image_obj)
print(f'Loaded {len(image_objs)} images from {image_paths[0]}...')
# image_objs could be a batch of images that have been collated into a tensor or np array.
# image_objs can also be a list of images.
# The code below that processes them one by one can be applied in both cases.
# If image_objs are a collated batch, processing them one by one will not add much overhead.
for idx, image_obj in enumerate(image_objs):
if return_clip_embs:
# input to clip_preprocessor: an image or a batch of images, each being PIL.Image.Image, numpy.ndarray,
# torch.Tensor, tf.Tensor or jax.ndarray.
# Different sizes of images are standardized to the same size 224*224.
clip_image_pixel_values = self.clip_preprocessor(images=image_obj, return_tensors="pt").pixel_values
image_pixel_values.append(clip_image_pixel_values)
# Convert tensor to numpy array.
if isinstance(image_obj, torch.Tensor):
image_obj = image_obj.cpu().numpy().transpose(1, 2, 0)
if isinstance(image_obj, np.ndarray):
image_obj = Image.fromarray(image_obj)
# Resize image_obj to (512, 512). The scheme is Image.NEAREST, to be consistent with
# PersonalizedBase dataset class.
image_obj, _, _ = pad_image_obj_to_square(image_obj)
image_np = np.array(image_obj.resize(size, Image.NEAREST))
face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
if len(face_info) > 0:
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
# id_emb: [512,]
id_emb = torch.from_numpy(face_info.normed_embedding)
else:
faceless_img_count += 1
print(f'No face detected in {image_paths[idx]}.', end=' ')
if not skip_non_faces:
print('Replace with random face embedding.')
# During training, use a random tensor as the face embedding.
id_emb = torch.randn(512)
else:
print(f'Skip.')
continue
all_id_embs.append(id_emb)
if verbose:
print(f'{len(all_id_embs)} face images identified, {faceless_img_count} faceless images.')
# No face is detected in the input images.
if len(all_id_embs) == 0:
return faceless_img_count, None, None
# all_id_embs: [BS, 512].
all_id_embs = torch.stack(all_id_embs, dim=0).to(device=device, dtype=torch.float16)
if return_clip_embs:
# image_pixel_values: [BS, 3, 224, 224]
image_pixel_values = torch.cat(image_pixel_values, dim=0)
image_pixel_values = image_pixel_values.to(device=device, dtype=torch.float16)
if fg_masks is not None:
assert len(fg_masks) == len(image_objs)
# fg_masks is a list of masks.
if isinstance(fg_masks, (list, tuple)):
fg_masks2 = []
for fg_mask in fg_masks:
# fg_mask: [Hi, Wi]
# BUG: clip_preprocessor will do central crop on images. But fg_mask is not central cropped.
# If the ref image is not square, then the fg_mask will not match the image.
# TODO: crop fg_mask and images to square before calling extract_init_id_embeds_from_images().
# fg_mask2: [Hi, Wi] -> [1, 1, 224, 224]
fg_mask2 = torch.tensor(fg_mask, device=device, dtype=torch.float16).unsqueeze(0).unsqueeze(0)
fg_mask2 = F.interpolate(fg_mask2, size=image_pixel_values.shape[-2:], mode='bilinear', align_corners=False)
fg_masks2.append(fg_mask2)
# fg_masks2: [BS, 224, 224]
fg_masks2 = torch.cat(fg_masks2, dim=0).squeeze(1)
else:
# fg_masks is a collated batch of masks.
# The actual size doesn't matter,
# as fg_mask2 will be resized to the same size as image features
# (much smaller than image_pixel_values).
fg_masks2 = fg_masks.to(device=device, dtype=torch.float16).unsqueeze(1)
# F.interpolate() always return a copy, even if scale_factor=1. So we don't need to clone fg_masks2.
fg_masks2 = F.interpolate(fg_masks2, size=image_pixel_values.shape[-2:], mode='bilinear', align_corners=False)
fg_masks2 = fg_masks2.squeeze(1)
else:
# fg_mask2: [BS, 224, 224].
fg_masks2 = torch.ones_like(image_pixel_values[:, 0, :, :], device=device, dtype=torch.float16)
clip_neg_features = self.get_clip_neg_features(BS=image_pixel_values.shape[0])
with torch.no_grad():
# image_fg_features: [BS, 257, 1280]. 257: 16*16 (patch_embeds) + 1 (class_embeds).
image_fg_dict = self.clip_image_encoder(image_pixel_values, attn_mask=fg_masks2, output_hidden_states=True)
# attn_mask: [BS, 1, 257]
image_fg_features = image_fg_dict.hidden_states[-2]
if image_fg_dict.attn_mask is not None:
image_fg_features = image_fg_features * image_fg_dict.attn_mask
# A negative mask is used to extract the background features.
# If fg_masks is None, then fg_masks2 is all ones, and bg masks is all zeros.
# Therefore, all pixels are masked. The extracted image_bg_features will be
# meaningless in this case.
image_bg_dict = self.clip_image_encoder(image_pixel_values, attn_mask=1-fg_masks2, output_hidden_states=True)
image_bg_features = image_bg_dict.hidden_states[-2]
# Subtract the feature bias (null features) from the bg features, to highlight the useful bg features.
if do_contrast_clip_embs_on_bg_features:
image_bg_features = image_bg_features - clip_neg_features
if image_bg_dict.attn_mask is not None:
image_bg_features = image_bg_features * image_bg_dict.attn_mask
# clip_fgbg_features: [BS, 514, 1280]. 514 = 257*2.
# all_id_embs: [BS, 512].
clip_fgbg_features = torch.cat([image_fg_features, image_bg_features], dim=1)
else:
clip_fgbg_features = None
clip_neg_features = None
if calc_avg:
if return_clip_embs:
# clip_fgbg_features: [BS, 514, 1280] -> [1, 514, 1280].
# all_id_embs: [BS, 512] -> [1, 512].
clip_fgbg_features = clip_fgbg_features.mean(dim=0, keepdim=True)
clip_neg_features = clip_neg_features.mean(dim=0, keepdim=True)
debug = False
if debug and all_id_embs is not None:
print(image_paths)
calc_stats('all_id_embs', all_id_embs)
# Compute pairwise similarities of the embeddings.
all_id_embs = F.normalize(all_id_embs, p=2, dim=1)
pairwise_sim = torch.matmul(all_id_embs, all_id_embs.t())
print('pairwise_sim:', pairwise_sim)
top_dir = os.path.dirname(image_paths[0])
mean_emb_path = os.path.join(top_dir, "mean_emb.pt")
if os.path.exists(mean_emb_path):
mean_emb = torch.load(mean_emb_path)
sim_to_mean = torch.matmul(all_id_embs, mean_emb.t())
print('sim_to_mean:', sim_to_mean)
if all_id_embs is not None:
id_embs = all_id_embs.mean(dim=0, keepdim=True)
# Without normalization, id_embs.norm(dim=1) is ~0.9. So normalization doesn't have much effect.
id_embs = F.normalize(id_embs, p=2, dim=-1)
# id_embs is None only if insightface_app is None, i.e., disabled by the user.
else:
# Don't do average of all_id_embs.
id_embs = all_id_embs
return faceless_img_count, id_embs, clip_fgbg_features
# This function should be implemented in derived classes.
# We don't plan to fine-tune the ID2ImgPrompt module. So disable the gradient computation.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
raise NotImplementedError
# If init_id_embs/pre_clip_features is provided, then use the provided face embeddings.
# Otherwise, if image_paths/image_objs are provided, extract face embeddings from the images.
# Otherwise, we generate random face embeddings [id_batch_size, 512].
def get_img_prompt_embs(self, init_id_embs, pre_clip_features, image_paths, image_objs,
id_batch_size,
skip_non_faces=True,
avg_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_std=0.0,
verbose=False):
face_image_count = 0
device = self.clip_image_encoder.device
clip_neg_features = self.get_clip_neg_features(BS=id_batch_size)
if init_id_embs is None:
# Input images are not provided. Generate random face embeddings.
if image_paths is None and image_objs is None:
faceid_embeds_from_images = False
# Use random face embeddings as faceid_embeds. [BS, 512].
faceid_embeds = torch.randn(id_batch_size, 512).to(device=device, dtype=torch.float16)
# Since it's a batch of random IDs, the CLIP features are all zeros as a placeholder.
# Only ConsistentID_ID2AdaPrompt will use clip_fgbg_features and clip_neg_features.
# Experiments show that using random clip features yields much better images than using zeros.
clip_fgbg_features = torch.randn(id_batch_size, 514, 1280).to(device=device, dtype=torch.float16) \
if self.use_clip_embs else None
else:
# Extract face ID embeddings and CLIP features from the images.
faceid_embeds_from_images = True
faceless_img_count, faceid_embeds, clip_fgbg_features \
= self.extract_init_id_embeds_from_images( \
image_objs, image_paths=image_paths, size=(512, 512),
calc_avg=(avg_at_stage == 'id_emb'),
skip_non_faces=skip_non_faces,
verbose=verbose)
if image_paths is not None:
face_image_count = len(image_paths) - faceless_img_count
else:
face_image_count = len(image_objs) - faceless_img_count
else:
faceid_embeds_from_images = False
# Use the provided init_id_embs as faceid_embeds.
faceid_embeds = init_id_embs
if pre_clip_features is not None:
clip_fgbg_features = pre_clip_features
else:
clip_fgbg_features = None
if faceid_embeds.shape[0] == 1:
faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.repeat(id_batch_size, 1, 1)
# If skip_non_faces, then faceid_embeds won't be None.
# Otherwise, if faceid_embeds_from_images, and no face images are detected,
# then we return Nones.
if faceid_embeds is None:
return face_image_count, None, None, None
if perturb_at_stage == 'id_emb' and perturb_std > 0:
# If id_batch_size > 1, after adding noises, the id_batch_size embeddings will be different.
faceid_embeds = perturb_tensor(faceid_embeds, perturb_std, perturb_std_is_relative=True, keep_norm=True)
if self.name == 'consistentID' or self.name == 'jointIDs':
clip_fgbg_features = perturb_tensor(clip_fgbg_features, perturb_std, perturb_std_is_relative=True, keep_norm=True)
faceid_embeds = F.normalize(faceid_embeds, p=2, dim=-1)
# pos_prompt_embs, neg_prompt_embs: [BS, 77, 768] or [BS, 22, 768].
with torch.no_grad():
pos_prompt_embs = \
self.map_init_id_to_img_prompt_embs(faceid_embeds, clip_fgbg_features,
called_for_neg_img_prompt=False)
if avg_at_stage == 'img_prompt_emb':
pos_prompt_embs = pos_prompt_embs.mean(dim=0, keepdim=True)
faceid_embeds = faceid_embeds.mean(dim=0, keepdim=True)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.mean(dim=0, keepdim=True)
if perturb_at_stage == 'img_prompt_emb' and perturb_std > 0:
# NOTE: for simplicity, pos_prompt_embs and pos_core_prompt_emb are perturbed independently.
# This could cause inconsistency between pos_prompt_embs and pos_core_prompt_emb.
# But in practice, unless we use both pos_prompt_embs and pos_core_prompt_emb
# this is not an issue. But we rarely use pos_prompt_embs and pos_core_prompt_emb together.
pos_prompt_embs = perturb_tensor(pos_prompt_embs, perturb_std, perturb_std_is_relative=True, keep_norm=True)
# If faceid_embeds_from_images, and the prompt embeddings are already averaged, then
# we assume all images are from the same subject, and the batch dim of faceid_embeds is 1.
# So we need to repeat faceid_embeds.
if faceid_embeds_from_images and avg_at_stage is not None:
faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
pos_prompt_embs = pos_prompt_embs.repeat(id_batch_size, 1, 1)
if clip_fgbg_features is not None:
clip_fgbg_features = clip_fgbg_features.repeat(id_batch_size, 1, 1)
if self.gen_neg_img_prompt:
# Never perturb the negative prompt embeddings.
with torch.no_grad():
neg_prompt_embs = \
self.map_init_id_to_img_prompt_embs(torch.zeros_like(faceid_embeds),
clip_neg_features,
called_for_neg_img_prompt=True)
return face_image_count, faceid_embeds, pos_prompt_embs, neg_prompt_embs
else:
return face_image_count, faceid_embeds, pos_prompt_embs, None
# get_batched_img_prompt_embs() is a wrapper of get_img_prompt_embs()
# which is convenient for batched training.
# NOTE: get_batched_img_prompt_embs() should only be called during training.
# It is a wrapper of get_img_prompt_embs() which is convenient for batched training.
# If init_id_embs is None, generate random face embeddings [BS, 512].
# Returns faceid_embeds, id2img_prompt_emb.
def get_batched_img_prompt_embs(self, batch_size, init_id_embs, pre_clip_features):
# pos_prompt_embs, neg_prompt_embs are generated without gradient computation.
# So we don't need to worry that the teacher model weights are updated.
return self.get_img_prompt_embs(init_id_embs=init_id_embs,
pre_clip_features=pre_clip_features,
image_paths=None,
image_objs=None,
id_batch_size=batch_size,
# During training, don't skip non-face images. Instead,
# setting skip_non_faces=False will replace them by random face embeddings.
skip_non_faces=False,
# We always assume the instances belong to different subjects.
# So never average the embeddings across instances.
avg_at_stage=None,
verbose=False)
# If img_prompt_embs is provided, we use it directly.
# Otherwise, if face_id_embs is provided, we use it to generate img_prompt_embs.
# Otherwise, if image_paths is provided, we extract face_id_embs from the images.
# image_paths: a list of image paths. image_folder: the parent folder name.
# avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
# avg_at_stage == ada_prompt_emb usually produces the worst results.
# avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
# p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
p_dropout=0,
return_zero_embs_for_dropped_encoders=True,
avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
perturb_std=0, enable_static_img_suffix_embs=False):
if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
img_prompt_avg_at_stage = None
else:
img_prompt_avg_at_stage = avg_at_stage
if img_prompt_embs is None:
# Do averaging. So id_batch_size becomes 1 after averaging.
if img_prompt_avg_at_stage is not None:
id_batch_size = 1
else:
if face_id_embs is not None:
id_batch_size = face_id_embs.shape[0]
elif image_paths is not None:
id_batch_size = len(image_paths)
else:
id_batch_size = 1
# faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
# NOTE: If face_id_embs, image_paths and image_objs are all None,
# then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
# and each instance is different.
# Otherwise, if face_id_embs is provided, it's used.
# If not, image_paths/image_objs are used to extract face embeddings.
# img_prompt_embs is in the image prompt space.
# img_prompt_embs: [BS, 16/4, 768].
face_image_count, faceid_embeds, img_prompt_embs, neg_img_prompt_embs \
= self.get_img_prompt_embs(\
init_id_embs=face_id_embs,
pre_clip_features=None,
# image_folder is passed only for logging purpose.
# image_paths contains the paths of the images.
image_paths=image_paths, image_objs=None,
id_batch_size=id_batch_size,
perturb_at_stage=perturb_at_stage,
perturb_std=perturb_std,
avg_at_stage=img_prompt_avg_at_stage,
verbose=True)
if face_image_count == 0:
return None
# No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
# img_prompt_embs: [BS, 16/4, 768] -> [1, 16/4, 768].
img_prompt_embs = img_prompt_embs.mean(dim=0, keepdim=True)
# adaface_subj_embs: [BS, 16/4, 768].
adaface_subj_embs = \
self.subj_basis_generator(img_prompt_embs, clip_features=None, raw_id_embs=None,
out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
is_face=True,
enable_static_img_suffix_embs=enable_static_img_suffix_embs)
# During training, img_prompt_avg_at_stage is None, and BS >= 1.
# During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
if img_prompt_avg_at_stage is not None:
# adaface_subj_embs: [1, 16, 768] -> [16, 768]
adaface_subj_embs = adaface_subj_embs.squeeze(0)
return adaface_subj_embs
class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, *args, **kwargs):
self.name = 'arc2face'
self.num_id_vecs = 16
super().__init__(*args, **kwargs)
self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
self.clip_preprocessor = CLIPImageProcessor.from_pretrained('openai/clip-vit-large-patch14')
self.clip_image_encoder.eval()
if self.dtype == torch.float16:
self.clip_image_encoder.half()
print(f'CLIP image encoder loaded.')
'''
{'landmark_3d_68': <insightface.model_zoo.landmark.Landmark object at 0x7f8e3f0cc190>,
'landmark_2d_106': <insightface.model_zoo.landmark.Landmark object at 0x7f8e3f0cc2b0>,
'detection': <insightface.model_zoo.retinaface.RetinaFace object at 0x7f8e3f0cc100>,
'genderage': <insightface.model_zoo.attribute.Attribute object at 0x7f8e3f0cc1f0>,
'recognition': <insightface.model_zoo.arcface_onnx.ArcFaceONNX object at 0x7f8e3f0cc0d0>}
'''
# Use the same model as ID2AdaPrompt does.
# FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
# Note there's a second "model" in the path.
# Note DON'T use CUDAExecutionProvider, as it will hang DDP training.
# Seems when loading insightface onto the GPU, it will only reside on the first GPU.
# Then the process on the second GPU has issue to communicate with insightface on the first GPU, causing hanging.
self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
providers=['CPUExecutionProvider'])
self.face_app.prepare(ctx_id=0, det_size=(512, 512))
print(f'Face encoder loaded on CPU.')
self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
'models/arc2face', subfolder="encoder",
torch_dtype=self.dtype
)
self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
if self.out_id_embs_cfg_scale == -1:
self.out_id_embs_cfg_scale = 1
#### Arc2Face pipeline specific configs ####
self.gen_neg_img_prompt = False
# bg CLIP features are used by the bg subject basis generator.
self.use_clip_embs = True
self.do_contrast_clip_embs_on_bg_features = True
# self.num_static_img_suffix_embs is initialized in the parent class.
self.id_img_prompt_max_length = 22
self.clip_embedding_dim = 1024
self.init_subj_basis_generator()
if self.adaface_ckpt_path is not None:
self.load_adaface_ckpt(self.adaface_ckpt_path)
print(f"{self.name} ada prompt encoder initialized, "
f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
# Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
'''
self.text_to_image_prompt_encoder: arc2face_models.py:CLIPTextModelWrapper instance.
init_id_embs: (N, 512) normalized Face ID embeddings.
'''
# arcface_token_id: 1014
arcface_token_id = self.tokenizer.encode("id", add_special_tokens=False)[0]
# This step should be quite fast, and there's no need to cache the input_ids.
input_ids = self.tokenizer(
"photo of a id person",
truncation=True,
padding="max_length",
# In Arc2Face_ID2AdaPrompt, id_img_prompt_max_length is 22.
# Arc2Face's image prompt is meanlingless in tokens other than ID tokens.
max_length=self.id_img_prompt_max_length,
return_tensors="pt",
).input_ids.to(init_id_embs.device)
# input_ids: [1, 22] or [3, 22] (during training).
input_ids = input_ids.repeat(len(init_id_embs), 1)
init_id_embs = init_id_embs.to(self.dtype)
# face_embs_padded: [1, 512] -> [1, 768].
face_embs_padded = F.pad(init_id_embs, (0, self.text_to_image_prompt_encoder.config.hidden_size - init_id_embs.shape[-1]), "constant", 0)
# self.text_to_image_prompt_encoder(input_ids=input_ids, ...) is called twice. The first is only to get the token embeddings (the shallowest mapping).
# The second call does the ordinary CLIP text encoding pass.
token_embs = self.text_to_image_prompt_encoder(input_ids=input_ids, return_token_embs=True)
token_embs[input_ids==arcface_token_id] = face_embs_padded
prompt_embeds = self.text_to_image_prompt_encoder(
input_ids=input_ids,
input_token_embs=token_embs,
return_token_embs=False
)[0]
# Restore the original dtype of prompt_embeds: float16 -> float32.
prompt_embeds = prompt_embeds.to(self.dtype)
# token 4: 'id' in "photo of a id person".
# 4:20 are the most important 16 embeddings that contain the subject's identity.
# [N, 22, 768] -> [N, 16, 768]
return prompt_embeds[:, 4:20]
def get_id2img_learnable_modules(self):
return [ self.text_to_image_prompt_encoder ]
# ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
*args, **kwargs):
self.name = 'consistentID'
self.num_id_vecs = 4
super().__init__(*args, **kwargs)
if pipe is None:
# The base_model_path is kind of arbitrary, as the UNet and VAE in the model
# are not used and will be released soon.
# Only the consistentID modules and bise_net are used.
assert base_model_path is not None, "base_model_path should be provided."
pipe = ConsistentIDPipeline.from_single_file(base_model_path)
pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
pipe.to(dtype=self.dtype)
# Since the passed-in pipe is None, this should be called during inference,
# when the teacher ConsistentIDPipeline is not initialized.
# Therefore, we release VAE, UNet and text_encoder to save memory.
pipe.release_components(["unet", "vae"])
# Otherwise, we share the pipeline with the teacher.
# So we don't release the components.
self.pipe = pipe
self.face_app = pipe.face_app
# ConsistentID uses 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K'.
self.clip_image_encoder = patch_clip_image_encoder_with_mask(pipe.clip_encoder)
self.clip_preprocessor = pipe.clip_preprocessor
self.text_to_image_prompt_encoder = pipe.text_encoder
self.tokenizer = pipe.tokenizer
self.image_proj_model = pipe.image_proj_model
self.clip_image_encoder.eval()
self.image_proj_model.eval()
if self.dtype == torch.float16:
self.clip_image_encoder.half()
self.image_proj_model.half()
if self.out_id_embs_cfg_scale == -1:
self.out_id_embs_cfg_scale = 6
#### ConsistentID pipeline specific configs ####
# self.num_static_img_suffix_embs is initialized in the parent class.
self.gen_neg_img_prompt = True
self.use_clip_embs = True
self.do_contrast_clip_embs_on_bg_features = True
self.clip_embedding_dim = 1280
self.s_scale = 1.0
self.shortcut = False
self.init_subj_basis_generator()
if self.adaface_ckpt_path is not None:
self.load_adaface_ckpt(self.adaface_ckpt_path)
print(f"{self.name} ada prompt encoder initialized, "
f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
assert init_id_embs is not None, "init_id_embs should be provided."
init_id_embs = init_id_embs.to(self.dtype)
clip_features = clip_features.to(self.dtype)
if not called_for_neg_img_prompt:
# clip_features: [BS, 514, 1280].
# clip_features is provided when the function is called within
# ConsistentID_ID2AdaPrompt:extract_init_id_embeds_from_images(), which is
# image_fg_features and image_bg_features concatenated at dim=1.
# Therefore, we split clip_image_double_embeds into image_fg_features and image_bg_features.
# image_bg_features is not used in ConsistentID_ID2AdaPrompt.
image_fg_features, image_bg_features = clip_features.chunk(2, dim=1)
# clip_image_embeds: [BS, 257, 1280].
clip_image_embeds = image_fg_features
else:
# clip_features is the negative image features. So we don't need to split it.
clip_image_embeds = clip_features
init_id_embs = torch.zeros_like(init_id_embs)
faceid_embeds = init_id_embs
# image_proj_model maps 1280-dim OpenCLIP embeddings to 768-dim face prompt embeddings.
# clip_image_embeds are used as queries to transform faceid_embeds.
# faceid_embeds -> kv, clip_image_embeds -> q
if faceid_embeds.shape[0] != clip_image_embeds.shape[0]:
breakpoint()
try:
global_id_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=self.shortcut, scale=self.s_scale)
except:
breakpoint()
return global_id_embeds
def get_id2img_learnable_modules(self):
return [ self.image_proj_model ]
# A wrapper for combining multiple FaceID2AdaPrompt instances.
class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
out_id_embs_cfg_scales=None, enabled_encoders=None,
*args, **kwargs):
self.name = 'jointIDs'
assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
adaface_encoder_types2num_id_vecs = { 'arc2face': 16, 'consistentID': 4 }
self.encoders_num_id_vecs = [ adaface_encoder_types2num_id_vecs[encoder_type] \
for encoder_type in adaface_encoder_types ]
self.num_id_vecs = sum(self.encoders_num_id_vecs)
super().__init__(*args, **kwargs)
self.num_sub_encoders = len(adaface_encoder_types)
self.id2ada_prompt_encoders = nn.ModuleList()
self.encoders_num_static_img_suffix_embs = []
# TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
# Now they are just placeholders.
if out_id_embs_cfg_scales is None:
# -1: use the default scale for the adaface encoder type.
# i.e., 6 for arc2face and 1 for consistentID.
self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
else:
# Do not normalize the weights, and just use them as is.
self.out_id_embs_cfg_scales = out_id_embs_cfg_scales
# Note we don't pass the adaface_ckpt_paths to the base class, but instead,
# we load them once and for all in self.load_adaface_ckpt().
for i, encoder_type in enumerate(adaface_encoder_types):
kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
if encoder_type == 'arc2face':
encoder = Arc2Face_ID2AdaPrompt(*args, **kwargs)
elif encoder_type == 'consistentID':
encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
else:
breakpoint()
self.id2ada_prompt_encoders.append(encoder)
self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
self.num_static_img_suffix_embs = sum(self.encoders_num_static_img_suffix_embs)
# No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
# in the derived classes.
# self.gen_neg_img_prompt = True
# self.use_clip_embs = True
# self.do_contrast_clip_embs_on_bg_features = True
self.face_id_dims = [encoder.face_id_dim for encoder in self.id2ada_prompt_encoders]
self.face_id_dim = sum(self.face_id_dims)
# Different adaface encoders may have different clip_embedding_dim.
# clip_embedding_dim is only used for bg subject basis generator.
# Here we use the joint clip embeddings of both OpenAI CLIP and laion CLIP.
# Therefore, the clip_embedding_dim is the sum of the clip_embedding_dims of all adaface encoders.
self.clip_embedding_dims = [encoder.clip_embedding_dim for encoder in self.id2ada_prompt_encoders]
self.clip_embedding_dim = sum(self.clip_embedding_dims)
# The ctors of the derived classes have already initialized encoder.subj_basis_generator.
# If subj_basis_generator expansion params are specified, they are equally applied to all adaface encoders.
# This self.subj_basis_generator is not meant to be called as self.subj_basis_generator(), but instead,
# it's used as a unified interface to save/load the subj_basis_generator of all adaface encoders.
self.subj_basis_generator = \
nn.ModuleList( [encoder.subj_basis_generator for encoder \
in self.id2ada_prompt_encoders] )
if adaface_ckpt_paths is not None:
self.load_adaface_ckpt(adaface_ckpt_paths)
print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
f"ID vecs: {self.num_id_vecs}, static suffix embs: {self.num_static_img_suffix_embs}.")
if enabled_encoders is not None:
self.are_encoders_enabled = \
torch.tensor([True if encoder_type in enabled_encoders else False \
for encoder_type in adaface_encoder_types])
if not self.are_encoders_enabled.any():
print(f"All encoders are disabled, which shoudn't happen.")
breakpoint()
if self.are_encoders_enabled.sum() < self.num_sub_encoders:
disabled_encoders = [ encoder_type for i, encoder_type in enumerate(adaface_encoder_types) \
if not self.are_encoders_enabled[i] ]
print(f"{len(disabled_encoders)} encoders are disabled: {disabled_encoders}.")
else:
self.are_encoders_enabled = \
torch.tensor([True] * self.num_sub_encoders)
def load_adaface_ckpt(self, adaface_ckpt_paths):
# If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
# so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
if len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
adaface_ckpt_paths = adaface_ckpt_paths[0]
if isinstance(adaface_ckpt_paths, str):
# This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
# the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
# Therefore, no need to patch missing variables.
ckpt = torch.load(adaface_ckpt_paths, map_location='cpu')
string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
if self.subject_string not in string_to_subj_basis_generator_dict:
print(f"Subject '{self.subject_string}' not found in the embedding manager.")
breakpoint()
ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
for i, subj_basis_generator in enumerate(self.subj_basis_generator):
ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
# Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
img_prompt_dim=self.output_dim)
if subj_basis_generator.prompt2token_proj_attention_multipliers \
== [1] * 12:
subj_basis_generator.extend_prompt2token_proj_attention(\
ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
elif subj_basis_generator.prompt2token_proj_attention_multipliers \
!= ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
assert subj_basis_generator.prompt2token_proj_attention_multipliers \
== ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
"Inconsistent prompt2token_proj_attention_multipliers."
subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
# TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
# If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
# extend subj_basis_generator again.
if self.extend_prompt2token_proj_attention_multiplier > 1:
# During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
# During training, prompt2token_proj_ext_attention_perturb_ratio == 0.1.
# During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
subj_basis_generator.extend_prompt2token_proj_attention(\
None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
subj_basis_generator.freeze_prompt2token_proj()
print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
elif isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
for i, ckpt_path in enumerate(adaface_ckpt_paths):
self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
else:
breakpoint()
def extract_init_id_embeds_from_images(self, *args, **kwargs):
total_faceless_img_count = 0
all_id_embs = []
all_clip_fgbg_features = []
id_embs_shape = None
clip_fgbg_features_shape = None
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
faceless_img_count, id_embs, clip_fgbg_features = \
id2ada_prompt_encoder.extract_init_id_embeds_from_images(*args, **kwargs)
total_faceless_img_count += faceless_img_count
# id_embs: [BS, 512] or [1, 512] (if calc_avg == True), or None.
# id_embs has the same shape across all id2ada_prompt_encoders.
all_id_embs.append(id_embs)
# clip_fgbg_features: [BS, 514, 1280/1024] or [1, 514, 1280/1024] (if calc_avg == True), or None.
# clip_fgbg_features has the same shape except for the last dimension across all id2ada_prompt_encoders.
all_clip_fgbg_features.append(clip_fgbg_features)
if id_embs is not None:
id_embs_shape = id_embs.shape
if clip_fgbg_features is not None:
clip_fgbg_features_shape = clip_fgbg_features.shape
num_extracted_id_embs = 0
for i in range(len(all_id_embs)):
if all_id_embs[i] is not None:
# As calc_avg is the same for all id2ada_prompt_encoders,
# each id_embs and clip_fgbg_features should have the same shape, if they are not None.
if all_id_embs[i].shape != id_embs_shape:
print("Inconsistent ID embedding shapes.")
breakpoint()
else:
num_extracted_id_embs += 1
else:
all_id_embs[i] = torch.zeros(id_embs_shape, dtype=torch.float16, device=device)
clip_fgbg_features_shape2 = torch.Size(clip_fgbg_features_shape[:-1] + (self.clip_embedding_dims[i],))
if all_clip_fgbg_features[i] is not None:
if all_clip_fgbg_features[i].shape != clip_fgbg_features_shape2:
print("Inconsistent clip features shapes.")
breakpoint()
else:
all_clip_fgbg_features[i] = torch.zeros(clip_fgbg_features_shape2,
dtype=torch.float16, device=device)
# If at least one face encoder detects faces, then return the embeddings.
# Otherwise return None embeddings.
# It's possible that some face encoders detect faces, while others don't,
# since different face encoders use different face detection models.
if num_extracted_id_embs == 0:
return 0, None, None
all_id_embs = torch.cat(all_id_embs, dim=1)
# clip_fgbg_features: [BS, 514, 1280] or [BS, 514, 1024]. So we concatenate them along dim=2.
all_clip_fgbg_features = torch.cat(all_clip_fgbg_features, dim=2)
return total_faceless_img_count, all_id_embs, all_clip_fgbg_features
# init_id_embs, clip_features are never None.
def map_init_id_to_img_prompt_embs(self, init_id_embs,
clip_features=None,
called_for_neg_img_prompt=False):
if init_id_embs is None or clip_features is None:
breakpoint()
# each id_embs and clip_fgbg_features should have the same shape.
# If some of them were None, they have been replaced by zero embeddings.
all_init_id_embs = init_id_embs.split(self.face_id_dims, dim=1)
all_clip_features = clip_features.split(self.clip_embedding_dims, dim=2)
all_img_prompt_embs = []
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
img_prompt_embs = id2ada_prompt_encoder.map_init_id_to_img_prompt_embs(
all_init_id_embs[i], clip_features=all_clip_features[i],
called_for_neg_img_prompt=called_for_neg_img_prompt,
)
all_img_prompt_embs.append(img_prompt_embs)
all_img_prompt_embs = torch.cat(all_img_prompt_embs, dim=1)
return all_img_prompt_embs
# If init_id_embs/pre_clip_features is provided, then use the provided face embeddings.
# Otherwise, if image_paths/image_objs are provided, extract face embeddings from the images.
# Otherwise, we generate random face embeddings [id_batch_size, 512].
def get_img_prompt_embs(self, init_id_embs, pre_clip_features, *args, **kwargs):
face_image_counts = []
all_faceid_embeds = []
all_pos_prompt_embs = []
all_neg_prompt_embs = []
faceid_embeds_shape = None
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
# init_id_embs, pre_clip_features could be None. If they are None,
# we split them into individual vectors for each id2ada_prompt_encoder.
if init_id_embs is not None:
all_init_id_embs = init_id_embs.split(self.face_id_dims, dim=1)
else:
all_init_id_embs = [None] * self.num_sub_encoders
if pre_clip_features is not None:
all_pre_clip_features = pre_clip_features.split(self.clip_embedding_dims, dim=2)
else:
all_pre_clip_features = [None] * self.num_sub_encoders
faceid_embeds_shape = None
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
face_image_count, faceid_embeds, pos_prompt_embs, neg_prompt_embs = \
id2ada_prompt_encoder.get_img_prompt_embs(all_init_id_embs[i], all_pre_clip_features[i],
*args, **kwargs)
face_image_counts.append(face_image_count)
all_faceid_embeds.append(faceid_embeds)
all_pos_prompt_embs.append(pos_prompt_embs)
all_neg_prompt_embs.append(neg_prompt_embs)
# all faceid_embeds have the same shape across all id2ada_prompt_encoders.
# But pos_prompt_embs and neg_prompt_embs may have different number of ID embeddings.
if faceid_embeds is not None:
faceid_embeds_shape = faceid_embeds.shape
if faceid_embeds_shape is None:
return 0, None, None, None
# We take the maximum face_image_count among all adaface encoders.
face_image_count = max(face_image_counts)
BS = faceid_embeds.shape[0]
for i in range(len(all_faceid_embeds)):
if all_faceid_embeds[i] is not None:
if all_faceid_embeds[i].shape != faceid_embeds_shape:
print("Inconsistent face embedding shapes.")
breakpoint()
else:
all_faceid_embeds[i] = torch.zeros(faceid_embeds_shape, dtype=torch.float16, device=device)
N_ID = self.encoders_num_id_vecs[i]
if all_pos_prompt_embs[i] is None:
# Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs embeddings.
all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
if all_neg_prompt_embs[i] is None:
all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
all_faceid_embeds = torch.cat(all_faceid_embeds, dim=1)
all_pos_prompt_embs = torch.cat(all_pos_prompt_embs, dim=1)
all_neg_prompt_embs = torch.cat(all_neg_prompt_embs, dim=1)
return face_image_count, all_faceid_embeds, all_pos_prompt_embs, all_neg_prompt_embs
# We don't need to implement get_batched_img_prompt_embs() since the interface
# is fully compatible with FaceID2AdaPrompt.get_batched_img_prompt_embs().
def generate_adaface_embeddings(self, image_paths, face_id_embs=None,
img_prompt_embs=None, p_dropout=0,
return_zero_embs_for_dropped_encoders=True,
*args, **kwargs):
# clip_image_encoder should be already put on GPU.
# So its .device is the device of its parameters.
device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
BS = -1
if face_id_embs is not None:
BS = face_id_embs.shape[0]
all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
else:
all_face_id_embs = [None] * self.num_sub_encoders
if img_prompt_embs is not None:
BS = img_prompt_embs.shape[0] if BS == -1 else BS
if img_prompt_embs.shape[1] != self.num_id_vecs:
breakpoint()
all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs, dim=1)
else:
all_img_prompt_embs = [None] * self.num_sub_encoders
if image_paths is not None:
BS = len(image_paths) if BS == -1 else BS
if BS == -1:
breakpoint()
# During training, p_dropout is 0.1. During inference, p_dropout is 0.
# When there are two sub-encoders, the prob of one encoder being dropped is
# p_dropout * 2 - p_dropout^2 = 0.18.
if p_dropout > 0:
# self.are_encoders_enabled is a global mask.
# are_encoders_enabled is a local mask for each batch.
are_encoders_enabled = torch.rand(self.num_sub_encoders) < p_dropout
are_encoders_enabled = are_encoders_enabled & self.are_encoders_enabled
# We should at least enable one encoder.
if not are_encoders_enabled.any():
# Randomly enable an encoder with self.are_encoders_enabled[i] == True.
enabled_indices = torch.nonzero(self.are_encoders_enabled).squeeze(1)
sel_idx = torch.randint(0, len(enabled_indices), (1,)).item()
are_encoders_enabled[enabled_indices[sel_idx]] = True
else:
are_encoders_enabled = self.are_encoders_enabled
all_adaface_subj_embs = []
num_available_id_vecs = 0
for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
if not are_encoders_enabled[i]:
adaface_subj_embs = None
print(f"Encoder {id2ada_prompt_encoder.name} is dropped.")
else:
# ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
# -> each sub-enconder's subj_basis_generator.train().
# Therefore grad for the following call is enabled.
adaface_subj_embs = \
id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
all_face_id_embs[i],
all_img_prompt_embs[i],
*args, **kwargs)
# adaface_subj_embs: [16, 768] or [4, 768].
N_ID = self.encoders_num_id_vecs[i]
if adaface_subj_embs is None:
if not return_zero_embs_for_dropped_encoders:
continue
else:
subj_emb_shape = (N_ID, 768) if is_emb_averaged else (BS, N_ID, 768)
# adaface_subj_embs is zero-filled. So N_ID is not counted as available subject embeddings.
adaface_subj_embs = torch.zeros(subj_emb_shape, dtype=torch.float16, device=device)
all_adaface_subj_embs.append(adaface_subj_embs)
else:
all_adaface_subj_embs.append(adaface_subj_embs)
num_available_id_vecs += N_ID
# No faces are found in the images, so return None embeddings.
# We don't want to return an all-zero embedding, which is useless.
if num_available_id_vecs == 0:
return None
# If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
# during inference, we average across the batch dim.
# all_adaface_subj_embs[0]: [4, 768]. all_adaface_subj_embs[1]: [16, 768].
# all_adaface_subj_embs: [20, 768].
# during training, we don't average across the batch dim.
# all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
# all_adaface_subj_embs: [BS, 20, 768].
all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
return all_adaface_subj_embs
'''
# For ip-adapter distillation on objects. Strictly speaking, it's not face-to-image prompts, but
# CLIP/DINO visual features to image prompts.
class Objects_Vis2ImgPrompt(nn.Module):
def __init__(self):
self.dino_encoder = ViTModel.from_pretrained('facebook/dino-vits16')
self.dino_encoder.eval()
self.dino_encoder.half()
self.dino_preprocess = ViTFeatureExtractor.from_pretrained('facebook/dino-vits16')
print(f'DINO encoder loaded.')
'''