Spaces:
Running
on
Zero
Running
on
Zero
adaface-neurips
commited on
Commit
·
a29cf91
1
Parent(s):
61fbdeb
Allow dynamically changing base model style type, support anime style, upgrade adaface model
Browse files- adaface/adaface_wrapper.py +16 -7
- adaface/face_id_to_ada_prompt.py +11 -2
- app.py +81 -40
- infer.py +6 -6
- models/adaface/{VGGface2_HQ_masks2024-10-05T09-28-53_zero3-ada-28000.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt} +2 -2
- models/{rv51/realisticVisionV51_v51VAE.safetensors → aingdiffusion/aingdiffusion_v170_ar.safetensors} +1 -1
- models/rv51/realisticVisionV51_v51VAE_dste8.safetensors +3 -0
adaface/adaface_wrapper.py
CHANGED
@@ -247,7 +247,7 @@ class AdaFaceWrapper(nn.Module):
|
|
247 |
token_embeds[token_id] = subj_embs[i]
|
248 |
print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
|
249 |
|
250 |
-
def update_prompt(self, prompt):
|
251 |
if prompt is None:
|
252 |
prompt = ""
|
253 |
|
@@ -259,7 +259,10 @@ class AdaFaceWrapper(nn.Module):
|
|
259 |
# When we do joint training, seems both work better if they are appended to the prompt.
|
260 |
# Therefore we simply appended all placeholder_tokens_str's to the prompt.
|
261 |
# NOTE: Prepending them hurts compositional prompts.
|
262 |
-
|
|
|
|
|
|
|
263 |
|
264 |
return prompt
|
265 |
|
@@ -290,14 +293,16 @@ class AdaFaceWrapper(nn.Module):
|
|
290 |
self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
|
291 |
return all_adaface_subj_embs
|
292 |
|
293 |
-
def encode_prompt(self, prompt, negative_prompt=None,
|
|
|
|
|
294 |
if negative_prompt is None:
|
295 |
negative_prompt = self.negative_prompt
|
296 |
|
297 |
if device is None:
|
298 |
device = self.device
|
299 |
|
300 |
-
prompt = self.update_prompt(prompt)
|
301 |
if verbose:
|
302 |
print(f"Subject prompt: {prompt}")
|
303 |
|
@@ -350,8 +355,10 @@ class AdaFaceWrapper(nn.Module):
|
|
350 |
return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
|
351 |
|
352 |
# ref_img_strength is used only in the img2img pipeline.
|
353 |
-
def forward(self, noise, prompt, negative_prompt=None,
|
354 |
-
|
|
|
|
|
355 |
noise = noise.to(device=self.device, dtype=torch.float16)
|
356 |
|
357 |
if negative_prompt is None:
|
@@ -359,7 +366,9 @@ class AdaFaceWrapper(nn.Module):
|
|
359 |
# prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
|
360 |
prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
|
361 |
negative_pooled_prompt_embeds_ = \
|
362 |
-
self.encode_prompt(prompt, negative_prompt,
|
|
|
|
|
363 |
# Repeat the prompt embeddings for all images in the batch.
|
364 |
prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
|
365 |
if negative_prompt_embeds_ is not None:
|
|
|
247 |
token_embeds[token_id] = subj_embs[i]
|
248 |
print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
|
249 |
|
250 |
+
def update_prompt(self, prompt, placeholder_tokens_pos='postpend'):
|
251 |
if prompt is None:
|
252 |
prompt = ""
|
253 |
|
|
|
259 |
# When we do joint training, seems both work better if they are appended to the prompt.
|
260 |
# Therefore we simply appended all placeholder_tokens_str's to the prompt.
|
261 |
# NOTE: Prepending them hurts compositional prompts.
|
262 |
+
if placeholder_tokens_pos == 'prepend':
|
263 |
+
prompt = self.all_placeholder_tokens_str + " " + prompt
|
264 |
+
elif placeholder_tokens_pos == 'postpend':
|
265 |
+
prompt = prompt + " " + self.all_placeholder_tokens_str
|
266 |
|
267 |
return prompt
|
268 |
|
|
|
293 |
self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
|
294 |
return all_adaface_subj_embs
|
295 |
|
296 |
+
def encode_prompt(self, prompt, negative_prompt=None,
|
297 |
+
placeholder_tokens_pos='postpend',
|
298 |
+
device=None, verbose=False):
|
299 |
if negative_prompt is None:
|
300 |
negative_prompt = self.negative_prompt
|
301 |
|
302 |
if device is None:
|
303 |
device = self.device
|
304 |
|
305 |
+
prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
|
306 |
if verbose:
|
307 |
print(f"Subject prompt: {prompt}")
|
308 |
|
|
|
355 |
return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
|
356 |
|
357 |
# ref_img_strength is used only in the img2img pipeline.
|
358 |
+
def forward(self, noise, prompt, negative_prompt=None,
|
359 |
+
placeholder_tokens_pos='postpend',
|
360 |
+
guidance_scale=6.0, out_image_count=4,
|
361 |
+
ref_img_strength=0.8, generator=None, verbose=False):
|
362 |
noise = noise.to(device=self.device, dtype=torch.float16)
|
363 |
|
364 |
if negative_prompt is None:
|
|
|
366 |
# prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
|
367 |
prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
|
368 |
negative_pooled_prompt_embeds_ = \
|
369 |
+
self.encode_prompt(prompt, negative_prompt,
|
370 |
+
placeholder_tokens_pos=placeholder_tokens_pos,
|
371 |
+
device=self.device, verbose=verbose)
|
372 |
# Repeat the prompt embeddings for all images in the batch.
|
373 |
prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
|
374 |
if negative_prompt_embeds_ is not None:
|
adaface/face_id_to_ada_prompt.py
CHANGED
@@ -863,8 +863,17 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
|
|
863 |
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
|
864 |
img_prompt_dim=self.output_dim)
|
865 |
|
866 |
-
subj_basis_generator.
|
867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
|
869 |
|
870 |
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
|
|
|
863 |
ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
|
864 |
img_prompt_dim=self.output_dim)
|
865 |
|
866 |
+
if subj_basis_generator.prompt2token_proj_attention_multipliers \
|
867 |
+
== [1] * 12:
|
868 |
+
subj_basis_generator.extend_prompt2token_proj_attention(\
|
869 |
+
ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
|
870 |
+
elif subj_basis_generator.prompt2token_proj_attention_multipliers \
|
871 |
+
!= ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
|
872 |
+
raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
|
873 |
+
|
874 |
+
assert subj_basis_generator.prompt2token_proj_attention_multipliers \
|
875 |
+
== ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
|
876 |
+
"Inconsistent prompt2token_proj_attention_multipliers."
|
877 |
subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
|
878 |
|
879 |
# extend_prompt2token_proj_attention_multiplier is an integer >= 1.
|
app.py
CHANGED
@@ -7,7 +7,7 @@ from animatediff.utils.util import save_videos_grid
|
|
7 |
from adaface.adaface_wrapper import AdaFaceWrapper
|
8 |
|
9 |
import random
|
10 |
-
from infer import load_model
|
11 |
MAX_SEED=10000
|
12 |
import uuid
|
13 |
from insightface.app import FaceAnalysis
|
@@ -24,20 +24,13 @@ parser = argparse.ArgumentParser()
|
|
24 |
parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
|
25 |
choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
|
26 |
parser.add_argument('--adaface_ckpt_path', type=str,
|
27 |
-
default='models/adaface/VGGface2_HQ_masks2024-10-
|
28 |
-
|
29 |
-
|
30 |
-
choices=["sar", "rv51"])
|
31 |
parser.add_argument('--gpu', type=int, default=None)
|
32 |
parser.add_argument('--ip', type=str, default="0.0.0.0")
|
33 |
args = parser.parse_args()
|
34 |
|
35 |
-
base_model_type_to_path = {
|
36 |
-
"sd15": "models/sd15-dste8-vae.safetensors", # LDM format. Needs to be converted.
|
37 |
-
"sar": "models/sar/sar.safetensors", # LDM format. Needs to be converted.
|
38 |
-
"rv51": "models/rv51/realisticVisionV51_v51VAE.safetensors"
|
39 |
-
}
|
40 |
-
|
41 |
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
42 |
if randomize_seed:
|
43 |
seed = random.randint(0, MAX_SEED)
|
@@ -50,16 +43,16 @@ app = FaceAnalysis(name="buffalo_l", root='models/insightface', providers=['CUDA
|
|
50 |
app.prepare(ctx_id=0, det_size=(320, 320))
|
51 |
device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
base_model_path = base_model_type_to_path[args.base_model_type]
|
56 |
|
|
|
|
|
57 |
adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
|
58 |
adaface_encoder_types=args.adaface_encoder_types,
|
59 |
adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
|
60 |
|
61 |
-
basedir
|
62 |
-
savedir
|
63 |
os.makedirs(savedir, exist_ok=True)
|
64 |
|
65 |
#print(f"### Cleaning cached examples ...")
|
@@ -81,10 +74,23 @@ def get_clicked_image(data: gr.SelectData):
|
|
81 |
return data.index
|
82 |
|
83 |
@spaces.GPU
|
84 |
-
def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
|
|
|
85 |
if uploaded_image_paths is None:
|
86 |
print("No image uploaded")
|
87 |
return None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# uploaded_image_paths is a list of tuples:
|
89 |
# [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
|
90 |
# Extract the file paths.
|
@@ -98,9 +104,20 @@ def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
|
|
98 |
|
99 |
# Generate two images each time for the user to select from.
|
100 |
noise = torch.randn(out_image_count, 3, 512, 512)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
# samples: A list of PIL Image instances.
|
102 |
with torch.no_grad():
|
103 |
-
samples = adaface(noise, prompt,
|
|
|
|
|
104 |
|
105 |
face_paths = []
|
106 |
for sample in samples:
|
@@ -114,13 +131,25 @@ def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
|
|
114 |
return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
|
115 |
|
116 |
@spaces.GPU(duration=90)
|
117 |
-
def
|
118 |
-
init_image_strength, init_image_final_weight,
|
119 |
prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
|
120 |
attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
|
121 |
is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
|
122 |
id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
if prompt is None:
|
125 |
prompt = ""
|
126 |
|
@@ -145,7 +174,8 @@ def generate_image(image_container, uploaded_image_paths, init_img_file_paths, i
|
|
145 |
else:
|
146 |
if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
|
147 |
and (adaface_ckpt_path != args.adaface_ckpt_path):
|
148 |
-
|
|
|
149 |
adaface.id2ada_prompt_encoder.load_adaface_ckpt(adaface_ckpt_path)
|
150 |
|
151 |
with torch.no_grad():
|
@@ -154,7 +184,9 @@ def generate_image(image_container, uploaded_image_paths, init_img_file_paths, i
|
|
154 |
update_text_encoder=True)
|
155 |
|
156 |
# adaface_prompt_embeds: [1, 77, 768].
|
157 |
-
adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt,
|
|
|
|
|
158 |
|
159 |
image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
|
160 |
|
@@ -252,20 +284,20 @@ with gr.Blocks(css=css) as demo:
|
|
252 |
|
253 |
prompt = gr.Dropdown(label="Prompt",
|
254 |
info="Try something like 'man/woman walking on the beach'.",
|
255 |
-
value="((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
|
256 |
allow_custom_value=True,
|
257 |
filterable=False,
|
258 |
choices=[
|
259 |
-
"((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
|
260 |
-
"walking on the beach, sunset, orange sky, eye level shot",
|
261 |
-
"in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
|
262 |
-
"dancing pose among folks in a park, waving hands",
|
263 |
-
"in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
|
264 |
-
"jedi wielding a lightsaber, star wars, full body view, eye level shot",
|
265 |
-
"playing guitar on a boat, ocean waves",
|
266 |
-
"with a passion for reading, curled up with a book in a cozy nook near a window",
|
267 |
-
"running pose in a park, eye level shot",
|
268 |
-
"in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
|
269 |
])
|
270 |
|
271 |
init_image_strength = gr.Slider(
|
@@ -285,6 +317,14 @@ with gr.Blocks(css=css) as demo:
|
|
285 |
value=0.1,
|
286 |
)
|
287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
guidance_scale = gr.Slider(
|
289 |
label="Guidance scale",
|
290 |
minimum=1.0,
|
@@ -352,18 +392,18 @@ with gr.Blocks(css=css) as demo:
|
|
352 |
image_embed_cfg_begin_scale = gr.Slider(
|
353 |
label="ID-Animator Image Embedding Initial Scale",
|
354 |
info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
|
355 |
-
minimum=0.
|
356 |
maximum=1.5,
|
357 |
step=0.1,
|
358 |
-
value=1.
|
359 |
)
|
360 |
image_embed_cfg_end_scale = gr.Slider(
|
361 |
label="ID-Animator Image Embedding Final Scale",
|
362 |
info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
|
363 |
-
minimum=0.
|
364 |
maximum=1.5,
|
365 |
step=0.1,
|
366 |
-
value=0.
|
367 |
)
|
368 |
|
369 |
id_animator_anneal_steps = gr.Slider(
|
@@ -393,7 +433,7 @@ with gr.Blocks(css=css) as demo:
|
|
393 |
init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files, outputs=[uploaded_init_img_gallery, init_clear_button_column, init_img_files])
|
394 |
remove_init_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_init_img_gallery, init_clear_button_column,
|
395 |
init_img_files, init_img_selected_idx])
|
396 |
-
gen_init.click(fn=gen_init_images, inputs=[uploaded_files_gallery, prompt],
|
397 |
outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
|
398 |
uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
|
399 |
|
@@ -405,8 +445,9 @@ with gr.Blocks(css=css) as demo:
|
|
405 |
queue=False,
|
406 |
api_name=False,
|
407 |
).then(
|
408 |
-
fn=
|
409 |
-
inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
|
|
|
410 |
prompt, negative_prompt, num_steps, video_length, guidance_scale,
|
411 |
seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
|
412 |
is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
|
|
|
7 |
from adaface.adaface_wrapper import AdaFaceWrapper
|
8 |
|
9 |
import random
|
10 |
+
from infer import load_model, model_style_type2base_model_path
|
11 |
MAX_SEED=10000
|
12 |
import uuid
|
13 |
from insightface.app import FaceAnalysis
|
|
|
24 |
parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
|
25 |
choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
|
26 |
parser.add_argument('--adaface_ckpt_path', type=str,
|
27 |
+
default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt')
|
28 |
+
parser.add_argument('--model_style_type', type=str, default='realistic',
|
29 |
+
choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
|
|
|
30 |
parser.add_argument('--gpu', type=int, default=None)
|
31 |
parser.add_argument('--ip', type=str, default="0.0.0.0")
|
32 |
args = parser.parse_args()
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
35 |
if randomize_seed:
|
36 |
seed = random.randint(0, MAX_SEED)
|
|
|
43 |
app.prepare(ctx_id=0, det_size=(320, 320))
|
44 |
device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
|
45 |
|
46 |
+
global adaface, id_animator
|
|
|
|
|
47 |
|
48 |
+
base_model_path = model_style_type2base_model_path[args.model_style_type]
|
49 |
+
id_animator = load_model(model_style_type=args.model_style_type, device=device)
|
50 |
adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
|
51 |
adaface_encoder_types=args.adaface_encoder_types,
|
52 |
adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
|
53 |
|
54 |
+
basedir = os.getcwd()
|
55 |
+
savedir = os.path.join(basedir,'samples')
|
56 |
os.makedirs(savedir, exist_ok=True)
|
57 |
|
58 |
#print(f"### Cleaning cached examples ...")
|
|
|
74 |
return data.index
|
75 |
|
76 |
@spaces.GPU
|
77 |
+
def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_count=3):
|
78 |
+
global adaface, id_animator
|
79 |
if uploaded_image_paths is None:
|
80 |
print("No image uploaded")
|
81 |
return None, None, None
|
82 |
+
|
83 |
+
model_style_type = model_style_type.lower()
|
84 |
+
base_model_path = model_style_type2base_model_path[model_style_type]
|
85 |
+
# If the base model type is changed, reload the model.
|
86 |
+
if model_style_type != args.model_style_type:
|
87 |
+
id_animator = load_model(model_style_type=model_style_type, device=device)
|
88 |
+
adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
|
89 |
+
adaface_encoder_types=args.adaface_encoder_types,
|
90 |
+
adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
|
91 |
+
# Update base model type.
|
92 |
+
args.model_style_type = model_style_type
|
93 |
+
|
94 |
# uploaded_image_paths is a list of tuples:
|
95 |
# [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
|
96 |
# Extract the file paths.
|
|
|
104 |
|
105 |
# Generate two images each time for the user to select from.
|
106 |
noise = torch.randn(out_image_count, 3, 512, 512)
|
107 |
+
|
108 |
+
enhance_face = True
|
109 |
+
if enhance_face and "face portrait" not in prompt:
|
110 |
+
if "portrait" in prompt:
|
111 |
+
# Enhance the face features by replacing "portrait" with "face portrait".
|
112 |
+
prompt = prompt.replace("portrait", "face portrait")
|
113 |
+
else:
|
114 |
+
prompt = "face portrait, " + prompt
|
115 |
+
|
116 |
# samples: A list of PIL Image instances.
|
117 |
with torch.no_grad():
|
118 |
+
samples = adaface(noise, prompt,
|
119 |
+
placeholder_tokens_pos='prepend',
|
120 |
+
out_image_count=out_image_count, verbose=True)
|
121 |
|
122 |
face_paths = []
|
123 |
for sample in samples:
|
|
|
131 |
return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
|
132 |
|
133 |
@spaces.GPU(duration=90)
|
134 |
+
def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
|
135 |
+
init_image_strength, init_image_final_weight, model_style_type,
|
136 |
prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
|
137 |
attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
|
138 |
is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
|
139 |
id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
|
140 |
|
141 |
+
global adaface, id_animator
|
142 |
+
model_style_type = model_style_type.lower()
|
143 |
+
base_model_path = model_style_type2base_model_path[model_style_type]
|
144 |
+
# If the base model type is changed, reload the model.
|
145 |
+
if model_style_type != args.model_style_type:
|
146 |
+
id_animator = load_model(model_style_type=model_style_type, device=device)
|
147 |
+
adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
|
148 |
+
adaface_encoder_types=args.adaface_encoder_types,
|
149 |
+
adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
|
150 |
+
# Update base model type.
|
151 |
+
args.model_style_type = model_style_type
|
152 |
+
|
153 |
if prompt is None:
|
154 |
prompt = ""
|
155 |
|
|
|
174 |
else:
|
175 |
if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
|
176 |
and (adaface_ckpt_path != args.adaface_ckpt_path):
|
177 |
+
args.adaface_ckpt_path = adaface_ckpt_path
|
178 |
+
# Reload the adaface model weights.
|
179 |
adaface.id2ada_prompt_encoder.load_adaface_ckpt(adaface_ckpt_path)
|
180 |
|
181 |
with torch.no_grad():
|
|
|
184 |
update_text_encoder=True)
|
185 |
|
186 |
# adaface_prompt_embeds: [1, 77, 768].
|
187 |
+
adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt,
|
188 |
+
placeholder_tokens_pos='prepend',
|
189 |
+
verbose=True)
|
190 |
|
191 |
image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
|
192 |
|
|
|
284 |
|
285 |
prompt = gr.Dropdown(label="Prompt",
|
286 |
info="Try something like 'man/woman walking on the beach'.",
|
287 |
+
value="portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
|
288 |
allow_custom_value=True,
|
289 |
filterable=False,
|
290 |
choices=[
|
291 |
+
"portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
|
292 |
+
"portrait, walking on the beach, sunset, orange sky, eye level shot",
|
293 |
+
"portrait, in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
|
294 |
+
"portrait, dancing pose among folks in a park, waving hands",
|
295 |
+
"portrait, in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
|
296 |
+
"portrait, jedi wielding a lightsaber, star wars, full body view, eye level shot",
|
297 |
+
"portrait, playing guitar on a boat, ocean waves",
|
298 |
+
"portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
|
299 |
+
"portrait, running pose in a park, eye level shot",
|
300 |
+
"portrait, in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
|
301 |
])
|
302 |
|
303 |
init_image_strength = gr.Slider(
|
|
|
317 |
value=0.1,
|
318 |
)
|
319 |
|
320 |
+
model_style_type = gr.Dropdown(
|
321 |
+
label="Base Model Style Type",
|
322 |
+
info="Switching the base model type will take 10~20 seconds to reload the model",
|
323 |
+
value=args.model_style_type,
|
324 |
+
choices=["Rrealistic", "Anime", "Photorealistic"],
|
325 |
+
allow_custom_value=False,
|
326 |
+
filterable=False,
|
327 |
+
)
|
328 |
guidance_scale = gr.Slider(
|
329 |
label="Guidance scale",
|
330 |
minimum=1.0,
|
|
|
392 |
image_embed_cfg_begin_scale = gr.Slider(
|
393 |
label="ID-Animator Image Embedding Initial Scale",
|
394 |
info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
|
395 |
+
minimum=0.6,
|
396 |
maximum=1.5,
|
397 |
step=0.1,
|
398 |
+
value=1.0,
|
399 |
)
|
400 |
image_embed_cfg_end_scale = gr.Slider(
|
401 |
label="ID-Animator Image Embedding Final Scale",
|
402 |
info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
|
403 |
+
minimum=0.3,
|
404 |
maximum=1.5,
|
405 |
step=0.1,
|
406 |
+
value=0.5,
|
407 |
)
|
408 |
|
409 |
id_animator_anneal_steps = gr.Slider(
|
|
|
433 |
init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files, outputs=[uploaded_init_img_gallery, init_clear_button_column, init_img_files])
|
434 |
remove_init_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_init_img_gallery, init_clear_button_column,
|
435 |
init_img_files, init_img_selected_idx])
|
436 |
+
gen_init.click(fn=gen_init_images, inputs=[uploaded_files_gallery, model_style_type, prompt],
|
437 |
outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
|
438 |
uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
|
439 |
|
|
|
445 |
queue=False,
|
446 |
api_name=False,
|
447 |
).then(
|
448 |
+
fn=generate_video,
|
449 |
+
inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
|
450 |
+
init_image_final_weight, model_style_type,
|
451 |
prompt, negative_prompt, num_steps, video_length, guidance_scale,
|
452 |
seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
|
453 |
is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
|
infer.py
CHANGED
@@ -9,19 +9,19 @@ from safetensors import safe_open
|
|
9 |
from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
|
10 |
from faceadapter.face_adapter import FaceAdapterPlusForVideoLora
|
11 |
|
12 |
-
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
16 |
}
|
17 |
|
18 |
-
def load_model(
|
19 |
inference_config = "inference-v2.yaml"
|
20 |
sd_version = "animatediff/sd"
|
21 |
id_ckpt = "models/animator.ckpt"
|
22 |
image_encoder_path = "models/image_encoder"
|
23 |
|
24 |
-
base_model_path
|
25 |
|
26 |
motion_module_path="models/v3_sd15_mm.ckpt"
|
27 |
motion_lora_path = "models/v3_sd15_adapter.ckpt"
|
|
|
9 |
from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
|
10 |
from faceadapter.face_adapter import FaceAdapterPlusForVideoLora
|
11 |
|
12 |
+
model_style_type2base_model_path = {
|
13 |
+
"realistic": "models/rv51/realisticVisionV51_v51VAE_dste8.safetensors",
|
14 |
+
"anime": "models/aingdiffusion/aingdiffusion_v170_ar.safetensors",
|
15 |
+
"photorealistic": "models/sar/sar.safetensors" # LDM format. Needs to be converted.
|
16 |
}
|
17 |
|
18 |
+
def load_model(model_style_type="realistic", device="cuda"):
|
19 |
inference_config = "inference-v2.yaml"
|
20 |
sd_version = "animatediff/sd"
|
21 |
id_ckpt = "models/animator.ckpt"
|
22 |
image_encoder_path = "models/image_encoder"
|
23 |
|
24 |
+
base_model_path = model_style_type2base_model_path[model_style_type]
|
25 |
|
26 |
motion_module_path="models/v3_sd15_mm.ckpt"
|
27 |
motion_lora_path = "models/v3_sd15_adapter.ckpt"
|
models/adaface/{VGGface2_HQ_masks2024-10-05T09-28-53_zero3-ada-28000.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c66b1847072c66deaa38b9ec91c0d76ac5274dec8d02444fc9672f0defa4d156
|
3 |
+
size 1814921594
|
models/{rv51/realisticVisionV51_v51VAE.safetensors → aingdiffusion/aingdiffusion_v170_ar.safetensors}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2132625894
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883af0939ef9bbb7ca03e90e778512258be26be7bef9276768c1594f9b7d3590
|
3 |
size 2132625894
|
models/rv51/realisticVisionV51_v51VAE_dste8.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a96d832b0df00b72e762486cec30311f4c706871f50120fc5dab6f60cf044a33
|
3 |
+
size 2132625894
|