adaface-neurips commited on
Commit
a29cf91
·
1 Parent(s): 61fbdeb

Allow dynamically changing base model style type, support anime style, upgrade adaface model

Browse files
adaface/adaface_wrapper.py CHANGED
@@ -247,7 +247,7 @@ class AdaFaceWrapper(nn.Module):
247
  token_embeds[token_id] = subj_embs[i]
248
  print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
249
 
250
- def update_prompt(self, prompt):
251
  if prompt is None:
252
  prompt = ""
253
 
@@ -259,7 +259,10 @@ class AdaFaceWrapper(nn.Module):
259
  # When we do joint training, seems both work better if they are appended to the prompt.
260
  # Therefore we simply appended all placeholder_tokens_str's to the prompt.
261
  # NOTE: Prepending them hurts compositional prompts.
262
- prompt = prompt + " " + self.all_placeholder_tokens_str
 
 
 
263
 
264
  return prompt
265
 
@@ -290,14 +293,16 @@ class AdaFaceWrapper(nn.Module):
290
  self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
291
  return all_adaface_subj_embs
292
 
293
- def encode_prompt(self, prompt, negative_prompt=None, device=None, verbose=False):
 
 
294
  if negative_prompt is None:
295
  negative_prompt = self.negative_prompt
296
 
297
  if device is None:
298
  device = self.device
299
 
300
- prompt = self.update_prompt(prompt)
301
  if verbose:
302
  print(f"Subject prompt: {prompt}")
303
 
@@ -350,8 +355,10 @@ class AdaFaceWrapper(nn.Module):
350
  return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
351
 
352
  # ref_img_strength is used only in the img2img pipeline.
353
- def forward(self, noise, prompt, negative_prompt=None, guidance_scale=6.0,
354
- out_image_count=4, ref_img_strength=0.8, generator=None, verbose=False):
 
 
355
  noise = noise.to(device=self.device, dtype=torch.float16)
356
 
357
  if negative_prompt is None:
@@ -359,7 +366,9 @@ class AdaFaceWrapper(nn.Module):
359
  # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
360
  prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
361
  negative_pooled_prompt_embeds_ = \
362
- self.encode_prompt(prompt, negative_prompt, device=self.device, verbose=verbose)
 
 
363
  # Repeat the prompt embeddings for all images in the batch.
364
  prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
365
  if negative_prompt_embeds_ is not None:
 
247
  token_embeds[token_id] = subj_embs[i]
248
  print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
249
 
250
+ def update_prompt(self, prompt, placeholder_tokens_pos='postpend'):
251
  if prompt is None:
252
  prompt = ""
253
 
 
259
  # When we do joint training, seems both work better if they are appended to the prompt.
260
  # Therefore we simply appended all placeholder_tokens_str's to the prompt.
261
  # NOTE: Prepending them hurts compositional prompts.
262
+ if placeholder_tokens_pos == 'prepend':
263
+ prompt = self.all_placeholder_tokens_str + " " + prompt
264
+ elif placeholder_tokens_pos == 'postpend':
265
+ prompt = prompt + " " + self.all_placeholder_tokens_str
266
 
267
  return prompt
268
 
 
293
  self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
294
  return all_adaface_subj_embs
295
 
296
+ def encode_prompt(self, prompt, negative_prompt=None,
297
+ placeholder_tokens_pos='postpend',
298
+ device=None, verbose=False):
299
  if negative_prompt is None:
300
  negative_prompt = self.negative_prompt
301
 
302
  if device is None:
303
  device = self.device
304
 
305
+ prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
306
  if verbose:
307
  print(f"Subject prompt: {prompt}")
308
 
 
355
  return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
356
 
357
  # ref_img_strength is used only in the img2img pipeline.
358
+ def forward(self, noise, prompt, negative_prompt=None,
359
+ placeholder_tokens_pos='postpend',
360
+ guidance_scale=6.0, out_image_count=4,
361
+ ref_img_strength=0.8, generator=None, verbose=False):
362
  noise = noise.to(device=self.device, dtype=torch.float16)
363
 
364
  if negative_prompt is None:
 
366
  # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
367
  prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
368
  negative_pooled_prompt_embeds_ = \
369
+ self.encode_prompt(prompt, negative_prompt,
370
+ placeholder_tokens_pos=placeholder_tokens_pos,
371
+ device=self.device, verbose=verbose)
372
  # Repeat the prompt embeddings for all images in the batch.
373
  prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
374
  if negative_prompt_embeds_ is not None:
adaface/face_id_to_ada_prompt.py CHANGED
@@ -863,8 +863,17 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
863
  ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
864
  img_prompt_dim=self.output_dim)
865
 
866
- subj_basis_generator.extend_prompt2token_proj_attention(\
867
- ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
 
 
 
 
 
 
 
 
 
868
  subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
869
 
870
  # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
 
863
  ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
864
  img_prompt_dim=self.output_dim)
865
 
866
+ if subj_basis_generator.prompt2token_proj_attention_multipliers \
867
+ == [1] * 12:
868
+ subj_basis_generator.extend_prompt2token_proj_attention(\
869
+ ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
870
+ elif subj_basis_generator.prompt2token_proj_attention_multipliers \
871
+ != ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
872
+ raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
873
+
874
+ assert subj_basis_generator.prompt2token_proj_attention_multipliers \
875
+ == ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
876
+ "Inconsistent prompt2token_proj_attention_multipliers."
877
  subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
878
 
879
  # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
app.py CHANGED
@@ -7,7 +7,7 @@ from animatediff.utils.util import save_videos_grid
7
  from adaface.adaface_wrapper import AdaFaceWrapper
8
 
9
  import random
10
- from infer import load_model
11
  MAX_SEED=10000
12
  import uuid
13
  from insightface.app import FaceAnalysis
@@ -24,20 +24,13 @@ parser = argparse.ArgumentParser()
24
  parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
25
  choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
26
  parser.add_argument('--adaface_ckpt_path', type=str,
27
- default='models/adaface/VGGface2_HQ_masks2024-10-05T09-28-53_zero3-ada-28000.pt')
28
- # Don't use 'sd15' for base_model_type; it just generates messy videos.
29
- parser.add_argument('--base_model_type', type=str, default='rv51',
30
- choices=["sar", "rv51"])
31
  parser.add_argument('--gpu', type=int, default=None)
32
  parser.add_argument('--ip', type=str, default="0.0.0.0")
33
  args = parser.parse_args()
34
 
35
- base_model_type_to_path = {
36
- "sd15": "models/sd15-dste8-vae.safetensors", # LDM format. Needs to be converted.
37
- "sar": "models/sar/sar.safetensors", # LDM format. Needs to be converted.
38
- "rv51": "models/rv51/realisticVisionV51_v51VAE.safetensors"
39
- }
40
-
41
  def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
42
  if randomize_seed:
43
  seed = random.randint(0, MAX_SEED)
@@ -50,16 +43,16 @@ app = FaceAnalysis(name="buffalo_l", root='models/insightface', providers=['CUDA
50
  app.prepare(ctx_id=0, det_size=(320, 320))
51
  device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
52
 
53
- id_animator = load_model(base_model_type=args.base_model_type, device=device)
54
-
55
- base_model_path = base_model_type_to_path[args.base_model_type]
56
 
 
 
57
  adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
58
  adaface_encoder_types=args.adaface_encoder_types,
59
  adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
60
 
61
- basedir = os.getcwd()
62
- savedir = os.path.join(basedir,'samples')
63
  os.makedirs(savedir, exist_ok=True)
64
 
65
  #print(f"### Cleaning cached examples ...")
@@ -81,10 +74,23 @@ def get_clicked_image(data: gr.SelectData):
81
  return data.index
82
 
83
  @spaces.GPU
84
- def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
 
85
  if uploaded_image_paths is None:
86
  print("No image uploaded")
87
  return None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
88
  # uploaded_image_paths is a list of tuples:
89
  # [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
90
  # Extract the file paths.
@@ -98,9 +104,20 @@ def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
98
 
99
  # Generate two images each time for the user to select from.
100
  noise = torch.randn(out_image_count, 3, 512, 512)
 
 
 
 
 
 
 
 
 
101
  # samples: A list of PIL Image instances.
102
  with torch.no_grad():
103
- samples = adaface(noise, prompt, out_image_count=out_image_count, verbose=True)
 
 
104
 
105
  face_paths = []
106
  for sample in samples:
@@ -114,13 +131,25 @@ def gen_init_images(uploaded_image_paths, prompt, out_image_count=3):
114
  return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
115
 
116
  @spaces.GPU(duration=90)
117
- def generate_image(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
118
- init_image_strength, init_image_final_weight,
119
  prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
120
  attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
121
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
122
  id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
123
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if prompt is None:
125
  prompt = ""
126
 
@@ -145,7 +174,8 @@ def generate_image(image_container, uploaded_image_paths, init_img_file_paths, i
145
  else:
146
  if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
147
  and (adaface_ckpt_path != args.adaface_ckpt_path):
148
- # Reload the embedding manager
 
149
  adaface.id2ada_prompt_encoder.load_adaface_ckpt(adaface_ckpt_path)
150
 
151
  with torch.no_grad():
@@ -154,7 +184,9 @@ def generate_image(image_container, uploaded_image_paths, init_img_file_paths, i
154
  update_text_encoder=True)
155
 
156
  # adaface_prompt_embeds: [1, 77, 768].
157
- adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt, verbose=True)
 
 
158
 
159
  image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
160
 
@@ -252,20 +284,20 @@ with gr.Blocks(css=css) as demo:
252
 
253
  prompt = gr.Dropdown(label="Prompt",
254
  info="Try something like 'man/woman walking on the beach'.",
255
- value="((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
256
  allow_custom_value=True,
257
  filterable=False,
258
  choices=[
259
- "((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
260
- "walking on the beach, sunset, orange sky, eye level shot",
261
- "in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
262
- "dancing pose among folks in a park, waving hands",
263
- "in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
264
- "jedi wielding a lightsaber, star wars, full body view, eye level shot",
265
- "playing guitar on a boat, ocean waves",
266
- "with a passion for reading, curled up with a book in a cozy nook near a window",
267
- "running pose in a park, eye level shot",
268
- "in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
269
  ])
270
 
271
  init_image_strength = gr.Slider(
@@ -285,6 +317,14 @@ with gr.Blocks(css=css) as demo:
285
  value=0.1,
286
  )
287
 
 
 
 
 
 
 
 
 
288
  guidance_scale = gr.Slider(
289
  label="Guidance scale",
290
  minimum=1.0,
@@ -352,18 +392,18 @@ with gr.Blocks(css=css) as demo:
352
  image_embed_cfg_begin_scale = gr.Slider(
353
  label="ID-Animator Image Embedding Initial Scale",
354
  info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
355
- minimum=0.3,
356
  maximum=1.5,
357
  step=0.1,
358
- value=1.2,
359
  )
360
  image_embed_cfg_end_scale = gr.Slider(
361
  label="ID-Animator Image Embedding Final Scale",
362
  info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
363
- minimum=0.0,
364
  maximum=1.5,
365
  step=0.1,
366
- value=0.8,
367
  )
368
 
369
  id_animator_anneal_steps = gr.Slider(
@@ -393,7 +433,7 @@ with gr.Blocks(css=css) as demo:
393
  init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files, outputs=[uploaded_init_img_gallery, init_clear_button_column, init_img_files])
394
  remove_init_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_init_img_gallery, init_clear_button_column,
395
  init_img_files, init_img_selected_idx])
396
- gen_init.click(fn=gen_init_images, inputs=[uploaded_files_gallery, prompt],
397
  outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
398
  uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
399
 
@@ -405,8 +445,9 @@ with gr.Blocks(css=css) as demo:
405
  queue=False,
406
  api_name=False,
407
  ).then(
408
- fn=generate_image,
409
- inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength, init_image_final_weight,
 
410
  prompt, negative_prompt, num_steps, video_length, guidance_scale,
411
  seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
412
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
 
7
  from adaface.adaface_wrapper import AdaFaceWrapper
8
 
9
  import random
10
+ from infer import load_model, model_style_type2base_model_path
11
  MAX_SEED=10000
12
  import uuid
13
  from insightface.app import FaceAnalysis
 
24
  parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
25
  choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
26
  parser.add_argument('--adaface_ckpt_path', type=str,
27
+ default='models/adaface/VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt')
28
+ parser.add_argument('--model_style_type', type=str, default='realistic',
29
+ choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
 
30
  parser.add_argument('--gpu', type=int, default=None)
31
  parser.add_argument('--ip', type=str, default="0.0.0.0")
32
  args = parser.parse_args()
33
 
 
 
 
 
 
 
34
  def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
35
  if randomize_seed:
36
  seed = random.randint(0, MAX_SEED)
 
43
  app.prepare(ctx_id=0, det_size=(320, 320))
44
  device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
45
 
46
+ global adaface, id_animator
 
 
47
 
48
+ base_model_path = model_style_type2base_model_path[args.model_style_type]
49
+ id_animator = load_model(model_style_type=args.model_style_type, device=device)
50
  adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
51
  adaface_encoder_types=args.adaface_encoder_types,
52
  adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
53
 
54
+ basedir = os.getcwd()
55
+ savedir = os.path.join(basedir,'samples')
56
  os.makedirs(savedir, exist_ok=True)
57
 
58
  #print(f"### Cleaning cached examples ...")
 
74
  return data.index
75
 
76
  @spaces.GPU
77
+ def gen_init_images(uploaded_image_paths, model_style_type, prompt, out_image_count=3):
78
+ global adaface, id_animator
79
  if uploaded_image_paths is None:
80
  print("No image uploaded")
81
  return None, None, None
82
+
83
+ model_style_type = model_style_type.lower()
84
+ base_model_path = model_style_type2base_model_path[model_style_type]
85
+ # If the base model type is changed, reload the model.
86
+ if model_style_type != args.model_style_type:
87
+ id_animator = load_model(model_style_type=model_style_type, device=device)
88
+ adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
89
+ adaface_encoder_types=args.adaface_encoder_types,
90
+ adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
91
+ # Update base model type.
92
+ args.model_style_type = model_style_type
93
+
94
  # uploaded_image_paths is a list of tuples:
95
  # [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
96
  # Extract the file paths.
 
104
 
105
  # Generate two images each time for the user to select from.
106
  noise = torch.randn(out_image_count, 3, 512, 512)
107
+
108
+ enhance_face = True
109
+ if enhance_face and "face portrait" not in prompt:
110
+ if "portrait" in prompt:
111
+ # Enhance the face features by replacing "portrait" with "face portrait".
112
+ prompt = prompt.replace("portrait", "face portrait")
113
+ else:
114
+ prompt = "face portrait, " + prompt
115
+
116
  # samples: A list of PIL Image instances.
117
  with torch.no_grad():
118
+ samples = adaface(noise, prompt,
119
+ placeholder_tokens_pos='prepend',
120
+ out_image_count=out_image_count, verbose=True)
121
 
122
  face_paths = []
123
  for sample in samples:
 
131
  return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
132
 
133
  @spaces.GPU(duration=90)
134
+ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
135
+ init_image_strength, init_image_final_weight, model_style_type,
136
  prompt, negative_prompt, num_steps, video_length, guidance_scale, seed,
137
  attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
138
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
139
  id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
140
 
141
+ global adaface, id_animator
142
+ model_style_type = model_style_type.lower()
143
+ base_model_path = model_style_type2base_model_path[model_style_type]
144
+ # If the base model type is changed, reload the model.
145
+ if model_style_type != args.model_style_type:
146
+ id_animator = load_model(model_style_type=model_style_type, device=device)
147
+ adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
148
+ adaface_encoder_types=args.adaface_encoder_types,
149
+ adaface_ckpt_paths=[args.adaface_ckpt_path], device=device)
150
+ # Update base model type.
151
+ args.model_style_type = model_style_type
152
+
153
  if prompt is None:
154
  prompt = ""
155
 
 
174
  else:
175
  if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
176
  and (adaface_ckpt_path != args.adaface_ckpt_path):
177
+ args.adaface_ckpt_path = adaface_ckpt_path
178
+ # Reload the adaface model weights.
179
  adaface.id2ada_prompt_encoder.load_adaface_ckpt(adaface_ckpt_path)
180
 
181
  with torch.no_grad():
 
184
  update_text_encoder=True)
185
 
186
  # adaface_prompt_embeds: [1, 77, 768].
187
+ adaface_prompt_embeds, _, _, _ = adaface.encode_prompt(prompt,
188
+ placeholder_tokens_pos='prepend',
189
+ verbose=True)
190
 
191
  image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
192
 
 
284
 
285
  prompt = gr.Dropdown(label="Prompt",
286
  info="Try something like 'man/woman walking on the beach'.",
287
+ value="portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
288
  allow_custom_value=True,
289
  filterable=False,
290
  choices=[
291
+ "portrait, ((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
292
+ "portrait, walking on the beach, sunset, orange sky, eye level shot",
293
+ "portrait, in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
294
+ "portrait, dancing pose among folks in a park, waving hands",
295
+ "portrait, in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
296
+ "portrait, jedi wielding a lightsaber, star wars, full body view, eye level shot",
297
+ "portrait, playing guitar on a boat, ocean waves",
298
+ "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
299
+ "portrait, running pose in a park, eye level shot",
300
+ "portrait, in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
301
  ])
302
 
303
  init_image_strength = gr.Slider(
 
317
  value=0.1,
318
  )
319
 
320
+ model_style_type = gr.Dropdown(
321
+ label="Base Model Style Type",
322
+ info="Switching the base model type will take 10~20 seconds to reload the model",
323
+ value=args.model_style_type,
324
+ choices=["Rrealistic", "Anime", "Photorealistic"],
325
+ allow_custom_value=False,
326
+ filterable=False,
327
+ )
328
  guidance_scale = gr.Slider(
329
  label="Guidance scale",
330
  minimum=1.0,
 
392
  image_embed_cfg_begin_scale = gr.Slider(
393
  label="ID-Animator Image Embedding Initial Scale",
394
  info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
395
+ minimum=0.6,
396
  maximum=1.5,
397
  step=0.1,
398
+ value=1.0,
399
  )
400
  image_embed_cfg_end_scale = gr.Slider(
401
  label="ID-Animator Image Embedding Final Scale",
402
  info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
403
+ minimum=0.3,
404
  maximum=1.5,
405
  step=0.1,
406
+ value=0.5,
407
  )
408
 
409
  id_animator_anneal_steps = gr.Slider(
 
433
  init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files, outputs=[uploaded_init_img_gallery, init_clear_button_column, init_img_files])
434
  remove_init_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_init_img_gallery, init_clear_button_column,
435
  init_img_files, init_img_selected_idx])
436
+ gen_init.click(fn=gen_init_images, inputs=[uploaded_files_gallery, model_style_type, prompt],
437
  outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
438
  uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
439
 
 
445
  queue=False,
446
  api_name=False,
447
  ).then(
448
+ fn=generate_video,
449
+ inputs=[image_container, files, init_img_files, init_img_selected_idx, init_image_strength,
450
+ init_image_final_weight, model_style_type,
451
  prompt, negative_prompt, num_steps, video_length, guidance_scale,
452
  seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
453
  is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
infer.py CHANGED
@@ -9,19 +9,19 @@ from safetensors import safe_open
9
  from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
10
  from faceadapter.face_adapter import FaceAdapterPlusForVideoLora
11
 
12
- base_model_type_to_path = {
13
- "sd15": "models/sd15-dste8-vae.safetensors", # LDM format. Needs to be converted.
14
- "sar": "models/sar/sar.safetensors", # LDM format. Needs to be converted.
15
- "rv51": "models/rv51/realisticVisionV51_v51VAE.safetensors"
16
  }
17
 
18
- def load_model(base_model_type="rv51", device="cuda"):
19
  inference_config = "inference-v2.yaml"
20
  sd_version = "animatediff/sd"
21
  id_ckpt = "models/animator.ckpt"
22
  image_encoder_path = "models/image_encoder"
23
 
24
- base_model_path = base_model_type_to_path[base_model_type]
25
 
26
  motion_module_path="models/v3_sd15_mm.ckpt"
27
  motion_lora_path = "models/v3_sd15_adapter.ckpt"
 
9
  from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
10
  from faceadapter.face_adapter import FaceAdapterPlusForVideoLora
11
 
12
+ model_style_type2base_model_path = {
13
+ "realistic": "models/rv51/realisticVisionV51_v51VAE_dste8.safetensors",
14
+ "anime": "models/aingdiffusion/aingdiffusion_v170_ar.safetensors",
15
+ "photorealistic": "models/sar/sar.safetensors" # LDM format. Needs to be converted.
16
  }
17
 
18
+ def load_model(model_style_type="realistic", device="cuda"):
19
  inference_config = "inference-v2.yaml"
20
  sd_version = "animatediff/sd"
21
  id_ckpt = "models/animator.ckpt"
22
  image_encoder_path = "models/image_encoder"
23
 
24
+ base_model_path = model_style_type2base_model_path[model_style_type]
25
 
26
  motion_module_path="models/v3_sd15_mm.ckpt"
27
  motion_lora_path = "models/v3_sd15_adapter.ckpt"
models/adaface/{VGGface2_HQ_masks2024-10-05T09-28-53_zero3-ada-28000.pt → VGGface2_HQ_masks2024-10-08T14-42-05_zero3-ada-24500.pt} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f6959ba41eb8cc8fcc738ba5ecc751de3acc0d1180e3af2272b7b52b04c6ae8
3
- size 1814922042
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c66b1847072c66deaa38b9ec91c0d76ac5274dec8d02444fc9672f0defa4d156
3
+ size 1814921594
models/{rv51/realisticVisionV51_v51VAE.safetensors → aingdiffusion/aingdiffusion_v170_ar.safetensors} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15012c538f503ce2ebfc2c8547b268c75ccdaff7a281db55399940ff1d70e21d
3
  size 2132625894
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883af0939ef9bbb7ca03e90e778512258be26be7bef9276768c1594f9b7d3590
3
  size 2132625894
models/rv51/realisticVisionV51_v51VAE_dste8.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a96d832b0df00b72e762486cec30311f4c706871f50120fc5dab6f60cf044a33
3
+ size 2132625894