Spaces:
Running
on
Zero
Running
on
Zero
optimize mvdream pipeline
Browse files- mvdream/pipeline_mvdream.py +12 -5
mvdream/pipeline_mvdream.py
CHANGED
@@ -499,6 +499,13 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
499 |
# Prepare extra step kwargs.
|
500 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
# Denoising loop
|
503 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
504 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
@@ -511,17 +518,17 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
511 |
unet_inputs = {
|
512 |
'x': latent_model_input,
|
513 |
'timesteps': torch.tensor([t] * actual_num_frames * multiplier, dtype=latent_model_input.dtype, device=device),
|
514 |
-
'context':
|
515 |
'num_frames': actual_num_frames,
|
516 |
-
'camera':
|
517 |
}
|
518 |
|
519 |
if image is not None:
|
520 |
-
unet_inputs['ip'] =
|
521 |
-
unet_inputs['ip_img'] =
|
522 |
|
523 |
# predict the noise residual
|
524 |
-
noise_pred = self.unet
|
525 |
|
526 |
# perform guidance
|
527 |
if do_classifier_free_guidance:
|
|
|
499 |
# Prepare extra step kwargs.
|
500 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
501 |
|
502 |
+
context = torch.cat([prompt_embeds_neg] * actual_num_frames + [prompt_embeds_pos] * actual_num_frames)
|
503 |
+
torch.cat([camera] * multiplier)
|
504 |
+
|
505 |
+
if image is not None:
|
506 |
+
ip = torch.cat([image_embeds_neg] * actual_num_frames + [image_embeds_pos] * actual_num_frames)
|
507 |
+
ip_img = torch.cat([image_latents_neg] + [image_latents_pos]) # no repeat
|
508 |
+
|
509 |
# Denoising loop
|
510 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
511 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
|
|
518 |
unet_inputs = {
|
519 |
'x': latent_model_input,
|
520 |
'timesteps': torch.tensor([t] * actual_num_frames * multiplier, dtype=latent_model_input.dtype, device=device),
|
521 |
+
'context': context,
|
522 |
'num_frames': actual_num_frames,
|
523 |
+
'camera': camera,
|
524 |
}
|
525 |
|
526 |
if image is not None:
|
527 |
+
unet_inputs['ip'] = ip
|
528 |
+
unet_inputs['ip_img'] = ip_img
|
529 |
|
530 |
# predict the noise residual
|
531 |
+
noise_pred = self.unet(**unet_inputs)
|
532 |
|
533 |
# perform guidance
|
534 |
if do_classifier_free_guidance:
|