Spaces:

yzy0713
/

myMagicClothing

Configuration error

App Files Files Community

myMagicClothing / pipelines /OmsAnimateDiffusionPipeline.py

yzy0713

Add files

6a05036 9 months ago

raw

history blame contribute delete

15.1 kB

	from diffusers.pipelines.animatediff.pipeline_animatediff import *

	class OmsAnimateDiffusionPipeline(AnimateDiffPipeline):

	def _denoise_loop(
	self,
	timesteps,
	num_inference_steps,
	do_classifier_free_guidance,
	guidance_scale,
	cloth_guidance_scale,
	num_warmup_steps,
	prompt_embeds,
	negative_prompt_embeds,
	latents,
	cross_attention_kwargs,
	added_cond_kwargs,
	extra_step_kwargs,
	callback,
	callback_steps,
	callback_on_step_end,
	callback_on_step_end_tensor_inputs,
	):
	"""Denoising loop for AnimateDiff."""
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 3) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

	# predict the noise residual
	noise_pred = self.unet(
	latent_model_input,
	t,
	encoder_hidden_states=prompt_embeds,
	cross_attention_kwargs=cross_attention_kwargs,
	added_cond_kwargs=added_cond_kwargs,
	).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_cloth, noise_pred_text = noise_pred.chunk(3)
	noise_pred = (
	noise_pred_uncond
	+ guidance_scale * (noise_pred_text - noise_pred_cloth)
	+ cloth_guidance_scale * (noise_pred_cloth - noise_pred_uncond)
	)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

	if callback_on_step_end is not None:
	callback_kwargs = {}
	for k in callback_on_step_end_tensor_inputs:
	callback_kwargs[k] = locals()[k]
	callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

	latents = callback_outputs.pop("latents", latents)
	prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
	negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	return latents

	@torch.no_grad()
	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	num_frames: Optional[int] = 16,
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	cloth_guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_videos_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	ip_adapter_image: Optional[PipelineImageInput] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	clip_skip: Optional[int] = None,
	callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
	callback_on_step_end_tensor_inputs: List[str] = ["latents"],
	**kwargs,
	):
	r"""
	The call function to the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
	height (`int`, optional, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
	The height in pixels of the generated video.
	width (`int`, optional, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
	The width in pixels of the generated video.
	num_frames (`int`, optional, defaults to 16):
	The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
	amounts to 2 seconds of video.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	A higher guidance scale value encourages the model to generate images closely linked to the text
	`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide what to not include in image generation. If not defined, you need to
	pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
	to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
	generator (`torch.Generator` or `List[torch.Generator]`, optional):
	A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
	generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
	`(batch_size, num_channel, num_frames, height, width)`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
	provided, text embeddings are generated from the `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
	not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
	ip_adapter_image: (`PipelineImageInput`, optional):
	Optional image input to work with IP Adapters.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
	`np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
	of a plain tuple.
	cross_attention_kwargs (`dict`, optional):
	A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
	[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
	clip_skip (`int`, optional):
	Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
	the output of the pre-final layer will be used for computing the prompt embeddings.
	callback_on_step_end (`Callable`, optional):
	A function that calls at the end of each denoising steps during the inference. The function is called
	with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
	callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
	`callback_on_step_end_tensor_inputs`.
	callback_on_step_end_tensor_inputs (`List`, optional):
	The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
	will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
	`._callback_tensor_inputs` attribute of your pipeine class.

	Examples:

	Returns:
	[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
	If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
	returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
	"""

	callback = kwargs.pop("callback", None)
	callback_steps = kwargs.pop("callback_steps", None)

	if callback is not None:
	deprecate(
	"callback",
	"1.0.0",
	"Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
	)
	if callback_steps is not None:
	deprecate(
	"callback_steps",
	"1.0.0",
	"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
	)

	# 0. Default height and width to unet
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	num_videos_per_prompt = 1

	# 1. Check inputs. Raise error if not correct
	self.check_inputs(
	prompt,
	height,
	width,
	callback_steps,
	negative_prompt,
	prompt_embeds,
	negative_prompt_embeds,
	callback_on_step_end_tensor_inputs,
	)

	self._guidance_scale = guidance_scale
	self._clip_skip = clip_skip
	self._cross_attention_kwargs = cross_attention_kwargs

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device

	# 3. Encode input prompt
	text_encoder_lora_scale = (
	self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
	)
	prompt_embeds, negative_prompt_embeds = self.encode_prompt(
	prompt,
	device,
	num_videos_per_prompt,
	self.do_classifier_free_guidance,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	lora_scale=text_encoder_lora_scale,
	clip_skip=self.clip_skip,
	)
	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	if self.do_classifier_free_guidance:
	prompt_embeds = torch.cat([negative_prompt_embeds, negative_prompt_embeds, prompt_embeds])

	if ip_adapter_image is not None:
	image_embeds = self.prepare_ip_adapter_image_embeds(
	ip_adapter_image, device, batch_size * num_videos_per_prompt
	)

	# 4. Prepare timesteps
	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps
	self._num_timesteps = len(timesteps)

	# 5. Prepare latent variables
	num_channels_latents = self.unet.config.in_channels
	latents = self.prepare_latents(
	batch_size * num_videos_per_prompt,
	num_channels_latents,
	num_frames,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 7. Add image embeds for IP-Adapter
	added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

	# 8. Denoising loop
	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	denoise_args = {
	"timesteps": timesteps,
	"num_inference_steps": num_inference_steps,
	"do_classifier_free_guidance": self.do_classifier_free_guidance,
	"guidance_scale": guidance_scale,
	"cloth_guidance_scale": guidance_scale,
	"num_warmup_steps": num_warmup_steps,
	"prompt_embeds": prompt_embeds,
	"negative_prompt_embeds": negative_prompt_embeds,
	"latents": latents,
	"cross_attention_kwargs": self.cross_attention_kwargs,
	"added_cond_kwargs": added_cond_kwargs,
	"extra_step_kwargs": extra_step_kwargs,
	"callback": callback,
	"callback_steps": callback_steps,
	"callback_on_step_end": callback_on_step_end,
	"callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
	}

	if self.free_init_enabled:
	latents = self._free_init_loop(
	height=height,
	width=width,
	num_frames=num_frames,
	num_channels_latents=num_channels_latents,
	batch_size=batch_size,
	num_videos_per_prompt=num_videos_per_prompt,
	denoise_args=denoise_args,
	device=device,
	)
	else:
	latents = self._denoise_loop(**denoise_args)

	video = self._retrieve_video_frames(latents, output_type, return_dict)

	# 9. Offload all models
	self.maybe_free_model_hooks()

	return video