Spaces:

fffiloni
/

SVFR-demo

Running on L40S

App Files Files Community

SVFR-demo / src /models /svfr_adapter /unet_3d_svd_condition_ip.py

fffiloni

Migrated from GitHub

bdd549c verified 20 days ago

raw

history blame

24.2 kB

	from dataclasses import dataclass
	from typing import Dict, Optional, Tuple, Union, Any

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.loaders import UNet2DConditionLoadersMixin
	from diffusers.utils import BaseOutput, logging
	from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor

	from diffusers.models.embeddings import TimestepEmbedding, Timesteps
	from diffusers.models.modeling_utils import ModelMixin
	from src.models.svfr_adapter.unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
	from src.models.svfr_adapter.attention_processor import AttnProcessor2_0, AttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterAttnProcessor

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	@dataclass
	class UNet3DConditionSVDOutput(BaseOutput):
	"""
	The output of [`UNet3DConditionSVDModel`].

	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
	The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
	"""

	sample: torch.FloatTensor = None


	class UNet3DConditionSVDModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
	r"""
	A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
	shaped output.

	This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
	for all models (such as downloading or saving).

	Parameters:
	sample_size (`int` or `Tuple[int, int]`, optional, defaults to `None`):
	Height and width of input/output sample.
	in_channels (`int`, optional, defaults to 8): Number of channels in the input sample.
	out_channels (`int`, optional, defaults to 4): Number of channels in the output.
	down_block_types (`Tuple[str]`, optional, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
	The tuple of downsample blocks to use.
	up_block_types (`Tuple[str]`, optional, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
	The tuple of upsample blocks to use.
	block_out_channels (`Tuple[int]`, optional, defaults to `(320, 640, 1280, 1280)`):
	The tuple of output channels for each block.
	addition_time_embed_dim: (`int`, defaults to 256):
	Dimension to to encode the additional time ids.
	projection_class_embeddings_input_dim (`int`, defaults to 768):
	The dimension of the projection of encoded `added_time_ids`.
	layers_per_block (`int`, optional, defaults to 2): The number of layers per block.
	cross_attention_dim (`int` or `Tuple[int]`, optional, defaults to 1280):
	The dimension of the cross attention features.
	transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , optional, defaults to 1):
	The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
	[`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
	[`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
	num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
	The number of attention heads.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	"""

	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(
	self,
	sample_size: Optional[int] = None,
	in_channels: int = 8,
	out_channels: int = 4,
	down_block_types: Tuple[str] = (
	"CrossAttnDownBlockSpatioTemporal",
	"CrossAttnDownBlockSpatioTemporal",
	"CrossAttnDownBlockSpatioTemporal",
	"DownBlockSpatioTemporal",
	),
	up_block_types: Tuple[str] = (
	"UpBlockSpatioTemporal",
	"CrossAttnUpBlockSpatioTemporal",
	"CrossAttnUpBlockSpatioTemporal",
	"CrossAttnUpBlockSpatioTemporal",
	),
	block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
	addition_time_embed_dim: int = 256,
	projection_class_embeddings_input_dim: int = 768,
	layers_per_block: Union[int, Tuple[int]] = 2,
	cross_attention_dim: Union[int, Tuple[int]] = 1024,
	transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
	num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
	num_frames: int = 25,
	):
	super().__init__()

	self.sample_size = sample_size

	# Check inputs
	if len(down_block_types) != len(up_block_types):
	raise ValueError(
	f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
	)

	if len(block_out_channels) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
	)

	if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
	)

	# input
	self.conv_in = nn.Conv2d(
	in_channels,
	block_out_channels[0],
	kernel_size=3,
	padding=1,
	)

	# time
	time_embed_dim = block_out_channels[0] * 4

	self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
	timestep_input_dim = block_out_channels[0]

	self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)

	self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
	self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)

	self.down_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	if isinstance(num_attention_heads, int):
	num_attention_heads = (num_attention_heads,) * len(down_block_types)

	if isinstance(cross_attention_dim, int):
	cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

	if isinstance(layers_per_block, int):
	layers_per_block = [layers_per_block] * len(down_block_types)

	if isinstance(transformer_layers_per_block, int):
	transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)

	blocks_time_embed_dim = time_embed_dim

	# down
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=layers_per_block[i],
	transformer_layers_per_block=transformer_layers_per_block[i],
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=blocks_time_embed_dim,
	add_downsample=not is_final_block,
	resnet_eps=1e-5,
	cross_attention_dim=cross_attention_dim[i],
	num_attention_heads=num_attention_heads[i],
	resnet_act_fn="silu",
	)
	self.down_blocks.append(down_block)

	# mid
	self.mid_block = UNetMidBlockSpatioTemporal(
	block_out_channels[-1],
	temb_channels=blocks_time_embed_dim,
	transformer_layers_per_block=transformer_layers_per_block[-1],
	cross_attention_dim=cross_attention_dim[-1],
	num_attention_heads=num_attention_heads[-1],
	)

	# count how many layers upsample the images
	self.num_upsamplers = 0

	# up
	reversed_block_out_channels = list(reversed(block_out_channels))
	reversed_num_attention_heads = list(reversed(num_attention_heads))
	reversed_layers_per_block = list(reversed(layers_per_block))
	reversed_cross_attention_dim = list(reversed(cross_attention_dim))
	reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))

	output_channel = reversed_block_out_channels[0]
	for i, up_block_type in enumerate(up_block_types):
	is_final_block = i == len(block_out_channels) - 1

	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]
	input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]

	# add upsample block for all BUT final layer
	if not is_final_block:
	add_upsample = True
	self.num_upsamplers += 1
	else:
	add_upsample = False

	up_block = get_up_block(
	up_block_type,
	num_layers=reversed_layers_per_block[i] + 1,
	transformer_layers_per_block=reversed_transformer_layers_per_block[i],
	in_channels=input_channel,
	out_channels=output_channel,
	prev_output_channel=prev_output_channel,
	temb_channels=blocks_time_embed_dim,
	add_upsample=add_upsample,
	resnet_eps=1e-5,
	resolution_idx=i,
	cross_attention_dim=reversed_cross_attention_dim[i],
	num_attention_heads=reversed_num_attention_heads[i],
	resnet_act_fn="silu",
	)
	self.up_blocks.append(up_block)
	prev_output_channel = output_channel

	# out
	self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
	self.conv_act = nn.SiLU()

	self.conv_out = nn.Conv2d(
	block_out_channels[0],
	out_channels,
	kernel_size=3,
	padding=1,
	)

	@property
	def attn_processors(self) -> Dict[str, AttentionProcessor]:
	r"""
	Returns:
	`dict` of attention processors: A dictionary containing all attention processors used in the model with
	indexed by its weight name.
	"""
	# set recursively
	processors = {}

	def fn_recursive_add_processors(
	name: str,
	module: torch.nn.Module,
	processors: Dict[str, AttentionProcessor],
	):
	if hasattr(module, "get_processor"):
	processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)

	for sub_name, child in module.named_children():
	fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

	return processors

	for name, module in self.named_children():
	fn_recursive_add_processors(name, module, processors)

	return processors

	def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
	r"""
	Sets the attention processor to use to compute attention.

	Parameters:
	processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
	The instantiated processor class or a dictionary of processor classes that will be set as the processor
	for all `Attention` layers.

	If `processor` is a dict, the key needs to define the path to the corresponding cross attention
	processor. This is strongly recommended when setting trainable attention processors.

	"""
	count = len(self.attn_processors.keys())

	if isinstance(processor, dict) and len(processor) != count:
	raise ValueError(
	f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
	f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
	)

	def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
	if hasattr(module, "set_processor"):
	if not isinstance(processor, dict):
	module.set_processor(processor)
	else:
	module.set_processor(processor.pop(f"{name}.processor"))

	for sub_name, child in module.named_children():
	fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

	for name, module in self.named_children():
	fn_recursive_attn_processor(name, module, processor)

	def set_default_attn_processor(self):
	"""
	Disables custom attention processors and sets the default attention implementation.
	"""
	if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
	processor = AttnProcessor()
	else:
	raise ValueError(
	f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
	)

	self.set_attn_processor(processor)

	def _set_gradient_checkpointing(self, module, value=False):
	if hasattr(module, "gradient_checkpointing"):
	module.gradient_checkpointing = value

	# Copied from diffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
	def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
	"""
	Sets the attention processor to use [feed forward
	chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

	Parameters:
	chunk_size (`int`, optional):
	The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
	over each tensor of dim=`dim`.
	dim (`int`, optional, defaults to `0`):
	The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
	or dim=1 (sequence length).
	"""
	if dim not in [0, 1]:
	raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")

	# By default chunk size is 1
	chunk_size = chunk_size or 1

	def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
	if hasattr(module, "set_chunk_feed_forward"):
	module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)

	for child in module.children():
	fn_recursive_feed_forward(child, chunk_size, dim)

	for module in self.children():
	fn_recursive_feed_forward(module, chunk_size, dim)

	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	encoder_hidden_states: torch.Tensor,
	down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
	mid_block_additional_residual: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	added_time_ids: torch.Tensor=None,
	pose_cond_fea: Optional[torch.Tensor] = None,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	) -> Union[UNet3DConditionSVDOutput, Tuple]:
	r"""
	The [`UNetSpatioTemporalConditionModel`] forward method.

	Args:
	sample (`torch.FloatTensor`):
	The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
	timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
	encoder_hidden_states (`torch.FloatTensor`):
	The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
	added_time_ids: (`torch.FloatTensor`):
	The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
	embeddings and added to the time embeddings.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
	tuple.
	Returns:
	[`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
	If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
	a `tuple` is returned where the first element is the sample tensor.
	"""
	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
	# This would be a good case for the `match` statement (Python 3.10+)
	is_mps = sample.device.type == "mps"
	if isinstance(timestep, float):
	dtype = torch.float32 if is_mps else torch.float64
	else:
	dtype = torch.int32 if is_mps else torch.int64
	timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
	elif len(timesteps.shape) == 0:
	timesteps = timesteps[None].to(sample.device)

	batch_size, num_frames = sample.shape[:2]
	timesteps = timesteps.expand(batch_size)

	t_emb = self.time_proj(timesteps)
	t_emb = t_emb.to(dtype=sample.dtype)
	emb = self.time_embedding(t_emb)

	time_embeds = self.add_time_proj(added_time_ids.flatten())
	time_embeds = time_embeds.reshape((batch_size, -1))
	time_embeds = time_embeds.to(emb.dtype)
	aug_emb = self.add_embedding(time_embeds)
	emb = emb + aug_emb

	sample = sample.flatten(0, 1)
	emb = emb.repeat_interleave(num_frames, dim=0)

	# 2. pre-process
	sample = self.conv_in(sample)

	if pose_cond_fea is not None:
	sample = sample + pose_cond_fea.flatten(0, 1)

	image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)

	down_block_res_samples = (sample,)
	for downsample_block in self.down_blocks:
	if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
	sample, res_samples = downsample_block(
	hidden_states=sample,
	temb=emb,
	encoder_hidden_states=encoder_hidden_states,
	cross_attention_kwargs=cross_attention_kwargs,
	image_only_indicator=image_only_indicator,
	)
	else:
	sample, res_samples = downsample_block(
	hidden_states=sample,
	temb=emb,
	image_only_indicator=image_only_indicator,
	)

	down_block_res_samples += res_samples


	# 4. mid
	sample = self.mid_block(
	hidden_states=sample,
	temb=emb,
	encoder_hidden_states=encoder_hidden_states,
	image_only_indicator=image_only_indicator,
	cross_attention_kwargs=cross_attention_kwargs,

	)


	# 5. up
	for i, upsample_block in enumerate(self.up_blocks):
	res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
	down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

	if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
	sample = upsample_block(
	hidden_states=sample,
	temb=emb,
	res_hidden_states_tuple=res_samples,
	encoder_hidden_states=encoder_hidden_states,
	cross_attention_kwargs=cross_attention_kwargs,
	image_only_indicator=image_only_indicator,
	)
	else:
	sample = upsample_block(
	hidden_states=sample,
	temb=emb,
	res_hidden_states_tuple=res_samples,
	image_only_indicator=image_only_indicator,
	)

	# 6. post-process
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	# 7. Reshape back to original shape
	sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])

	if not return_dict:
	return (sample,)

	return UNet3DConditionSVDOutput(sample=sample)



	def init_ip_adapters(unet, num_adapter_embeds=[], scale=1.0):
	# init adapter modules
	attn_procs = {}
	unet_sd = unet.state_dict()
	for name in unet.attn_processors.keys():
	cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
	if name.startswith("mid_block"):
	hidden_size = unet.config.block_out_channels[-1]
	elif name.startswith("up_blocks"):
	block_id = int(name[len("up_blocks.")])
	hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
	elif name.startswith("down_blocks"):
	block_id = int(name[len("down_blocks.")])
	hidden_size = unet.config.block_out_channels[block_id]
	# if cross_attention_dim is None or "temporal_transformer_blocks" in name:
	if cross_attention_dim is None:
	attn_processor_class = (
	AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
	)
	attn_procs[name] = attn_processor_class()
	else:
	attn_processor_class = (
	IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
	)

	attn_procs[name] = attn_processor_class(
	hidden_size=hidden_size,
	cross_attention_dim=cross_attention_dim,
	num_tokens=num_adapter_embeds,
	scale=scale
	)

	layer_name = name.split(".processor")[0]
	weights = {}
	for i in range(len(num_adapter_embeds)):
	weights.update({f"to_k_ip.{i}.weight": unet_sd[layer_name + ".to_k.weight"]})
	weights.update({f"to_v_ip.{i}.weight": unet_sd[layer_name + ".to_v.weight"]})

	attn_procs[name].load_state_dict(weights)
	unet.set_attn_processor(attn_procs)
	adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
	return adapter_modules