Spaces:

Fabrice-TIERCELIN
/

HunyuanVideo

Running

App Files Files Community

HunyuanVideo / hyvideo /modules /embed_layers.py

Fabrice-TIERCELIN

Upload 11 files

797fd08 verified 13 days ago

raw

history blame contribute delete

4.85 kB

	import math
	import torch
	import torch.nn as nn
	from einops import rearrange, repeat

	from ..utils.helpers import to_2tuple


	class PatchEmbed(nn.Module):
	"""2D Image to Patch Embedding

	Image to Patch Embedding using Conv2d

	A convolution based approach to patchifying a 2D image w/ embedding projection.

	Based on the impl in https://github.com/google-research/vision_transformer

	Hacked together by / Copyright 2020 Ross Wightman

	Remove the _assert function in forward function to be compatible with multi-resolution images.
	"""

	def __init__(
	self,
	patch_size=16,
	in_chans=3,
	embed_dim=768,
	norm_layer=None,
	flatten=True,
	bias=True,
	dtype=None,
	device=None,
	):
	factory_kwargs = {"dtype": dtype, "device": device}
	super().__init__()
	patch_size = to_2tuple(patch_size)
	self.patch_size = patch_size
	self.flatten = flatten

	self.proj = nn.Conv3d(
	in_chans,
	embed_dim,
	kernel_size=patch_size,
	stride=patch_size,
	bias=bias,
	**factory_kwargs
	)
	nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
	if bias:
	nn.init.zeros_(self.proj.bias)

	self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

	def forward(self, x):
	x = self.proj(x)
	if self.flatten:
	x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
	x = self.norm(x)
	return x


	class TextProjection(nn.Module):
	"""
	Projects text embeddings. Also handles dropout for classifier-free guidance.

	Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
	"""

	def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
	factory_kwargs = {"dtype": dtype, "device": device}
	super().__init__()
	self.linear_1 = nn.Linear(
	in_features=in_channels,
	out_features=hidden_size,
	bias=True,
	**factory_kwargs
	)
	self.act_1 = act_layer()
	self.linear_2 = nn.Linear(
	in_features=hidden_size,
	out_features=hidden_size,
	bias=True,
	**factory_kwargs
	)

	def forward(self, caption):
	hidden_states = self.linear_1(caption)
	hidden_states = self.act_1(hidden_states)
	hidden_states = self.linear_2(hidden_states)
	return hidden_states


	def timestep_embedding(t, dim, max_period=10000):
	"""
	Create sinusoidal timestep embeddings.

	Args:
	t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
	dim (int): the dimension of the output.
	max_period (int): controls the minimum frequency of the embeddings.

	Returns:
	embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.

	.. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
	"""
	half = dim // 2
	freqs = torch.exp(
	-math.log(max_period)
	* torch.arange(start=0, end=half, dtype=torch.float32)
	/ half
	).to(device=t.device)
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
	return embedding


	class TimestepEmbedder(nn.Module):
	"""
	Embeds scalar timesteps into vector representations.
	"""

	def __init__(
	self,
	hidden_size,
	act_layer,
	frequency_embedding_size=256,
	max_period=10000,
	out_size=None,
	dtype=None,
	device=None,
	):
	factory_kwargs = {"dtype": dtype, "device": device}
	super().__init__()
	self.frequency_embedding_size = frequency_embedding_size
	self.max_period = max_period
	if out_size is None:
	out_size = hidden_size

	self.mlp = nn.Sequential(
	nn.Linear(
	frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
	),
	act_layer(),
	nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
	)
	nn.init.normal_(self.mlp[0].weight, std=0.02)
	nn.init.normal_(self.mlp[2].weight, std=0.02)

	def forward(self, t):
	t_freq = timestep_embedding(
	t, self.frequency_embedding_size, self.max_period
	).type(self.mlp[0].weight.dtype)
	t_emb = self.mlp(t_freq)
	return t_emb