Spaces:

Baraaqasem
/

Imag

Runtime error

App Files Files Community

Imag / src /videogen_hub /infermodels /opensora_12.py

Baraaqasem

Upload 49 files

413d4d0 verified 2 months ago

raw

history blame

6.39 kB

	import os

	from huggingface_hub import snapshot_download, hf_hub_download

	from videogen_hub import MODEL_PATH


	class OpenSora12:
	def __init__(self, device="gpu"):
	"""
	1. Download the pretrained model and put it inside MODEL_PATH/modelscope
	2. Create Pipeline
	Note: it seems that the model needed from model_dir cannot support cpu
	Args:
	device: 'gpu' or 'cpu' the device to use the model
	"""

	from mmengine import Config as mmengine_config
	from videogen_hub.pipelines.opensora.scripts.inference import main
	model_path = snapshot_download("hpcai-tech/OpenSora-STDiT-v3",
	local_dir=os.path.join(MODEL_PATH, 'OpenSora-STDiT-v3'))
	self.pipeline = main
	self.config = {
	# Basic video frame settings
	"num_frames": 51, # Total number of frames in a clip
	"frame_interval": 1, # Interval between frames
	"fps": 24, # Frames per second
	"image_size": [480, 854], # Resolution of each frame (height, width)
	# Model configuration for multi-resolution and specific model parameters
	"multi_resolution": "STDiT2", # Multi-resolution model type
	"model": {
	"type": "STDiT3-XL/2", # Model type and size
	"from_pretrained": os.path.join(MODEL_PATH, "STDiT3-XL_2"), # Path to pretrained checkpoint
	"file_name": "model.safetensors", # Name of the model file
	"input_sq_size": 512, # Input square size for the model
	"qk_norm": True, # Whether to normalize query-key in attention
	"enable_flashattn": False, # Enable flash attention mechanism, require flash_attn package
	"enable_layernorm_kernel": False, # Enable layer normalization in kernel, requires apex package
	},
	# Variational Autoencoder (VAE) specific settings
	"vae": {
	"type": "OpenSoraVAE_V1_2", # Type of the autoencoder
	"from_pretrained": "hpcai-tech/OpenSora-VAE-v1.2", # Pretrained model from Hugging Face
	#"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), # Local cache directory for model weights
	"micro_frame_size": 17,
	"micro_batch_size": 4, # Batch size for processing
	},
	# Text encoder settings for embedding textual information
	"text_encoder": {
	"type": "t5", # Text encoder model type
	"from_pretrained": "DeepFloyd/t5-v1_1-xxl", # Pretrained model
	"cache_dir": os.path.join(MODEL_PATH, "t5-v1_1-xxl"), # Cache directory
	"model_max_length": 300, # Max length of text inputs
	},
	# Scheduler settings for diffusion models
	"scheduler": {
	"type": "rflow", # Type of scheduler for the diffusion process
	"num_sampling_steps": 30, # Number of sampling steps in diffusion
	"cfg_scale": 7.0, # Scale for classifier-free guidance
	# "cfg_channel": 3, # Number of channels for guidance
	},
	# Additional settings for processing and output
	"dtype": "bf16", # Data type for computation (bfloat16)
	# "prompt_path": "./assets/texts/t2v_samples.txt", # Path to text prompts
	"prompt_path": None, # Path to text prompts
	"prompt": [
	"A beautiful sunset over the city"
	], # List of prompts for generation
	"batch_size": 1, # Batch size for generation
	"seed": 42, # Seed for random number generators
	"save_dir": "./samples/samples/", # Directory to save generated samples
	"config": "sample.py", # Path to this configuration file
	"prompt_as_path": False, # Treat the prompt as a file path (True/False)
	"reference_path": None, # Path to reference image/video for conditioning
	"loop": 1, # Number of times to loop the processing
	"sample_name": None, # Specific name for the generated sample
	"num_sample": 1, # Number of samples to generate
	"aes": 6.5,
	"flow": None,
	}
	self.config = mmengine_config(self.config)

	hf_hub_download(
	repo_id="hpcai-tech/OpenSora-STDiT-v3",
	filename="model.safetensors",
	local_dir=self.config.model.from_pretrained,
	)

	hf_hub_download(
	repo_id="hpcai-tech/OpenSora-VAE-v1.2",
	filename="model.safetensors",
	local_dir=os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"),
	)

	hf_hub_download(
	repo_id="DeepFloyd/t5-v1_1-xxl",
	filename="pytorch_model-00001-of-00002.bin",
	local_dir=self.config.text_encoder.cache_dir,
	)

	def infer_one_video(
	self,
	prompt: str = None,
	size: list = [320, 512],
	seconds: int = 2,
	fps: int = 8,
	seed: int = 42,
	):
	"""
	Generates a single video based on the provided prompt and parameters.
	The generated video always has resolution 256x256

	Args:
	prompt (str, optional): The text prompt to generate the video from. Defaults to None.
	size (list, optional): The resolution of the video. Defaults to [320, 512].
	seconds (int, optional): The duration of the video in seconds. Defaults to 2.
	fps (int, optional): The frames per second of the video. Defaults to 8.
	seed (int, optional): The seed for random number generation. Defaults to 42.

	Returns:
	torch.Tensor: The generated video as a tensor.
	"""

	self.config.num_frames = fps * seconds
	self.config.fps = fps
	self.config.seed = seed
	self.config.prompt = [prompt]
	self.config.image_size = size

	all_batch_samples = self.pipeline(self.config)

	sample = all_batch_samples[0][0]
	# sample is torch.Size([1, C, f, H, W])

	output = sample.squeeze(0).permute(1, 2, 3, 0).cpu().float()
	# torch.Size([1, C, f, H, W]) -> torch.Size([f, H, W, C])
	# BFloat16 -> Float

	return output