Spaces:
Runtime error
Runtime error
File size: 6,393 Bytes
413d4d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
from huggingface_hub import snapshot_download, hf_hub_download
from videogen_hub import MODEL_PATH
class OpenSora12:
def __init__(self, device="gpu"):
"""
1. Download the pretrained model and put it inside MODEL_PATH/modelscope
2. Create Pipeline
Note: it seems that the model needed from model_dir cannot support cpu
Args:
device: 'gpu' or 'cpu' the device to use the model
"""
from mmengine import Config as mmengine_config
from videogen_hub.pipelines.opensora.scripts.inference import main
model_path = snapshot_download("hpcai-tech/OpenSora-STDiT-v3",
local_dir=os.path.join(MODEL_PATH, 'OpenSora-STDiT-v3'))
self.pipeline = main
self.config = {
# Basic video frame settings
"num_frames": 51, # Total number of frames in a clip
"frame_interval": 1, # Interval between frames
"fps": 24, # Frames per second
"image_size": [480, 854], # Resolution of each frame (height, width)
# Model configuration for multi-resolution and specific model parameters
"multi_resolution": "STDiT2", # Multi-resolution model type
"model": {
"type": "STDiT3-XL/2", # Model type and size
"from_pretrained": os.path.join(MODEL_PATH, "STDiT3-XL_2"), # Path to pretrained checkpoint
"file_name": "model.safetensors", # Name of the model file
"input_sq_size": 512, # Input square size for the model
"qk_norm": True, # Whether to normalize query-key in attention
"enable_flashattn": False, # Enable flash attention mechanism, require flash_attn package
"enable_layernorm_kernel": False, # Enable layer normalization in kernel, requires apex package
},
# Variational Autoencoder (VAE) specific settings
"vae": {
"type": "OpenSoraVAE_V1_2", # Type of the autoencoder
"from_pretrained": "hpcai-tech/OpenSora-VAE-v1.2", # Pretrained model from Hugging Face
#"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), # Local cache directory for model weights
"micro_frame_size": 17,
"micro_batch_size": 4, # Batch size for processing
},
# Text encoder settings for embedding textual information
"text_encoder": {
"type": "t5", # Text encoder model type
"from_pretrained": "DeepFloyd/t5-v1_1-xxl", # Pretrained model
"cache_dir": os.path.join(MODEL_PATH, "t5-v1_1-xxl"), # Cache directory
"model_max_length": 300, # Max length of text inputs
},
# Scheduler settings for diffusion models
"scheduler": {
"type": "rflow", # Type of scheduler for the diffusion process
"num_sampling_steps": 30, # Number of sampling steps in diffusion
"cfg_scale": 7.0, # Scale for classifier-free guidance
# "cfg_channel": 3, # Number of channels for guidance
},
# Additional settings for processing and output
"dtype": "bf16", # Data type for computation (bfloat16)
# "prompt_path": "./assets/texts/t2v_samples.txt", # Path to text prompts
"prompt_path": None, # Path to text prompts
"prompt": [
"A beautiful sunset over the city"
], # List of prompts for generation
"batch_size": 1, # Batch size for generation
"seed": 42, # Seed for random number generators
"save_dir": "./samples/samples/", # Directory to save generated samples
"config": "sample.py", # Path to this configuration file
"prompt_as_path": False, # Treat the prompt as a file path (True/False)
"reference_path": None, # Path to reference image/video for conditioning
"loop": 1, # Number of times to loop the processing
"sample_name": None, # Specific name for the generated sample
"num_sample": 1, # Number of samples to generate
"aes": 6.5,
"flow": None,
}
self.config = mmengine_config(self.config)
hf_hub_download(
repo_id="hpcai-tech/OpenSora-STDiT-v3",
filename="model.safetensors",
local_dir=self.config.model.from_pretrained,
)
hf_hub_download(
repo_id="hpcai-tech/OpenSora-VAE-v1.2",
filename="model.safetensors",
local_dir=os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"),
)
hf_hub_download(
repo_id="DeepFloyd/t5-v1_1-xxl",
filename="pytorch_model-00001-of-00002.bin",
local_dir=self.config.text_encoder.cache_dir,
)
def infer_one_video(
self,
prompt: str = None,
size: list = [320, 512],
seconds: int = 2,
fps: int = 8,
seed: int = 42,
):
"""
Generates a single video based on the provided prompt and parameters.
The generated video always has resolution 256x256
Args:
prompt (str, optional): The text prompt to generate the video from. Defaults to None.
size (list, optional): The resolution of the video. Defaults to [320, 512].
seconds (int, optional): The duration of the video in seconds. Defaults to 2.
fps (int, optional): The frames per second of the video. Defaults to 8.
seed (int, optional): The seed for random number generation. Defaults to 42.
Returns:
torch.Tensor: The generated video as a tensor.
"""
self.config.num_frames = fps * seconds
self.config.fps = fps
self.config.seed = seed
self.config.prompt = [prompt]
self.config.image_size = size
all_batch_samples = self.pipeline(self.config)
sample = all_batch_samples[0][0]
# sample is torch.Size([1, C, f, H, W])
output = sample.squeeze(0).permute(1, 2, 3, 0).cpu().float()
# torch.Size([1, C, f, H, W]) -> torch.Size([f, H, W, C])
# BFloat16 -> Float
return output
|