import os from huggingface_hub import hf_hub_download, snapshot_download import torch from videogen_hub import MODEL_PATH class T2VTurbo(): def __init__(self, base_model="vc2", merged=True, device="cuda"): """ 1. Download the pretrained model and put it inside MODEL_PATH 2. Create Pipeline Args: device: 'cuda' or 'cpu' the device to use the model """ from videogen_hub.pipelines.t2v_turbo.inference_vc2 import T2VTurboVC2Pipeline1 from videogen_hub.pipelines.t2v_turbo.inference_ms import T2VTurboMSPipeline1 self.config = { "model": { "target": "lvdm.models.ddpm3d.LatentDiffusion", "params": { "linear_start": 0.00085, "linear_end": 0.012, "num_timesteps_cond": 1, "timesteps": 1000, "first_stage_key": "video", "cond_stage_key": "caption", "cond_stage_trainable": False, "conditioning_key": "crossattn", "image_size": [320, 512], "channels": 4, "scale_by_std": False, "scale_factor": 0.18215, "use_ema": False, "uncond_type": "empty_seq", "use_scale": True, "scale_b": 0.7, "unet_config": { "target": "lvdm.modules.networks.openaimodel3d.UNetModel", "params": { "in_channels": 4, "out_channels": 4, "model_channels": 320, "attention_resolutions": [4, 2, 1], "num_res_blocks": 2, "channel_mult": [1, 2, 4, 4], "num_head_channels": 64, "transformer_depth": 1, "context_dim": 1024, "use_linear": True, "use_checkpoint": True, "temporal_conv": True, "temporal_attention": True, "temporal_selfatt_only": True, "use_relative_position": False, "use_causal_attention": False, "temporal_length": 16, "addition_attention": True, "fps_cond": True } }, "first_stage_config": { "target": "lvdm.models.autoencoder.AutoencoderKL", "params": { "embed_dim": 4, "monitor": "val / rec_loss", "ddconfig": { "double_z": True, "z_channels": 4, "resolution": 512, "in_channels": 3, "out_ch": 3, "ch": 128, "ch_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_resolutions": [], "dropout": 0.0 }, "lossconfig": { "target": "torch.nn.Identity" } } }, "cond_stage_config": { "target": "lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder", "params": { "freeze": True, "layer": "penultimate" } } } } } if base_model == "vc2" and merged: merged_model_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-VC2-Merged", filename="t2v_turbo_vc2.pt", local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-VC2")) self.pipeline = T2VTurboVC2Pipeline1(self.config, merged, device, None, merged_model_path) elif base_model == "vc2": base_model_path = hf_hub_download(repo_id="VideoCrafter/VideoCrafter2", filename="model.ckpt", local_dir=os.path.join(MODEL_PATH, "videocrafter2")) unet_lora_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-VC2", filename="unet_lora.pt", local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-VC2")) # It uses the config provided above. self.pipeline = T2VTurboVC2Pipeline1(self.config, merged, device, unet_lora_path, base_model_path) else: base_model_path = snapshot_download(repo_id="ali-vilab/text-to-video-ms-1.7b", local_dir=os.path.join(MODEL_PATH, "modelscope_1.7b")) unet_lora_path = hf_hub_download(repo_id="jiachenli-ucsb/T2V-Turbo-MS", filename="unet_lora.pt", local_dir=os.path.join(MODEL_PATH, "T2V-Turbo-MS")) # It uses the config provided by base_model. self.pipeline = T2VTurboMSPipeline1(device, unet_lora_path, base_model_path) def infer_one_video( self, prompt: str = None, size: list = [320, 512], seconds: int = 2, fps: int = 8, seed: int = 42, ): """ Generates a single video based on the provided prompt and parameters. The output is of shape [frames, channels, height, width]. Args: prompt (str, optional): The text prompt to generate the video from. Defaults to None. seconds (int, optional): The duration of the video in seconds. Defaults to 2. fps (int, optional): The frames per second of the video. Defaults to 8. seed (int, optional): The seed for random number generation. Defaults to 42. Returns: torch.Tensor: The generated video as a tensor. """ output = self.pipeline.inference(prompt=prompt, height=size[0], width=size[1], seed=seed, num_frames=seconds * fps, fps=fps, randomize_seed=False) # [channels, frames, height, width] -> [frames, channels, height, width] output = output.squeeze().permute(1, 0, 2, 3) return output.cpu()