import torch class CogVideoX: def __init__(self, weight="THUDM/CogVideoX-2b", device="cuda"): """ Initializes the CogVideo model with a specific device. Args: device (str, optional): The device to run the model on. Defaults to "cuda". """ from diffusers import CogVideoXPipeline self.pipe = CogVideoXPipeline.from_pretrained(weight).to("cuda") def infer_one_video( self, prompt: str = None, size: list = [320, 512], seconds: int = 2, fps: int = 8, seed: int = 42, ): """ Generates a single video based on the provided prompt and parameters. Args: prompt (str, optional): The text prompt to generate the video from. Defaults to None. size (list, optional): The size of the video as [height, width]. Defaults to [320, 512]. seconds (int, optional): The duration of the video in seconds. Defaults to 2. fps (int, optional): The frames per second of the video. Defaults to 8. seed (int, optional): The seed for random number generation. Defaults to 42. Returns: torch.Tensor: The generated video as a tensor. """ video = self.pipe(prompt=prompt, guidance_scale=6, num_frames=seconds * fps, #height=size[0], #width=size[1], num_inference_steps=50, generator=torch.manual_seed(seed)).frames[0] from videogen_hub.utils import images_to_tensor video = video[:-1] # drop the last frame video = images_to_tensor(video) # parse it back to tensor (T, C, H, W) return video