File size: 6,393 Bytes
413d4d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os

from huggingface_hub import snapshot_download, hf_hub_download

from videogen_hub import MODEL_PATH


class OpenSora12:
    def __init__(self, device="gpu"):
        """
        1. Download the pretrained model and put it inside MODEL_PATH/modelscope
        2. Create Pipeline
        Note: it seems that the model needed from model_dir cannot support cpu
        Args:
            device: 'gpu' or 'cpu' the device to use the model
        """

        from mmengine import Config as mmengine_config
        from videogen_hub.pipelines.opensora.scripts.inference import main
        model_path = snapshot_download("hpcai-tech/OpenSora-STDiT-v3",
                                       local_dir=os.path.join(MODEL_PATH, 'OpenSora-STDiT-v3'))
        self.pipeline = main
        self.config = {
            # Basic video frame settings
            "num_frames": 51,  # Total number of frames in a clip
            "frame_interval": 1,  # Interval between frames
            "fps": 24,  # Frames per second
            "image_size": [480, 854],  # Resolution of each frame (height, width)
            # Model configuration for multi-resolution and specific model parameters
            "multi_resolution": "STDiT2",  # Multi-resolution model type
            "model": {
                "type": "STDiT3-XL/2",  # Model type and size
                "from_pretrained": os.path.join(MODEL_PATH, "STDiT3-XL_2"),  # Path to pretrained checkpoint
                "file_name": "model.safetensors",  # Name of the model file
                "input_sq_size": 512,  # Input square size for the model
                "qk_norm": True,  # Whether to normalize query-key in attention
                "enable_flashattn": False,  # Enable flash attention mechanism, require flash_attn package
                "enable_layernorm_kernel": False,  # Enable layer normalization in kernel, requires apex package
            },
            # Variational Autoencoder (VAE) specific settings
            "vae": {
                "type": "OpenSoraVAE_V1_2",  # Type of the autoencoder
                "from_pretrained": "hpcai-tech/OpenSora-VAE-v1.2",  # Pretrained model from Hugging Face
                #"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"),  # Local cache directory for model weights
                "micro_frame_size": 17,
                "micro_batch_size": 4,  # Batch size for processing
            },
            # Text encoder settings for embedding textual information
            "text_encoder": {
                "type": "t5",  # Text encoder model type
                "from_pretrained": "DeepFloyd/t5-v1_1-xxl",  # Pretrained model
                "cache_dir": os.path.join(MODEL_PATH, "t5-v1_1-xxl"),  # Cache directory
                "model_max_length": 300,  # Max length of text inputs
            },
            # Scheduler settings for diffusion models
            "scheduler": {
                "type": "rflow",  # Type of scheduler for the diffusion process
                "num_sampling_steps": 30,  # Number of sampling steps in diffusion
                "cfg_scale": 7.0,  # Scale for classifier-free guidance
                # "cfg_channel": 3,  # Number of channels for guidance
            },
            # Additional settings for processing and output
            "dtype": "bf16",  # Data type for computation (bfloat16)
            # "prompt_path": "./assets/texts/t2v_samples.txt",  # Path to text prompts
            "prompt_path": None,  # Path to text prompts
            "prompt": [
                "A beautiful sunset over the city"
            ],  # List of prompts for generation
            "batch_size": 1,  # Batch size for generation
            "seed": 42,  # Seed for random number generators
            "save_dir": "./samples/samples/",  # Directory to save generated samples
            "config": "sample.py",  # Path to this configuration file
            "prompt_as_path": False,  # Treat the prompt as a file path (True/False)
            "reference_path": None,  # Path to reference image/video for conditioning
            "loop": 1,  # Number of times to loop the processing
            "sample_name": None,  # Specific name for the generated sample
            "num_sample": 1,  # Number of samples to generate
            "aes": 6.5,
            "flow": None,
        }
        self.config = mmengine_config(self.config)

        hf_hub_download(
            repo_id="hpcai-tech/OpenSora-STDiT-v3",
            filename="model.safetensors",
            local_dir=self.config.model.from_pretrained,
        )

        hf_hub_download(
            repo_id="hpcai-tech/OpenSora-VAE-v1.2",
            filename="model.safetensors",
            local_dir=os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"),
        )

        hf_hub_download(
            repo_id="DeepFloyd/t5-v1_1-xxl",
            filename="pytorch_model-00001-of-00002.bin",
            local_dir=self.config.text_encoder.cache_dir,
        )

    def infer_one_video(
            self,
            prompt: str = None,
            size: list = [320, 512],
            seconds: int = 2,
            fps: int = 8,
            seed: int = 42,
    ):
        """
        Generates a single video based on the provided prompt and parameters.
        The generated video always has resolution 256x256

        Args:
            prompt (str, optional): The text prompt to generate the video from. Defaults to None.
            size (list, optional): The resolution of the video. Defaults to [320, 512].
            seconds (int, optional): The duration of the video in seconds. Defaults to 2.
            fps (int, optional): The frames per second of the video. Defaults to 8.
            seed (int, optional): The seed for random number generation. Defaults to 42.

        Returns:
            torch.Tensor: The generated video as a tensor.
        """

        self.config.num_frames = fps * seconds
        self.config.fps = fps
        self.config.seed = seed
        self.config.prompt = [prompt]
        self.config.image_size = size

        all_batch_samples = self.pipeline(self.config)

        sample = all_batch_samples[0][0]
        # sample is torch.Size([1, C, f, H, W])

        output = sample.squeeze(0).permute(1, 2, 3, 0).cpu().float()
        # torch.Size([1, C, f, H, W]) -> torch.Size([f, H, W, C])
        # BFloat16 -> Float

        return output