import os

from huggingface_hub import snapshot_download, hf_hub_download

from videogen_hub import MODEL_PATH


class OpenSora:
    def __init__(self, device="gpu"):
        """
        1. Download the pretrained model and put it inside MODEL_PATH/modelscope
        2. Create Pipeline
        Note: it seems that the model needed from model_dir cannot support cpu
        Args:
            device: 'gpu' or 'cpu' the device to use the model
        """

        from mmengine import Config as mmengine_config
        from videogen_hub.pipelines.opensora.scripts.inference import main

        self.pipeline = main
        self.config = {
            # Basic video frame settings
            "num_frames": 32,  # Total number of frames in a clip
            "frame_interval": 3,  # Interval between frames
            "fps": 24,  # Frames per second
            "image_size": [480, 854],  # Resolution of each frame (height, width)
            # Model configuration for multi-resolution and specific model parameters
            "multi_resolution": "STDiT2",  # Multi-resolution model type
            "model": {
                "type": "STDiT2-XL/2",  # Model type and size
                "from_pretrained": os.path.join(MODEL_PATH, "STDiT2-XL_2"),  # Path to pretrained checkpoint
                "file_name": "model.safetensors",  # Name of the model file
                "input_sq_size": 512,  # Input square size for the model
                "qk_norm": True,  # Whether to normalize query-key in attention
                "enable_flashattn": False,  # Enable flash attention mechanism, require flash_attn package
                "enable_layernorm_kernel": False,  # Enable layer normalization in kernel, requires apex package
            },
            # Variational Autoencoder (VAE) specific settings
            "vae": {
                "type": "VideoAutoencoderKL",  # Type of the autoencoder
                "from_pretrained": "stabilityai/sd-vae-ft-ema",  # Pretrained model from Hugging Face
                "cache_dir": os.path.join(MODEL_PATH, "sd-vae-ft-ema"),  # Local cache directory for model weights
                "micro_batch_size": 4,  # Batch size for processing
            },
            # Text encoder settings for embedding textual information
            "text_encoder": {
                "type": "t5",  # Text encoder model type
                "from_pretrained": "DeepFloyd/t5-v1_1-xxl",  # Pretrained model
                "cache_dir": os.path.join(MODEL_PATH, "t5-v1_1-xxl"),  # Cache directory
                "model_max_length": 200,  # Max length of text inputs
            },
            # Scheduler settings for diffusion models
            "scheduler": {
                "type": "iddpm",  # Type of scheduler for the diffusion process
                "num_sampling_steps": 50,  # Number of sampling steps in diffusion
                "cfg_scale": 7.0,  # Scale for classifier-free guidance
                "cfg_channel": 3,  # Number of channels for guidance
            },
            # Additional settings for processing and output
            "dtype": "bf16",  # Data type for computation (bfloat16)
            # "prompt_path": "./assets/texts/t2v_samples.txt",  # Path to text prompts
            "prompt_path": None,  # Path to text prompts
            "prompt": [
                "A beautiful sunset over the city"
            ],  # List of prompts for generation
            "batch_size": 1,  # Batch size for generation
            "seed": 42,  # Seed for random number generators
            "save_dir": "./samples/samples/",  # Directory to save generated samples
            "config": "sample.py",  # Path to this configuration file
            "prompt_as_path": False,  # Treat the prompt as a file path (True/False)
            "reference_path": None,  # Path to reference image/video for conditioning
            "loop": 1,  # Number of times to loop the processing
            "sample_name": None,  # Specific name for the generated sample
            "num_sample": 1,  # Number of samples to generate
        }
        self.config = mmengine_config(self.config)

        hf_hub_download(
            repo_id="hpcai-tech/OpenSora-STDiT-v2-stage2",
            filename="model.safetensors",
            local_dir=self.config.model.from_pretrained,
        )

        hf_hub_download(
            repo_id="stabilityai/sd-vae-ft-ema",
            filename="diffusion_pytorch_model.safetensors",
            local_dir=self.config.vae.cache_dir,
        )

        hf_hub_download(
            repo_id="DeepFloyd/t5-v1_1-xxl",
            filename="pytorch_model-00001-of-00002.bin",
            local_dir=self.config.text_encoder.cache_dir,
        )

    def infer_one_video(
        self,
        prompt: str = None,
        size: list = [320, 512],
        seconds: int = 2,
        fps: int = 8,
        seed: int = 42,
    ):
        """
        Generates a single video based on the provided prompt and parameters.
        The generated video always has resolution 256x256

        Args:
            prompt (str, optional): The text prompt to generate the video from. Defaults to None.
            seconds (int, optional): The duration of the video in seconds. Defaults to 2.
            fps (int, optional): The frames per second of the video. Defaults to 8.
            seed (int, optional): The seed for random number generation. Defaults to 42.

        Returns:
            torch.Tensor: The generated video as a tensor.
        """

        self.config.num_frames = fps * seconds
        self.config.fps = fps
        self.config.seed = seed
        self.config.prompt = [prompt]
        self.config.image_size = size

        all_batch_samples = self.pipeline(self.config) 
        
        sample = all_batch_samples[0][0]
        # sample is torch.Size([1, C, f, H, W])

        output = sample.squeeze(0).permute(1, 2, 3, 0).cpu().float()
        # torch.Size([1, C, f, H, W]) -> torch.Size([f, H, W, C])
        # BFloat16 -> Float

        return output