Spaces:
Runtime error
Runtime error
import os | |
from PIL import Image | |
from huggingface_hub import snapshot_download | |
from videogen_hub import MODEL_PATH | |
class ConsistI2V: | |
def __init__(self, device="cuda"): | |
class Args: | |
def __init__(self): | |
self.inference_config = "configs/inference/inference.yaml" | |
self.prompt = None | |
self.n_prompt = "" | |
self.seed = "random" | |
self.path_to_first_frame = None | |
self.prompt_config = "configs/prompts/default.yaml" | |
self.format = "mp4" | |
self.save_model = False | |
self.optional_args = [] | |
self.args = Args() | |
model_path = os.path.join(MODEL_PATH, "TIGER-Lab", "ConsistI2V").replace("\\", "\\\\") | |
yaml_config = f""" | |
output_dir: "samples/inference" | |
output_name: "i2v" | |
pretrained_model_path: "{model_path}" | |
unet_path: null | |
unet_ckpt_prefix: "module." | |
pipeline_pretrained_path: null | |
sampling_kwargs: | |
height: 256 | |
width: 256 | |
n_frames: 16 | |
steps: 50 | |
ddim_eta: 0.0 | |
guidance_scale_txt: 7.5 | |
guidance_scale_img: 1.0 | |
guidance_rescale: 0.0 | |
num_videos_per_prompt: 1 | |
frame_stride: 3 | |
unet_additional_kwargs: | |
variant: null | |
n_temp_heads: 8 | |
augment_temporal_attention: true | |
temp_pos_embedding: "rotary" # "rotary" or "sinusoidal" | |
first_frame_condition_mode: "concat" | |
use_frame_stride_condition: true | |
noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive" | |
noise_alpha: 1.0 | |
noise_scheduler_kwargs: | |
beta_start: 0.00085 | |
beta_end: 0.012 | |
beta_schedule: "linear" | |
steps_offset: 1 | |
clip_sample: false | |
rescale_betas_zero_snr: false # true if using zero terminal snr | |
timestep_spacing: "leading" # "trailing" if using zero terminal snr | |
prediction_type: "epsilon" # "v_prediction" if using zero terminal snr | |
frameinit_kwargs: | |
enable: true | |
camera_motion: null | |
noise_level: 850 | |
filter_params: | |
method: 'gaussian' | |
d_s: 0.25 | |
d_t: 0.25 | |
""" | |
from omegaconf import OmegaConf | |
self.config = OmegaConf.create(yaml_config) | |
model_path = os.path.join(MODEL_PATH, "ConsistI2V").replace("\\", "\\\\") | |
snapshot_download("TIGER-Lab/ConsistI2V", local_dir=model_path) | |
from videogen_hub.pipelines.consisti2v.scripts.animate import main | |
self.pipeline = main | |
def infer_one_video( | |
self, | |
input_image: Image.Image, | |
prompt: str = None, | |
size: list = [320, 512], | |
seconds: int = 2, | |
fps: int = 8, | |
seed: int = 42, | |
): | |
""" | |
Generates a single video based on a textual prompt and first frame image, using either a provided image or an image path as the starting point. The output is a tensor representing the video. | |
Args: | |
input_image (PIL.Image.Image): The input image to use as the basis for video generation. | |
prompt (str, optional): The text prompt that guides the video generation. If not specified, the video generation will rely solely on the input image. Defaults to None. | |
size (list, optional): Specifies the resolution of the output video as [height, width]. Defaults to [320, 512]. | |
seconds (int, optional): The duration of the video in seconds. Defaults to 2. | |
fps (int, optional): The number of frames per second in the generated video. This determines how smooth the video appears. Defaults to 8. | |
seed (int, optional): A seed value for random number generation, ensuring reproducibility of the video generation process. Defaults to 42. | |
Returns: | |
torch.Tensor: A tensor representing the generated video, structured as (time, channel, height, width). | |
""" | |
self.args.prompt = prompt | |
self.args.path_to_first_frame = input_image | |
self.args.seed = str(seed) | |
self.config.sampling_kwargs.height = size[0] | |
self.config.sampling_kwargs.width = size[1] | |
self.config.sampling_kwargs.n_frames = seconds * fps | |
return self.pipeline(self.args, self.config) | |