|
import os |
|
import time |
|
from pathlib import Path |
|
from loguru import logger |
|
from datetime import datetime |
|
import gradio as gr |
|
import random |
|
import spaces |
|
import torch |
|
|
|
from hyvideo.utils.file_utils import save_videos_grid |
|
from hyvideo.utils.preprocess_text_encoder_tokenizer_utils import preprocess_text_encoder_tokenizer |
|
from hyvideo.config import parse_args |
|
from hyvideo.inference import HunyuanVideoSampler |
|
from hyvideo.constants import NEGATIVE_PROMPT |
|
|
|
from huggingface_hub import snapshot_download |
|
|
|
if torch.cuda.device_count() > 0: |
|
snapshot_download(repo_id="tencent/HunyuanVideo", repo_type="model", local_dir="ckpts", force_download=True) |
|
snapshot_download(repo_id="xtuner/llava-llama-3-8b-v1_1-transformers", repo_type="model", local_dir="ckpts/llava-llama-3-8b-v1_1-transformers", force_download=True) |
|
|
|
class Args: |
|
def __init__(self, input_dir, output_dir): |
|
self.input_dir = input_dir |
|
self.output_dir = output_dir |
|
|
|
|
|
args = Args("ckpts/llava-llama-3-8b-v1_1-transformers", "ckpts/text_encoder") |
|
preprocess_text_encoder_tokenizer(args) |
|
snapshot_download(repo_id="openai/clip-vit-large-patch14", repo_type="model", local_dir="ckpts/text_encoder_2", force_download=True) |
|
|
|
def initialize_model(model_path): |
|
print("initialize_model: " + model_path) |
|
if torch.cuda.device_count() == 0: |
|
return None |
|
|
|
args = parse_args() |
|
models_root_path = Path(model_path) |
|
if not models_root_path.exists(): |
|
raise ValueError(f"`models_root` not exists: {models_root_path}") |
|
|
|
print(f"`models_root` exists: {models_root_path}") |
|
hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args) |
|
print("Model initialized: " + model_path) |
|
return hunyuan_video_sampler |
|
|
|
model = initialize_model("ckpts") |
|
|
|
def generate_video( |
|
prompt, |
|
resolution, |
|
video_length, |
|
seed, |
|
num_inference_steps, |
|
guidance_scale, |
|
flow_shift, |
|
embedded_guidance_scale |
|
): |
|
print("generate_video (prompt: " + prompt + ")") |
|
return generate_video_gpu( |
|
model, |
|
prompt, |
|
resolution, |
|
video_length, |
|
seed, |
|
num_inference_steps, |
|
guidance_scale, |
|
flow_shift, |
|
embedded_guidance_scale |
|
) |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_video_gpu( |
|
model, |
|
prompt, |
|
resolution, |
|
video_length, |
|
seed, |
|
num_inference_steps, |
|
guidance_scale, |
|
flow_shift, |
|
embedded_guidance_scale |
|
): |
|
print("generate_video_gpu (prompt: " + prompt + ")") |
|
if torch.cuda.device_count() == 0: |
|
gr.Warning("Set this space to GPU config to make it work.") |
|
return None |
|
|
|
seed = None if seed == -1 else seed |
|
width, height = resolution.split("x") |
|
width, height = int(width), int(height) |
|
negative_prompt = "" |
|
print("Predicting video...") |
|
|
|
outputs = model.predict( |
|
prompt=prompt, |
|
height=height, |
|
width=width, |
|
video_length=video_length, |
|
seed=seed, |
|
negative_prompt=negative_prompt, |
|
infer_steps=num_inference_steps, |
|
guidance_scale=guidance_scale, |
|
num_videos_per_prompt=1, |
|
flow_shift=flow_shift, |
|
batch_size=1, |
|
embedded_guidance_scale=embedded_guidance_scale |
|
) |
|
|
|
print("Video predicted") |
|
samples = outputs["samples"] |
|
sample = samples[0].unsqueeze(0) |
|
|
|
save_path = "./gradio_outputs" |
|
os.makedirs(save_path, exist_ok=True) |
|
|
|
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S") |
|
video_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][0]}_{outputs['prompts'][0][:100].replace('/','')}.mp4" |
|
save_videos_grid(sample, video_path, fps=24) |
|
logger.info(f"Sample saved to: {video_path}") |
|
|
|
print("Return the video") |
|
return video_path |
|
|
|
def create_demo(model_path): |
|
with gr.Blocks() as demo: |
|
if torch.cuda.device_count() == 0: |
|
with gr.Row(): |
|
gr.HTML(""" |
|
<p style="background-color: red;"><big><big><big><b>⚠️To use <i>Hunyuan Video</i>, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo?duplicate=true">duplicate this space</a> and set a GPU with 80 GB VRAM.</b> |
|
|
|
You can't use <i>Hunyuan Video</i> directly here because this space runs on a CPU, which is not enough for <i>Hunyuan Video</i>. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo/discussions/new">feedback</a> if you have issues. |
|
</big></big></big></p> |
|
""") |
|
gr.Markdown("# Hunyuan Video Generation") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
prompt = gr.Textbox(label="Prompt", value="A cat walks on the grass, realistic style.") |
|
with gr.Row(): |
|
resolution = gr.Dropdown( |
|
choices=[ |
|
|
|
("1280x720 (16:9, 720p)", "1280x720"), |
|
("720x1280 (9:16, 720p)", "720x1280"), |
|
("1104x832 (4:3, 720p)", "1104x832"), |
|
("832x1104 (3:4, 720p)", "832x1104"), |
|
("960x960 (1:1, 720p)", "960x960"), |
|
|
|
("960x544 (16:9, 540p)", "960x544"), |
|
("544x960 (9:16, 540p)", "544x960"), |
|
("832x624 (4:3, 540p)", "832x624"), |
|
("624x832 (3:4, 540p)", "624x832"), |
|
("720x720 (1:1, 540p)", "720x720"), |
|
], |
|
value="832x624", |
|
label="Resolution" |
|
) |
|
video_length = gr.Dropdown( |
|
label="Video Length", |
|
choices=[ |
|
("2s(65f)", 65), |
|
("5s(129f)", 129), |
|
], |
|
value=65, |
|
) |
|
num_inference_steps = gr.Slider(1, 100, value=5, step=1, label="Number of Inference Steps") |
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
with gr.Column(): |
|
seed = gr.Slider(label="Seed (-1 for random)", value=-1, minimum=-1, maximum=2**63 - 1, step=1) |
|
guidance_scale = gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Guidance Scale") |
|
flow_shift = gr.Slider(0.0, 10.0, value=7.0, step=0.1, label="Flow Shift") |
|
embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale") |
|
|
|
generate_btn = gr.Button(value = "🚀 Generate Video", variant = "primary") |
|
|
|
with gr.Row(): |
|
output = gr.Video(label = "Generated Video", autoplay = True) |
|
|
|
gr.Markdown(""" |
|
## **Alternatives** |
|
If you can't use _Hunyuan Video_, you can use _[CogVideoX](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)_ or _[LTX Video Playground](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground)_ instead. |
|
""") |
|
|
|
generate_btn.click( |
|
fn=generate_video, |
|
inputs=[ |
|
prompt, |
|
resolution, |
|
video_length, |
|
seed, |
|
num_inference_steps, |
|
guidance_scale, |
|
flow_shift, |
|
embedded_guidance_scale |
|
], |
|
outputs=output |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" |
|
demo = create_demo("ckpts") |
|
demo.queue(10).launch() |