Spaces:

yhzhai
/

mcm

Running on Zero

App Files Files Community

mcm / app.py

yhzhai

format

e61b61b 5 months ago

raw

history blame contribute delete

17.3 kB

	import spaces
	import os
	import random
	from datetime import datetime
	from typing import Optional

	import gradio as gr
	import numpy as np
	import torch
	from diffusers import (
	AnimateDiffPipeline,
	DiffusionPipeline,
	LCMScheduler,
	MotionAdapter,
	)
	from diffusers.utils import export_to_video
	from peft import PeftModel

	device = "cuda"
	mcm_id = "yhzhai/mcm"
	basedir = os.getcwd()
	savedir = os.path.join(
	basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S")
	)

	MAX_SEED = np.iinfo(np.int32).max


	def get_modelscope_pipeline(
	mcm_variant: Optional[str] = "WebVid",
	):
	model_id = "ali-vilab/text-to-video-ms-1.7b"
	# if torch.cuda.is_available():
	# pipe = DiffusionPipeline.from_pretrained(
	# model_id, torch_dtype=torch.float16, variant="fp16"
	# )
	# else:
	pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16")
	scheduler = LCMScheduler.from_pretrained(
	model_id,
	subfolder="scheduler",
	timestep_scaling=4.0,
	)
	pipe.scheduler = scheduler
	pipe.enable_vae_slicing()

	if mcm_variant == "WebVid":
	subfolder = "modelscopet2v-webvid"
	elif mcm_variant == "LAION-aes":
	subfolder = "modelscopet2v-laion"
	elif mcm_variant == "Anime":
	subfolder = "modelscopet2v-anime"
	elif mcm_variant == "Realistic":
	subfolder = "modelscopet2v-real"
	elif mcm_variant == "3D Cartoon":
	subfolder = "modelscopet2v-3d-cartoon"
	else:
	subfolder = "modelscopet2v-laion"

	lora = PeftModel.from_pretrained(
	pipe.unet,
	model_id=mcm_id,
	subfolder=subfolder,
	adapter_name="lora",
	torch_device="cpu",
	)
	lora.merge_and_unload()
	pipe.unet = lora

	pipe = pipe.to(device)

	return pipe


	def get_animatediff_pipeline(
	real_variant: Optional[str] = "realvision",
	motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
	mcm_variant: Optional[str] = "WebVid",
	):
	if real_variant is None:
	model_id = "runwayml/stable-diffusion-v1-5"
	elif real_variant == "epicrealism":
	model_id = "emilianJR/epiCRealism"
	elif real_variant == "realvision":
	model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
	else:
	raise ValueError(f"Unknown real_variant {real_variant}")

	# if torch.cuda.is_available():
	# adapter = MotionAdapter.from_pretrained(
	# motion_module_path, torch_dtype=torch.float16
	# )
	# pipe = AnimateDiffPipeline.from_pretrained(
	# model_id,
	# motion_adapter=adapter,
	# torch_dtype=torch.float16,
	# )
	# else:
	adapter = MotionAdapter.from_pretrained(motion_module_path)
	pipe = AnimateDiffPipeline.from_pretrained(
	model_id,
	motion_adapter=adapter, torch_dtype=torch.float16
	)
	scheduler = LCMScheduler.from_pretrained(
	model_id,
	subfolder="scheduler",
	timestep_scaling=4.0,
	clip_sample=False,
	timestep_spacing="linspace",
	beta_schedule="linear",
	beta_start=0.00085,
	beta_end=0.012,
	steps_offset=1,
	)
	pipe.scheduler = scheduler
	pipe.enable_vae_slicing()

	if mcm_variant == "WebVid":
	subfolder = "animatediff-webvid"
	elif mcm_variant == "LAION-aes":
	subfolder = "animatediff-laion"
	else:
	subfolder = "animatediff-laion"

	lora = PeftModel.from_pretrained(
	pipe.unet,
	model_id=mcm_id,
	subfolder=subfolder,
	adapter_name="lora",
	torch_device="cpu",
	)
	lora.merge_and_unload()
	pipe.unet = lora

	pipe = pipe.to(device)
	return pipe


	pipe_dict = {
	"ModelScope T2V": {
	"WebVid": None,
	"LAION-aes": None,
	"Anime": None,
	"Realistic": None,
	"3D Cartoon": None,
	},
	"AnimateDiff (SD1.5)": {"WebVid": None, "LAION-aes": None},
	"AnimateDiff (RealisticVision)": {"WebVid": None, "LAION-aes": None},
	"AnimateDiff (epiCRealism)": {"WebVid": None, "LAION-aes": None},
	}
	cache_pipeline = {
	"base_model": None,
	"variant": None,
	"pipeline": None,
	}

	# def init_pipelines():
	# for base_model in variants.keys():
	# for variant in variants[base_model]:
	# if pipe_dict[base_model][variant] is None:
	# if base_model == "ModelScope T2V":
	# pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant)
	# elif base_model == "AnimateDiff (SD1.5)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant=None,
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# elif base_model == "AnimateDiff (RealisticVision)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant="realvision",
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# elif base_model == "AnimateDiff (epiCRealism)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant="epicrealism",
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# else:
	# raise ValueError(f"Unknown base_model {base_model}")


	@spaces.GPU(duration=60)
	def infer(
	base_model,
	variant,
	prompt,
	num_inference_steps=4,
	height=256,
	width=256,
	seed=0,
	randomize_seed=True,
	progress = gr.Progress(track_tqdm=True),
	):
	# if pipe_dict[base_model][variant] is None:
	# if base_model == "ModelScope T2V":
	# pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant)
	# elif base_model == "AnimateDiff (SD1.5)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant=None,
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# elif base_model == "AnimateDiff (RealisticVision)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant="realvision",
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# elif base_model == "AnimateDiff (epiCRealism)":
	# pipe_dict[base_model][variant] = get_animatediff_pipeline(
	# real_variant="epicrealism",
	# motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	# mcm_variant=variant,
	# )
	# else:
	# raise ValueError(f"Unknown base_model {base_model}")
	if (
	cache_pipeline["base_model"] == base_model
	and cache_pipeline["variant"] == variant
	):
	pass
	else:
	if base_model == "ModelScope T2V":
	pipeline = get_modelscope_pipeline(mcm_variant=variant)
	elif base_model == "AnimateDiff (SD1.5)":
	pipeline = get_animatediff_pipeline(
	real_variant=None,
	motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	mcm_variant=variant,
	)
	elif base_model == "AnimateDiff (RealisticVision)":
	pipeline = get_animatediff_pipeline(
	real_variant="realvision",
	motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	mcm_variant=variant,
	)
	elif base_model == "AnimateDiff (epiCRealism)":
	pipeline = get_animatediff_pipeline(
	real_variant="epicrealism",
	motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
	mcm_variant=variant,
	)
	else:
	raise ValueError(f"Unknown base_model {base_model}")

	cache_pipeline["base_model"] = base_model
	cache_pipeline["variant"] = variant
	cache_pipeline["pipeline"] = pipeline

	# pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to(device)
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	generator = torch.Generator("cpu").manual_seed(seed)

	output = cache_pipeline["pipeline"](
	prompt=prompt,
	num_frames=16,
	guidance_scale=1.0,
	num_inference_steps=num_inference_steps,
	height=height,
	width=width,
	generator=generator,
	).frames
	if not isinstance(output, list):
	output = [output[i] for i in range(output.shape[0])]

	os.makedirs(savedir, exist_ok=True)
	save_path = os.path.join(
	savedir, f"sample_{base_model}_{variant}_{seed}.mp4".replace(" ", "_")
	)
	export_to_video(
	output[0],
	save_path,
	fps=7,
	)
	print(f"Saved to {save_path}")
	# pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to("cpu")
	return save_path, seed


	examples = [
	[
	"ModelScope T2V",
	"LAION-aes",
	"Aerial uhd 4k view. mid-air flight over fresh and clean mountain river at sunny summer morning. Green trees and sun rays on horizon. Direct on sun.",
	4,
	256,
	256,
	],
	[
	"ModelScope T2V",
	"Anime",
	"Timelapse misty mountain landscape",
	4,
	256,
	256,
	],
	[
	"ModelScope T2V",
	"WebVid",
	"Back of woman in shorts going near pure creek in beautiful mountains.",
	4,
	256,
	256,
	],
	[
	"ModelScope T2V",
	"3D Cartoon",
	"A rotating pandoro (a traditional italian sweet yeast bread, most popular around christmas and new year) being eaten in time-lapse.",
	4,
	256,
	256,
	],
	[
	"ModelScope T2V",
	"Realistic",
	"Slow motion avocado with a stone falls and breaks into 2 parts with splashes",
	4,
	256,
	256,
	],
	[
	"AnimateDiff (epiCRealism)",
	"LAION-aes",
	"Slow motion of delicious salmon sachimi set with green vegetables leaves served on wood plate. make homemade japanese food at home.-dan",
	8,
	512,
	512,
	],
	[
	"AnimateDiff (epiCRealism)",
	"WebVid",
	"Blooming meadow panorama zoom-out shot heavenly clouds and upcoming thunderstorm in mountain range harz, germany.",
	8,
	512,
	512,
	],
	[
	"AnimateDiff (epiCRealism)",
	"LAION-aes",
	"A young woman in a yellow sweater uses vr glasses, sitting on the shore of a pond on a background of dark waves. a strong wind develops her hair, the sun's rays are reflected from the water.",
	8,
	512,
	512,
	],
	[
	"AnimateDiff (epiCRealism)",
	"LAION-aes",
	"Female running at sunset. healthy fitness concept",
	8,
	512,
	512,
	],
	]

	css = """
	#col-container {
	margin: 0 auto;
	}
	"""

	variants = {
	"ModelScope T2V": ["WebVid", "LAION-aes", "Anime", "Realistic", "3D Cartoon"],
	"AnimateDiff (SD1.5)": ["WebVid", "LAION-aes"],
	"AnimateDiff (RealisticVision)": ["WebVid", "LAION-aes"],
	"AnimateDiff (epiCRealism)": ["WebVid", "LAION-aes"],
	}


	def update_variant(rs):
	return gr.update(choices=variants[rs], value=None)


	# init_pipelines()

	with gr.Blocks(css=css) as demo:

	with gr.Column(elem_id="col-container"):
	gr.HTML(
	"""
	<div style="text-align: center; margin-bottom: 20px;">
	<h1 align="center">
	<a href="https://yhzhai.github.io/mcm/"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
	</h1>
	<h4>Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.</h4>
	<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
	<a href='https://yhzhai.github.io/mcm/'><img src='https://img.shields.io/badge/Project-Page-Green'></a>
	<a href='https://arxiv.org/abs/2406.06890'><img src='https://img.shields.io/badge/Paper-arXiv-red'></a>
	<a href='https://huggingface.co/yhzhai/mcm'><img src='https://img.shields.io/badge/HF-checkpoint-yellow'></a>
	</div>
	</div>
	"""
	)

	gr.Markdown(
	f"""
	<p align="center">Currently running on {device}.</p>
	<p align="center">Model loading takes extra time.</p>
	"""
	)

	# <p align="center">ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for 512x512.</p>
	with gr.Row():
	base_model = gr.Dropdown(
	label="Base model",
	choices=[
	"ModelScope T2V",
	"AnimateDiff (SD1.5)",
	"AnimateDiff (RealisticVision)",
	"AnimateDiff (epiCRealism)",
	],
	value="ModelScope T2V",
	interactive=True,
	)
	variant_dropdown = gr.Dropdown(
	variants["ModelScope T2V"],
	label="MCM Variant",
	interactive=True,
	value=None,
	)
	base_model.change(
	update_variant, inputs=[base_model], outputs=[variant_dropdown]
	)

	with gr.Row():
	prompt = gr.Text(
	label="Prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt",
	container=False,
	)

	run_button = gr.Button("Run", scale=0)

	with gr.Row():
	with gr.Column():
	with gr.Accordion("Advanced Settings", open=True):

	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=0,
	)

	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

	with gr.Row():
	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=1,
	maximum=12,
	step=1,
	value=4,
	)

	with gr.Group():
	with gr.Row():
	text_hint = gr.Textbox(
	"Hint: ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for resolution 512x512.",
	interactive=False,
	label="Hint",
	container=False,

	)
	with gr.Row():
	height = gr.Slider(
	label="Height",
	minimum=256,
	maximum=1024,
	step=64,
	value=512,
	interactive=True,
	)
	width = gr.Slider(
	label="Width",
	minimum=256,
	maximum=1024,
	step=64,
	value=512,
	interactive=True,
	)



	with gr.Column(show_progress=True):
	# result = gr.Video(label="Result", show_label=False, interactive=False, height=512, width=512, autoplay=True)
	result = gr.Video(
	label="Result",
	show_label=False,
	interactive=False,
	autoplay=True,
	# height=512,
	# width=512,
	)

	gr.Examples(
	examples=examples,
	inputs=[base_model, variant_dropdown, prompt, num_inference_steps, height, width],
	fn=infer,
	outputs=[result, seed],
	)

	run_button.click(
	fn=infer,
	inputs=[
	base_model,
	variant_dropdown,
	prompt,
	num_inference_steps,
	height,
	width,
	seed,
	randomize_seed,
	],
	outputs=[result, seed],
	)

	demo.queue().launch()