Spaces:
Sleeping
Sleeping
import torch | |
import imageio | |
import os | |
import argparse | |
from diffusers.schedulers import EulerAncestralDiscreteScheduler | |
from transformers import T5EncoderModel, T5Tokenizer | |
from allegro.pipelines.pipeline_allegro import AllegroPipeline | |
from allegro.models.vae.vae_allegro import AllegroAutoencoderKL3D | |
from allegro.models.transformers.transformer_3d_allegro import AllegroTransformer3DModel | |
def single_inference(args): | |
dtype=torch.bfloat16 | |
# vae have better formance in float32 | |
vae = AllegroAutoencoderKL3D.from_pretrained(args.vae, torch_dtype=torch.float32).cuda() | |
vae.eval() | |
text_encoder = T5EncoderModel.from_pretrained( | |
args.text_encoder, | |
torch_dtype=dtype | |
) | |
text_encoder.eval() | |
tokenizer = T5Tokenizer.from_pretrained( | |
args.tokenizer, | |
) | |
scheduler = EulerAncestralDiscreteScheduler() | |
transformer = AllegroTransformer3DModel.from_pretrained( | |
args.dit, | |
torch_dtype=dtype | |
).cuda() | |
transformer.eval() | |
allegro_pipeline = AllegroPipeline( | |
vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
scheduler=scheduler, | |
transformer=transformer | |
).to("cuda:0") | |
positive_prompt = """ | |
(masterpiece), (best quality), (ultra-detailed), (unwatermarked), | |
{} | |
emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, | |
sharp focus, high budget, cinemascope, moody, epic, gorgeous | |
""" | |
negative_prompt = """ | |
nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, | |
low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry. | |
""" | |
user_prompt = positive_prompt.format(args.user_prompt.lower().strip()) | |
if args.enable_cpu_offload: | |
allegro_pipeline.enable_sequential_cpu_offload() | |
print("cpu offload enabled") | |
out_video = allegro_pipeline( | |
user_prompt, | |
negative_prompt = negative_prompt, | |
num_frames=88, | |
height=720, | |
width=1280, | |
num_inference_steps=args.num_sampling_steps, | |
guidance_scale=args.guidance_scale, | |
max_sequence_length=512, | |
generator = torch.Generator(device="cuda:0").manual_seed(args.seed) | |
).video[0] | |
imageio.mimwrite(args.save_path, out_video, fps=15, quality=8) # highest quality is 10, lowest is 0 | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--user_prompt", type=str, default='') | |
parser.add_argument("--vae", type=str, default='') | |
parser.add_argument("--dit", type=str, default='') | |
parser.add_argument("--text_encoder", type=str, default='') | |
parser.add_argument("--tokenizer", type=str, default='') | |
parser.add_argument("--save_path", type=str, default="./output_videos/test_video.mp4") | |
parser.add_argument("--guidance_scale", type=float, default=7.5) | |
parser.add_argument("--num_sampling_steps", type=int, default=100) | |
parser.add_argument("--seed", type=int, default=42) | |
parser.add_argument("--enable_cpu_offload", action='store_true') | |
args = parser.parse_args() | |
if os.path.dirname(args.save_path) != '' and (not os.path.exists(os.path.dirname(args.save_path))): | |
os.makedirs(os.path.dirname(args.save_path)) | |
single_inference(args) | |