Spaces:

Tello2020
/

Text2video2024

Configuration error

App Files Files Community

Tello2020 commited on Jul 29, 2023

Commit

fd5f698

1 Parent(s): b3ae437

Upload 14 files

Browse files

Files changed (14) hide show

LICENSE +21 -0
README.md +20 -12
bucketing.py +32 -0
cog.yaml +18 -0
dataset.py +581 -0
download-weights +48 -0
inference.py +238 -0
lama.py +350 -0
lora.py +1312 -0
predict.py +101 -0
samples.py +57 -0
train.py +998 -0
unet_3d_blocks.py +836 -0
unet_3d_condition.py +499 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 ExponentialML
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
----
-title: Text2video2024
-emoji: 🦀
-colorFrom: yellow
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.25.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# cog-text2video
+A Cog implementation with txt2vid and vid2vid of:
+- https://huggingface.co/cerspense/zeroscope_v2_XL
+- https://huggingface.co/cerspense/zeroscope_v2_576w
+- https://huggingface.co/camenduru/potat1
+Deployed at https://replicate.com/anotherjesse/zeroscope-v2-xl
+## Shoutouts
+- [Text-To-Video-Finetuning](https://github.com/camenduru/Text-To-Video-Finetuning) - Finetune ModelScope's Text To Video model using Diffusers
+- [Showlab](https://github.com/showlab/Tune-A-Video) and bryandlee[https://github.com/bryandlee/Tune-A-Video] for their Tune-A-Video contribution that made this much easier.
+- [lucidrains](https://github.com/lucidrains) for their implementations around video diffusion.
+- [cloneofsimo](https://github.com/cloneofsimo) for their diffusers implementation of LoRA.
+- [kabachuha](https://github.com/kabachuha) for their conversion scripts, training ideas, and webui works.
+- [JCBrouwer](https://github.com/JCBrouwer) Inference implementations.
+- [sergiobr](https://github.com/sergiobr) Helpful ideas and bug fixes.
+- [cjwbw/damo-text-to-video](https://replicate.com/cjwbw/damo-text-to-video) for original [cog](https://github.com/replicate/cog) implementation

bucketing.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from PIL import Image
+def min_res(size, min_size): return 192 if size < 192 else size
+def up_down_bucket(m_size, in_size, direction):
+    if direction == 'down': return abs(int(m_size - in_size))
+    if direction == 'up': return abs(int(m_size + in_size))
+def get_bucket_sizes(size, direction: 'down', min_size):
+    multipliers = [64, 128]
+    for i, m in enumerate(multipliers):
+        res =  up_down_bucket(m, size, direction)
+        multipliers[i] = min_res(res, min_size=min_size)
+    return multipliers
+def closest_bucket(m_size, size, direction, min_size):
+    lst = get_bucket_sizes(m_size, direction, min_size)
+    return lst[min(range(len(lst)), key=lambda i: abs(lst[i]-size))]
+def resolve_bucket(i,h,w): return  (i / (h / w))
+def sensible_buckets(m_width, m_height, w, h, min_size=192):
+    if h > w:
+        w = resolve_bucket(m_width, h, w)
+        w = closest_bucket(m_width, w, 'down', min_size=min_size)
+        return w, m_height
+    if h < w:
+        h = resolve_bucket(m_height, w, h)
+        h = closest_bucket(m_height, h, 'down', min_size=min_size)
+        return m_width, h
+    return m_width, m_height

cog.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+build:
+  gpu: true
+  python_version: "3.10"
+  cuda: "11.7"
+  python_packages:
+    - "accelerate==0.20.3"
+    - "diffusers==0.17.1"
+    - "gradio==3.35.2"
+    - "imageio[ffmpeg]==2.31.1"
+    - "torch==2.0.1"
+    - "torchvision==0.15.2"
+    - "transformers==4.30.2"
+    - "einops==0.6.1"
+    - "omegaconf==2.3.0"
+    - "opencv-python-headless==4.7.0.72"
+    - "decord==0.6.0"
+predict: "predict.py:Predictor"

dataset.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import os
+import decord
+import numpy as np
+import random
+import json
+import torchvision
+import torchvision.transforms as T
+import torch
+from glob import glob
+from PIL import Image
+from itertools import islice
+from pathlib import Path
+from .bucketing import sensible_buckets
+decord.bridge.set_bridge('torch')
+from torch.utils.data import Dataset
+from einops import rearrange, repeat
+def get_prompt_ids(prompt, tokenizer):
+    prompt_ids = tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+    ).input_ids
+    return prompt_ids
+def read_caption_file(caption_file):
+        with open(caption_file, 'r', encoding="utf8") as t:
+            return t.read()
+def get_text_prompt(
+        text_prompt: str = '',
+        fallback_prompt: str= '',
+        file_path:str = '',
+        ext_types=['.mp4'],
+        use_caption=False
+    ):
+    try:
+        if use_caption:
+            if len(text_prompt) > 1: return text_prompt
+            caption_file = ''
+            # Use caption on per-video basis (One caption PER video)
+            for ext in ext_types:
+                maybe_file = file_path.replace(ext, '.txt')
+                if maybe_file.endswith(ext_types): continue
+                if os.path.exists(maybe_file):
+                    caption_file = maybe_file
+                    break
+            if os.path.exists(caption_file):
+                return read_caption_file(caption_file)
+            # Return fallback prompt if no conditions are met.
+            return fallback_prompt
+        return text_prompt
+    except:
+        print(f"Couldn't read prompt caption for {file_path}. Using fallback.")
+        return fallback_prompt
+def get_video_frames(vr, start_idx, sample_rate=1, max_frames=24):
+    max_range = len(vr)
+    frame_number = sorted((0, start_idx, max_range))[1]
+    frame_range = range(frame_number, max_range, sample_rate)
+    frame_range_indices = list(frame_range)[:max_frames]
+    return frame_range_indices
+def process_video(vid_path, use_bucketing, w, h, get_frame_buckets, get_frame_batch):
+    if use_bucketing:
+        vr = decord.VideoReader(vid_path)
+        resize = get_frame_buckets(vr)
+        video = get_frame_batch(vr, resize=resize)
+    else:
+        vr = decord.VideoReader(vid_path, width=w, height=h)
+        video = get_frame_batch(vr)
+    return video, vr
+# https://github.com/ExponentialML/Video-BLIP2-Preprocessor
+class VideoJsonDataset(Dataset):
+    def __init__(
+            self,
+            tokenizer = None,
+            width: int = 256,
+            height: int = 256,
+            n_sample_frames: int = 4,
+            sample_start_idx: int = 1,
+            frame_step: int = 1,
+            json_path: str ="",
+            json_data = None,
+            vid_data_key: str = "video_path",
+            preprocessed: bool = False,
+            use_bucketing: bool = False,
+            **kwargs
+    ):
+        self.vid_types = (".mp4", ".avi", ".mov", ".webm", ".flv", ".mjpeg")
+        self.use_bucketing = use_bucketing
+        self.tokenizer = tokenizer
+        self.preprocessed = preprocessed
+        self.vid_data_key = vid_data_key
+        self.train_data = self.load_from_json(json_path, json_data)
+        self.width = width
+        self.height = height
+        self.n_sample_frames = n_sample_frames
+        self.sample_start_idx = sample_start_idx
+        self.frame_step = frame_step
+    def build_json(self, json_data):
+        extended_data = []
+        for data in json_data['data']:
+            for nested_data in data['data']:
+                self.build_json_dict(
+                    data,
+                    nested_data,
+                    extended_data
+                )
+        json_data = extended_data
+        return json_data
+    def build_json_dict(self, data, nested_data, extended_data):
+        clip_path = nested_data['clip_path'] if 'clip_path' in nested_data else None
+        extended_data.append({
+            self.vid_data_key: data[self.vid_data_key],
+            'frame_index': nested_data['frame_index'],
+            'prompt': nested_data['prompt'],
+            'clip_path': clip_path
+        })
+    def load_from_json(self, path, json_data):
+        try:
+            with open(path) as jpath:
+                print(f"Loading JSON from {path}")
+                json_data = json.load(jpath)
+                return self.build_json(json_data)
+        except:
+            self.train_data = []
+            print("Non-existant JSON path. Skipping.")
+    def validate_json(self, base_path, path):
+        return os.path.exists(f"{base_path}/{path}")
+    def get_frame_range(self, vr):
+        return get_video_frames(
+            vr,
+            self.sample_start_idx,
+            self.frame_step,
+            self.n_sample_frames
+        )
+    def get_vid_idx(self, vr, vid_data=None):
+        frames = self.n_sample_frames
+        if vid_data is not None:
+            idx = vid_data['frame_index']
+        else:
+            idx = self.sample_start_idx
+        return idx
+    def get_frame_buckets(self, vr):
+        _, h, w = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, h, w)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def get_frame_batch(self, vr, resize=None):
+        frame_range = self.get_frame_range(vr)
+        frames = vr.get_batch(frame_range)
+        video = rearrange(frames, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch
+            )
+        return video, vr
+    def train_data_batch(self, index):
+        # If we are training on individual clips.
+        if 'clip_path' in self.train_data[index] and \
+            self.train_data[index]['clip_path'] is not None:
+            vid_data = self.train_data[index]
+            clip_path = vid_data['clip_path']
+            # Get video prompt
+            prompt = vid_data['prompt']
+            video, _ = self.process_video_wrapper(clip_path)
+            prompt_ids = prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+            return video, prompt, prompt_ids
+         # Assign train data
+        train_data = self.train_data[index]
+        # Get the frame of the current index.
+        self.sample_start_idx = train_data['frame_index']
+        # Initialize resize
+        resize = None
+        video, vr = self.process_video_wrapper(train_data[self.vid_data_key])
+        # Get video prompt
+        prompt = train_data['prompt']
+        vr.seek(0)
+        prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+        return video, prompt, prompt_ids
+    @staticmethod
+    def __getname__(): return 'json'
+    def __len__(self):
+        if self.train_data is not None:
+            return len(self.train_data)
+        else:
+            return 0
+    def __getitem__(self, index):
+        # Initialize variables
+        video = None
+        prompt = None
+        prompt_ids = None
+        # Use default JSON training
+        if self.train_data is not None:
+            video, prompt, prompt_ids = self.train_data_batch(index)
+        example = {
+            "pixel_values": (video / 127.5 - 1.0),
+            "prompt_ids": prompt_ids[0],
+            "text_prompt": prompt,
+            'dataset': self.__getname__()
+        }
+        return example
+class SingleVideoDataset(Dataset):
+    def __init__(
+        self,
+            tokenizer = None,
+            width: int = 256,
+            height: int = 256,
+            n_sample_frames: int = 4,
+            frame_step: int = 1,
+            single_video_path: str = "",
+            single_video_prompt: str = "",
+            use_caption: bool = False,
+            use_bucketing: bool = False,
+            **kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.use_bucketing = use_bucketing
+        self.frames = []
+        self.index = 1
+        self.vid_types = (".mp4", ".avi", ".mov", ".webm", ".flv", ".mjpeg")
+        self.n_sample_frames = n_sample_frames
+        self.frame_step = frame_step
+        self.single_video_path = single_video_path
+        self.single_video_prompt = single_video_prompt
+        self.width = width
+        self.height = height
+    def create_video_chunks(self):
+        # Create a list of frames separated by sample frames
+        # [(1,2,3), (4,5,6), ...]
+        vr = decord.VideoReader(self.single_video_path)
+        vr_range = range(1, len(vr), self.frame_step)
+        self.frames = list(self.chunk(vr_range, self.n_sample_frames))
+        # Delete any list that contains an out of range index.
+        for i, inner_frame_nums in enumerate(self.frames):
+            for frame_num in inner_frame_nums:
+                if frame_num > len(vr):
+                    print(f"Removing out of range index list at position: {i}...")
+                    del self.frames[i]
+        return self.frames
+    def chunk(self, it, size):
+        it = iter(it)
+        return iter(lambda: tuple(islice(it, size)), ())
+    def get_frame_batch(self, vr, resize=None):
+        index = self.index
+        frames = vr.get_batch(self.frames[self.index])
+        video = rearrange(frames, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video
+    def get_frame_buckets(self, vr):
+        _, h, w = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, h, w)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch
+            )
+        return video, vr
+    def single_video_batch(self, index):
+        train_data = self.single_video_path
+        self.index = index
+        if train_data.endswith(self.vid_types):
+            video, _ = self.process_video_wrapper(train_data)
+            prompt = self.single_video_prompt
+            prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+            return video, prompt, prompt_ids
+        else:
+            raise ValueError(f"Single video is not a video type. Types: {self.vid_types}")
+    @staticmethod
+    def __getname__(): return 'single_video'
+    def __len__(self):
+        return len(self.create_video_chunks())
+    def __getitem__(self, index):
+        video, prompt, prompt_ids = self.single_video_batch(index)
+        example = {
+            "pixel_values": (video / 127.5 - 1.0),
+            "prompt_ids": prompt_ids[0],
+            "text_prompt": prompt,
+            'dataset': self.__getname__()
+        }
+        return example
+class ImageDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer = None,
+        width: int = 256,
+        height: int = 256,
+        base_width: int = 256,
+        base_height: int = 256,
+        use_caption:     bool = False,
+        image_dir: str = '',
+        single_img_prompt: str = '',
+        use_bucketing: bool = False,
+        fallback_prompt: str = '',
+        **kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.img_types = (".png", ".jpg", ".jpeg", '.bmp')
+        self.use_bucketing = use_bucketing
+        self.image_dir = self.get_images_list(image_dir)
+        self.fallback_prompt = fallback_prompt
+        self.use_caption = use_caption
+        self.single_img_prompt = single_img_prompt
+        self.width = width
+        self.height = height
+    def get_images_list(self, image_dir):
+        if os.path.exists(image_dir):
+            imgs = [x for x in os.listdir(image_dir) if x.endswith(self.img_types)]
+            full_img_dir = []
+            for img in imgs:
+                full_img_dir.append(f"{image_dir}/{img}")
+            return sorted(full_img_dir)
+        return ['']
+    def image_batch(self, index):
+        train_data = self.image_dir[index]
+        img = train_data
+        try:
+            img = torchvision.io.read_image(img, mode=torchvision.io.ImageReadMode.RGB)
+        except:
+            img = T.transforms.PILToTensor()(Image.open(img).convert("RGB"))
+        width = self.width
+        height = self.height
+        if self.use_bucketing:
+            _, h, w = img.shape
+            width, height = sensible_buckets(width, height, w, h)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        img = resize(img)
+        img = repeat(img, 'c h w -> f c h w', f=1)
+        prompt = get_text_prompt(
+            file_path=train_data,
+            text_prompt=self.single_img_prompt,
+            fallback_prompt=self.fallback_prompt,
+            ext_types=self.img_types,
+            use_caption=True
+        )
+        prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+        return img, prompt, prompt_ids
+    @staticmethod
+    def __getname__(): return 'image'
+    def __len__(self):
+        # Image directory
+        if os.path.exists(self.image_dir[0]):
+            return len(self.image_dir)
+        else:
+            return 0
+    def __getitem__(self, index):
+        img, prompt, prompt_ids = self.image_batch(index)
+        example = {
+            "pixel_values": (img / 127.5 - 1.0),
+            "prompt_ids": prompt_ids[0],
+            "text_prompt": prompt,
+            'dataset': self.__getname__()
+        }
+        return example
+class VideoFolderDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer=None,
+        width: int = 256,
+        height: int = 256,
+        n_sample_frames: int = 16,
+        fps: int = 8,
+        path: str = "./data",
+        fallback_prompt: str = "",
+        use_bucketing: bool = False,
+        **kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.use_bucketing = use_bucketing
+        self.fallback_prompt = fallback_prompt
+        self.video_files = glob(f"{path}/*.mp4")
+        self.width = width
+        self.height = height
+        self.n_sample_frames = n_sample_frames
+        self.fps = fps
+    def get_frame_buckets(self, vr):
+        _, h, w = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, h, w)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def get_frame_batch(self, vr, resize=None):
+        n_sample_frames = self.n_sample_frames
+        native_fps = vr.get_avg_fps()
+        every_nth_frame = max(1, round(native_fps / self.fps))
+        every_nth_frame = min(len(vr), every_nth_frame)
+        effective_length = len(vr) // every_nth_frame
+        if effective_length < n_sample_frames:
+            n_sample_frames = effective_length
+        effective_idx = random.randint(0, (effective_length - n_sample_frames))
+        idxs = every_nth_frame * np.arange(effective_idx, effective_idx + n_sample_frames)
+        video = vr.get_batch(idxs)
+        video = rearrange(video, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video, vr
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch
+            )
+        return video, vr
+    def get_prompt_ids(self, prompt):
+        return self.tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+    @staticmethod
+    def __getname__(): return 'folder'
+    def __len__(self):
+        return len(self.video_files)
+    def __getitem__(self, index):
+        video, _ = self.process_video_wrapper(self.video_files[index])
+        if os.path.exists(self.video_files[index].replace(".mp4", ".txt")):
+            with open(self.video_files[index].replace(".mp4", ".txt"), "r") as f:
+                prompt = f.read()
+        else:
+            prompt = self.fallback_prompt
+        prompt_ids = self.get_prompt_ids(prompt)
+        return {"pixel_values": (video[0] / 127.5 - 1.0), "prompt_ids": prompt_ids[0], "text_prompt": prompt, 'dataset': self.__getname__()}
+class CachedDataset(Dataset):
+    def __init__(self,cache_dir: str = ''):
+        self.cache_dir = cache_dir
+        self.cached_data_list = self.get_files_list()
+    def get_files_list(self):
+        tensors_list = [f"{self.cache_dir}/{x}" for x in os.listdir(self.cache_dir) if x.endswith('.pt')]
+        return sorted(tensors_list)
+    def __len__(self):
+        return len(self.cached_data_list)
+    def __getitem__(self, index):
+        cached_latent = torch.load(self.cached_data_list[index], map_location='cuda:0')
+        return cached_latent

download-weights ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env python
+import os
+import shutil
+import torch
+from diffusers import DiffusionPipeline
+MODEL_CACHE = "model-cache"
+TMP_CACHE = "tmp-cache"
+if os.path.exists(MODEL_CACHE):
+    shutil.rmtree(MODEL_CACHE)
+os.makedirs(MODEL_CACHE, exist_ok=True)
+pipe = DiffusionPipeline.from_pretrained(
+    "cerspense/zeroscope_v2_XL",
+    torch_dtype=torch.float16,
+    cache_dir=TMP_CACHE,
+)
+pipe.save_pretrained(MODEL_CACHE + "/xl")
+pipe = DiffusionPipeline.from_pretrained(
+    "cerspense/zeroscope_v2_576w",
+    torch_dtype=torch.float16,
+    cache_dir=TMP_CACHE,
+)
+pipe.save_pretrained(MODEL_CACHE + "/576w")
+pipe = DiffusionPipeline.from_pretrained(
+    "camenduru/potat1",
+    torch_dtype=torch.float16,
+    cache_dir=TMP_CACHE,
+)
+pipe.save_pretrained(MODEL_CACHE + "/potat1")
+pipe = DiffusionPipeline.from_pretrained(
+    "strangeman3107/animov-512x",
+    torch_dtype=torch.float16,
+    cache_dir=TMP_CACHE,
+)
+pipe.save_pretrained(MODEL_CACHE + "/animov-512x")
+shutil.rmtree(TMP_CACHE)

inference.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import argparse
+import os
+import warnings
+from pathlib import Path
+from uuid import uuid4
+from utils.lora import inject_inferable_lora
+import torch
+from diffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
+from models.unet_3d_condition import UNet3DConditionModel
+from einops import rearrange
+from torch.nn.functional import interpolate
+import imageio
+import decord
+from train import handle_memory_attention, load_primary_models
+from utils.lama import inpaint_watermark
+def initialize_pipeline(model, device="cuda", xformers=False, sdp=False):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        scheduler, tokenizer, text_encoder, vae, _unet = load_primary_models(model)
+        del _unet #This is a no op
+        unet = UNet3DConditionModel.from_pretrained(model, subfolder='unet')
+        # unet.disable_gradient_checkpointing()
+    pipeline = TextToVideoSDPipeline.from_pretrained(
+        pretrained_model_name_or_path=model,
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder.to(device=device, dtype=torch.half),
+        vae=vae.to(device=device, dtype=torch.half),
+        unet=unet.to(device=device, dtype=torch.half),
+    )
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    unet._set_gradient_checkpointing(value=False)
+    handle_memory_attention(xformers, sdp, unet)
+    vae.enable_slicing()
+    return pipeline
+def vid2vid(
+    pipeline, init_video, init_weight, prompt, negative_prompt, height, width, num_inference_steps, generator, guidance_scale
+):
+    num_frames = init_video.shape[2]
+    init_video = rearrange(init_video, "b c f h w -> (b f) c h w")
+    pipeline.generator=generator
+    latents = pipeline.vae.encode(init_video).latent_dist.sample()
+    latents = rearrange(latents, "(b f) c h w -> b c f h w", f=num_frames)
+    latents = pipeline.scheduler.add_noise(
+        original_samples=latents * 0.18215,
+        noise=torch.randn_like(latents),
+        timesteps=(torch.ones(latents.shape[0]) * pipeline.scheduler.num_train_timesteps * (1 - init_weight)).long(),
+    )
+    if latents.shape[0] != len(prompt):
+        latents = latents.repeat(len(prompt), 1, 1, 1, 1)
+    do_classifier_free_guidance = guidance_scale > 1.0
+    prompt_embeds = pipeline._encode_prompt(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        device=latents.device,
+        num_images_per_prompt=1,
+        do_classifier_free_guidance=do_classifier_free_guidance,
+    )
+    pipeline.scheduler.set_timesteps(num_inference_steps, device=latents.device)
+    timesteps = pipeline.scheduler.timesteps
+    timesteps = timesteps[round(init_weight * len(timesteps)) :]
+    with pipeline.progress_bar(total=len(timesteps)) as progress_bar:
+        for t in timesteps:
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = pipeline.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            noise_pred = pipeline.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # reshape latents
+            bsz, channel, frames, width, height = latents.shape
+            latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+            noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = pipeline.scheduler.step(noise_pred, t, latents).prev_sample
+            # reshape latents back
+            latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+            progress_bar.update()
+    video_tensor = pipeline.decode_latents(latents)
+    return video_tensor
+@torch.inference_mode()
+def inference(
+    model,
+    prompt,
+    negative_prompt=None,
+    batch_size=1,
+    num_frames=16,
+    width=256,
+    height=256,
+    num_steps=50,
+    guidance_scale=9,
+    init_video=None,
+    init_weight=0.5,
+    device="cuda",
+    xformers=False,
+    sdp=False,
+    lora_path='',
+    lora_rank=64,
+    seed=0,
+):
+    with torch.autocast(device, dtype=torch.half):
+        pipeline = initialize_pipeline(model, device, xformers, sdp)
+        inject_inferable_lora(pipeline, lora_path, r=lora_rank)
+        prompt = [prompt] * batch_size
+        negative_prompt = ([negative_prompt] * batch_size) if negative_prompt is not None else None
+        if init_video is not None:
+            g_cuda = torch.Generator(device='cuda')
+            g_cuda.manual_seed(seed)
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(seed)
+            videos = vid2vid(
+                pipeline=pipeline,
+                init_video=init_video.to(device=device, dtype=torch.half),
+                init_weight=init_weight,
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                generator=g_cuda,
+                guidance_scale=guidance_scale,
+            )
+        else:
+            g_cuda = torch.Generator(device='cuda')
+            g_cuda.manual_seed(seed)
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(seed)
+            videos = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                generator=g_cuda,
+                guidance_scale=guidance_scale,
+                output_type="pt",
+            ).frames
+        return videos
+def export_to_video(video_frames, output_video_path, fps):
+    writer = imageio.get_writer(output_video_path, format="FFMPEG", fps=fps)
+    for frame in video_frames:
+        writer.append_data(frame)
+    writer.close()
+def run(**args):
+    decord.bridge.set_bridge("torch")
+    output_dir = args.pop("output_dir")
+    fps = args.pop("fps")
+    remove_watermark = args.pop("remove_watermark")
+    init_video = args.get("init_video", None)
+    if init_video is not None:
+        vr = decord.VideoReader(init_video)
+        init = rearrange(vr[:], "f h w c -> c f h w").div(127.5).sub(1).unsqueeze(0)
+        init = interpolate(init, size=(args['num_frames'], args['height'], args['width']), mode="trilinear")
+        args["init_video"] = init
+    videos = inference(**args)
+    os.makedirs(output_dir, exist_ok=True)
+    for idx, video in enumerate(videos):
+        if remove_watermark:
+            video = rearrange(video, "c f h w -> f c h w").add(1).div(2)
+            video = inpaint_watermark(video)
+            video = rearrange(video, "f c h w -> f h w c").clamp(0, 1).mul(255)
+        else:
+            video = rearrange(video, "c f h w -> f h w c").clamp(-1, 1).add(1).mul(127.5)
+        video = video.byte().cpu().numpy()
+        filename = os.path.join(output_dir, f"output-{idx}.mp4")
+        export_to_video(video, filename, fps)
+        yield filename
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", type=str, required=True)
+    parser.add_argument("-p", "--prompt", type=str, required=True)
+    parser.add_argument("-n", "--negative_prompt", type=str, default=None)
+    parser.add_argument("-o", "--output_dir", type=str, default="./output")
+    parser.add_argument("-B", "--batch_size", type=int, default=1)
+    parser.add_argument("-T", "--num_frames", type=int, default=16)
+    parser.add_argument("-W", "--width", type=int, default=256)
+    parser.add_argument("-H", "--height", type=int, default=256)
+    parser.add_argument("-s", "--num_steps", type=int, default=50)
+    parser.add_argument("-g", "--guidance-scale", type=float, default=9)
+    parser.add_argument("-i", "--init-video", type=str, default=None)
+    parser.add_argument("-iw", "--init-weight", type=float, default=0.5)
+    parser.add_argument("-f", "--fps", type=int, default=8)
+    parser.add_argument("-d", "--device", type=str, default="cuda")
+    parser.add_argument("-x", "--xformers", action="store_true")
+    parser.add_argument("-S", "--sdp", action="store_true")
+    parser.add_argument("-lP", "--lora_path", type=str, default="")
+    parser.add_argument("-lR", "--lora_rank", type=int, default=64)
+    parser.add_argument("-rw", "--remove-watermark", action="store_true")
+    parser.add_argument("-seed", "--seed", type=int, default =0)
+    args = vars(parser.parse_args())
+    for filename in run(**args):
+        print(filename)

lama.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Based on the implementation from:
+https://huggingface.co/spaces/fffiloni/lama-video-watermark-remover/tree/main
+Modules were adapted by Hans Brouwer to only support the final configuration of the model uploaded here:
+https://huggingface.co/akhaliq/lama
+Apache License 2.0: https://github.com/advimman/lama/blob/main/LICENSE
+@article{suvorov2021resolution,
+  title={Resolution-robust Large Mask Inpainting with Fourier Convolutions},
+  author={Suvorov, Roman and Logacheva, Elizaveta and Mashikhin, Anton and Remizova, Anastasia and Ashukha, Arsenii and Silvestrov, Aleksei and Kong, Naejin and Goka, Harshith and Park, Kiwoong and Lempitsky, Victor},
+  journal={arXiv preprint arXiv:2109.07161},
+  year={2021}
+}
+"""
+import os
+import sys
+from urllib.request import urlretrieve
+import torch
+from einops import rearrange
+from PIL import Image
+from torch import nn
+from torch.nn import functional as F
+from torchvision.transforms.functional import to_tensor
+from tqdm import tqdm
+from train import export_to_video
+LAMA_URL = "https://huggingface.co/akhaliq/lama/resolve/main/best.ckpt"
+LAMA_PATH = "models/lama.ckpt"
+def download_progress(t):
+    last_b = [0]
+    def update_to(b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            t.total = tsize
+        t.update((b - last_b[0]) * bsize)
+        last_b[0] = b
+    return update_to
+def download(url, path):
+    with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=path) as t:
+        urlretrieve(url, filename=path, reporthook=download_progress(t), data=None)
+class FourierUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, groups=1):
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+        self.conv_layer = torch.nn.Conv2d(
+            in_channels=in_channels * 2,
+            out_channels=out_channels * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=self.groups,
+            bias=False,
+        )
+        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+    def forward(self, x):
+        batch = x.shape[0]
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-2, -1)
+        ffted = torch.fft.rfftn(x, dim=fft_dim, norm="ortho")
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((batch, -1) + ffted.size()[3:])
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(self.bn(ffted))
+        # (batch,c, t, h, w/2+1, 2)
+        ffted = ffted.view((batch, -1, 2) + ffted.size()[2:]).permute(0, 1, 3, 4, 2).contiguous()
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+        ifft_shape_slice = x.shape[-2:]
+        output = torch.fft.irfftn(ffted, s=ifft_shape_slice, dim=fft_dim, norm="ortho")
+        return output
+class SpectralTransform(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1, groups=1):
+        super(SpectralTransform, self).__init__()
+        self.stride = stride
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels // 2, kernel_size=1, groups=groups, bias=False),
+            nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True),
+        )
+        self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups)
+        self.conv2 = torch.nn.Conv2d(out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False)
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+        output = self.conv2(x + output)
+        return output
+class FFC(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        ratio_gin,
+        ratio_gout,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+        padding_type="reflect",
+        gated=False,
+    ):
+        super(FFC, self).__init__()
+        assert stride == 1 or stride == 2, "Stride should be 1 or 2."
+        self.stride = stride
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(
+            in_cl, out_cl, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type
+        )
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(
+            in_cl, out_cg, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type
+        )
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(
+            in_cg, out_cl, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type
+        )
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(in_cg, out_cg, stride, 1 if groups == 1 else groups // 2)
+        self.gated = gated
+        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        self.gate = module(in_channels, 2, 1)
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
+        return out_xl, out_xg
+class FFC_BN_ACT(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        ratio_gin=0,
+        ratio_gout=0,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+        norm_layer=nn.BatchNorm2d,
+        activation_layer=nn.ReLU,
+    ):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(
+            in_channels, out_channels, kernel_size, ratio_gin, ratio_gout, stride, padding, dilation, groups, bias
+        )
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        self.bn_l = lnorm(out_channels - global_channels)
+        self.bn_g = gnorm(global_channels)
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+    def forward(self, x):
+        x_l, x_g = self.ffc(x)
+        x_l = self.act_l(self.bn_l(x_l))
+        x_g = self.act_g(self.bn_g(x_g))
+        return x_l, x_g
+class FFCResnetBlock(nn.Module):
+    def __init__(self, dim, ratio_gin, ratio_gout):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(
+            dim, dim, kernel_size=3, padding=1, dilation=1, ratio_gin=ratio_gin, ratio_gout=ratio_gout
+        )
+        self.conv2 = FFC_BN_ACT(
+            dim, dim, kernel_size=3, padding=1, dilation=1, ratio_gin=ratio_gin, ratio_gout=ratio_gout
+        )
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x_l, x_g
+        x_l, x_g = self.conv1((x_l, x_g))
+        x_l, x_g = self.conv2((x_l, x_g))
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        return out
+class ConcatTupleLayer(nn.Module):
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+class LargeMaskInpainting(nn.Module):
+    def __init__(self, input_nc=4, output_nc=3, ngf=64, n_downsampling=3, n_blocks=18, max_features=1024):
+        super().__init__()
+        model = [nn.ReflectionPad2d(3), FFC_BN_ACT(input_nc, ngf, kernel_size=7)]
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2**i
+            model += [
+                FFC_BN_ACT(
+                    min(max_features, ngf * mult),
+                    min(max_features, ngf * mult * 2),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    ratio_gout=0.75 if i == n_downsampling - 1 else 0,
+                )
+            ]
+        ### resnet blocks
+        for i in range(n_blocks):
+            cur_resblock = FFCResnetBlock(min(max_features, ngf * 2**n_downsampling), ratio_gin=0.75, ratio_gout=0.75)
+            model += [cur_resblock]
+        model += [ConcatTupleLayer()]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [
+                nn.ConvTranspose2d(
+                    min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    output_padding=1,
+                ),
+                nn.BatchNorm2d(min(max_features, int(ngf * mult / 2))),
+                nn.ReLU(True),
+            ]
+        model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7), nn.Sigmoid()]
+        self.model = nn.Sequential(*model)
+    def forward(self, img, mask):
+        masked_img = img * (1 - mask)
+        masked_img = torch.cat([masked_img, mask], dim=1)
+        pred = self.model(masked_img)
+        inpainted = mask * pred + (1 - mask) * img
+        return inpainted
+@torch.inference_mode()
+def inpaint_watermark(imgs):
+    if not os.path.exists(LAMA_PATH):
+        download(LAMA_URL, LAMA_PATH)
+    mask = to_tensor(Image.open("./utils/mask.png").convert("L")).unsqueeze(0).to(imgs.device)
+    if mask.shape[-1] != imgs.shape[-1]:
+        mask = F.interpolate(mask, size=(imgs.shape[2], imgs.shape[3]), mode="nearest")
+    mask = mask.expand(imgs.shape[0], 1, mask.shape[2], mask.shape[3])
+    model = LargeMaskInpainting().to(imgs.device)
+    state_dict = torch.load(LAMA_PATH, map_location=imgs.device)["state_dict"]
+    g_dict = {k.replace("generator.", ""): v for k, v in state_dict.items() if k.startswith("generator")}
+    model.load_state_dict(g_dict)
+    inpainted = model.forward(imgs, mask)
+    return inpainted
+if __name__ == "__main__":
+    import decord
+    decord.bridge.set_bridge("torch")
+    if len(sys.argv) < 2:
+        print("Usage: python -m utils.lama <path/to/video>")
+        sys.exit(1)
+    video_path = sys.argv[1]
+    out_path = video_path.replace(".mp4", " inpainted.mp4")
+    vr = decord.VideoReader(video_path)
+    fps = vr.get_avg_fps()
+    video = rearrange(vr[:], "f h w c -> f c h w").div(255)
+    inpainted = inpaint_watermark(video)
+    inpainted = rearrange(inpainted, "f c h w -> f h w c").clamp(0, 1).mul(255).byte().cpu().numpy()
+    export_to_video(inpainted, out_path, fps)

lora.py ADDED Viewed

	@@ -0,0 +1,1312 @@

+import json
+import math
+from itertools import groupby
+import os
+from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from safetensors.torch import safe_open
+    from safetensors.torch import save_file as safe_save
+    safetensors_available = True
+except ImportError:
+    from .safe_open import safe_open
+    def safe_save(
+        tensors: Dict[str, torch.Tensor],
+        filename: str,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> None:
+        raise EnvironmentError(
+            "Saving safetensors requires the safetensors library. Please install with pip or similar."
+        )
+    safetensors_available = False
+class LoraInjectedLinear(nn.Module):
+    def __init__(
+        self, in_features, out_features, bias=False, r=4, dropout_p=0.1, scale=1.0
+    ):
+        super().__init__()
+        if r > min(in_features, out_features):
+            #raise ValueError(
+            #    f"LoRA rank {r} must be less or equal than {min(in_features, out_features)}"
+            #)
+            print(f"LoRA rank {r} is too large. setting to: {min(in_features, out_features)}")
+            r = min(in_features, out_features)
+        self.r = r
+        self.linear = nn.Linear(in_features, out_features, bias)
+        self.lora_down = nn.Linear(in_features, r, bias=False)
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Linear(r, out_features, bias=False)
+        self.scale = scale
+        self.selector = nn.Identity()
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Linear(self.r, self.r, bias=False)
+        self.selector.weight.data = torch.diag(diag)
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups: int = 1,
+        bias: bool = True,
+        r: int = 4,
+        dropout_p: float = 0.1,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            print(f"LoRA rank {r} is too large. setting to: {min(in_channels, out_channels)}")
+            r = min(in_channels, out_channels)
+        self.r = r
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.lora_down = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv2d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.conv(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv2d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: (3, 1, 1),
+        padding: (1, 0, 0),
+        bias: bool = False,
+        r: int = 4,
+        dropout_p: float = 0,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            print(f"LoRA rank {r} is too large. setting to: {min(in_channels, out_channels)}")
+            r = min(in_channels, out_channels)
+        self.r = r
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.conv = nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.lora_down = nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            bias=False,
+            padding=padding
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv3d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.conv(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv3d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+UNET_DEFAULT_TARGET_REPLACE = {"CrossAttention", "Attention", "GEGLU"}
+UNET_EXTENDED_TARGET_REPLACE = {"ResnetBlock2D", "CrossAttention", "Attention", "GEGLU"}
+TEXT_ENCODER_DEFAULT_TARGET_REPLACE = {"CLIPAttention"}
+TEXT_ENCODER_EXTENDED_TARGET_REPLACE = {"CLIPAttention"}
+DEFAULT_TARGET_REPLACE = UNET_DEFAULT_TARGET_REPLACE
+EMBED_FLAG = "<embed>"
+def _find_children(
+    model,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+):
+    """
+    Find all modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for parent in model.modules():
+        for name, module in parent.named_children():
+            if any([isinstance(module, _class) for _class in search_class]):
+                yield parent, name, module
+def _find_modules_v2(
+    model,
+    ancestor_class: Optional[Set[str]] = None,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [
+        LoraInjectedLinear,
+        LoraInjectedConv2d,
+        LoraInjectedConv3d
+    ],
+):
+    """
+    Find all modules of a certain class (or union of classes) that are direct or
+    indirect descendants of other modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # Get the targets we should replace all linears under
+    if ancestor_class is not None:
+        ancestors = (
+            module
+            for module in model.modules()
+            if module.__class__.__name__ in ancestor_class
+        )
+    else:
+        # this, incase you want to naively iterate over all modules.
+        ancestors = [module for module in model.modules()]
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for ancestor in ancestors:
+        for fullname, module in ancestor.named_modules():
+            if any([isinstance(module, _class) for _class in search_class]):
+                # Find the direct parent if this is a descendant, not a child, of target
+                *path, name = fullname.split(".")
+                parent = ancestor
+                while path:
+                    parent = parent.get_submodule(path.pop(0))
+                # Skip this linear if it's a child of a LoraInjectedLinear
+                if exclude_children_of and any(
+                    [isinstance(parent, _class) for _class in exclude_children_of]
+                ):
+                    continue
+                # Otherwise, yield it
+                yield parent, name, module
+def _find_modules_old(
+    model,
+    ancestor_class: Set[str] = DEFAULT_TARGET_REPLACE,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [LoraInjectedLinear],
+):
+    ret = []
+    for _module in model.modules():
+        if _module.__class__.__name__ in ancestor_class:
+            for name, _child_module in _module.named_modules():
+                if _child_module.__class__ in search_class:
+                    ret.append((_module, name, _child_module))
+    print(ret)
+    return ret
+_find_modules = _find_modules_v2
+def inject_trainable_lora(
+    model: nn.Module,
+    target_replace_module: Set[str] = DEFAULT_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+    verbose: bool = False,
+    dropout_p: float = 0.0,
+    scale: float = 1.0,
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear]
+    ):
+        weight = _child_module.weight
+        bias = _child_module.bias
+        if verbose:
+            print("LoRA Injection : injecting lora into ", name)
+            print("LoRA Injection : weight shape", weight.shape)
+        _tmp = LoraInjectedLinear(
+            _child_module.in_features,
+            _child_module.out_features,
+            _child_module.bias is not None,
+            r=r,
+            dropout_p=dropout_p,
+            scale=scale,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def inject_trainable_lora_extended(
+    model: nn.Module,
+    target_replace_module: Set[str] = UNET_EXTENDED_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, nn.Conv2d, nn.Conv3d]
+    ):
+        if _child_module.__class__ == nn.Linear:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedLinear(
+                _child_module.in_features,
+                _child_module.out_features,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif _child_module.__class__ == nn.Conv2d:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedConv2d(
+                _child_module.in_channels,
+                _child_module.out_channels,
+                _child_module.kernel_size,
+                _child_module.stride,
+                _child_module.padding,
+                _child_module.dilation,
+                _child_module.groups,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        elif _child_module.__class__ == nn.Conv3d:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedConv3d(
+                _child_module.in_channels,
+                _child_module.out_channels,
+                bias=_child_module.bias is not None,
+                kernel_size=_child_module.kernel_size,
+                padding=_child_module.padding,
+                r=r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        if bias is not None:
+            _tmp.to(_child_module.bias.device).to(_child_module.bias.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def inject_inferable_lora(
+        model,
+        lora_path='',
+        unet_replace_modules=["UNet3DConditionModel"],
+        text_encoder_replace_modules=["CLIPEncoderLayer"],
+        is_extended=False,
+        r=16
+    ):
+    from transformers.models.clip import CLIPTextModel
+    from diffusers import UNet3DConditionModel
+    def is_text_model(f): return 'text_encoder' in f and isinstance(model.text_encoder, CLIPTextModel)
+    def is_unet(f): return 'unet' in f and model.unet.__class__.__name__ == "UNet3DConditionModel"
+    if os.path.exists(lora_path):
+        try:
+            for f in os.listdir(lora_path):
+                if f.endswith('.pt'):
+                    lora_file = os.path.join(lora_path, f)
+                    if is_text_model(f):
+                        monkeypatch_or_replace_lora(
+                            model.text_encoder,
+                            torch.load(lora_file),
+                            target_replace_module=text_encoder_replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded Text Encoder LoRa.")
+                        continue
+                    if is_unet(f):
+                        monkeypatch_or_replace_lora_extended(
+                            model.unet,
+                            torch.load(lora_file),
+                            target_replace_module=unet_replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded UNET LoRa.")
+                        continue
+                    print("Found a .pt file, but doesn't have the correct name format. (unet.pt, text_encoder.pt)")
+        except Exception as e:
+            print(e)
+            print("Couldn't inject LoRA's due to an error.")
+def extract_lora_ups_down(model, target_replace_module=DEFAULT_TARGET_REPLACE):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+    ):
+        loras.append((_child_module.lora_up, _child_module.lora_down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def extract_lora_as_tensor(
+    model, target_replace_module=DEFAULT_TARGET_REPLACE, as_fp16=True
+):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+    ):
+        up, down = _child_module.realize_as_lora()
+        if as_fp16:
+            up = up.to(torch.float16)
+            down = down.to(torch.float16)
+        loras.append((up, down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def save_lora_weight(
+    model,
+    path="./lora.pt",
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+):
+    weights = []
+    for _up, _down in extract_lora_ups_down(
+        model, target_replace_module=target_replace_module
+    ):
+        weights.append(_up.weight.to("cpu").to(torch.float32))
+        weights.append(_down.weight.to("cpu").to(torch.float32))
+    torch.save(weights, path)
+def save_lora_as_json(model, path="./lora.json"):
+    weights = []
+    for _up, _down in extract_lora_ups_down(model):
+        weights.append(_up.weight.detach().cpu().numpy().tolist())
+        weights.append(_down.weight.detach().cpu().numpy().tolist())
+    import json
+    with open(path, "w") as f:
+        json.dump(weights, f)
+def save_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Saves the Lora from multiple modules in a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (module, target_replace_module)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (model, target_replace_module) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        for i, (_up, _down) in enumerate(
+            extract_lora_as_tensor(model, target_replace_module)
+        ):
+            rank = _down.shape[0]
+            metadata[f"{name}:{i}:rank"] = str(rank)
+            weights[f"{name}:{i}:up"] = _up
+            weights[f"{name}:{i}:down"] = _down
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def save_safeloras(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    outpath="./lora.safetensors",
+):
+    return save_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def convert_loras_to_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Converts the Lora from multiple pytorch .pt files into a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (pytorch_model_path, target_replace_module, rank)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (path, target_replace_module, r) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        lora = torch.load(path)
+        for i, weight in enumerate(lora):
+            is_up = i % 2 == 0
+            i = i // 2
+            if is_up:
+                metadata[f"{name}:{i}:rank"] = str(r)
+                weights[f"{name}:{i}:up"] = weight
+            else:
+                weights[f"{name}:{i}:down"] = weight
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def convert_loras_to_safeloras(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    outpath="./lora.safetensors",
+):
+    convert_loras_to_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def parse_safeloras(
+    safeloras,
+) -> Dict[str, Tuple[List[nn.parameter.Parameter], List[int], List[str]]]:
+    """
+    Converts a loaded safetensor file that contains a set of module Loras
+    into Parameters and other information
+    Output is a dictionary of {
+        "module name": (
+            [list of weights],
+            [list of ranks],
+            target_replacement_modules
+        )
+    }
+    """
+    loras = {}
+    metadata = safeloras.metadata()
+    get_name = lambda k: k.split(":")[0]
+    keys = list(safeloras.keys())
+    keys.sort(key=get_name)
+    for name, module_keys in groupby(keys, get_name):
+        info = metadata.get(name)
+        if not info:
+            raise ValueError(
+                f"Tensor {name} has no metadata - is this a Lora safetensor?"
+            )
+        # Skip Textual Inversion embeds
+        if info == EMBED_FLAG:
+            continue
+        # Handle Loras
+        # Extract the targets
+        target = json.loads(info)
+        # Build the result lists - Python needs us to preallocate lists to insert into them
+        module_keys = list(module_keys)
+        ranks = [4] * (len(module_keys) // 2)
+        weights = [None] * len(module_keys)
+        for key in module_keys:
+            # Split the model name and index out of the key
+            _, idx, direction = key.split(":")
+            idx = int(idx)
+            # Add the rank
+            ranks[idx] = int(metadata[f"{name}:{idx}:rank"])
+            # Insert the weight into the list
+            idx = idx * 2 + (1 if direction == "down" else 0)
+            weights[idx] = nn.parameter.Parameter(safeloras.get_tensor(key))
+        loras[name] = (weights, ranks, target)
+    return loras
+def parse_safeloras_embeds(
+    safeloras,
+) -> Dict[str, torch.Tensor]:
+    """
+    Converts a loaded safetensor file that contains Textual Inversion embeds into
+    a dictionary of embed_token: Tensor
+    """
+    embeds = {}
+    metadata = safeloras.metadata()
+    for key in safeloras.keys():
+        # Only handle Textual Inversion embeds
+        meta = metadata.get(key)
+        if not meta or meta != EMBED_FLAG:
+            continue
+        embeds[key] = safeloras.get_tensor(key)
+    return embeds
+def load_safeloras(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras)
+def load_safeloras_embeds(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras_embeds(safeloras)
+def load_safeloras_both(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras), parse_safeloras_embeds(safeloras)
+def collapse_lora(model, alpha=1.0):
+    for _module, name, _child_module in _find_modules(
+        model,
+        UNET_EXTENDED_TARGET_REPLACE | TEXT_ENCODER_EXTENDED_TARGET_REPLACE,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            print("Collapsing Lin Lora in", name)
+            _child_module.linear.weight = nn.Parameter(
+                _child_module.linear.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data
+                    @ _child_module.lora_down.weight.data
+                )
+                .type(_child_module.linear.weight.dtype)
+                .to(_child_module.linear.weight.device)
+            )
+        else:
+            print("Collapsing Conv Lora in", name)
+            _child_module.conv.weight = nn.Parameter(
+                _child_module.conv.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data.flatten(start_dim=1)
+                    @ _child_module.lora_down.weight.data.flatten(start_dim=1)
+                )
+                .reshape(_child_module.conv.weight.data.shape)
+                .type(_child_module.conv.weight.dtype)
+                .to(_child_module.conv.weight.device)
+            )
+def monkeypatch_or_replace_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, LoraInjectedLinear]
+    ):
+        _source = (
+            _child_module.linear
+            if isinstance(_child_module, LoraInjectedLinear)
+            else _child_module
+        )
+        weight = _source.weight
+        bias = _source.bias
+        _tmp = LoraInjectedLinear(
+            _source.in_features,
+            _source.out_features,
+            _source.bias is not None,
+            r=r.pop(0) if isinstance(r, list) else r,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_lora_extended(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[
+            nn.Linear,
+            nn.Conv2d,
+            nn.Conv3d,
+            LoraInjectedLinear,
+            LoraInjectedConv2d,
+            LoraInjectedConv3d,
+        ],
+    ):
+        if (_child_module.__class__ == nn.Linear) or (
+            _child_module.__class__ == LoraInjectedLinear
+        ):
+            if len(loras[0].shape) != 2:
+                continue
+            _source = (
+                _child_module.linear
+                if isinstance(_child_module, LoraInjectedLinear)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedLinear(
+                _source.in_features,
+                _source.out_features,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif (_child_module.__class__ == nn.Conv2d) or (
+            _child_module.__class__ == LoraInjectedConv2d
+        ):
+            if len(loras[0].shape) != 4:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv2d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv2d(
+                _source.in_channels,
+                _source.out_channels,
+                _source.kernel_size,
+                _source.stride,
+                _source.padding,
+                _source.dilation,
+                _source.groups,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        elif _child_module.__class__ == nn.Conv3d or(
+            _child_module.__class__ == LoraInjectedConv3d
+        ):
+            if len(loras[0].shape) != 5:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv3d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv3d(
+                _source.in_channels,
+                _source.out_channels,
+                bias=_source.bias is not None,
+                kernel_size=_source.kernel_size,
+                padding=_source.padding,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_safeloras(models, safeloras):
+    loras = parse_safeloras(safeloras)
+    for name, (lora, ranks, target) in loras.items():
+        model = getattr(models, name, None)
+        if not model:
+            print(f"No model provided for {name}, contained in Lora")
+            continue
+        monkeypatch_or_replace_lora_extended(model, lora, target, ranks)
+def monkeypatch_remove_lora(model):
+    for _module, name, _child_module in _find_modules(
+        model, search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d]
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            _source = _child_module.linear
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Linear(
+                _source.in_features, _source.out_features, bias is not None
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        else:
+            _source = _child_module.conv
+            weight, bias = _source.weight, _source.bias
+            if isinstance(_source, nn.Conv2d):
+                _tmp = nn.Conv2d(
+                    in_channels=_source.in_channels,
+                    out_channels=_source.out_channels,
+                    kernel_size=_source.kernel_size,
+                    stride=_source.stride,
+                    padding=_source.padding,
+                    dilation=_source.dilation,
+                    groups=_source.groups,
+                    bias=bias is not None,
+                )
+                _tmp.weight = weight
+                if bias is not None:
+                    _tmp.bias = bias
+            if isinstance(_source, nn.Conv3d):
+                _tmp = nn.Conv3d(
+                _source.in_channels,
+                _source.out_channels,
+                bias=_source.bias is not None,
+                kernel_size=_source.kernel_size,
+                padding=_source.padding,
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        _module._modules[name] = _tmp
+def monkeypatch_add_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[LoraInjectedLinear]
+    ):
+        weight = _child_module.linear.weight
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_up.weight.to(weight.device) * beta
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_down.weight.to(weight.device) * beta
+        )
+        _module._modules[name].to(weight.device)
+def tune_lora_scale(model, alpha: float = 1.0):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            _module.scale = alpha
+def set_lora_diag(model, diag: torch.Tensor):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            _module.set_selector_from_diag(diag)
+def _text_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
+def _ti_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["ti", "pt"])
+def apply_learned_embed_in_clip(
+    learned_embeds,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    if isinstance(token, str):
+        trained_tokens = [token]
+    elif isinstance(token, list):
+        assert len(learned_embeds.keys()) == len(
+            token
+        ), "The number of tokens and the number of embeds should be the same"
+        trained_tokens = token
+    else:
+        trained_tokens = list(learned_embeds.keys())
+    for token in trained_tokens:
+        print(token)
+        embeds = learned_embeds[token]
+        # cast to dtype of text_encoder
+        dtype = text_encoder.get_input_embeddings().weight.dtype
+        num_added_tokens = tokenizer.add_tokens(token)
+        i = 1
+        if not idempotent:
+            while num_added_tokens == 0:
+                print(f"The tokenizer already contains the token {token}.")
+                token = f"{token[:-1]}-{i}>"
+                print(f"Attempting to add the token {token}.")
+                num_added_tokens = tokenizer.add_tokens(token)
+                i += 1
+        elif num_added_tokens == 0 and idempotent:
+            print(f"The tokenizer already contains the token {token}.")
+            print(f"Replacing {token} embedding.")
+        # resize the token embeddings
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        # get the id for the token and assign the embeds
+        token_id = tokenizer.convert_tokens_to_ids(token)
+        text_encoder.get_input_embeddings().weight.data[token_id] = embeds
+    return token
+def load_learned_embed_in_clip(
+    learned_embeds_path,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    learned_embeds = torch.load(learned_embeds_path)
+    apply_learned_embed_in_clip(
+        learned_embeds, text_encoder, tokenizer, token, idempotent
+    )
+def patch_pipe(
+    pipe,
+    maybe_unet_path,
+    token: Optional[str] = None,
+    r: int = 4,
+    patch_unet=True,
+    patch_text=True,
+    patch_ti=True,
+    idempotent_token=True,
+    unet_target_replace_module=DEFAULT_TARGET_REPLACE,
+    text_target_replace_module=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+):
+    if maybe_unet_path.endswith(".pt"):
+        # torch format
+        if maybe_unet_path.endswith(".ti.pt"):
+            unet_path = maybe_unet_path[:-6] + ".pt"
+        elif maybe_unet_path.endswith(".text_encoder.pt"):
+            unet_path = maybe_unet_path[:-16] + ".pt"
+        else:
+            unet_path = maybe_unet_path
+        ti_path = _ti_lora_path(unet_path)
+        text_path = _text_lora_path(unet_path)
+        if patch_unet:
+            print("LoRA : Patching Unet")
+            monkeypatch_or_replace_lora(
+                pipe.unet,
+                torch.load(unet_path),
+                r=r,
+                target_replace_module=unet_target_replace_module,
+            )
+        if patch_text:
+            print("LoRA : Patching text encoder")
+            monkeypatch_or_replace_lora(
+                pipe.text_encoder,
+                torch.load(text_path),
+                target_replace_module=text_target_replace_module,
+                r=r,
+            )
+        if patch_ti:
+            print("LoRA : Patching token input")
+            token = load_learned_embed_in_clip(
+                ti_path,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+    elif maybe_unet_path.endswith(".safetensors"):
+        safeloras = safe_open(maybe_unet_path, framework="pt", device="cpu")
+        monkeypatch_or_replace_safeloras(pipe, safeloras)
+        tok_dict = parse_safeloras_embeds(safeloras)
+        if patch_ti:
+            apply_learned_embed_in_clip(
+                tok_dict,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+        return tok_dict
+def train_patch_pipe(pipe, patch_unet, patch_text):
+    if patch_unet:
+        print("LoRA : Patching Unet")
+        collapse_lora(pipe.unet)
+        monkeypatch_remove_lora(pipe.unet)
+    if patch_text:
+        print("LoRA : Patching text encoder")
+        collapse_lora(pipe.text_encoder)
+        monkeypatch_remove_lora(pipe.text_encoder)
+@torch.no_grad()
+def inspect_lora(model):
+    moved = {}
+    for name, _module in model.named_modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            ups = _module.lora_up.weight.data.clone()
+            downs = _module.lora_down.weight.data.clone()
+            wght: torch.Tensor = ups.flatten(1) @ downs.flatten(1)
+            dist = wght.flatten().abs().mean().item()
+            if name in moved:
+                moved[name].append(dist)
+            else:
+                moved[name] = [dist]
+    return moved
+def save_all(
+    unet,
+    text_encoder,
+    save_path,
+    placeholder_token_ids=None,
+    placeholder_tokens=None,
+    save_lora=True,
+    save_ti=True,
+    target_replace_module_text=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+    target_replace_module_unet=DEFAULT_TARGET_REPLACE,
+    safe_form=True,
+):
+    if not safe_form:
+        # save ti
+        if save_ti:
+            ti_path = _ti_lora_path(save_path)
+            learned_embeds_dict = {}
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                learned_embeds_dict[tok] = learned_embeds.detach().cpu()
+            torch.save(learned_embeds_dict, ti_path)
+            print("Ti saved to ", ti_path)
+        # save text encoder
+        if save_lora:
+            save_lora_weight(
+                unet, save_path, target_replace_module=target_replace_module_unet
+            )
+            print("Unet saved to ", save_path)
+            save_lora_weight(
+                text_encoder,
+                _text_lora_path(save_path),
+                target_replace_module=target_replace_module_text,
+            )
+            print("Text Encoder saved to ", _text_lora_path(save_path))
+    else:
+        assert save_path.endswith(
+            ".safetensors"
+        ), f"Save path : {save_path} should end with .safetensors"
+        loras = {}
+        embeds = {}
+        if save_lora:
+            loras["unet"] = (unet, target_replace_module_unet)
+            loras["text_encoder"] = (text_encoder, target_replace_module_text)
+        if save_ti:
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                embeds[tok] = learned_embeds.detach().cpu()
+        save_safeloras_with_embeds(loras, embeds, save_path)

predict.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+from typing import List
+from cog import BasePredictor, Input, Path
+import subprocess
+import shutil
+MODEL_CACHE = "model-cache"
+class Predictor(BasePredictor):
+    def setup(self):
+        pass
+    def predict(
+        self,
+        prompt: str = Input(
+            description="Input prompt", default="An astronaut riding a horse"
+        ),
+        negative_prompt: str = Input(
+            description="Negative prompt", default=None
+        ),
+        init_video: Path = Input(
+            description="URL of the initial video (optional)", default=None
+        ),
+        init_weight: float = Input(
+            description="Strength of init_video", default=0.5
+        ),
+        num_frames: int = Input(
+            description="Number of frames for the output video", default=24
+        ),
+        num_inference_steps: int = Input(
+            description="Number of denoising steps", ge=1, le=500, default=50
+        ),
+        width: int = Input(
+            description="Width of the output video", ge=256, default=576
+        ),
+        height: int = Input(
+            description="Height of the output video", ge=256, default=320
+        ),
+        guidance_scale: float = Input(
+            description="Guidance scale", ge=1.0, le=100.0, default=7.5
+        ),
+        fps: int = Input(description="fps for the output video", default=8),
+        model: str = Input(
+            description="Model to use", default="xl", choices=["xl", "576w", "potat1", "animov-512x"]
+        ),
+        batch_size: int = Input(description="Batch size", default=1, ge=1),
+        remove_watermark: bool = Input(
+            description="Remove watermark", default=False
+        ),
+        seed: int = Input(
+            description="Random seed. Leave blank to randomize the seed", default=None
+        ),
+    ) -> List[Path]:
+        if seed is None:
+            seed = int.from_bytes(os.urandom(2), "big")
+        print(f"Using seed: {seed}")
+        shutil.rmtree("output", ignore_errors=True)
+        os.makedirs("output", exist_ok=True)
+        args = {
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "batch_size": batch_size,
+            "num_frames": num_frames,
+            "num_steps": num_inference_steps,
+            "seed": seed,
+            "guidance-scale": guidance_scale,
+            "width": width,
+            "height": height,
+            "fps": fps,
+            "device": "cuda",
+            "output_dir": "output",
+            "remove-watermark": remove_watermark,
+        }
+        args['model'] = MODEL_CACHE + "/" + model
+        if init_video is not None:
+            # for some reason I need to copy the file to make it work
+            if os.path.exists("input.mp4"):
+                os.unlink("input.mp4")
+            shutil.copy(init_video, "input.mp4")
+            args["init-video"] = "input.mp4"
+            args["init-weight"] = init_weight
+            print("init video", os.stat("input.mp4").st_size)
+        cmd = ["python", "inference.py"]
+        for k, v in args.items():
+            if not v is None:
+                cmd.append(f"--{k}")
+                cmd.append(str(v))
+        subprocess.check_call(cmd)
+        # outputs = inference.run(**args)
+        outputs = []
+        for f in os.listdir("output"):
+            if f.endswith(".mp4"):
+                outputs.append(Path(os.path.join("output", f)))
+        return outputs

samples.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import base64
+import requests
+import sys
+import os
+def gen(output_fn, **kwargs):
+    if os.path.exists(output_fn):
+        print("Skipping", output_fn)
+        return
+    print("Generating", output_fn)
+    url = "http://localhost:5000/predictions"
+    response = requests.post(url, json={"input": kwargs})
+    data = response.json()
+    try:
+        datauri = data["output"][0]
+        base64_encoded_data = datauri.split(",")[1]
+        data = base64.b64decode(base64_encoded_data)
+    except:
+        print("Error!")
+        print("input:", kwargs)
+        print(data["logs"])
+        # sys.exit(1)
+    with open(output_fn, "wb") as f:
+        f.write(data)
+def main():
+    gen(
+        "sample.mp4",
+        prompt="A deep sea video of a bioluminescent siphonophore, 8k, beautiful, award winning, close up",
+        seed=42,
+        num_frames=24,
+        model="potat1",
+        num_inference_steps=30,
+        guidance_scale=17.5,
+        fps=12,
+    )
+    gen(
+        "vid-sample.mp4",
+        prompt="A deep sea video of a bioluminescent siphonophore, 8k, beautiful, award winning, close up",
+        seed=42,
+        num_frames=24,
+        model="zeroscope_v2_XL",
+        num_inference_steps=30,
+        guidance_scale=17.5,
+        init_video="https://replicate.delivery/pbxt/qxacIWhXu0rFAZu6GMElrXrTL5Wx6ZqnjPqIoS7DgIftowkIA/out.mp4",
+        fps=12,
+    )
+if __name__ == "__main__":
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,998 @@

+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import random
+import gc
+import copy
+from typing import Dict, Optional, Tuple
+from omegaconf import OmegaConf
+import cv2
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms as T
+import diffusers
+import transformers
+from torchvision import transforms
+from tqdm.auto import tqdm
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from models.unet_3d_condition import UNet3DConditionModel
+from diffusers.models import AutoencoderKL
+from diffusers import DPMSolverMultistepScheduler, DDPMScheduler, TextToVideoSDPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, export_to_video
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import AttnProcessor2_0, Attention
+from diffusers.models.attention import BasicTransformerBlock
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPEncoder
+from utils.dataset import VideoJsonDataset, SingleVideoDataset, \
+    ImageDataset, VideoFolderDataset, CachedDataset
+from einops import rearrange, repeat
+from utils.lora import (
+    extract_lora_ups_down,
+    inject_trainable_lora,
+    inject_trainable_lora_extended,
+    save_lora_weight,
+    train_patch_pipe,
+    monkeypatch_or_replace_lora,
+    monkeypatch_or_replace_lora_extended
+)
+already_printed_trainables = False
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+logger = get_logger(__name__, log_level="INFO")
+def create_logging(logging, logger, accelerator):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+def accelerate_set_verbose(accelerator):
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+def get_train_dataset(dataset_types, train_data, tokenizer):
+    train_datasets = []
+    # Loop through all available datasets, get the name, then add to list of data to process.
+    for DataSet in [VideoJsonDataset, SingleVideoDataset, ImageDataset, VideoFolderDataset]:
+        for dataset in dataset_types:
+            if dataset == DataSet.__getname__():
+                train_datasets.append(DataSet(**train_data, tokenizer=tokenizer))
+    if len(train_datasets) > 0:
+        return train_datasets
+    else:
+        raise ValueError("Dataset type not found: 'json', 'single_video', 'folder', 'image'")
+def extend_datasets(datasets, dataset_items, extend=False):
+    biggest_data_len = max(x.__len__() for x in datasets)
+    extended = []
+    for dataset in datasets:
+        if dataset.__len__() == 0:
+            del dataset
+            continue
+        if dataset.__len__() < biggest_data_len:
+            for item in dataset_items:
+                if extend and item not in extended and hasattr(dataset, item):
+                    print(f"Extending {item}")
+                    value = getattr(dataset, item)
+                    value *= biggest_data_len
+                    value = value[:biggest_data_len]
+                    setattr(dataset, item, value)
+                    print(f"New {item} dataset length: {dataset.__len__()}")
+                    extended.append(item)
+def export_to_video(video_frames, output_video_path, fps):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, _ = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=fps, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+def create_output_folders(output_dir, config):
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    out_dir = os.path.join(output_dir, f"train_{now}")
+    os.makedirs(out_dir, exist_ok=True)
+    os.makedirs(f"{out_dir}/samples", exist_ok=True)
+    OmegaConf.save(config, os.path.join(out_dir, 'config.yaml'))
+    return out_dir
+def load_primary_models(pretrained_model_path):
+    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
+    unet = UNet3DConditionModel.from_pretrained(pretrained_model_path, subfolder="unet")
+    return noise_scheduler, tokenizer, text_encoder, vae, unet
+def unet_and_text_g_c(unet, text_encoder, unet_enable, text_enable):
+    unet._set_gradient_checkpointing(value=unet_enable)
+    text_encoder._set_gradient_checkpointing(CLIPEncoder, value=text_enable)
+def freeze_models(models_to_freeze):
+    for model in models_to_freeze:
+        if model is not None: model.requires_grad_(False)
+def is_attn(name):
+   return ('attn1' or 'attn2' == name.split('.')[-1])
+def set_processors(attentions):
+    for attn in attentions: attn.set_processor(AttnProcessor2_0())
+def set_torch_2_attn(unet):
+    optim_count = 0
+    for name, module in unet.named_modules():
+        if is_attn(name):
+            if isinstance(module, torch.nn.ModuleList):
+                for m in module:
+                    if isinstance(m, BasicTransformerBlock):
+                        set_processors([m.attn1, m.attn2])
+                        optim_count += 1
+    if optim_count > 0:
+        print(f"{optim_count} Attention layers using Scaled Dot Product Attention.")
+def handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet):
+    try:
+        is_torch_2 = hasattr(F, 'scaled_dot_product_attention')
+        if enable_xformers_memory_efficient_attention and not is_torch_2:
+            if is_xformers_available():
+                from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+                unet.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
+        if enable_torch_2_attn and is_torch_2:
+            set_torch_2_attn(unet)
+    except:
+        print("Could not enable memory efficient attention for xformers or Torch 2.0.")
+def inject_lora(use_lora, model, replace_modules, is_extended=False, dropout=0.0, lora_path='', r=16):
+    injector = (
+        inject_trainable_lora if not is_extended
+    else
+        inject_trainable_lora_extended
+    )
+    params = None
+    negation = None
+    if os.path.exists(lora_path):
+        try:
+            for f in os.listdir(lora_path):
+                if f.endswith('.pt'):
+                    lora_file = os.path.join(lora_path, f)
+                    if 'text_encoder' in f and isinstance(model, CLIPTextModel):
+                        monkeypatch_or_replace_lora(
+                            model,
+                            torch.load(lora_file),
+                            target_replace_module=replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded Text Encoder LoRa.")
+                    if 'unet' in f and isinstance(model, UNet3DConditionModel):
+                        monkeypatch_or_replace_lora_extended(
+                            model,
+                            torch.load(lora_file),
+                            target_replace_module=replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded UNET LoRa.")
+        except Exception as e:
+            print(e)
+            print("Could not load LoRAs. Injecting new ones instead...")
+    if use_lora:
+        REPLACE_MODULES = replace_modules
+        injector_args = {
+            "model": model,
+            "target_replace_module": REPLACE_MODULES,
+            "r": r
+        }
+        if not is_extended: injector_args['dropout_p'] = dropout
+        params, negation = injector(**injector_args)
+        for _up, _down in extract_lora_ups_down(
+            model,
+            target_replace_module=REPLACE_MODULES):
+            if all(x is not None for x in [_up, _down]):
+                print(f"Lora successfully injected into {model.__class__.__name__}.")
+            break
+    return params, negation
+def save_lora(model, name, condition, replace_modules, step, save_path):
+    if condition and replace_modules is not None:
+        save_path = f"{save_path}/{step}_{name}.pt"
+        save_lora_weight(model, save_path, replace_modules)
+def handle_lora_save(
+    use_unet_lora,
+    use_text_lora,
+    model,
+    save_path,
+    checkpoint_step,
+    unet_target_modules,
+    text_encoder_target_modules
+):
+    save_path = f"{save_path}/lora"
+    os.makedirs(save_path, exist_ok=True)
+    save_lora(
+        model.unet,
+        'unet',
+        use_unet_lora,
+        unet_target_modules,
+        checkpoint_step,
+        save_path,
+    )
+    save_lora(
+        model.text_encoder,
+        'text_encoder',
+        use_text_lora,
+        text_encoder_target_modules,
+        checkpoint_step,
+        save_path
+    )
+    train_patch_pipe(model, use_unet_lora, use_text_lora)
+def param_optim(model, condition, extra_params=None, is_lora=False, negation=None):
+    return {
+        "model": model,
+        "condition": condition,
+        'extra_params': extra_params,
+        'is_lora': is_lora,
+        "negation": negation
+    }
+def create_optim_params(name='param', params=None, lr=5e-6, extra_params=None):
+    params = {
+        "name": name,
+        "params": params,
+        "lr": lr
+    }
+    if extra_params is not None:
+        for k, v in extra_params.items():
+            params[k] = v
+    return params
+def negate_params(name, negation):
+    # We have to do this if we are co-training with LoRA.
+    # This ensures that parameter groups aren't duplicated.
+    if negation is None: return False
+    for n in negation:
+        if n in name and 'temp' not in name:
+            return True
+    return False
+def create_optimizer_params(model_list, lr):
+    import itertools
+    optimizer_params = []
+    for optim in model_list:
+        model, condition, extra_params, is_lora, negation = optim.values()
+        # Check if we are doing LoRA training.
+        if is_lora and condition:
+            params = create_optim_params(
+                params=itertools.chain(*model),
+                extra_params=extra_params
+            )
+            optimizer_params.append(params)
+            continue
+        # If this is true, we can train it.
+        if condition:
+            for n, p in model.named_parameters():
+                should_negate = 'lora' in n
+                if should_negate: continue
+                params = create_optim_params(n, p, lr, extra_params)
+                optimizer_params.append(params)
+    return optimizer_params
+def get_optimizer(use_8bit_adam):
+    if use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+        return bnb.optim.AdamW8bit
+    else:
+        return torch.optim.AdamW
+def is_mixed_precision(accelerator):
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    return weight_dtype
+def cast_to_gpu_and_type(model_list, accelerator, weight_dtype):
+    for model in model_list:
+        if model is not None: model.to(accelerator.device, dtype=weight_dtype)
+def handle_cache_latents(
+        should_cache,
+        output_dir,
+        train_dataloader,
+        train_batch_size,
+        vae,
+        cached_latent_dir=None
+    ):
+    # Cache latents by storing them in VRAM.
+    # Speeds up training and saves memory by not encoding during the train loop.
+    if not should_cache: return None
+    vae.to('cuda', dtype=torch.float16)
+    vae.enable_slicing()
+    cached_latent_dir = (
+        os.path.abspath(cached_latent_dir) if cached_latent_dir is not None else None
+        )
+    if cached_latent_dir is None:
+        cache_save_dir = f"{output_dir}/cached_latents"
+        os.makedirs(cache_save_dir, exist_ok=True)
+        for i, batch in enumerate(tqdm(train_dataloader, desc="Caching Latents.")):
+            save_name = f"cached_{i}"
+            full_out_path =  f"{cache_save_dir}/{save_name}.pt"
+            pixel_values = batch['pixel_values'].to('cuda', dtype=torch.float16)
+            batch['pixel_values'] = tensor_to_vae_latent(pixel_values, vae)
+            for k, v in batch.items(): batch[k] = v[0]
+            torch.save(batch, full_out_path)
+            del pixel_values
+            del batch
+            # We do this to avoid fragmentation from casting latents between devices.
+            torch.cuda.empty_cache()
+    else:
+        cache_save_dir = cached_latent_dir
+    return torch.utils.data.DataLoader(
+        CachedDataset(cache_dir=cache_save_dir),
+        batch_size=train_batch_size,
+        shuffle=True,
+        num_workers=0
+    )
+def handle_trainable_modules(model, trainable_modules=None, is_enabled=True, negation=None):
+    global already_printed_trainables
+    # This can most definitely be refactored :-)
+    unfrozen_params = 0
+    if trainable_modules is not None:
+        for name, module in model.named_modules():
+            for tm in tuple(trainable_modules):
+                if tm == 'all':
+                    model.requires_grad_(is_enabled)
+                    unfrozen_params =len(list(model.parameters()))
+                    break
+                if tm in name and 'lora' not in name:
+                    for m in module.parameters():
+                        m.requires_grad_(is_enabled)
+                        if is_enabled: unfrozen_params +=1
+    if unfrozen_params > 0 and not already_printed_trainables:
+        already_printed_trainables = True
+        print(f"{unfrozen_params} params have been unfrozen for training.")
+def tensor_to_vae_latent(t, vae):
+    video_length = t.shape[1]
+    t = rearrange(t, "b f c h w -> (b f) c h w")
+    latents = vae.encode(t).latent_dist.sample()
+    latents = rearrange(latents, "(b f) c h w -> b c f h w", f=video_length)
+    latents = latents * 0.18215
+    return latents
+def sample_noise(latents, noise_strength, use_offset_noise):
+    b ,c, f, *_ = latents.shape
+    noise_latents = torch.randn_like(latents, device=latents.device)
+    offset_noise = None
+    if use_offset_noise:
+        offset_noise = torch.randn(b, c, f, 1, 1, device=latents.device)
+        noise_latents = noise_latents + noise_strength * offset_noise
+    return noise_latents
+def should_sample(global_step, validation_steps, validation_data):
+    return (global_step % validation_steps == 0 or global_step == 1)  \
+    and validation_data.sample_preview
+def save_pipe(
+        path,
+        global_step,
+        accelerator,
+        unet,
+        text_encoder,
+        vae,
+        output_dir,
+        use_unet_lora,
+        use_text_lora,
+        unet_target_replace_module=None,
+        text_target_replace_module=None,
+        is_checkpoint=False,
+    ):
+    if is_checkpoint:
+        save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
+        os.makedirs(save_path, exist_ok=True)
+    else:
+        save_path = output_dir
+    # Save the dtypes so we can continue training at the same precision.
+    u_dtype, t_dtype, v_dtype = unet.dtype, text_encoder.dtype, vae.dtype
+   # Copy the model without creating a reference to it. This allows keeping the state of our lora training if enabled.
+    unet_out = copy.deepcopy(accelerator.unwrap_model(unet, keep_fp32_wrapper=False))
+    text_encoder_out = copy.deepcopy(accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=False))
+    pipeline = TextToVideoSDPipeline.from_pretrained(
+        path,
+        unet=unet_out,
+        text_encoder=text_encoder_out,
+        vae=vae,
+    ).to(torch_dtype=torch.float16)
+    handle_lora_save(
+        use_unet_lora,
+        use_text_lora,
+        pipeline,
+        output_dir,
+        global_step,
+        unet_target_replace_module,
+        text_target_replace_module
+    )
+    pipeline.save_pretrained(save_path)
+    if is_checkpoint:
+        unet, text_encoder = accelerator.prepare(unet, text_encoder)
+        models_to_cast_back = [(unet, u_dtype), (text_encoder, t_dtype), (vae, v_dtype)]
+        [x[0].to(accelerator.device, dtype=x[1]) for x in models_to_cast_back]
+    logger.info(f"Saved model at {save_path} on step {global_step}")
+    del pipeline
+    del unet_out
+    del text_encoder_out
+    torch.cuda.empty_cache()
+    gc.collect()
+def replace_prompt(prompt, token, wlist):
+    for w in wlist:
+        if w in prompt: return prompt.replace(w, token)
+    return prompt
+def main(
+    pretrained_model_path: str,
+    output_dir: str,
+    train_data: Dict,
+    validation_data: Dict,
+    dataset_types: Tuple[str] = ('json'),
+    validation_steps: int = 100,
+    trainable_modules: Tuple[str] = ("attn1", "attn2"),
+    trainable_text_modules: Tuple[str] = ("all"),
+    extra_unet_params = None,
+    extra_text_encoder_params = None,
+    train_batch_size: int = 1,
+    max_train_steps: int = 500,
+    learning_rate: float = 5e-5,
+    scale_lr: bool = False,
+    lr_scheduler: str = "constant",
+    lr_warmup_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.999,
+    adam_weight_decay: float = 1e-2,
+    adam_epsilon: float = 1e-08,
+    max_grad_norm: float = 1.0,
+    gradient_accumulation_steps: int = 1,
+    gradient_checkpointing: bool = False,
+    text_encoder_gradient_checkpointing: bool = False,
+    checkpointing_steps: int = 500,
+    resume_from_checkpoint: Optional[str] = None,
+    mixed_precision: Optional[str] = "fp16",
+    use_8bit_adam: bool = False,
+    enable_xformers_memory_efficient_attention: bool = True,
+    enable_torch_2_attn: bool = False,
+    seed: Optional[int] = None,
+    train_text_encoder: bool = False,
+    use_offset_noise: bool = False,
+    offset_noise_strength: float = 0.1,
+    extend_dataset: bool = False,
+    cache_latents: bool = False,
+    cached_latent_dir = None,
+    use_unet_lora: bool = False,
+    use_text_lora: bool = False,
+    unet_lora_modules: Tuple[str] = ["ResnetBlock2D"],
+    text_encoder_lora_modules: Tuple[str] = ["CLIPEncoderLayer"],
+    lora_rank: int = 16,
+    lora_path: str = '',
+    **kwargs
+):
+    *_, config = inspect.getargvalues(inspect.currentframe())
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with="tensorboard",
+        logging_dir=output_dir
+    )
+    # Make one log on every process with the configuration for debugging.
+    create_logging(logging, logger, accelerator)
+    # Initialize accelerate, transformers, and diffusers warnings
+    accelerate_set_verbose(accelerator)
+    # If passed along, set the training seed now.
+    if seed is not None:
+        set_seed(seed)
+    # Handle the output folder creation
+    if accelerator.is_main_process:
+       output_dir = create_output_folders(output_dir, config)
+    # Load scheduler, tokenizer and models.
+    noise_scheduler, tokenizer, text_encoder, vae, unet = load_primary_models(pretrained_model_path)
+    # Freeze any necessary models
+    freeze_models([vae, text_encoder, unet])
+    # Enable xformers if available
+    handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet)
+    if scale_lr:
+        learning_rate = (
+            learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
+        )
+    # Initialize the optimizer
+    optimizer_cls = get_optimizer(use_8bit_adam)
+    # Use LoRA if enabled.
+    unet_lora_params, unet_negation = inject_lora(
+        use_unet_lora, unet, unet_lora_modules, is_extended=True,
+        r=lora_rank, lora_path=lora_path
+        )
+    text_encoder_lora_params, text_encoder_negation = inject_lora(
+        use_text_lora, text_encoder, text_encoder_lora_modules,
+        r=lora_rank, lora_path=lora_path
+        )
+    # Create parameters to optimize over with a condition (if "condition" is true, optimize it)
+    optim_params = [
+        param_optim(unet, trainable_modules is not None, extra_params=extra_unet_params, negation=unet_negation),
+        param_optim(text_encoder, train_text_encoder and not use_text_lora, extra_params=extra_text_encoder_params,
+                    negation=text_encoder_negation
+                   ),
+        param_optim(text_encoder_lora_params, use_text_lora, is_lora=True, extra_params={"lr": 1e-5}),
+        param_optim(unet_lora_params, use_unet_lora, is_lora=True, extra_params={"lr": 1e-5})
+    ]
+    params = create_optimizer_params(optim_params, learning_rate)
+    # Create Optimizer
+    optimizer = optimizer_cls(
+        params,
+        lr=learning_rate,
+        betas=(adam_beta1, adam_beta2),
+        weight_decay=adam_weight_decay,
+        eps=adam_epsilon,
+    )
+    # Scheduler
+    lr_scheduler = get_scheduler(
+        lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
+        num_training_steps=max_train_steps * gradient_accumulation_steps,
+    )
+    # Get the training dataset based on types (json, single_video, image)
+    train_datasets = get_train_dataset(dataset_types, train_data, tokenizer)
+    # Extend datasets that are less than the greatest one. This allows for more balanced training.
+    attrs = ['train_data', 'frames', 'image_dir', 'video_files']
+    extend_datasets(train_datasets, attrs, extend=extend_dataset)
+    # Process one dataset
+    if len(train_datasets) == 1:
+        train_dataset = train_datasets[0]
+    # Process many datasets
+    else:
+        train_dataset = torch.utils.data.ConcatDataset(train_datasets)
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True
+    )
+     # Latents caching
+    cached_data_loader = handle_cache_latents(
+        cache_latents,
+        output_dir,
+        train_dataloader,
+        train_batch_size,
+        vae,
+        cached_latent_dir
+    )
+    if cached_data_loader is not None:
+        train_dataloader = cached_data_loader
+    # Prepare everything with our `accelerator`.
+    unet, optimizer,train_dataloader, lr_scheduler, text_encoder = accelerator.prepare(
+        unet,
+        optimizer,
+        train_dataloader,
+        lr_scheduler,
+        text_encoder
+    )
+    # Use Gradient Checkpointing if enabled.
+    unet_and_text_g_c(
+        unet,
+        text_encoder,
+        gradient_checkpointing,
+        text_encoder_gradient_checkpointing
+    )
+    # Enable VAE slicing to save memory.
+    vae.enable_slicing()
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = is_mixed_precision(accelerator)
+    # Move text encoders, and VAE to GPU
+    models_to_cast = [text_encoder, vae]
+    cast_to_gpu_and_type(models_to_cast, accelerator, weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs
+    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2video-fine-tune")
+    # Train!
+    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    def finetune_unet(batch, train_encoder=False):
+        # Check if we are training the text encoder
+        text_trainable = (train_text_encoder or use_text_lora)
+        # Unfreeze UNET Layers
+        if global_step == 0:
+            already_printed_trainables = False
+            unet.train()
+            handle_trainable_modules(
+                unet,
+                trainable_modules,
+                is_enabled=True,
+                negation=unet_negation
+            )
+        # Convert videos to latent space
+        pixel_values = batch["pixel_values"]
+        if not cache_latents:
+            latents = tensor_to_vae_latent(pixel_values, vae)
+        else:
+            latents = pixel_values
+        # Get video length
+        video_length = latents.shape[2]
+        # Sample noise that we'll add to the latents
+        noise = sample_noise(latents, offset_noise_strength, use_offset_noise)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each video
+        timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+        # Enable text encoder training
+        if text_trainable:
+            text_encoder.train()
+            if use_text_lora:
+                text_encoder.text_model.embeddings.requires_grad_(True)
+            if global_step == 0 and train_text_encoder:
+                handle_trainable_modules(
+                    text_encoder,
+                    trainable_modules=trainable_text_modules,
+                    negation=text_encoder_negation
+            )
+            cast_to_gpu_and_type([text_encoder], accelerator, torch.float32)
+        # Fixes gradient checkpointing training.
+        # See: https://github.com/prigoyal/pytorch_memonger/blob/master/tutorial/Checkpointing_for_PyTorch_models.ipynb
+        if gradient_checkpointing or text_encoder_gradient_checkpointing:
+            unet.eval()
+            text_encoder.eval()
+        # Encode text embeddings
+        token_ids = batch['prompt_ids']
+        encoder_hidden_states = text_encoder(token_ids)[0]
+        # Get the target for loss depending on the prediction type
+        if noise_scheduler.prediction_type == "epsilon":
+            target = noise
+        elif noise_scheduler.prediction_type == "v_prediction":
+            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            raise ValueError(f"Unknown prediction type {noise_scheduler.prediction_type}")
+        # Here we do two passes for video and text training.
+        # If we are on the second iteration of the loop, get one frame.
+        # This allows us to train text information only on the spatial layers.
+        losses = []
+        should_truncate_video = (video_length > 1 and text_trainable)
+        # We detach the encoder hidden states for the first pass (video frames > 1)
+        # Then we make a clone of the initial state to ensure we can train it in the loop.
+        detached_encoder_state = encoder_hidden_states.clone().detach()
+        trainable_encoder_state = encoder_hidden_states.clone()
+        for i in range(2):
+            should_detach = noisy_latents.shape[2] > 1 and i == 0
+            if should_truncate_video and i == 1:
+                noisy_latents = noisy_latents[:,:,1,:,:].unsqueeze(2)
+                target = target[:,:,1,:,:].unsqueeze(2)
+            encoder_hidden_states = (
+                detached_encoder_state if should_detach else trainable_encoder_state
+            )
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states=encoder_hidden_states).sample
+            loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            losses.append(loss)
+            # This was most likely single frame training or a single image.
+            if video_length == 1 and i == 0: break
+        loss = losses[0] if len(losses) == 1 else losses[0] + losses[1]
+        return loss, latents
+    for epoch in range(first_epoch, num_train_epochs):
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+            with accelerator.accumulate(unet) ,accelerator.accumulate(text_encoder):
+                text_prompt = batch['text_prompt'][0]
+                with accelerator.autocast():
+                    loss, latents = finetune_unet(batch, train_encoder=train_text_encoder)
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(train_batch_size)).mean()
+                train_loss += avg_loss.item() / gradient_accumulation_steps
+                # Backpropagate
+                try:
+                    accelerator.backward(loss)
+                    params_to_clip = (
+                        unet.parameters() if not train_text_encoder
+                    else
+                        list(unet.parameters()) + list(text_encoder.parameters())
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                except Exception as e:
+                    print(f"An error has occured during backpropogation! {e}")
+                    continue
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                if global_step % checkpointing_steps == 0:
+                    save_pipe(
+                        pretrained_model_path,
+                        global_step,
+                        accelerator,
+                        unet,
+                        text_encoder,
+                        vae,
+                        output_dir,
+                        use_unet_lora,
+                        use_text_lora,
+                        unet_lora_modules,
+                        text_encoder_lora_modules,
+                        is_checkpoint=True
+                    )
+                if should_sample(global_step, validation_steps, validation_data):
+                    if global_step == 1: print("Performing validation prompt.")
+                    if accelerator.is_main_process:
+                        with accelerator.autocast():
+                            unet.eval()
+                            text_encoder.eval()
+                            unet_and_text_g_c(unet, text_encoder, False, False)
+                            pipeline = TextToVideoSDPipeline.from_pretrained(
+                                pretrained_model_path,
+                                text_encoder=text_encoder,
+                                vae=vae,
+                                unet=unet
+                            )
+                            diffusion_scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                            pipeline.scheduler = diffusion_scheduler
+                            prompt = text_prompt if len(validation_data.prompt) <= 0 else validation_data.prompt
+                            curr_dataset_name = batch['dataset']
+                            save_filename = f"{global_step}_dataset-{curr_dataset_name}_{prompt}"
+                            out_file = f"{output_dir}/samples/{save_filename}.mp4"
+                            with torch.no_grad():
+                                video_frames = pipeline(
+                                    prompt,
+                                    width=validation_data.width,
+                                    height=validation_data.height,
+                                    num_frames=validation_data.num_frames,
+                                    num_inference_steps=validation_data.num_inference_steps,
+                                    guidance_scale=validation_data.guidance_scale
+                                ).frames
+                            export_to_video(video_frames, out_file, train_data.get('fps', 8))
+                            del pipeline
+                            torch.cuda.empty_cache()
+                    logger.info(f"Saved a new sample to {out_file}")
+                    unet_and_text_g_c(
+                        unet,
+                        text_encoder,
+                        gradient_checkpointing,
+                        text_encoder_gradient_checkpointing
+                    )
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            accelerator.log({"training_loss": loss.detach().item()}, step=step)
+            progress_bar.set_postfix(**logs)
+            if global_step >= max_train_steps:
+                break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        save_pipe(
+                pretrained_model_path,
+                global_step,
+                accelerator,
+                unet,
+                text_encoder,
+                vae,
+                output_dir,
+                use_unet_lora,
+                use_text_lora,
+                unet_lora_modules,
+                text_encoder_lora_modules,
+                is_checkpoint=False
+        )
+    accelerator.end_training()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="./configs/my_config.yaml")
+    args = parser.parse_args()
+    main(**OmegaConf.load(args.config))

unet_3d_blocks.py ADDED Viewed

	@@ -0,0 +1,836 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+from diffusers.models.resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D
+from diffusers.models.transformer_2d import Transformer2DModel
+from diffusers.models.transformer_temporal import TransformerTemporalModel
+# Assign gradient checkpoint function to simple variable for readability.
+g_c = checkpoint.checkpoint
+def use_temporal(module, num_frames, x):
+    if num_frames == 1:
+        if isinstance(module, TransformerTemporalModel):
+            return {"sample": x}
+        else:
+            return x
+def custom_checkpoint(module, mode=None):
+    if mode == None: raise ValueError('Mode for gradient checkpointing cannot be none.')
+    custom_forward = None
+    if mode == 'resnet':
+        def custom_forward(hidden_states, temb):
+            inputs = module(hidden_states, temb)
+            return inputs
+    if mode == 'attn':
+        def custom_forward(
+            hidden_states,
+            encoder_hidden_states=None,
+            cross_attention_kwargs=None
+        ):
+            inputs = module(
+                hidden_states,
+                encoder_hidden_states,
+                cross_attention_kwargs
+            )
+            return inputs
+    if mode == 'temp':
+         def custom_forward(hidden_states, num_frames=None):
+            inputs = use_temporal(module, num_frames, hidden_states)
+            if inputs is None: inputs = module(
+                hidden_states,
+                num_frames=num_frames
+            )
+            return inputs
+    return custom_forward
+def transformer_g_c(transformer, sample, num_frames):
+    sample = g_c(custom_checkpoint(transformer, mode='temp'),
+        sample, num_frames, use_reentrant=False
+    )['sample']
+    return sample
+def cross_attn_g_c(
+        attn,
+        temp_attn,
+        resnet,
+        temp_conv,
+        hidden_states,
+        encoder_hidden_states,
+        cross_attention_kwargs,
+        temb,
+        num_frames,
+        inverse_temp=False
+    ):
+    def ordered_g_c(idx):
+        # Self and CrossAttention
+        if idx == 0: return g_c(custom_checkpoint(attn, mode='attn'),
+            hidden_states, encoder_hidden_states,cross_attention_kwargs, use_reentrant=False
+        )['sample']
+        # Temporal Self and CrossAttention
+        if idx == 1: return g_c(custom_checkpoint(temp_attn, mode='temp'),
+            hidden_states, num_frames, use_reentrant=False)['sample']
+        # Resnets
+        if idx == 2: return g_c(custom_checkpoint(resnet, mode='resnet'),
+            hidden_states, temb, use_reentrant=False)
+        # Temporal Convolutions
+        if idx == 3: return g_c(custom_checkpoint(temp_conv, mode='temp'),
+            hidden_states, num_frames, use_reentrant=False
+    )
+    # Here we call the function depending on the order in which they are called.
+    # For some layers, the orders are different, so we access the appropriate one by index.
+    if not inverse_temp:
+        for idx in [0,1,2,3]: hidden_states = ordered_g_c(idx)
+    else:
+        for idx in [2,3,0,1]: hidden_states = ordered_g_c(idx)
+    return hidden_states
+def up_down_g_c(resnet, temp_conv, hidden_states, temb, num_frames):
+    hidden_states = g_c(custom_checkpoint(resnet, mode='resnet'), hidden_states, temb, use_reentrant=False)
+    hidden_states = g_c(custom_checkpoint(temp_conv, mode='temp'),
+        hidden_states, num_frames,  use_reentrant=False
+    )
+    return hidden_states
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=True,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        self.gradient_checkpointing = False
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        temp_convs = [
+            TemporalConvLayer(
+                in_channels,
+                in_channels,
+            )
+        ]
+        attentions = []
+        temp_attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                Transformer2DModel(
+                    in_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    in_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    in_channels,
+                    in_channels,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        if self.gradient_checkpointing:
+            hidden_states = up_down_g_c(
+                    self.resnets[0],
+                    self.temp_convs[0],
+                    hidden_states,
+                    temb,
+                    num_frames
+                )
+        else:
+            hidden_states = self.resnets[0](hidden_states, temb)
+            hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
+        for attn, temp_attn, resnet, temp_conv in zip(
+            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+        ):
+            if self.gradient_checkpointing:
+                hidden_states = cross_attn_g_c(
+                        attn,
+                        temp_attn,
+                        resnet,
+                        temp_conv,
+                        hidden_states,
+                        encoder_hidden_states,
+                        cross_attention_kwargs,
+                        temb,
+                        num_frames
+                    )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                if num_frames > 1:
+                    hidden_states = temp_attn(hidden_states, num_frames=num_frames).sample
+                hidden_states = resnet(hidden_states, temb)
+                if num_frames > 1:
+                    hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temp_attentions = []
+        temp_convs = []
+        self.gradient_checkpointing = False
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        # TODO(Patrick, William) - attention mask is not used
+        output_states = ()
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            if self.gradient_checkpointing:
+                hidden_states = cross_attn_g_c(
+                        attn,
+                        temp_attn,
+                        resnet,
+                        temp_conv,
+                        hidden_states,
+                        encoder_hidden_states,
+                        cross_attention_kwargs,
+                        temb,
+                        num_frames,
+                        inverse_temp=True
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                if num_frames > 1:
+                    hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                if num_frames > 1:
+                    hidden_states = temp_attn(hidden_states, num_frames=num_frames).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        self.gradient_checkpointing = False
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states, temb=None, num_frames=1):
+        output_states = ()
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            if self.gradient_checkpointing:
+                hidden_states = up_down_g_c(resnet, temp_conv, hidden_states, temb, num_frames)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                if num_frames > 1:
+                    hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        attentions = []
+        temp_attentions = []
+        self.gradient_checkpointing = False
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        # TODO(Patrick, William) - attention mask is not used
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.gradient_checkpointing:
+                hidden_states = cross_attn_g_c(
+                        attn,
+                        temp_attn,
+                        resnet,
+                        temp_conv,
+                        hidden_states,
+                        encoder_hidden_states,
+                        cross_attention_kwargs,
+                        temb,
+                        num_frames,
+                        inverse_temp=True
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                if num_frames > 1:
+                    hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                if num_frames > 1:
+                    hidden_states = temp_attn(hidden_states, num_frames=num_frames).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        self.gradient_checkpointing = False
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, num_frames=1):
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.gradient_checkpointing:
+                hidden_states = up_down_g_c(resnet, temp_conv, hidden_states, temb, num_frames)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                if num_frames > 1:
+                    hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

unet_3d_condition.py ADDED Viewed

	@@ -0,0 +1,499 @@

+# Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2023 The ModelScope Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+    transformer_g_c
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, it will skip the normalization and activation layers in post-processing
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        self.gradient_checkpointing = False
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=attention_head_dim,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+        )
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=False,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = nn.SiLU()
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, value=False):
+        self.gradient_checkpointing = value
+        self.mid_block.gradient_checkpointing = value
+        for module in self.down_blocks + self.up_blocks:
+            if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+                module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+        Returns:
+            [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+        if self.gradient_checkpointing:
+            sample = transformer_g_c(self.transformer_in, sample, num_frames)
+        else:
+            sample = self.transformer_in(sample, num_frames=num_frames).sample
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)