Spaces:

fffiloni
/

SVFR-demo

Running on L40S

App Files Files Community

fffiloni commited on 7 days ago

Commit

bdd549c

verified ·

1 Parent(s): ea32ad6

Migrated from GitHub

Browse files

Files changed (20) hide show

ORIGINAL_README.md +215 -0
assert/lq/lq1.mp4 +0 -0
assert/lq/lq2.mp4 +0 -0
assert/lq/lq3.mp4 +0 -0
assert/mask/lq3.png +0 -0
assert/method.png +0 -0
config/infer.yaml +21 -0
infer.py +305 -0
requirements.txt +10 -0
src/dataset/dataset.py +50 -0
src/dataset/face_align/align.py +36 -0
src/dataset/face_align/yoloface.py +310 -0
src/models/id_proj.py +20 -0
src/models/model_insightface_360k.py +203 -0
src/models/svfr_adapter/attention_processor.py +616 -0
src/models/svfr_adapter/unet_3d_blocks.py +0 -0
src/models/svfr_adapter/unet_3d_svd_condition_ip.py +536 -0
src/pipelines/pipeline.py +812 -0
src/utils/noise_util.py +25 -0
src/utils/util.py +64 -0

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,215 @@

+<!-- # SVFR: A Unified Framework for Generalized Video Face Restoration -->
+<div>
+<h1>SVFR: A Unified Framework for Generalized Video Face Restoration</h1>
+</div>
+[![arXiv](https://img.shields.io/badge/arXiv-2307.04725-b31b1b.svg)](https://arxiv.org/pdf/2501.01235)
+[![Project Page](https://img.shields.io/badge/Project-Website-green)](https://wangzhiyaoo.github.io/SVFR/)
+## 🔥 Overview
+SVFR is a unified framework for face video restoration that supports tasks such as **BFR, Colorization, Inpainting**, and **their combinations** within one cohesive system.
+<img src="assert/method.png">
+## 🎬 Demo
+### BFR
+<!--
+<div style="display: flex; gap: 10px;">
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case1_bfr.mp4" type="video/mp4">
+  </video>
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case4_bfr.mp4" type="video/mp4">
+  </video>
+</div> -->
+<!-- <div style="display: flex; gap: 10px;">
+  <video src="https://github.com/user-attachments/assets/49f985f3-a2db-4b9f-aed0-e9943bae9c17" controls width=45%></video>
+  <video src="https://github.com/user-attachments/assets/8fcd1dd9-79d3-4e57-b98e-a80ae2badfb5" controls width="45%"></video>
+</div> -->
+| Case1                                                                                                                        | Case2                                                                                                                        |
+|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+|<video src="https://github.com/user-attachments/assets/49f985f3-a2db-4b9f-aed0-e9943bae9c17" /> | <video src="https://github.com/user-attachments/assets/8fcd1dd9-79d3-4e57-b98e-a80ae2badfb5" /> |
+<!-- <video src="https://wangzhiyaoo.github.io/SVFR/bfr"> -->
+<!-- https://github.com/user-attachments/assets/49f985f3-a2db-4b9f-aed0-e9943bae9c17
+https://github.com/user-attachments/assets/8fcd1dd9-79d3-4e57-b98e-a80ae2badfb5 -->
+### BFR+Colorization
+<!-- <div style="display: flex; gap: 10px;">
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case10_bfr_colorization.mp4" type="video/mp4">
+  </video>
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case12_bfr_colorization.mp4" type="video/mp4">
+  </video>
+</div> -->
+<!-- https://github.com/user-attachments/assets/795f4cb1-a7c9-41c5-9486-26e64a96bcf0
+https://github.com/user-attachments/assets/6ccf2267-30be-4553-9ecc-f3e7e0ca1d6f -->
+| Case3                                                                                                                        | Case4                                                                                                                        |
+|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+|<video src="https://github.com/user-attachments/assets/795f4cb1-a7c9-41c5-9486-26e64a96bcf0" /> | <video src="https://github.com/user-attachments/assets/6ccf2267-30be-4553-9ecc-f3e7e0ca1d6f" /> |
+### BFR+Colorization+Inpainting
+<!-- <div style="display: flex; gap: 10px;">
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case14_bfr+colorization+inpainting.mp4" type="video/mp4">
+  </video>
+  <video controls width="360">
+    <source src="https://wangzhiyaoo.github.io/SVFR/static/videos/wild-test/case15_bfr+colorization+inpainting.mp4" type="video/mp4">
+  </video>
+</div> -->
+<!-- https://github.com/user-attachments/assets/6113819f-142b-4faa-b1c3-a2b669fd0786
+https://github.com/user-attachments/assets/efdac23c-0ba5-4dad-ab8c-48904af5dd89
+ -->
+| Case5                                                                                                                        | Case6                                                                                                                        |
+|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+|<video src="https://github.com/user-attachments/assets/6113819f-142b-4faa-b1c3-a2b669fd0786" /> | <video src="https://github.com/user-attachments/assets/efdac23c-0ba5-4dad-ab8c-48904af5dd89" /> |
+## 🎙️ News
+- **[2025.01.02]**: We released the initial version of the [inference code](#inference) and [models](#download-checkpoints). Stay tuned for continuous updates!
+- **[2024.12.17]**: This repo is created!
+## 🚀 Getting Started
+## Setup
+Use the following command to install a conda environment for SVFR from scratch:
+```bash
+conda create -n svfr python=3.9 -y
+conda activate svfr
+```
+Install PyTorch:  make sure to select the appropriate CUDA version based on your hardware, for example,
+```bash
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2
+```
+Install Dependencies:
+```bash
+pip install -r requirements.txt
+```
+## Download checkpoints
+<li>Download the Stable Video Diffusion</li>
+```
+conda install git-lfs
+git lfs install
+git clone https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt models/stable-video-diffusion-img2vid-xt
+```
+<li>Download SVFR</li>
+You can download checkpoints manually through link on [Google Drive](https://drive.google.com/drive/folders/1nzy9Vk-yA_DwXm1Pm4dyE2o0r7V6_5mn?usp=share_link).
+Put checkpoints as follows:
+```
+└── models
+    ├── face_align
+    │   ├── yoloface_v5m.pt
+    ├── face_restoration
+    │   ├── unet.pth
+    │   ├── id_linear.pth
+    │   ├── insightface_glint360k.pth
+    └── stable-video-diffusion-img2vid-xt
+        ├── vae
+        ├── scheduler
+        └── ...
+```
+## Inference
+### Inference single or multi task
+```
+python3 infer.py \
+ --config config/infer.yaml \
+ --task_ids 0 \
+ --input_path ./assert/lq/lq1.mp4 \
+ --output_dir ./results/
+```
+<li>task_id:</li>
+> 0 -- bfr
+> 1 -- colorization
+> 2 -- inpainting
+> 0,1 -- bfr and colorization
+> 0,1,2 -- bfr and colorization and inpainting
+> ...
+### Inference with additional inpainting mask
+```
+# For Inference with Inpainting
+# Add '--mask_path' if you need to specify the mask file.
+python3 infer.py \
+ --config config/infer.yaml \
+ --task_ids 0,1,2 \
+ --input_path ./assert/lq/lq3.mp4 \
+ --output_dir ./results/
+ --mask_path ./assert/mask/lq3.png
+```
+## License
+The code of SVFR is released under the MIT License. There is no limitation for both academic and commercial usage.
+**The pretrained models we provided with this library are available for non-commercial research purposes only, including both auto-downloading models and manual-downloading models.**
+## BibTex
+```
+@misc{wang2025svfrunifiedframeworkgeneralized,
+      title={SVFR: A Unified Framework for Generalized Video Face Restoration},
+      author={Zhiyao Wang and Xu Chen and Chengming Xu and Junwei Zhu and Xiaobin Hu and Jiangning Zhang and Chengjie Wang and Yuqi Liu and Yiyi Zhou and Rongrong Ji},
+      year={2025},
+      eprint={2501.01235},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2501.01235},
+}
+```

assert/lq/lq1.mp4 ADDED Viewed

Binary file (98.2 kB). View file

assert/lq/lq2.mp4 ADDED Viewed

Binary file (314 kB). View file

assert/lq/lq3.mp4 ADDED Viewed

Binary file (687 kB). View file

assert/mask/lq3.png ADDED Viewed

assert/method.png ADDED Viewed

config/infer.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+data:
+  n_sample_frames: 16
+  width: 512
+  height: 512
+pretrained_model_name_or_path: "models/stable-video-diffusion-img2vid-xt"
+unet_checkpoint_path: "models/face_restoration/unet.pth"
+id_linear_checkpoint_path: "models/face_restoration/id_linear.pth"
+net_arcface_checkpoint_path: "models/face_restoration/insightface_glint360k.pth"
+# output_dir: 'result'
+# test config
+weight_dtype: 'fp16'
+num_inference_steps: 30
+decode_chunk_size: 16
+overlap: 3
+noise_aug_strength: 0.00
+min_appearance_guidance_scale: 2.0
+max_appearance_guidance_scale: 2.0
+i2i_noise_strength: 1.0

infer.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import argparse
+import warnings
+import os
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from PIL import Image
+import random
+from omegaconf import OmegaConf
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers.schedulers import EulerDiscreteScheduler
+from transformers import CLIPVisionModelWithProjection
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+from src.models.svfr_adapter.unet_3d_svd_condition_ip import UNet3DConditionSVDModel
+# pipeline
+from src.pipelines.pipeline import LQ2VideoLongSVDPipeline
+from src.utils.util import (
+    save_videos_grid,
+    seed_everything,
+)
+from torchvision.utils import save_image
+from src.models.id_proj import IDProjConvModel
+from src.models import model_insightface_360k
+from src.dataset.face_align.align import AlignImage
+warnings.filterwarnings("ignore")
+import decord
+import cv2
+from src.dataset.dataset import get_affine_transform, mean_face_lm5p_256
+BASE_DIR = '.'
+def main(config,args):
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        cuda_visible_devices = os.environ['CUDA_VISIBLE_DEVICES']
+        print(f"CUDA_VISIBLE_DEVICES is set to: {cuda_visible_devices}")
+    else:
+        print("CUDA_VISIBLE_DEVICES is not set.")
+    save_dir = f"{BASE_DIR}/{args.output_dir}"
+    os.makedirs(save_dir,exist_ok=True)
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+        subfolder="vae",
+        variant="fp16")
+    val_noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+        f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+        subfolder="scheduler")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+        subfolder="image_encoder",
+        variant="fp16")
+    unet = UNet3DConditionSVDModel.from_pretrained(
+        f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+        subfolder="unet",
+        variant="fp16")
+    weight_dir = 'models/face_align'
+    det_path = os.path.join(BASE_DIR, weight_dir, 'yoloface_v5m.pt')
+    align_instance = AlignImage("cuda", det_path=det_path)
+    to_tensor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        ])
+    import torch.nn as nn
+    class InflatedConv3d(nn.Conv2d):
+        def forward(self, x):
+            x = super().forward(x)
+            return x
+    # Add ref channel
+    old_weights = unet.conv_in.weight
+    old_bias = unet.conv_in.bias
+    new_conv1 = InflatedConv3d(
+        12,
+        old_weights.shape[0],
+        kernel_size=unet.conv_in.kernel_size,
+        stride=unet.conv_in.stride,
+        padding=unet.conv_in.padding,
+        bias=True if old_bias is not None else False,
+    )
+    param = torch.zeros((320, 4, 3, 3), requires_grad=True)
+    new_conv1.weight = torch.nn.Parameter(torch.cat((old_weights, param), dim=1))
+    if old_bias is not None:
+        new_conv1.bias = old_bias
+    unet.conv_in = new_conv1
+    unet.config["in_channels"] = 12
+    unet.config.in_channels = 12
+    id_linear = IDProjConvModel(in_channels=512, out_channels=1024).to(device='cuda')
+    # load pretrained weights
+    unet_checkpoint_path = os.path.join(BASE_DIR, config.unet_checkpoint_path)
+    unet.load_state_dict(
+        torch.load(unet_checkpoint_path, map_location="cpu"),
+        strict=True,
+    )
+    id_linear_checkpoint_path = os.path.join(BASE_DIR, config.id_linear_checkpoint_path)
+    id_linear.load_state_dict(
+        torch.load(id_linear_checkpoint_path, map_location="cpu"),
+        strict=True,
+    )
+    net_arcface = model_insightface_360k.getarcface(f'{BASE_DIR}/{config.net_arcface_checkpoint_path}').eval().to(device="cuda")
+    if config.weight_dtype == "fp16":
+        weight_dtype = torch.float16
+    elif config.weight_dtype == "fp32":
+        weight_dtype = torch.float32
+    elif config.weight_dtype == "bf16":
+        weight_dtype = torch.bfloat16
+    else:
+        raise ValueError(
+            f"Do not support weight dtype: {config.weight_dtype} during training"
+        )
+    image_encoder.to(weight_dtype)
+    vae.to(weight_dtype)
+    unet.to(weight_dtype)
+    id_linear.to(weight_dtype)
+    net_arcface.requires_grad_(False).to(weight_dtype)
+    pipe = LQ2VideoLongSVDPipeline(
+        unet=unet,
+        image_encoder=image_encoder,
+        vae=vae,
+        scheduler=val_noise_scheduler,
+        feature_extractor=None
+    )
+    pipe = pipe.to("cuda", dtype=unet.dtype)
+    seed_input = args.seed
+    seed_everything(seed_input)
+    video_path = args.input_path
+    task_ids = args.task_ids
+    if 2 in task_ids and args.mask_path is not None:
+        mask_path = args.mask_path
+        mask = Image.open(mask_path).convert("L")
+        mask_array = np.array(mask)
+        white_positions = mask_array == 255
+    print('task_ids:',task_ids)
+    task_prompt = [0,0,0]
+    for i in range(3):
+        if i in task_ids:
+            task_prompt[i] = 1
+    print("task_prompt:",task_prompt)
+    video_name = video_path.split('/')[-1]
+    # print(video_name)
+    if os.path.exists(os.path.join(save_dir, "result_frames", video_name[:-4])):
+        print(os.path.join(save_dir, "result_frames", video_name[:-4]))
+        # continue
+    cap = decord.VideoReader(video_path, fault_tol=1)
+    total_frames = len(cap)
+    T = total_frames #
+    print("total_frames:",total_frames)
+    step=1
+    drive_idx_start = 0
+    drive_idx_list = list(range(drive_idx_start, drive_idx_start + T * step, step))
+    assert len(drive_idx_list) == T
+    imSameIDs = []
+    vid_gt = []
+    for i, drive_idx in enumerate(drive_idx_list):
+        frame = cap[drive_idx].asnumpy()
+        imSameID = Image.fromarray(frame)
+        imSameID = imSameID.resize((512,512))
+        image_array = np.array(imSameID)
+        if 2 in task_ids and args.mask_path is not None:
+            image_array[white_positions] = [255, 255, 255] # mask for inpainting task
+        vid_gt.append(np.float32(image_array/255.))
+        imSameIDs.append(imSameID)
+    vid_lq = [(torch.from_numpy(frame).permute(2,0,1) - 0.5) / 0.5 for frame in vid_gt]
+    val_data = dict(
+        pixel_values_vid_lq = torch.stack(vid_lq,dim=0),
+        # pixel_values_ref_img=self.to_tensor(target_image),
+        # pixel_values_ref_concat_img=self.to_tensor(imSrc2),
+        task_ids=task_ids,
+        task_id_input=torch.tensor(task_prompt),
+        total_frames=total_frames,
+    )
+    window_overlap=0
+    inter_frame_list = get_overlap_slide_window_indices(val_data["total_frames"],config.data.n_sample_frames,window_overlap)
+    lq_frames = val_data["pixel_values_vid_lq"]
+    task_ids = val_data["task_ids"]
+    task_id_input = val_data["task_id_input"]
+    height, width = val_data["pixel_values_vid_lq"].shape[-2:]
+    print("Generating the first clip...")
+    output = pipe(
+        lq_frames[inter_frame_list[0]].to("cuda").to(weight_dtype), # lq
+        None, # ref concat
+        torch.zeros((1, len(inter_frame_list[0]), 49, 1024)).to("cuda").to(weight_dtype),# encoder_hidden_states
+        task_id_input.to("cuda").to(weight_dtype),
+        height=height,
+        width=width,
+        num_frames=len(inter_frame_list[0]),
+        decode_chunk_size=config.decode_chunk_size,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale=config.min_appearance_guidance_scale,
+        max_guidance_scale=config.max_appearance_guidance_scale,
+        overlap=config.overlap,
+        frames_per_batch=len(inter_frame_list[0]),
+        num_inference_steps=50,
+        i2i_noise_strength=config.i2i_noise_strength,
+    )
+    video = output.frames
+    ref_img_tensor = video[0][:,-1]
+    ref_img = (video[0][:,-1] *0.5+0.5).clamp(0,1) * 255.
+    ref_img = ref_img.permute(1,2,0).cpu().numpy().astype(np.uint8)
+    pts5 = align_instance(ref_img[:,:,[2,1,0]], maxface=True)[0][0]
+    warp_mat = get_affine_transform(pts5, mean_face_lm5p_256 * height/256)
+    ref_img = cv2.warpAffine(np.array(Image.fromarray(ref_img)), warp_mat, (height, width), flags=cv2.INTER_CUBIC)
+    ref_img = to_tensor(ref_img).to("cuda").to(weight_dtype)
+    save_image(ref_img*0.5 + 0.5,f"{save_dir}/ref_img_align.png")
+    ref_img =  F.interpolate(ref_img.unsqueeze(0)[:, :, 0:224, 16:240], size=[112, 112], mode='bilinear')
+    _, id_feature_conv = net_arcface(ref_img)
+    id_embedding = id_linear(id_feature_conv)
+    print('Generating all video clips...')
+    video = pipe(
+        lq_frames.to("cuda").to(weight_dtype), # lq
+        ref_img_tensor.to("cuda").to(weight_dtype),
+        id_embedding.unsqueeze(1).repeat(1, len(lq_frames), 1, 1).to("cuda").to(weight_dtype), # encoder_hidden_states
+        task_id_input.to("cuda").to(weight_dtype),
+        height=height,
+        width=width,
+        num_frames=val_data["total_frames"],#frame_num,
+        decode_chunk_size=config.decode_chunk_size,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale=config.min_appearance_guidance_scale,
+        max_guidance_scale=config.max_appearance_guidance_scale,
+        overlap=config.overlap,
+        frames_per_batch=config.data.n_sample_frames,
+        num_inference_steps=config.num_inference_steps,
+        i2i_noise_strength=config.i2i_noise_strength,
+    ).frames
+    video = (video*0.5 + 0.5).clamp(0, 1)
+    video = torch.cat([video.to(device="cuda")], dim=0).cpu()
+    save_videos_grid(video, f"{save_dir}/{video_name[:-4]}_{seed_input}.mp4", n_rows=1, fps=25)
+    if args.restore_frames:
+        video = video.squeeze(0)
+        os.makedirs(os.path.join(save_dir, "result_frames", f"{video_name[:-4]}_{seed_input}"),exist_ok=True)
+        print(os.path.join(save_dir, "result_frames", video_name[:-4]))
+        for i in range(video.shape[1]):
+            save_frames_path = os.path.join(f"{save_dir}/result_frames", f"{video_name[:-4]}_{seed_input}", f'{i:08d}.png')
+            save_image(video[:,i], save_frames_path)
+def get_overlap_slide_window_indices(video_length, window_size, window_overlap):
+    inter_frame_list = []
+    for j in range(0, video_length, window_size-window_overlap):
+        inter_frame_list.append( [e % video_length for e in range(j, min(j + window_size, video_length))] )
+    return inter_frame_list
+if __name__ == "__main__":
+    def parse_list(value):
+        return [int(x) for x in value.split(",")]
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="./configs/infer.yaml")
+    parser.add_argument("--output_dir", type=str, default="output")
+    parser.add_argument("--seed", type=int, default=77)
+    parser.add_argument("--task_ids", type=parse_list, default=[0])
+    parser.add_argument("--input_path", type=str, default='./assert/lq/lq3.mp4')
+    parser.add_argument("--mask_path", type=str, default=None)
+    parser.add_argument("--restore_frames", action='store_true')
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config)
+    main(config, args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+accelerate
+decord
+diffusers
+einops
+moviepy==1.0.3
+numpy<2.0
+omegaconf
+opencv-python
+scikit-video
+transformers

src/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import numpy as np
+import random
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+from transformers import CLIPImageProcessor
+# import librosa
+import os
+import cv2
+mean_face_lm5p_256 = np.array([
+[(30.2946+8)*2+16, 51.6963*2],
+[(65.5318+8)*2+16, 51.5014*2],
+[(48.0252+8)*2+16, 71.7366*2],
+[(33.5493+8)*2+16, 92.3655*2],
+[(62.7299+8)*2+16, 92.2041*2],
+], dtype=np.float32)
+def get_affine_transform(target_face_lm5p, mean_lm5p):
+    mat_warp = np.zeros((2,3))
+    A = np.zeros((4,4))
+    B = np.zeros((4))
+    for i in range(5):
+        A[0][0] += target_face_lm5p[i][0] * target_face_lm5p[i][0] + target_face_lm5p[i][1] * target_face_lm5p[i][1]
+        A[0][2] += target_face_lm5p[i][0]
+        A[0][3] += target_face_lm5p[i][1]
+        B[0] += target_face_lm5p[i][0] * mean_lm5p[i][0] + target_face_lm5p[i][1] * mean_lm5p[i][1]        #sb[1] += a[i].x*b[i].y - a[i].y*b[i].x;
+        B[1] += target_face_lm5p[i][0] * mean_lm5p[i][1] - target_face_lm5p[i][1] * mean_lm5p[i][0]
+        B[2] += mean_lm5p[i][0]
+        B[3] += mean_lm5p[i][1]
+    A[1][1] = A[0][0]
+    A[2][1] = A[1][2] = -A[0][3]
+    A[3][1] = A[1][3] = A[2][0] = A[0][2]
+    A[2][2] = A[3][3] = 5
+    A[3][0] = A[0][3]
+    _, mat23 = cv2.solve(A, B, flags=cv2.DECOMP_SVD)
+    mat_warp[0][0] = mat23[0]
+    mat_warp[1][1] = mat23[0]
+    mat_warp[0][1] = -mat23[1]
+    mat_warp[1][0] = mat23[1]
+    mat_warp[0][2] = mat23[2]
+    mat_warp[1][2] = mat23[3]
+    return mat_warp

src/dataset/face_align/align.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(BASE_DIR)
+import torch
+from src.dataset.face_align.yoloface import YoloFace
+class AlignImage(object):
+    def __init__(self, device='cuda', det_path='checkpoints/yoloface_v5m.pt'):
+        self.facedet = YoloFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
+    @torch.no_grad()
+    def __call__(self, im, maxface=False):
+        bboxes, kpss, scores = self.facedet.detect(im)
+        face_num = bboxes.shape[0]
+        five_pts_list = []
+        scores_list = []
+        bboxes_list = []
+        for i in range(face_num):
+            five_pts_list.append(kpss[i].reshape(5,2))
+            scores_list.append(scores[i])
+            bboxes_list.append(bboxes[i])
+        if maxface and face_num>1:
+            max_idx = 0
+            max_area = (bboxes[0, 2])*(bboxes[0, 3])
+            for i in range(1, face_num):
+                area = (bboxes[i,2])*(bboxes[i,3])
+                if area>max_area:
+                    max_idx = i
+            five_pts_list = [five_pts_list[max_idx]]
+            scores_list = [scores_list[max_idx]]
+            bboxes_list = [bboxes_list[max_idx]]
+        return five_pts_list, scores_list, bboxes_list

src/dataset/face_align/yoloface.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# -*- coding: UTF-8 -*-
+import os
+import cv2
+import numpy as np
+import torch
+import torchvision
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) -
+             torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    # iou = inter / (area1 + area2 - inter)
+    return inter / (area1[:, None] + area2 - inter)
+def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+def scale_coords_landmarks(img1_shape, coords, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+    coords[:, [0, 2, 4, 6, 8]] -= pad[0]  # x padding
+    coords[:, [1, 3, 5, 7, 9]] -= pad[1]  # y padding
+    coords[:, :10] /= gain
+    #clip_coords(coords, img0_shape)
+    coords[:, 0].clamp_(0, img0_shape[1])  # x1
+    coords[:, 1].clamp_(0, img0_shape[0])  # y1
+    coords[:, 2].clamp_(0, img0_shape[1])  # x2
+    coords[:, 3].clamp_(0, img0_shape[0])  # y2
+    coords[:, 4].clamp_(0, img0_shape[1])  # x3
+    coords[:, 5].clamp_(0, img0_shape[0])  # y3
+    coords[:, 6].clamp_(0, img0_shape[1])  # x4
+    coords[:, 7].clamp_(0, img0_shape[0])  # y4
+    coords[:, 8].clamp_(0, img0_shape[1])  # x5
+    coords[:, 9].clamp_(0, img0_shape[0])  # y5
+    return coords
+def show_results(img, xywh, conf, landmarks, class_num):
+    h,w,c = img.shape
+    tl = 1 or round(0.002 * (h + w) / 2) + 1  # line/font thickness
+    x1 = int(xywh[0] * w - 0.5 * xywh[2] * w)
+    y1 = int(xywh[1] * h - 0.5 * xywh[3] * h)
+    x2 = int(xywh[0] * w + 0.5 * xywh[2] * w)
+    y2 = int(xywh[1] * h + 0.5 * xywh[3] * h)
+    cv2.rectangle(img, (x1,y1), (x2, y2), (0,255,0), thickness=tl, lineType=cv2.LINE_AA)
+    clors = [(255,0,0),(0,255,0),(0,0,255),(255,255,0),(0,255,255)]
+    for i in range(5):
+        point_x = int(landmarks[2 * i] * w)
+        point_y = int(landmarks[2 * i + 1] * h)
+        cv2.circle(img, (point_x, point_y), tl+1, clors[i], -1)
+    tf = max(tl - 1, 1)  # font thickness
+    label = str(conf)[:5]
+    cv2.putText(img, label, (x1, y1 - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+    return img
+def make_divisible(x, divisor):
+    # Returns x evenly divisible by divisor
+    return (x // divisor) * divisor
+def non_max_suppression_face(prediction, conf_thres=0.5, iou_thres=0.45, classes=None, agnostic=False, labels=()):
+    """Performs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+    """
+    nc = prediction.shape[2] - 15  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+    # Settings
+    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
+    # time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+    # t = time.time()
+    output = [torch.zeros((0, 16), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 15), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 15] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+        # Compute conf
+        x[:, 15:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        # Detections matrix nx6 (xyxy, conf, landmarks, cls)
+        if multi_label:
+            i, j = (x[:, 15:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 15, None], x[i, 5:15] ,j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 15:].max(1, keepdim=True)
+            x = torch.cat((box, conf, x[:, 5:15], j.float()), 1)[conf.view(-1) > conf_thres]
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+        # If none remain process next image
+        n = x.shape[0]  # number of boxes
+        if not n:
+            continue
+        # Batched NMS
+        c = x[:, 15:16] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        #if i.shape[0] > max_det:  # limit detections
+        #    i = i[:max_det]
+        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+        output[xi] = x[i]
+        # if (time.time() - t) > time_limit:
+        #     break  # time limit exceeded
+    return output
+class YoloFace():
+    def __init__(self, pt_path='checkpoints/yolov5m-face.pt', confThreshold=0.5, nmsThreshold=0.45, device='cuda'):
+        assert os.path.exists(pt_path)
+        self.inpSize = 416
+        self.conf_thres = confThreshold
+        self.iou_thres = nmsThreshold
+        self.test_device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.model = torch.jit.load(pt_path).to(self.test_device)
+        self.last_w = 416
+        self.last_h = 416
+        self.grids = None
+    @torch.no_grad()
+    def detect(self, srcimg):
+        # t0=time.time()
+        h0, w0 = srcimg.shape[:2]  # orig hw
+        r = self.inpSize / min(h0, w0)  # resize image to img_size
+        h1 = int(h0*r+31)//32*32
+        w1 = int(w0*r+31)//32*32
+        img = cv2.resize(srcimg, (w1,h1), interpolation=cv2.INTER_LINEAR)
+        # Convert
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR to RGB
+        # Run inference
+        img = torch.from_numpy(img).to(self.test_device).permute(2,0,1)
+        img = img.float()/255  # uint8 to fp16/32  0-1
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+        # Inference
+        if h1 != self.last_h or w1 != self.last_w or self.grids is None:
+            grids = []
+            for scale in [8,16,32]:
+                ny = h1//scale
+                nx = w1//scale
+                yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+                grid = torch.stack((xv, yv), 2).view((1,1,ny, nx, 2)).float()
+                grids.append(grid.to(self.test_device))
+            self.grids = grids
+            self.last_w = w1
+            self.last_h = h1
+        pred = self.model(img, self.grids).cpu()
+        # Apply NMS
+        det = non_max_suppression_face(pred, self.conf_thres, self.iou_thres)[0]
+        # Process detections
+        # det = pred[0]
+        bboxes = np.zeros((det.shape[0], 4))
+        kpss = np.zeros((det.shape[0], 5, 2))
+        scores = np.zeros((det.shape[0]))
+        # gn = torch.tensor([w0, h0, w0, h0]).to(pred)  # normalization gain whwh
+        # gn_lks = torch.tensor([w0, h0, w0, h0, w0, h0, w0, h0, w0, h0]).to(pred)  # normalization gain landmarks
+        det = det.cpu().numpy()
+        for j in range(det.shape[0]):
+            # xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(4).cpu().numpy()
+            bboxes[j, 0] = det[j, 0] * w0/w1
+            bboxes[j, 1] = det[j, 1] * h0/h1
+            bboxes[j, 2] = det[j, 2] * w0/w1 - bboxes[j, 0]
+            bboxes[j, 3] = det[j, 3] * h0/h1 - bboxes[j, 1]
+            scores[j] = det[j, 4]
+            # landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(5,2).cpu().numpy()
+            kpss[j, :, :] = det[j, 5:15].reshape(5, 2) * np.array([[w0/w1,h0/h1]])
+                # class_num = det[j, 15].cpu().numpy()
+                # orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
+        return bboxes, kpss, scores
+if __name__ == '__main__':
+    import time
+    imgpath = 'test.png'
+    yoloface = YoloFace(pt_path='../checkpoints/yoloface_v5m.pt')
+    srcimg = cv2.imread(imgpath)
+    #warpup
+    bboxes, kpss, scores = yoloface.detect(srcimg)
+    bboxes, kpss, scores = yoloface.detect(srcimg)
+    bboxes, kpss, scores = yoloface.detect(srcimg)
+    t1 = time.time()
+    for _ in range(10):
+        bboxes, kpss, scores = yoloface.detect(srcimg)
+    t2 = time.time()
+    print('total time: {} ms'.format((t2 - t1) * 1000))
+    for i in range(bboxes.shape[0]):
+        xmin, ymin, xamx, ymax = int(bboxes[i, 0]), int(bboxes[i, 1]), int(bboxes[i, 0] + bboxes[i, 2]), int(bboxes[i, 1] + bboxes[i, 3])
+        cv2.rectangle(srcimg, (xmin, ymin), (xamx, ymax), (0, 0, 255), thickness=2)
+        for j in range(5):
+            cv2.circle(srcimg, (int(kpss[i, j, 0]), int(kpss[i, j, 1])), 1, (0, 255, 0), thickness=5)
+    cv2.imwrite('test_yoloface.jpg', srcimg)

src/models/id_proj.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from diffusers import ModelMixin
+from einops import rearrange
+from torch import nn
+class IDProjConvModel(ModelMixin):
+    def __init__(self, in_channels=2048, out_channels=1024):
+        super().__init__()
+        self.project1024 = torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False)
+        self.final_norm = torch.nn.LayerNorm(out_channels)
+    def forward(self, src_id_features_7_7_1024):
+        c = self.project1024(src_id_features_7_7_1024)
+        c = torch.flatten(c, 2)
+        c = torch.transpose(c, 2, 1)
+        c = self.final_norm(c)
+        return c

src/models/model_insightface_360k.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import torch
+from torch import nn
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200', 'getarcface']
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # with torch.cuda.amp.autocast(self.fp16):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.prelu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        layer4_res = x
+        x = self.bn2(x)
+        x = torch.flatten(x, 1)
+        x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        y = self.features(x)
+        return y,layer4_res
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)
+def getarcface(pretrained=None):
+    model = iresnet100()
+    for param in model.parameters():
+        param.requires_grad=False
+    if pretrained is not None and os.path.exists(pretrained):
+        info = model.load_state_dict(torch.load(pretrained, map_location=lambda storage, loc: storage))
+        # print('insightface_glint360k', info)
+    return model.eval()
+if __name__=='__main__':
+    ckpt = 'pretrained/insightface_glint360k.pth'
+    arcface = getarcface(ckpt)

src/models/svfr_adapter/attention_processor.py ADDED Viewed

	@@ -0,0 +1,616 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.image_processor import IPAdapterMaskProcessor
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention_processor import Attention
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_torch_npu_available():
+    import torch_npu
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for Multiple IP-Adapters.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, `Tuple[int]` or `List[int]`, defaults to `(4,)`):
+            The context length of the image features.
+        scale (`float` or List[`float`], defaults to 1.0):
+            the weight scale of image prompt.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        if not isinstance(num_tokens, (tuple, list)):
+            num_tokens = [num_tokens]
+        self.num_tokens = num_tokens
+        if not isinstance(scale, list):
+            scale = [scale] * len(num_tokens)
+        if len(scale) != len(num_tokens):
+            raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.")
+        self.scale = scale
+        self.to_k_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+        self.to_v_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        if ip_adapter_masks is not None:
+            if not isinstance(ip_adapter_masks, List):
+                # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
+                ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
+            if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
+                raise ValueError(
+                    f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
+                    f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
+                    f"({len(ip_hidden_states)})"
+                )
+            else:
+                for index, (mask, scale, ip_state) in enumerate(zip(ip_adapter_masks, self.scale, ip_hidden_states)):
+                    if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
+                        raise ValueError(
+                            "Each element of the ip_adapter_masks array should be a tensor with shape "
+                            "[1, num_images_for_ip_adapter, height, width]."
+                            " Please use `IPAdapterMaskProcessor` to preprocess your mask"
+                        )
+                    if mask.shape[1] != ip_state.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of ip images ({ip_state.shape[1]}) at index {index}"
+                        )
+                    if isinstance(scale, list) and not len(scale) == mask.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of scales ({len(scale)}) at index {index}"
+                        )
+        else:
+            ip_adapter_masks = [None] * len(self.scale)
+        # for ip-adapter
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        ):
+            skip = False
+            if isinstance(scale, list):
+                if all(s == 0 for s in scale):
+                    skip = True
+            elif scale == 0:
+                skip = True
+            if not skip:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale] * mask.shape[1]
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_key = attn.head_to_batch_dim(ip_key)
+                        ip_value = attn.head_to_batch_dim(ip_value)
+                        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+                        _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                        _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states)
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
+                    ip_key = attn.head_to_batch_dim(ip_key)
+                    ip_value = attn.head_to_batch_dim(ip_value)
+                    ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+                    current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                    current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states)
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAdapterAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapter for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, `Tuple[int]` or `List[int]`, defaults to `(4,)`):
+            The context length of the image features.
+        scale (`float` or `List[float]`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        if not isinstance(num_tokens, (tuple, list)):
+            num_tokens = [num_tokens]
+        self.num_tokens = num_tokens
+        if not isinstance(scale, list):
+            scale = [scale] * len(num_tokens)
+        if len(scale) != len(num_tokens):
+            raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.")
+        self.scale = scale
+        self.to_k_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+        self.to_v_ip = nn.ModuleList(
+            [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))]
+        )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+    ):
+        residual = hidden_states
+        # separate ip_hidden_states from encoder_hidden_states
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                deprecation_message = (
+                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                )
+                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if ip_adapter_masks is not None:
+            if not isinstance(ip_adapter_masks, List):
+                # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
+                ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
+            if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
+                raise ValueError(
+                    f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
+                    f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
+                    f"({len(ip_hidden_states)})"
+                )
+            else:
+                for index, (mask, scale, ip_state) in enumerate(zip(ip_adapter_masks, self.scale, ip_hidden_states)):
+                    ip_hidden_states[index] = ip_state = ip_state.unsqueeze(1)
+                    if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
+                        raise ValueError(
+                            "Each element of the ip_adapter_masks array should be a tensor with shape "
+                            "[1, num_images_for_ip_adapter, height, width]."
+                            " Please use `IPAdapterMaskProcessor` to preprocess your mask"
+                        )
+                    if mask.shape[1] != ip_state.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of ip images ({ip_state.shape[1]}) at index {index}"
+                        )
+                    if isinstance(scale, list) and not len(scale) == mask.shape[1]:
+                        raise ValueError(
+                            f"Number of masks ({mask.shape[1]}) does not match "
+                            f"number of scales ({len(scale)}) at index {index}"
+                        )
+        else:
+            ip_adapter_masks = [None] * len(self.scale)
+        # for ip-adapter
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        ):
+            skip = False
+            if isinstance(scale, list):
+                if all(s == 0 for s in scale):
+                    skip = True
+            elif scale == 0:
+                skip = True
+            if not skip:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale] * mask.shape[1]
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                        # TODO: add support for attn.scale when we move to Torch 2.1
+                        _current_ip_hidden_states = F.scaled_dot_product_attention(
+                            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                        )
+                        _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
+                            batch_size, -1, attn.heads * head_dim
+                        )
+                        _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype)
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
+                    ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                    # TODO: add support for attn.scale when we move to Torch 2.1
+                    current_ip_hidden_states = F.scaled_dot_product_attention(
+                        query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                    )
+                    current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
+                        batch_size, -1, attn.heads * head_dim
+                    )
+                    current_ip_hidden_states = current_ip_hidden_states.to(query.dtype)
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

src/models/svfr_adapter/unet_3d_blocks.py ADDED Viewed

The diff for this file is too large to render. See raw diff

src/models/svfr_adapter/unet_3d_svd_condition_ip.py ADDED Viewed

	@@ -0,0 +1,536 @@

+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from src.models.svfr_adapter.unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
+from src.models.svfr_adapter.attention_processor import AttnProcessor2_0, AttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterAttnProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionSVDOutput(BaseOutput):
+    """
+    The output of [`UNet3DConditionSVDModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNet3DConditionSVDModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        added_time_ids: torch.Tensor=None,
+        pose_cond_fea: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[UNet3DConditionSVDOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        sample = sample.flatten(0, 1)
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        if pose_cond_fea is not None:
+            sample = sample + pose_cond_fea.flatten(0, 1)
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionSVDOutput(sample=sample)
+def init_ip_adapters(unet, num_adapter_embeds=[], scale=1.0):
+    # init adapter modules
+    attn_procs = {}
+    unet_sd = unet.state_dict()
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        # if cross_attention_dim is None or "temporal_transformer_blocks" in name:
+        if cross_attention_dim is None:
+            attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+            attn_procs[name] = attn_processor_class()
+        else:
+            attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+            attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        num_tokens=num_adapter_embeds,
+                        scale=scale
+                    )
+            layer_name = name.split(".processor")[0]
+            weights = {}
+            for i in range(len(num_adapter_embeds)):
+                weights.update({f"to_k_ip.{i}.weight": unet_sd[layer_name + ".to_k.weight"]})
+                weights.update({f"to_v_ip.{i}.weight": unet_sd[layer_name + ".to_v.weight"]})
+            attn_procs[name].load_state_dict(weights)
+    unet.set_attn_processor(attn_procs)
+    adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
+    return adapter_modules

src/pipelines/pipeline.py ADDED Viewed

	@@ -0,0 +1,812 @@

+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+from einops import rearrange
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.image_processor import VaeImageProcessor
+# from diffusers.models import UNetSpatioTemporalConditionModel
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor, is_compiled_module
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers import (
+    AutoencoderKLTemporalDecoder,
+    EulerDiscreteScheduler,
+)
+# from src.models.base.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
+from src.models.svfr_adapter.unet_3d_svd_condition_ip import UNet3DConditionSVDModel
+logger = logging.get_logger(__name__)
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+        outputs.append(batch_output)
+    if output_type == "np":
+        outputs = np.stack(outputs)
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+    return outputs
+@dataclass
+class LQ2VideoSVDPipelineOutput(BaseOutput):
+    r"""
+    Output class for zero-shot text-to-video pipeline.
+    Args:
+        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+    frames: Union[List[PIL.Image.Image], np.ndarray]
+    latents: Union[torch.Tensor, np.ndarray]
+class LQ2VideoLongSVDPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet3DConditionSVDModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # print("vae:", self.vae_scale_factor)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True)
+    def _clip_encode_image(self, image, num_frames, device, num_videos_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        image = image.to(device=device, dtype=dtype, non_blocking=True,).unsqueeze(0) # 3,224,224
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+            # image_embeddings = torch.cat([image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        # image_latents = image_latents * 0.18215
+        image_latents = image_latents.unsqueeze(0)
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # image_latents = torch.cat([negative_image_latents, image_latents])
+            image_latents = torch.cat([image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        task_id_input,
+        dtype,
+        batch_size,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(task_id_input)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        # add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        # add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        add_time_ids = task_id_input.to(dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size,
+        num_frames,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        ref_image_latents=None,
+        timestep=None
+    ):
+        from src.utils.noise_util import random_noise
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 3,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            # noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # noise = video_fusion_noise(shape=shape, generator=generator, device=device, dtype=dtype)
+            # noise = video_fusion_noise_repeat(shape=shape, generator=generator, device=device, dtype=dtype)
+            noise = random_noise(shape=shape, generator=generator, device=device, dtype=dtype)
+            # noise = video_fusion_noise_repeat_0830(shape=shape, generator=generator, device=device, dtype=dtype)
+        else:
+            noise = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        if timestep is not None:
+            init_latents = ref_image_latents.unsqueeze(0)
+            # init_latents = ref_image_latents.unsqueeze(1)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        else:
+            latents = noise * self.scheduler.init_noise_sigma
+        return latents
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        return timesteps, num_inference_steps - t_start
+    @property
+    def guidance_scale1(self):
+        return self._guidance_scale1
+    @property
+    def guidance_scale2(self):
+        return self._guidance_scale2
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    # @property
+    # def do_classifier_free_guidance(self):
+    #     return True
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        ref_image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], # lq
+        ref_concat_image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], # last concat ref img
+        id_prompts: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], # id encode_hidden_state
+        # task_id: int = 0,
+        task_id_input: torch.Tensor = None,
+        height: int = 512,
+        width: int = 512,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale=1.0, # 1.0,
+        max_guidance_scale=3.0,
+        noise_aug_strength: int = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        do_classifier_free_guidance: bool = True,
+        overlap=7,
+        frames_per_batch=14,
+        i2i_noise_strength=1.0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            noise_aug_strength (`int`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+        Examples:
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # print(min_guidance_scale, max_guidance_scale)
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(ref_image, height, width)
+        # 2. Define call parameters
+        if isinstance(ref_image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(ref_image, list):
+            batch_size = len(ref_image)
+        else:
+            if len(ref_image.shape)==4:
+                batch_size = 1
+            else:
+                batch_size = ref_image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        # do_classifier_free_guidance = True #True
+        # 3. Prepare clip image embeds
+        # image_embeddings = torch.zeros([2,1,1024],dtype=self.vae.dtype).to(device)
+        # image_embeddings = self._clip_encode_image(
+        #     clip_image,
+        #     num_frames,
+        #     device,
+        #     num_videos_per_prompt,
+        #     do_classifier_free_guidance,)
+        # print(image_embeddings)
+        image_embeddings = torch.cat([torch.zeros_like(id_prompts),id_prompts], dim=0) if do_classifier_free_guidance else id_prompts
+        # image_embeddings = torch.cat([torch.zeros_like(id_prompts),id_prompts,id_prompts], dim=0)
+        # image_embeddings = torch.cat([id_prompts,id_prompts,id_prompts], dim=0)
+        # image_embeddings = torch.cat([torch.zeros_like(id_prompts),torch.zeros_like(id_prompts),torch.zeros_like(id_prompts)], dim=0)
+        # image_embeddings = torch.cat([id_prompts_neg, id_prompts, id_prompts], dim=0)
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        # fps = fps - 1
+        # 4. Encode input image using VAE
+        needs_upcasting = (self.vae.dtype == torch.float16 or self.vae.dtype == torch.bfloat16) and self.vae.config.force_upcast
+        vae_dtype = self.vae.dtype
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        # Prepare ref image latents
+        ref_image_tensor = ref_image.to(
+            dtype=self.vae.dtype, device=self.vae.device
+        )
+        # bsz = ref_image_tensor.shape[0]
+        # ref_image_tensor = rearrange(ref_image_tensor,'b f c h w-> (b f) c h w')
+        chunk_size = 20
+        ref_image_latents = []
+        for chunk_idx in range((ref_image_tensor.shape[0]//chunk_size)+1):
+            if chunk_idx*chunk_size>=num_frames: break
+            ref_image_latent = self.vae.encode(ref_image_tensor[chunk_idx*chunk_size:(chunk_idx+1)*chunk_size]).latent_dist.mean #TODO
+            ref_image_latents.append(ref_image_latent)
+        ref_image_latents = torch.cat(ref_image_latents,dim=0)
+        # print(ref_image_tensor.shape,ref_image_latents.shape)
+        ref_image_latents = ref_image_latents * 0.18215  # (f, 4, h, w)
+        # ref_image_latents = rearrange(ref_image_latents, '(b f) c h w-> b f c h w', b=bsz)
+        noise = randn_tensor(
+            ref_image_tensor.shape,
+            generator=generator,
+            device=self.vae.device,
+            dtype=self.vae.dtype)
+        ref_image_tensor = ref_image_tensor + noise_aug_strength * noise
+        image_latents = []
+        for chunk_idx in range((ref_image_tensor.shape[0]//chunk_size)+1):
+            if chunk_idx*chunk_size>=num_frames: break
+            image_latent = self._encode_vae_image(
+                ref_image_tensor[chunk_idx*chunk_size:(chunk_idx+1)*chunk_size],
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+            image_latents.append(image_latent)
+        image_latents = torch.cat(image_latents, dim=1)
+        # print(ref_image_tensor.shape,image_latents.shape)
+        # print(image_latents.shape)
+        image_latents = image_latents.to(image_embeddings.dtype)
+        ref_image_latents = ref_image_latents.to(image_embeddings.dtype)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=vae_dtype)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        # image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        if ref_concat_image is not None:
+            ref_concat_tensor = ref_concat_image.to(
+                dtype=self.vae.dtype, device=self.vae.device
+            )
+            ref_concat_tensor = self.vae.encode(ref_concat_tensor.unsqueeze(0)).latent_dist.mode()
+            ref_concat_tensor = ref_concat_tensor.unsqueeze(0).repeat(1,num_frames,1,1,1)
+            ref_concat_tensor = torch.cat([torch.zeros_like(ref_concat_tensor), ref_concat_tensor]) if do_classifier_free_guidance else ref_concat_tensor
+            ref_concat_tensor = ref_concat_tensor.to(image_embeddings)
+        else:
+            ref_concat_tensor = torch.zeros_like(image_latents)
+         # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            task_id_input,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device, dtype=self.unet.dtype)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, i2i_noise_strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+            ref_image_latents,
+            timestep=latent_timestep
+        )
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(
+            min_guidance_scale,
+            max_guidance_scale,
+            num_inference_steps)
+        guidance_scale1 = guidance_scale.to(device, latents.dtype)
+        guidance_scale2 = guidance_scale.to(device, latents.dtype)
+        self._guidance_scale1 = guidance_scale1
+        self._guidance_scale2 = guidance_scale2
+        # 8. Denoising loop
+        latents_all = latents # for any-frame generation
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        shift = 0
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # init
+                pred_latents = torch.zeros_like(
+                    latents_all,
+                    dtype=self.unet.dtype,
+                )
+                counter = torch.zeros(
+                    (latents_all.shape[0], num_frames, 1, 1, 1),
+                    dtype=self.unet.dtype,
+                ).to(device=latents_all.device)
+                for batch, index_start in enumerate(range(0, num_frames, frames_per_batch - overlap*(i<3))):
+                    self.scheduler._step_index = None
+                    index_start -= shift
+                    def indice_slice(tensor, idx_list):
+                        tensor_list = []
+                        for idx in idx_list:
+                            idx = idx % tensor.shape[1]
+                            tensor_list.append(tensor[:,idx])
+                        return torch.stack(tensor_list, 1)
+                    idx_list = list(range(index_start, index_start+frames_per_batch))
+                    latents = indice_slice(latents_all, idx_list)
+                    image_latents_input = indice_slice(image_latents, idx_list)
+                    image_embeddings_input = indice_slice(image_embeddings, idx_list)
+                    ref_concat_tensor_input = indice_slice(ref_concat_tensor, idx_list)
+                    # if index_start + frames_per_batch >= num_frames:
+                    #     index_start = num_frames - frames_per_batch
+                    # latents = latents_all[:, index_start:index_start + frames_per_batch]
+                    # image_latents_input = image_latents[:, index_start:index_start + frames_per_batch]
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    #    = torch.cat([torch.zeros_like(image_latents_input),image_latents_input]) if do_classifier_free_guidance else image_latents_input
+                    # image_latents_input = torch.zeros_like(image_latents_input)
+                    # image_latents_input = torch.cat([image_latents_input] * 2) if do_classifier_free_guidance else image_latents_input
+                    # Concatenate image_latents over channels dimention
+                    # print(latent_model_input.shape, image_latents_input.shape)
+                    latent_model_input = torch.cat([
+                        latent_model_input,
+                        image_latents_input,
+                        ref_concat_tensor_input], dim=2)
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=image_embeddings_input.flatten(0,1),
+                        added_time_ids=added_time_ids,
+                        return_dict=False,
+                    )[0]
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(3)
+                        noise_pred = noise_pred_uncond + self.guidance_scale1[i] * (noise_pred_cond - noise_pred_uncond) #+ self.guidance_scale2[i] * (noise_pred_cond - noise_pred_drop_id)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t.to(self.unet.dtype), latents).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        latents = callback_outputs.pop("latents", latents)
+                    # if batch == 0:
+                    for iii in range(frames_per_batch):
+                        # pred_latents[:, index_start + iii:index_start + iii + 1] += latents[:, iii:iii+1] * min(iii + 1, frames_per_batch-iii)
+                        # counter[:, index_start + iii:index_start + iii + 1] += min(iii + 1, frames_per_batch-iii)
+                        p = (index_start + iii) % pred_latents.shape[1]
+                        pred_latents[:, p] += latents[:, iii] * min(iii + 1, frames_per_batch-iii)
+                        counter[:, p] += 1  * min(iii + 1, frames_per_batch-iii)
+                shift += overlap
+                shift = shift % frames_per_batch
+                pred_latents  = pred_latents / counter
+                latents_all = pred_latents
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        latents = latents_all
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=vae_dtype)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return LQ2VideoSVDPipelineOutput(frames=frames,latents=latents)
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

src/utils/noise_util.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.utils.torch_utils import randn_tensor
+def random_noise(
+    tensor: torch.Tensor = None,
+    shape: Tuple[int] = None,
+    dtype: torch.dtype = None,
+    device: torch.device = None,
+    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
+    noise_offset: Optional[float] = None,  # typical value is 0.1
+) -> torch.Tensor:
+    if tensor is not None:
+        shape = tensor.shape
+        device = tensor.device
+        dtype = tensor.dtype
+    if isinstance(device, str):
+        device = torch.device(device)
+    noise = randn_tensor(shape, dtype=dtype, device=device, generator=generator)
+    if noise_offset is not None:
+        noise += noise_offset * torch.randn(
+            (tensor.shape[0], tensor.shape[1], 1, 1, 1), device
+        )
+    return noise

src/utils/util.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from pathlib import Path
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange
+from PIL import Image
+import imageio
+def seed_everything(seed):
+    import random
+    import numpy as np
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed % (2**32))
+    random.seed(seed)
+def save_videos_from_pil(pil_images, path, fps=8):
+    save_fmt = Path(path).suffix
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    if save_fmt == ".mp4":
+        with imageio.get_writer(path, fps=fps) as writer:
+            for img in pil_images:
+                img_array = np.array(img)  # Convert PIL Image to numpy array
+                writer.append_data(img_array)
+    elif save_fmt == ".gif":
+        pil_images[0].save(
+            fp=path,
+            format="GIF",
+            append_images=pil_images[1:],
+            save_all=True,
+            duration=(1 / fps * 1000),
+            loop=0,
+            optimize=False,
+            lossless=True
+        )
+    else:
+        raise ValueError("Unsupported file type. Use .mp4 or .gif.")
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    height, width = videos.shape[-2:]
+    outputs = []
+    for i, x in enumerate(videos):
+        x = torchvision.utils.make_grid(x, nrow=n_rows)  # (c h w)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)  # (h w c)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        x = Image.fromarray(x)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    save_videos_from_pil(outputs, path, fps)