Spaces:

Baraaqasem
/

Imag

Runtime error

App Files Files Community

Baraaqasem commited on Nov 12, 2024

Commit

5d32408

verified ·

1 Parent(s): 413d4d0

Upload 585 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/videogen_hub/pipelines/__init__.py +0 -0
src/videogen_hub/pipelines/cogvideo/__init__.py +4 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_pipeline.py +612 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/LICENSE +201 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/Model_License +79 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/__init__.py +0 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/cluster_label2.npy +3 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/coglm_strategy.py +101 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/cogvideo_pipeline.py +1341 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/__init__.py +0 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/cogvideo_cache_model.py +695 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/cogvideo_model.py +543 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/pretrain_cogvideo.py +184 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/requirements.txt +4 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/__init__.py +17 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/cluster_label2.npy +3 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/direct_sr.py +117 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/dsr_model.py +225 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/dsr_sampling.py +204 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/iterative_sr.py +118 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/itersr_model.py +232 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/itersr_sampling.py +168 -0
src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/sr_group.py +49 -0
src/videogen_hub/pipelines/consisti2v/LICENSE +21 -0
src/videogen_hub/pipelines/consisti2v/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/configs/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/configs/inference/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/configs/inference/inference.yaml +48 -0
src/videogen_hub/pipelines/consisti2v/configs/inference/inference_autoregress.yaml +49 -0
src/videogen_hub/pipelines/consisti2v/configs/prompts/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/configs/prompts/default.yaml +16 -0
src/videogen_hub/pipelines/consisti2v/configs/training/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/configs/training/training.yaml +92 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/data/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/data/dataset.py +315 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/rotary_embedding.py +280 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_attention.py +809 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_transformer_blocks.py +564 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_unet.py +1371 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_unet_blocks.py +1159 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/pipeline_autoregress_animation.py +615 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/pipeline_conditional_animation.py +695 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/utils/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/utils/frameinit_utils.py +142 -0
src/videogen_hub/pipelines/consisti2v/consisti2v/utils/util.py +165 -0
src/videogen_hub/pipelines/consisti2v/scripts/__init__.py +0 -0
src/videogen_hub/pipelines/consisti2v/scripts/animate.py +247 -0

src/videogen_hub/pipelines/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/cogvideo/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import sys
+sys.path.insert(0, "./src/videogen_hub/pipelines/cogvideo/")
+sys.path.insert(0, "./src/videogen_hub/pipelines/cogvideo/cogvideo_src")

src/videogen_hub/pipelines/cogvideo/cogvideo_pipeline.py ADDED Viewed

	@@ -0,0 +1,612 @@

+from videogen_hub.pipelines.cogvideo.cogvideo_src.cogvideo_pipeline import (
+    InferenceModel_Interpolate,
+    InferenceModel_Sequential,
+    my_filling_sequence,
+    get_masks_and_position_ids_stage1,
+    get_masks_and_position_ids_stage2,
+    my_save_multiple_images,
+)
+from videogen_hub.depend.icetk import icetk as tokenizer
+from videogen_hub.pipelines.cogvideo.cogvideo_src.coglm_strategy import (
+    CoglmStrategy,
+)
+from videogen_hub.pipelines.cogvideo.cogvideo_src.sr_pipeline import (
+    DirectSuperResolution,
+)
+from SwissArmyTransformer.resources import auto_create
+import time, logging, sys, os, torch
+import torch.distributed as dist
+# path = os.path.join(args.output_path, f"{now_qi}_{raw_text}")
+def pipeline(args, raw_text, height, width, duration):
+    # model_stage1, args = InferenceModel_Sequential.from_pretrained(args, 'cogvideo-stage1')
+    # model_stage1.eval()
+    # parent_givan_tokens = process_stage1(model_stage1, raw_text, duration=4.0, video_raw_text=raw_text, video_guidance_text="视频",
+    #                                         image_text_suffix=" 高清摄影",
+    #                                         outputdir=None, batch_size=args.batch_size)
+    # process_stage2(model_stage2, raw_text, duration=2.0, video_raw_text=raw_text+" 视频",
+    #         video_guidance_text="视频", parent_given_tokens=parent_given_tokens,
+    #         outputdir=path,
+    #         gpu_rank=0, gpu_parallel_size=1) # TODO: 修改
+    assert int(args.stage_1) + int(args.stage_2) + int(args.both_stages) == 1
+    rank_id = args.device % args.parallel_size
+    generate_frame_num = args.generate_frame_num
+    if args.stage_1 or args.both_stages:
+        model_stage1, args = InferenceModel_Sequential.from_pretrained(
+            args, "cogvideo-stage1"
+        )
+        model_stage1.eval()
+        if args.both_stages:
+            model_stage1 = model_stage1.cpu()
+    if args.stage_2 or args.both_stages:
+        model_stage2, args = InferenceModel_Interpolate.from_pretrained(
+            args, "cogvideo-stage2"
+        )
+        model_stage2.eval()
+        if args.both_stages:
+            model_stage2 = model_stage2.cpu()
+    invalid_slices = [slice(tokenizer.num_image_tokens, None)]
+    strategy_cogview2 = CoglmStrategy(invalid_slices, temperature=1.0, top_k=16)
+    strategy_cogvideo = CoglmStrategy(
+        invalid_slices,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        temperature2=args.coglm_temperature2,
+    )
+    if not args.stage_1:
+        # from sr_pipeline import DirectSuperResolution
+        dsr_path = auto_create(
+            "cogview2-dsr", path=None
+        )  # path=os.getenv('SAT_HOME', '~/.sat_models')
+        dsr = DirectSuperResolution(args, dsr_path, max_bz=12, onCUDA=False)
+    def process_stage2(
+        model,
+        seq_text,
+        duration,
+        video_raw_text=None,
+        video_guidance_text="视频",
+        parent_given_tokens=None,
+        conddir=None,
+        outputdir=None,
+        gpu_rank=0,
+        gpu_parallel_size=1,
+    ):
+        stage2_starttime = time.time()
+        use_guidance = args.use_guidance_stage2
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage-2 model to cuda")
+            model = model.cuda()
+            logging.debug(
+                "moving in stage-2 model takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        try:
+            if parent_given_tokens is None:
+                assert conddir is not None
+                parent_given_tokens = torch.load(
+                    os.path.join(conddir, "frame_tokens.pt"), map_location="cpu"
+                )
+            sample_num_allgpu = parent_given_tokens.shape[0]
+            sample_num = sample_num_allgpu // gpu_parallel_size
+            assert sample_num * gpu_parallel_size == sample_num_allgpu
+            parent_given_tokens = parent_given_tokens[
+                gpu_rank * sample_num : (gpu_rank + 1) * sample_num
+            ]
+        except:
+            logging.critical("No frame_tokens found in interpolation, skip")
+            return False
+        # CogVideo Stage2 Generation
+        while (
+            duration >= 0.5
+        ):  # TODO: You can change the boundary to change the frame rate
+            parent_given_tokens_num = parent_given_tokens.shape[1]
+            generate_batchsize_persample = (parent_given_tokens_num - 1) // 2
+            generate_batchsize_total = generate_batchsize_persample * sample_num
+            total_frames = generate_frame_num
+            frame_len = 400
+            enc_text = tokenizer.encode(seq_text)
+            enc_duration = tokenizer.encode(str(float(duration)) + "秒")
+            seq = (
+                enc_duration
+                + [tokenizer["<n>"]]
+                + enc_text
+                + [tokenizer["<start_of_image>"]]
+                + [-1] * 400 * generate_frame_num
+            )
+            text_len = len(seq) - frame_len * generate_frame_num - 1
+            logging.info(
+                "[Stage2: Generating Frames, Frame Rate {:d}]\nraw text: {:s}".format(
+                    int(4 / duration), tokenizer.decode(enc_text)
+                )
+            )
+            # generation
+            seq = (
+                torch.cuda.LongTensor(seq, device=args.device)
+                .unsqueeze(0)
+                .repeat(generate_batchsize_total, 1)
+            )
+            for sample_i in range(sample_num):
+                for i in range(generate_batchsize_persample):
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 : text_len + 1 + 400
+                    ] = parent_given_tokens[sample_i][2 * i]
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 + 400 : text_len + 1 + 800
+                    ] = parent_given_tokens[sample_i][2 * i + 1]
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 + 800 : text_len + 1 + 1200
+                    ] = parent_given_tokens[sample_i][2 * i + 2]
+            if use_guidance:
+                guider_seq = (
+                    enc_duration
+                    + [tokenizer["<n>"]]
+                    + tokenizer.encode(video_guidance_text)
+                    + [tokenizer["<start_of_image>"]]
+                    + [-1] * 400 * generate_frame_num
+                )
+                guider_text_len = len(guider_seq) - frame_len * generate_frame_num - 1
+                guider_seq = (
+                    torch.cuda.LongTensor(guider_seq, device=args.device)
+                    .unsqueeze(0)
+                    .repeat(generate_batchsize_total, 1)
+                )
+                for sample_i in range(sample_num):
+                    for i in range(generate_batchsize_persample):
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 : text_len + 1 + 400
+                        ] = parent_given_tokens[sample_i][2 * i]
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 + 400 : text_len + 1 + 800
+                        ] = parent_given_tokens[sample_i][2 * i + 1]
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 + 800 : text_len + 1 + 1200
+                        ] = parent_given_tokens[sample_i][2 * i + 2]
+                video_log_text_attention_weights = 0
+            else:
+                guider_seq = None
+                guider_text_len = 0
+                video_log_text_attention_weights = 1.4
+            mbz = args.max_inference_batch_size
+            assert generate_batchsize_total < mbz or generate_batchsize_total % mbz == 0
+            output_list = []
+            start_time = time.time()
+            for tim in range(max(generate_batchsize_total // mbz, 1)):
+                input_seq = (
+                    seq[: min(generate_batchsize_total, mbz)].clone()
+                    if tim == 0
+                    else seq[mbz * tim : mbz * (tim + 1)].clone()
+                )
+                guider_seq2 = (
+                    (
+                        guider_seq[: min(generate_batchsize_total, mbz)].clone()
+                        if tim == 0
+                        else guider_seq[mbz * tim : mbz * (tim + 1)].clone()
+                    )
+                    if guider_seq is not None
+                    else None
+                )
+                output_list.append(
+                    my_filling_sequence(
+                        model,
+                        args,
+                        input_seq,
+                        batch_size=min(generate_batchsize_total, mbz),
+                        get_masks_and_position_ids=get_masks_and_position_ids_stage2,
+                        text_len=text_len,
+                        frame_len=frame_len,
+                        strategy=strategy_cogview2,
+                        strategy2=strategy_cogvideo,
+                        log_text_attention_weights=video_log_text_attention_weights,
+                        mode_stage1=False,
+                        guider_seq=guider_seq2,
+                        guider_text_len=guider_text_len,
+                        guidance_alpha=args.guidance_alpha,
+                        limited_spatial_channel_mem=True,
+                    )[0]
+                )
+            logging.info(
+                "Duration {:.2f}, Taken time {:.2f}\n".format(
+                    duration, time.time() - start_time
+                )
+            )
+            output_tokens = torch.cat(output_list, dim=0)
+            output_tokens = output_tokens[
+                :, text_len + 1 : text_len + 1 + (total_frames) * 400
+            ].reshape(sample_num, -1, 400 * total_frames)
+            output_tokens_merge = torch.cat(
+                (
+                    output_tokens[:, :, : 1 * 400],
+                    output_tokens[:, :, 400 * 3 : 4 * 400],
+                    output_tokens[:, :, 400 * 1 : 2 * 400],
+                    output_tokens[:, :, 400 * 4 : (total_frames) * 400],
+                ),
+                dim=2,
+            ).reshape(sample_num, -1, 400)
+            output_tokens_merge = torch.cat(
+                (output_tokens_merge, output_tokens[:, -1:, 400 * 2 : 3 * 400]), dim=1
+            )
+            duration /= 2
+            parent_given_tokens = output_tokens_merge
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 2 model to cpu")
+            model = model.cpu()
+            torch.cuda.empty_cache()
+            logging.debug(
+                "moving out model2 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        logging.info(
+            "CogVideo Stage2 completed. Taken time {:.2f}\n".format(
+                time.time() - stage2_starttime
+            )
+        )
+        # decoding
+        # imgs = [torch.nn.functional.interpolate(tokenizer.decode(image_ids=seq.tolist()), size=(480, 480)) for seq in output_tokens_merge]
+        # os.makedirs(output_dir_full_path, exist_ok=True)
+        # my_save_multiple_images(imgs, output_dir_full_path,subdir="frames", debug=False)
+        # torch.save(output_tokens_merge.cpu(), os.path.join(output_dir_full_path, 'frame_token.pt'))
+        # os.system(f"gifmaker -i '{output_dir_full_path}'/frames/0*.jpg -o '{output_dir_full_path}/{str(float(duration))}_concat.gif' -d 0.2")
+        # direct super-resolution by CogView2
+        logging.info("[Direct super-resolution]")
+        dsr_starttime = time.time()
+        enc_text = tokenizer.encode(seq_text)
+        frame_num_per_sample = parent_given_tokens.shape[1]
+        parent_given_tokens_2d = parent_given_tokens.reshape(-1, 400)
+        text_seq = (
+            torch.cuda.LongTensor(enc_text, device=args.device)
+            .unsqueeze(0)
+            .repeat(parent_given_tokens_2d.shape[0], 1)
+        )
+        sred_tokens = dsr(text_seq, parent_given_tokens_2d)
+        decoded_sr_videos = []
+        for sample_i in range(sample_num):
+            decoded_sr_imgs = []
+            for frame_i in range(frame_num_per_sample):
+                decoded_sr_img = tokenizer.decode(
+                    image_ids=sred_tokens[frame_i + sample_i * frame_num_per_sample][
+                        -3600:
+                    ]
+                )
+                decoded_sr_imgs.append(
+                    torch.nn.functional.interpolate(
+                        decoded_sr_img, size=(height, width)
+                    )
+                )
+            decoded_sr_videos.append(decoded_sr_imgs)
+        return decoded_sr_videos
+        # for sample_i in range(sample_num):
+        #     my_save_multiple_images(decoded_sr_videos[sample_i], outputdir,subdir=f"frames/{sample_i+sample_num*gpu_rank}", debug=False)
+        #     os.system(f"gifmaker -i '{outputdir}'/frames/'{sample_i+sample_num*gpu_rank}'/0*.jpg -o '{outputdir}/{sample_i+sample_num*gpu_rank}.gif' -d 0.125")
+        # logging.info("Direct super-resolution completed. Taken time {:.2f}\n".format(time.time() - dsr_starttime))
+        # return True
+    def process_stage1(
+        model,
+        seq_text,
+        duration,
+        video_raw_text=None,
+        video_guidance_text="视频",
+        image_text_suffix="",
+        outputdir=None,
+        batch_size=1,
+    ):
+        process_start_time = time.time()
+        use_guide = args.use_guidance_stage1
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 1 model to cuda")
+            model = model.cuda()
+            logging.debug(
+                "moving in model1 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        if video_raw_text is None:
+            video_raw_text = seq_text
+        mbz = (
+            args.stage1_max_inference_batch_size
+            if args.stage1_max_inference_batch_size > 0
+            else args.max_inference_batch_size
+        )
+        assert batch_size < mbz or batch_size % mbz == 0
+        frame_len = 400
+        # generate the first frame:
+        enc_text = tokenizer.encode(seq_text + image_text_suffix)
+        seq_1st = (
+            enc_text + [tokenizer["<start_of_image>"]] + [-1] * 400
+        )  # IV!!  # test local!!! # test randboi!!!
+        logging.info(
+            "[Generating First Frame with CogView2]Raw text: {:s}".format(
+                tokenizer.decode(enc_text)
+            )
+        )
+        text_len_1st = len(seq_1st) - frame_len * 1 - 1
+        seq_1st = torch.cuda.LongTensor(seq_1st, device=args.device).unsqueeze(0)
+        output_list_1st = []
+        for tim in range(max(batch_size // mbz, 1)):
+            start_time = time.time()
+            output_list_1st.append(
+                my_filling_sequence(
+                    model,
+                    args,
+                    seq_1st.clone(),
+                    batch_size=min(batch_size, mbz),
+                    get_masks_and_position_ids=get_masks_and_position_ids_stage1,
+                    text_len=text_len_1st,
+                    frame_len=frame_len,
+                    strategy=strategy_cogview2,
+                    strategy2=strategy_cogvideo,
+                    log_text_attention_weights=1.4,
+                    enforce_no_swin=True,
+                    mode_stage1=True,
+                )[0]
+            )
+            logging.info(
+                "[First Frame]Taken time {:.2f}\n".format(time.time() - start_time)
+            )
+        output_tokens_1st = torch.cat(output_list_1st, dim=0)
+        given_tokens = output_tokens_1st[
+            :, text_len_1st + 1 : text_len_1st + 401
+        ].unsqueeze(
+            1
+        )  # given_tokens.shape: [bs, frame_num, 400]
+        # generate subsequent frames:
+        total_frames = generate_frame_num
+        enc_duration = tokenizer.encode(str(float(duration)) + "秒")
+        if use_guide:
+            video_raw_text = video_raw_text + " 视频"
+        enc_text_video = tokenizer.encode(video_raw_text)
+        seq = (
+            enc_duration
+            + [tokenizer["<n>"]]
+            + enc_text_video
+            + [tokenizer["<start_of_image>"]]
+            + [-1] * 400 * generate_frame_num
+        )
+        guider_seq = (
+            enc_duration
+            + [tokenizer["<n>"]]
+            + tokenizer.encode(video_guidance_text)
+            + [tokenizer["<start_of_image>"]]
+            + [-1] * 400 * generate_frame_num
+        )
+        logging.info(
+            "[Stage1: Generating Subsequent Frames, Frame Rate {:.1f}]\nraw text: {:s}".format(
+                4 / duration, tokenizer.decode(enc_text_video)
+            )
+        )
+        text_len = len(seq) - frame_len * generate_frame_num - 1
+        guider_text_len = len(guider_seq) - frame_len * generate_frame_num - 1
+        seq = (
+            torch.cuda.LongTensor(seq, device=args.device)
+            .unsqueeze(0)
+            .repeat(batch_size, 1)
+        )
+        guider_seq = (
+            torch.cuda.LongTensor(guider_seq, device=args.device)
+            .unsqueeze(0)
+            .repeat(batch_size, 1)
+        )
+        for given_frame_id in range(given_tokens.shape[1]):
+            seq[
+                :,
+                text_len
+                + 1
+                + given_frame_id * 400 : text_len
+                + 1
+                + (given_frame_id + 1) * 400,
+            ] = given_tokens[:, given_frame_id]
+            guider_seq[
+                :,
+                guider_text_len
+                + 1
+                + given_frame_id * 400 : guider_text_len
+                + 1
+                + (given_frame_id + 1) * 400,
+            ] = given_tokens[:, given_frame_id]
+        output_list = []
+        if use_guide:
+            video_log_text_attention_weights = 0
+        else:
+            guider_seq = None
+            video_log_text_attention_weights = 1.4
+        for tim in range(max(batch_size // mbz, 1)):
+            start_time = time.time()
+            input_seq = (
+                seq[: min(batch_size, mbz)].clone()
+                if tim == 0
+                else seq[mbz * tim : mbz * (tim + 1)].clone()
+            )
+            guider_seq2 = (
+                (
+                    guider_seq[: min(batch_size, mbz)].clone()
+                    if tim == 0
+                    else guider_seq[mbz * tim : mbz * (tim + 1)].clone()
+                )
+                if guider_seq is not None
+                else None
+            )
+            output_list.append(
+                my_filling_sequence(
+                    model,
+                    args,
+                    input_seq,
+                    batch_size=min(batch_size, mbz),
+                    get_masks_and_position_ids=get_masks_and_position_ids_stage1,
+                    text_len=text_len,
+                    frame_len=frame_len,
+                    strategy=strategy_cogview2,
+                    strategy2=strategy_cogvideo,
+                    log_text_attention_weights=video_log_text_attention_weights,
+                    guider_seq=guider_seq2,
+                    guider_text_len=guider_text_len,
+                    guidance_alpha=args.guidance_alpha,
+                    limited_spatial_channel_mem=True,
+                    mode_stage1=True,
+                )[0]
+            )
+        output_tokens = torch.cat(output_list, dim=0)[:, 1 + text_len :]
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 1 model to cpu")
+            model = model.cpu()
+            torch.cuda.empty_cache()
+            logging.debug(
+                "moving in model1 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        # decoding
+        imgs, sred_imgs, txts = [], [], []
+        for seq in output_tokens:
+            decoded_imgs = [
+                torch.nn.functional.interpolate(
+                    tokenizer.decode(image_ids=seq.tolist()[i * 400 : (i + 1) * 400]),
+                    size=(height, width),
+                )
+                for i in range(total_frames)
+            ]
+            imgs.append(decoded_imgs)  # only the last image (target)
+        assert len(imgs) == batch_size
+        return imgs
+        # save_tokens = output_tokens[:, :+total_frames*400].reshape(-1, total_frames, 400).cpu()
+        # if outputdir is not None:
+        #     for clip_i in range(len(imgs)):
+        #         # os.makedirs(output_dir_full_paths[clip_i], exist_ok=True)
+        #         my_save_multiple_images(imgs[clip_i], outputdir, subdir=f"frames/{clip_i}", debug=False)
+        #         os.system(f"gifmaker -i '{outputdir}'/frames/'{clip_i}'/0*.jpg -o '{outputdir}/{clip_i}.gif' -d 0.25")
+        #     torch.save(save_tokens, os.path.join(outputdir, 'frame_tokens.pt'))
+        # logging.info("CogVideo Stage1 completed. Taken time {:.2f}\n".format(time.time() - process_start_time))
+        # return save_tokens
+    # ======================================================================================================
+    if args.stage_1 or args.both_stages:
+        if args.input_source != "interactive":
+            with open(args.input_source, "r") as fin:
+                promptlist = fin.readlines()
+            promptlist = [p.strip() for p in promptlist]
+        else:
+            promptlist = None
+        now_qi = -1
+        while True:
+            now_qi += 1
+            if promptlist is not None:  # with input-source
+                if args.multi_gpu:
+                    if now_qi % dist.get_world_size() != dist.get_rank():
+                        continue
+                    rk = dist.get_rank()
+                else:
+                    rk = 0
+                raw_text = promptlist[now_qi]
+                raw_text = raw_text.strip()
+                print(f"Working on Line No. {now_qi} on {rk}... [{raw_text}]")
+            else:  # interactive
+                raw_text = input("\nPlease Input Query (stop to exit) >>> ")
+                raw_text = raw_text.strip()
+                if not raw_text:
+                    print("Query should not be empty!")
+                    continue
+                if raw_text == "stop":
+                    return
+            try:
+                path = os.path.join(args.output_path, f"{now_qi}_{raw_text}")
+                parent_given_tokens, imgs = process_stage1(
+                    model_stage1,
+                    raw_text,
+                    duration=4.0,
+                    video_raw_text=raw_text,
+                    video_guidance_text="视频",
+                    image_text_suffix=" 高清摄影",
+                    outputdir=path if args.stage_1 else None,
+                    batch_size=args.batch_size,
+                )
+                if args.stage_1 and not args.both_stages:
+                    print("only stage 1")
+                    return imgs
+                if args.both_stages:
+                    videos = process_stage2(
+                        model_stage2,
+                        raw_text,
+                        duration=duration,
+                        video_raw_text=raw_text + " 视频",
+                        video_guidance_text="视频",
+                        parent_given_tokens=parent_given_tokens,
+                        outputdir=path,
+                        gpu_rank=0,
+                        gpu_parallel_size=1,
+                    )  # TODO: 修改
+                    return videos
+            except (ValueError, FileNotFoundError) as e:
+                print(e)
+                continue
+    elif args.stage_2:
+        sample_dirs = os.listdir(args.output_path)
+        for sample in sample_dirs:
+            raw_text = sample.split("_")[-1]
+            path = os.path.join(args.output_path, sample, "Interp")
+            parent_given_tokens = torch.load(
+                os.path.join(args.output_path, sample, "frame_tokens.pt")
+            )
+            process_stage2(
+                raw_text,
+                duration=2.0,
+                video_raw_text=raw_text + " 视频",
+                video_guidance_text="视频",
+                parent_given_tokens=parent_given_tokens,
+                outputdir=path,
+                gpu_rank=0,
+                gpu_parallel_size=1,
+            )  # TODO: 修改
+    else:
+        assert False

src/videogen_hub/pipelines/cogvideo/cogvideo_src/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

src/videogen_hub/pipelines/cogvideo/cogvideo_src/Model_License ADDED Viewed

	@@ -0,0 +1,79 @@

+The CogVideo License
+Section I: PREAMBLE
+Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
+Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
+In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
+Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
+This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
+NOW THEREFORE, You and Licensor agree as follows:
+1. Definitions
+- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
+- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
+- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
+- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
+- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
+- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
+- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
+- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
+- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
+- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
+- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
+Section II: INTELLECTUAL PROPERTY RIGHTS
+Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
+3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
+Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
+4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
+Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
+You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
+You must cause any modified files to carry prominent notices stating that You changed the files;
+You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
+5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
+6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
+Section IV: OTHER PROVISIONS
+7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
+8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
+9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
+10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+END OF TERMS AND CONDITIONS
+Attachment A
+Use Restrictions
+You agree not to use the Model or Derivatives of the Model:
+- In any way that violates any applicable national, federal, state, local or international law or regulation;
+- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
+- To generate or disseminate personal identifiable information that can be used to harm an individual;
+- To defame, disparage or otherwise harass others;
+- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
+- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
+- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
+- To provide medical advice and medical results interpretation;
+- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).

src/videogen_hub/pipelines/cogvideo/cogvideo_src/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/cogvideo/cogvideo_src/cluster_label2.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b87880fdbe89670f12844377b9cf97a9733b1f54e3a9b73cbb9835084c4e02ec
+size 160128

src/videogen_hub/pipelines/cogvideo/cogvideo_src/coglm_strategy.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   coglm_strategy.py
+@Time    :   2021/10/08 22:22:42
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import torch
+import numpy as np
+import torch.nn.functional as F
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-65504):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p > 0.0:
+        # convert to 1D
+        logits = logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        # going back to 2D
+        logits = logits.view(1, -1).contiguous()
+    return logits
+class CoglmStrategy:
+    def __init__(self, invalid_slices=[], temperature=1., top_k=200, eps=1e-4, top_p=0.0, end_tokens=None, temperature2=0.89):
+        self.invalid_slices = invalid_slices
+        self.temperature = temperature
+        self.temperature2 = temperature2
+        self.topk = top_k
+        self.top_p = top_p
+        self.eps = eps
+        if end_tokens is None:
+            end_tokens = []
+        self.end_tokens = end_tokens
+        self._is_done = False
+        self.outlier_count_down = torch.zeros(16)
+        self.vis_list = [[]for i in range(16)]
+        self.cluster_labels = torch.tensor(np.load('cluster_label2.npy'), device='cuda', dtype=torch.long)
+        self.start_pos = -1
+        self.white_cluster = []
+        # self.fout = open('tmp.txt', 'w')
+    @property
+    def is_done(self) -> bool:
+        return self._is_done
+    def forward(self, logits, tokens, mems, temperature=None, temperature2=None):
+        if temperature is None:
+            temperature = self.temperature
+        if temperature2 is None:
+            temperature2 = self.temperature2
+        logits = logits / temperature
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -65504
+        rprobs = F.softmax(logits.float(), dim=-1)
+        c = self.cluster_labels.expand(*rprobs.shape)
+        cprobs = torch.zeros(logits.shape[0], 500, device=logits.device).scatter_add_(1, c, rprobs)
+        # self.fout.write(str(tokens.shape[-1])+ ' ' + str(cprobs.topk(10)) + '\n')
+        # self.fout.flush()
+        best_scores, best_clusters = cprobs.topk(self.topk)
+        bz = logits.shape[0]
+        for i in range(bz):
+            selected_cluster = best_clusters[i][torch.multinomial(best_scores[i] / best_scores[i].sum(), num_samples=1)]
+            logits[i, self.cluster_labels != selected_cluster] = -65504
+        # logits = top_k_logits(logits, self.topk, self.top_p)
+        probs = F.softmax(logits.float()/temperature2, dim=-1)  # float is essetial, due to a bug in Pytorch
+        pred = torch.multinomial(probs, num_samples=1)
+        if pred.numel() == 1 and pred.item() in self.end_tokens:
+            self._is_done = True
+        tokens = torch.cat((tokens, pred.view(tokens.shape[0], 1)), dim=1)
+        return tokens, mems
+    def finalize(self, tokens, mems):
+        self._is_done = False
+        return tokens, mems

src/videogen_hub/pipelines/cogvideo/cogvideo_src/cogvideo_pipeline.py ADDED Viewed

	@@ -0,0 +1,1341 @@

+# -*- encoding: utf-8 -*-
+"""
+@File    :   cogvideo_pipeline.py
+@Time    :   2022/07/15 11:24:56
+@Author  :   Wenyi Hong
+@Version :   1.0
+@Contact :   [email protected]
+"""
+# here put the import lib
+import os
+import sys
+import torch
+import argparse
+import time
+from torchvision.utils import save_image
+import stat
+from videogen_hub.depend.icetk import icetk as tokenizer
+import logging, sys
+import torch.distributed as dist
+tokenizer.add_special_tokens(
+    ["<start_of_image>", "<start_of_english>", "<start_of_chinese>"]
+)
+from SwissArmyTransformer import get_args
+from SwissArmyTransformer.data_utils import BinaryDataset, make_loaders
+from SwissArmyTransformer.generation.sampling_strategies import BaseStrategy
+from SwissArmyTransformer.generation.utils import (
+    timed_name,
+    save_multiple_images,
+    generate_continually,
+)
+from SwissArmyTransformer.resources import auto_create
+from .models.cogvideo_cache_model import CogVideoCacheModel
+from .coglm_strategy import CoglmStrategy
+def get_masks_and_position_ids_stage1(data, textlen, framelen):
+    # Extract batch size and sequence length.
+    tokens = data
+    seq_length = len(data[0])
+    # Attention mask (lower triangular).
+    attention_mask = torch.ones(
+        (1, textlen + framelen, textlen + framelen), device=data.device
+    )
+    attention_mask[:, :textlen, textlen:] = 0
+    attention_mask[:, textlen:, textlen:].tril_()
+    attention_mask.unsqueeze_(1)
+    # Unaligned version
+    position_ids = torch.zeros(seq_length, dtype=torch.long, device=data.device)
+    torch.arange(
+        textlen, out=position_ids[:textlen], dtype=torch.long, device=data.device
+    )
+    torch.arange(
+        512,
+        512 + seq_length - textlen,
+        out=position_ids[textlen:],
+        dtype=torch.long,
+        device=data.device,
+    )
+    position_ids = position_ids.unsqueeze(0)
+    return tokens, attention_mask, position_ids
+def get_masks_and_position_ids_stage2(data, textlen, framelen):
+    # Extract batch size and sequence length.
+    tokens = data
+    seq_length = len(data[0])
+    # Attention mask (lower triangular).
+    attention_mask = torch.ones(
+        (1, textlen + framelen, textlen + framelen), device=data.device
+    )
+    attention_mask[:, :textlen, textlen:] = 0
+    attention_mask[:, textlen:, textlen:].tril_()
+    attention_mask.unsqueeze_(1)
+    # Unaligned version
+    position_ids = torch.zeros(seq_length, dtype=torch.long, device=data.device)
+    torch.arange(
+        textlen, out=position_ids[:textlen], dtype=torch.long, device=data.device
+    )
+    frame_num = (seq_length - textlen) // framelen
+    assert frame_num == 5
+    torch.arange(
+        512,
+        512 + framelen,
+        out=position_ids[textlen : textlen + framelen],
+        dtype=torch.long,
+        device=data.device,
+    )
+    torch.arange(
+        512 + framelen * 2,
+        512 + framelen * 3,
+        out=position_ids[textlen + framelen : textlen + framelen * 2],
+        dtype=torch.long,
+        device=data.device,
+    )
+    torch.arange(
+        512 + framelen * (frame_num - 1),
+        512 + framelen * frame_num,
+        out=position_ids[textlen + framelen * 2 : textlen + framelen * 3],
+        dtype=torch.long,
+        device=data.device,
+    )
+    torch.arange(
+        512 + framelen * 1,
+        512 + framelen * 2,
+        out=position_ids[textlen + framelen * 3 : textlen + framelen * 4],
+        dtype=torch.long,
+        device=data.device,
+    )
+    torch.arange(
+        512 + framelen * 3,
+        512 + framelen * 4,
+        out=position_ids[textlen + framelen * 4 : textlen + framelen * 5],
+        dtype=torch.long,
+        device=data.device,
+    )
+    position_ids = position_ids.unsqueeze(0)
+    return tokens, attention_mask, position_ids
+def my_update_mems(
+    hiddens, mems_buffers, mems_indexs, limited_spatial_channel_mem, text_len, frame_len
+):
+    if hiddens is None:
+        return None, mems_indexs
+    mem_num = len(hiddens)
+    ret_mem = []
+    with torch.no_grad():
+        for id in range(mem_num):
+            if hiddens[id][0] is None:
+                ret_mem.append(None)
+            else:
+                if (
+                    id == 0
+                    and limited_spatial_channel_mem
+                    and mems_indexs[id] + hiddens[0][0].shape[1] >= text_len + frame_len
+                ):
+                    if mems_indexs[id] == 0:
+                        for layer, hidden in enumerate(hiddens[id]):
+                            mems_buffers[id][layer, :, :text_len] = hidden.expand(
+                                mems_buffers[id].shape[1], -1, -1
+                            )[:, :text_len]
+                    new_mem_len_part2 = (
+                        mems_indexs[id] + hiddens[0][0].shape[1] - text_len
+                    ) % frame_len
+                    if new_mem_len_part2 > 0:
+                        for layer, hidden in enumerate(hiddens[id]):
+                            mems_buffers[id][
+                                layer, :, text_len : text_len + new_mem_len_part2
+                            ] = hidden.expand(mems_buffers[id].shape[1], -1, -1)[
+                                :, -new_mem_len_part2:
+                            ]
+                    mems_indexs[id] = text_len + new_mem_len_part2
+                else:
+                    for layer, hidden in enumerate(hiddens[id]):
+                        mems_buffers[id][
+                            layer,
+                            :,
+                            mems_indexs[id] : mems_indexs[id] + hidden.shape[1],
+                        ] = hidden.expand(mems_buffers[id].shape[1], -1, -1)
+                    mems_indexs[id] += hidden.shape[1]
+                ret_mem.append(mems_buffers[id][:, :, : mems_indexs[id]])
+    return ret_mem, mems_indexs
+def my_save_multiple_images(imgs, path, subdir, debug=True):
+    # imgs: list of tensor images
+    if debug:
+        imgs = torch.cat(imgs, dim=0)
+        print("\nSave to: ", path, flush=True)
+        save_image(imgs, path, normalize=True)
+    else:
+        print("\nSave to: ", path, flush=True)
+        single_frame_path = os.path.join(path, subdir)
+        os.makedirs(single_frame_path, exist_ok=True)
+        for i in range(len(imgs)):
+            save_image(
+                imgs[i],
+                os.path.join(single_frame_path, f'{str(i).rjust(4,"0")}.jpg'),
+                normalize=True,
+            )
+            os.chmod(
+                os.path.join(single_frame_path, f'{str(i).rjust(4,"0")}.jpg'),
+                stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU,
+            )
+        save_image(
+            torch.cat(imgs, dim=0),
+            os.path.join(single_frame_path, f"frame_concat.jpg"),
+            normalize=True,
+        )
+        os.chmod(
+            os.path.join(single_frame_path, f"frame_concat.jpg"),
+            stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU,
+        )
+def calc_next_tokens_frame_begin_id(text_len, frame_len, total_len):
+    # The fisrt token's position id of the frame that the next token belongs to;
+    if total_len < text_len:
+        return None
+    return (total_len - text_len) // frame_len * frame_len + text_len
+def my_filling_sequence(
+    model,
+    args,
+    seq,
+    batch_size,
+    get_masks_and_position_ids,
+    text_len,
+    frame_len,
+    strategy=BaseStrategy(),
+    strategy2=BaseStrategy(),
+    mems=None,
+    log_text_attention_weights=0,  # default to 0: no artificial change
+    mode_stage1=True,
+    enforce_no_swin=False,
+    guider_seq=None,
+    guider_text_len=0,
+    guidance_alpha=1,
+    limited_spatial_channel_mem=False,  # 空间通道的存储限制在本帧内
+    **kw_args,
+):
+    """
+    seq: [2, 3, 5, ..., -1(to be generated), -1, ...]
+    mems: [num_layers, batch_size, len_mems(index), mem_hidden_size]
+        cache, should be first mems.shape[1] parts of context_tokens.
+        mems are the first-level citizens here, but we don't assume what is memorized.
+        input mems are used when multi-phase generation.
+    """
+    if guider_seq is not None:
+        logging.debug("Using Guidance In Inference")
+    if limited_spatial_channel_mem:
+        logging.debug("Limit spatial-channel's mem to current frame")
+    assert len(seq.shape) == 2
+    # building the initial tokens, attention_mask, and position_ids
+    actual_context_length = 0
+    while seq[-1][actual_context_length] >= 0:  # the last seq has least given tokens
+        actual_context_length += 1  # [0, context_length-1] are given
+    assert actual_context_length > 0
+    current_frame_num = (actual_context_length - text_len) // frame_len
+    assert current_frame_num >= 0
+    context_length = text_len + current_frame_num * frame_len
+    tokens, attention_mask, position_ids = get_masks_and_position_ids(
+        seq, text_len, frame_len
+    )
+    tokens = tokens[..., :context_length]
+    input_tokens = tokens.clone()
+    if guider_seq is not None:
+        guider_index_delta = text_len - guider_text_len
+        guider_tokens, guider_attention_mask, guider_position_ids = (
+            get_masks_and_position_ids(guider_seq, guider_text_len, frame_len)
+        )
+        guider_tokens = guider_tokens[..., : context_length - guider_index_delta]
+        guider_input_tokens = guider_tokens.clone()
+    for fid in range(current_frame_num):
+        input_tokens[:, text_len + 400 * fid] = tokenizer["<start_of_image>"]
+        if guider_seq is not None:
+            guider_input_tokens[:, guider_text_len + 400 * fid] = tokenizer[
+                "<start_of_image>"
+            ]
+    attention_mask = attention_mask.type_as(next(model.parameters()))  # if fp16
+    # initialize generation
+    counter = context_length - 1  # Last fixed index is ``counter''
+    index = 0  # Next forward starting index, also the length of cache.
+    mems_buffers_on_GPU = False
+    mems_indexs = [0, 0]
+    mems_len = [
+        (400 + 74) if limited_spatial_channel_mem else 5 * 400 + 74,
+        5 * 400 + 74,
+    ]
+    mems_buffers = [
+        torch.zeros(
+            args.num_layers,
+            batch_size,
+            mem_len,
+            args.hidden_size * 2,
+            dtype=next(model.parameters()).dtype,
+        )
+        for mem_len in mems_len
+    ]
+    if guider_seq is not None:
+        guider_attention_mask = guider_attention_mask.type_as(
+            next(model.parameters())
+        )  # if fp16
+        guider_mems_buffers = [
+            torch.zeros(
+                args.num_layers,
+                batch_size,
+                mem_len,
+                args.hidden_size * 2,
+                dtype=next(model.parameters()).dtype,
+            )
+            for mem_len in mems_len
+        ]
+        guider_mems_indexs = [0, 0]
+        guider_mems = None
+    torch.cuda.empty_cache()
+    # step-by-step generation
+    while counter < len(seq[0]) - 1:
+        # we have generated counter+1 tokens
+        # Now, we want to generate seq[counter + 1],
+        # token[:, index: counter+1] needs forwarding.
+        if index == 0:
+            group_size = (
+                2
+                if (input_tokens.shape[0] == batch_size and not mode_stage1)
+                else batch_size
+            )
+            logits_all = None
+            for batch_idx in range(0, input_tokens.shape[0], group_size):
+                logits, *output_per_layers = model(
+                    input_tokens[batch_idx : batch_idx + group_size, index:],
+                    position_ids[..., index : counter + 1],
+                    attention_mask,  # TODO memlen
+                    mems=mems,
+                    text_len=text_len,
+                    frame_len=frame_len,
+                    counter=counter,
+                    log_text_attention_weights=log_text_attention_weights,
+                    enforce_no_swin=enforce_no_swin,
+                    **kw_args,
+                )
+                logits_all = (
+                    torch.cat((logits_all, logits), dim=0)
+                    if logits_all is not None
+                    else logits
+                )
+                mem_kv01 = [
+                    [o["mem_kv"][0] for o in output_per_layers],
+                    [o["mem_kv"][1] for o in output_per_layers],
+                ]
+                next_tokens_frame_begin_id = calc_next_tokens_frame_begin_id(
+                    text_len, frame_len, mem_kv01[0][0].shape[1]
+                )
+                for id, mem_kv in enumerate(mem_kv01):
+                    for layer, mem_kv_perlayer in enumerate(mem_kv):
+                        if limited_spatial_channel_mem and id == 0:
+                            mems_buffers[id][
+                                layer, batch_idx : batch_idx + group_size, :text_len
+                            ] = mem_kv_perlayer.expand(
+                                min(group_size, input_tokens.shape[0] - batch_idx),
+                                -1,
+                                -1,
+                            )[
+                                :, :text_len
+                            ]
+                            mems_buffers[id][
+                                layer,
+                                batch_idx : batch_idx + group_size,
+                                text_len : text_len
+                                + mem_kv_perlayer.shape[1]
+                                - next_tokens_frame_begin_id,
+                            ] = mem_kv_perlayer.expand(
+                                min(group_size, input_tokens.shape[0] - batch_idx),
+                                -1,
+                                -1,
+                            )[
+                                :, next_tokens_frame_begin_id:
+                            ]
+                        else:
+                            mems_buffers[id][
+                                layer,
+                                batch_idx : batch_idx + group_size,
+                                : mem_kv_perlayer.shape[1],
+                            ] = mem_kv_perlayer.expand(
+                                min(group_size, input_tokens.shape[0] - batch_idx),
+                                -1,
+                                -1,
+                            )
+                mems_indexs[0], mems_indexs[1] = (
+                    mem_kv01[0][0].shape[1],
+                    mem_kv01[1][0].shape[1],
+                )
+                if limited_spatial_channel_mem:
+                    mems_indexs[0] -= next_tokens_frame_begin_id - text_len
+            mems = [mems_buffers[id][:, :, : mems_indexs[id]] for id in range(2)]
+            logits = logits_all
+            # Guider
+            if guider_seq is not None:
+                guider_logits_all = None
+                for batch_idx in range(0, guider_input_tokens.shape[0], group_size):
+                    guider_logits, *guider_output_per_layers = model(
+                        guider_input_tokens[
+                            batch_idx : batch_idx + group_size,
+                            max(index - guider_index_delta, 0) :,
+                        ],
+                        guider_position_ids[
+                            ...,
+                            max(index - guider_index_delta, 0) : counter
+                            + 1
+                            - guider_index_delta,
+                        ],
+                        guider_attention_mask,
+                        mems=guider_mems,
+                        text_len=guider_text_len,
+                        frame_len=frame_len,
+                        counter=counter - guider_index_delta,
+                        log_text_attention_weights=log_text_attention_weights,
+                        enforce_no_swin=enforce_no_swin,
+                        **kw_args,
+                    )
+                    guider_logits_all = (
+                        torch.cat((guider_logits_all, guider_logits), dim=0)
+                        if guider_logits_all is not None
+                        else guider_logits
+                    )
+                    guider_mem_kv01 = [
+                        [o["mem_kv"][0] for o in guider_output_per_layers],
+                        [o["mem_kv"][1] for o in guider_output_per_layers],
+                    ]
+                    for id, guider_mem_kv in enumerate(guider_mem_kv01):
+                        for layer, guider_mem_kv_perlayer in enumerate(guider_mem_kv):
+                            if limited_spatial_channel_mem and id == 0:
+                                guider_mems_buffers[id][
+                                    layer,
+                                    batch_idx : batch_idx + group_size,
+                                    :guider_text_len,
+                                ] = guider_mem_kv_perlayer.expand(
+                                    min(group_size, input_tokens.shape[0] - batch_idx),
+                                    -1,
+                                    -1,
+                                )[
+                                    :, :guider_text_len
+                                ]
+                                guider_next_tokens_frame_begin_id = (
+                                    calc_next_tokens_frame_begin_id(
+                                        guider_text_len,
+                                        frame_len,
+                                        guider_mem_kv_perlayer.shape[1],
+                                    )
+                                )
+                                guider_mems_buffers[id][
+                                    layer,
+                                    batch_idx : batch_idx + group_size,
+                                    guider_text_len : guider_text_len
+                                    + guider_mem_kv_perlayer.shape[1]
+                                    - guider_next_tokens_frame_begin_id,
+                                ] = guider_mem_kv_perlayer.expand(
+                                    min(group_size, input_tokens.shape[0] - batch_idx),
+                                    -1,
+                                    -1,
+                                )[
+                                    :, guider_next_tokens_frame_begin_id:
+                                ]
+                            else:
+                                guider_mems_buffers[id][
+                                    layer,
+                                    batch_idx : batch_idx + group_size,
+                                    : guider_mem_kv_perlayer.shape[1],
+                                ] = guider_mem_kv_perlayer.expand(
+                                    min(group_size, input_tokens.shape[0] - batch_idx),
+                                    -1,
+                                    -1,
+                                )
+                    guider_mems_indexs[0], guider_mems_indexs[1] = (
+                        guider_mem_kv01[0][0].shape[1],
+                        guider_mem_kv01[1][0].shape[1],
+                    )
+                    if limited_spatial_channel_mem:
+                        guider_mems_indexs[0] -= (
+                            guider_next_tokens_frame_begin_id - guider_text_len
+                        )
+                guider_mems = [
+                    guider_mems_buffers[id][:, :, : guider_mems_indexs[id]]
+                    for id in range(2)
+                ]
+                guider_logits = guider_logits_all
+        else:
+            if not mems_buffers_on_GPU:
+                if not mode_stage1:
+                    torch.cuda.empty_cache()
+                    for idx, mem in enumerate(mems):
+                        mems[idx] = mem.to(next(model.parameters()).device)
+                    if guider_seq is not None:
+                        for idx, mem in enumerate(guider_mems):
+                            guider_mems[idx] = mem.to(next(model.parameters()).device)
+                else:
+                    torch.cuda.empty_cache()
+                    for idx, mem_buffer in enumerate(mems_buffers):
+                        mems_buffers[idx] = mem_buffer.to(
+                            next(model.parameters()).device
+                        )
+                    mems = [
+                        mems_buffers[id][:, :, : mems_indexs[id]] for id in range(2)
+                    ]
+                    if guider_seq is not None:
+                        for idx, guider_mem_buffer in enumerate(guider_mems_buffers):
+                            guider_mems_buffers[idx] = guider_mem_buffer.to(
+                                next(model.parameters()).device
+                            )
+                        guider_mems = [
+                            guider_mems_buffers[id][:, :, : guider_mems_indexs[id]]
+                            for id in range(2)
+                        ]
+                    mems_buffers_on_GPU = True
+            logits, *output_per_layers = model(
+                input_tokens[:, index:],
+                position_ids[..., index : counter + 1],
+                attention_mask,  # TODO memlen
+                mems=mems,
+                text_len=text_len,
+                frame_len=frame_len,
+                counter=counter,
+                log_text_attention_weights=log_text_attention_weights,
+                enforce_no_swin=enforce_no_swin,
+                limited_spatial_channel_mem=limited_spatial_channel_mem,
+                **kw_args,
+            )
+            mem_kv0, mem_kv1 = [o["mem_kv"][0] for o in output_per_layers], [
+                o["mem_kv"][1] for o in output_per_layers
+            ]
+            if guider_seq is not None:
+                guider_logits, *guider_output_per_layers = model(
+                    guider_input_tokens[:, max(index - guider_index_delta, 0) :],
+                    guider_position_ids[
+                        ...,
+                        max(index - guider_index_delta, 0) : counter
+                        + 1
+                        - guider_index_delta,
+                    ],
+                    guider_attention_mask,
+                    mems=guider_mems,
+                    text_len=guider_text_len,
+                    frame_len=frame_len,
+                    counter=counter - guider_index_delta,
+                    log_text_attention_weights=0,
+                    enforce_no_swin=enforce_no_swin,
+                    limited_spatial_channel_mem=limited_spatial_channel_mem,
+                    **kw_args,
+                )
+                guider_mem_kv0, guider_mem_kv1 = [
+                    o["mem_kv"][0] for o in guider_output_per_layers
+                ], [o["mem_kv"][1] for o in guider_output_per_layers]
+            if not mems_buffers_on_GPU:
+                torch.cuda.empty_cache()
+                for idx, mem_buffer in enumerate(mems_buffers):
+                    mems_buffers[idx] = mem_buffer.to(next(model.parameters()).device)
+                if guider_seq is not None:
+                    for idx, guider_mem_buffer in enumerate(guider_mems_buffers):
+                        guider_mems_buffers[idx] = guider_mem_buffer.to(
+                            next(model.parameters()).device
+                        )
+                mems_buffers_on_GPU = True
+            mems, mems_indexs = my_update_mems(
+                [mem_kv0, mem_kv1],
+                mems_buffers,
+                mems_indexs,
+                limited_spatial_channel_mem,
+                text_len,
+                frame_len,
+            )
+            if guider_seq is not None:
+                guider_mems, guider_mems_indexs = my_update_mems(
+                    [guider_mem_kv0, guider_mem_kv1],
+                    guider_mems_buffers,
+                    guider_mems_indexs,
+                    limited_spatial_channel_mem,
+                    guider_text_len,
+                    frame_len,
+                )
+        counter += 1
+        index = counter
+        logits = logits[:, -1].expand(batch_size, -1)  # [batch size, vocab size]
+        tokens = tokens.expand(batch_size, -1)
+        if guider_seq is not None:
+            guider_logits = guider_logits[:, -1].expand(batch_size, -1)
+            guider_tokens = guider_tokens.expand(batch_size, -1)
+        if seq[-1][counter].item() < 0:
+            # sampling
+            guided_logits = (
+                guider_logits + (logits - guider_logits) * guidance_alpha
+                if guider_seq is not None
+                else logits
+            )
+            if mode_stage1 and counter < text_len + 400:
+                tokens, mems = strategy.forward(guided_logits, tokens, mems)
+            else:
+                tokens, mems = strategy2.forward(guided_logits, tokens, mems)
+            if guider_seq is not None:
+                guider_tokens = torch.cat((guider_tokens, tokens[:, -1:]), dim=1)
+            if seq[0][counter].item() >= 0:
+                for si in range(seq.shape[0]):
+                    if seq[si][counter].item() >= 0:
+                        tokens[si, -1] = seq[si, counter]
+                        if guider_seq is not None:
+                            guider_tokens[si, -1] = guider_seq[
+                                si, counter - guider_index_delta
+                            ]
+        else:
+            tokens = torch.cat(
+                (
+                    tokens,
+                    seq[:, counter : counter + 1]
+                    .clone()
+                    .expand(tokens.shape[0], 1)
+                    .to(device=tokens.device, dtype=tokens.dtype),
+                ),
+                dim=1,
+            )
+            if guider_seq is not None:
+                guider_tokens = torch.cat(
+                    (
+                        guider_tokens,
+                        guider_seq[
+                            :,
+                            counter
+                            - guider_index_delta : counter
+                            + 1
+                            - guider_index_delta,
+                        ]
+                        .clone()
+                        .expand(guider_tokens.shape[0], 1)
+                        .to(device=guider_tokens.device, dtype=guider_tokens.dtype),
+                    ),
+                    dim=1,
+                )
+        input_tokens = tokens.clone()
+        if guider_seq is not None:
+            guider_input_tokens = guider_tokens.clone()
+        if (index - text_len - 1) // 400 < (
+            input_tokens.shape[-1] - text_len - 1
+        ) // 400:
+            boi_idx = ((index - text_len - 1) // 400 + 1) * 400 + text_len
+            while boi_idx < input_tokens.shape[-1]:
+                input_tokens[:, boi_idx] = tokenizer["<start_of_image>"]
+                if guider_seq is not None:
+                    guider_input_tokens[:, boi_idx - guider_index_delta] = tokenizer[
+                        "<start_of_image>"
+                    ]
+                boi_idx += 400
+        if strategy.is_done:
+            break
+    return strategy.finalize(tokens, mems)
+class InferenceModel_Sequential(CogVideoCacheModel):
+    def __init__(self, args, transformer=None, parallel_output=True):
+        super().__init__(
+            args,
+            transformer=transformer,
+            parallel_output=parallel_output,
+            window_size=-1,
+            cogvideo_stage=1,
+        )
+    # TODO: check it
+    def final_forward(self, logits, **kwargs):
+        logits_parallel = logits
+        logits_parallel = torch.nn.functional.linear(
+            logits_parallel.float(),
+            self.transformer.word_embeddings.weight[:20000].float(),
+        )
+        return logits_parallel
+class InferenceModel_Interpolate(CogVideoCacheModel):
+    def __init__(self, args, transformer=None, parallel_output=True):
+        super().__init__(
+            args,
+            transformer=transformer,
+            parallel_output=parallel_output,
+            window_size=10,
+            cogvideo_stage=2,
+        )
+    # TODO: check it
+    def final_forward(self, logits, **kwargs):
+        logits_parallel = logits
+        logits_parallel = torch.nn.functional.linear(
+            logits_parallel.float(),
+            self.transformer.word_embeddings.weight[:20000].float(),
+        )
+        return logits_parallel
+def main(args):
+    assert int(args.stage_1) + int(args.stage_2) + int(args.both_stages) == 1
+    rank_id = args.device % args.parallel_size
+    generate_frame_num = args.generate_frame_num
+    if args.stage_1 or args.both_stages:
+        model_stage1, args = InferenceModel_Sequential.from_pretrained(
+            args, "cogvideo-stage1"
+        )
+        model_stage1.eval()
+        if args.both_stages:
+            model_stage1 = model_stage1.cpu()
+    if args.stage_2 or args.both_stages:
+        model_stage2, args = InferenceModel_Interpolate.from_pretrained(
+            args, "cogvideo-stage2"
+        )
+        model_stage2.eval()
+        if args.both_stages:
+            model_stage2 = model_stage2.cpu()
+    invalid_slices = [slice(tokenizer.num_image_tokens, None)]
+    strategy_cogview2 = CoglmStrategy(invalid_slices, temperature=1.0, top_k=16)
+    strategy_cogvideo = CoglmStrategy(
+        invalid_slices,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        temperature2=args.coglm_temperature2,
+    )
+    if not args.stage_1:
+        from sr_pipeline import DirectSuperResolution
+        dsr_path = auto_create(
+            "cogview2-dsr", path=None
+        )  # path=os.getenv('SAT_HOME', '~/.sat_models')
+        dsr = DirectSuperResolution(args, dsr_path, max_bz=12, onCUDA=False)
+    def process_stage2(
+        model,
+        seq_text,
+        duration,
+        video_raw_text=None,
+        video_guidance_text="视频",
+        parent_given_tokens=None,
+        conddir=None,
+        outputdir=None,
+        gpu_rank=0,
+        gpu_parallel_size=1,
+    ):
+        stage2_starttime = time.time()
+        use_guidance = args.use_guidance_stage2
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage-2 model to cuda")
+            model = model.cuda()
+            logging.debug(
+                "moving in stage-2 model takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        try:
+            if parent_given_tokens is None:
+                assert conddir is not None
+                parent_given_tokens = torch.load(
+                    os.path.join(conddir, "frame_tokens.pt"), map_location="cpu"
+                )
+            sample_num_allgpu = parent_given_tokens.shape[0]
+            sample_num = sample_num_allgpu // gpu_parallel_size
+            assert sample_num * gpu_parallel_size == sample_num_allgpu
+            parent_given_tokens = parent_given_tokens[
+                gpu_rank * sample_num : (gpu_rank + 1) * sample_num
+            ]
+        except:
+            logging.critical("No frame_tokens found in interpolation, skip")
+            return False
+        # CogVideo Stage2 Generation
+        while (
+            duration >= 0.5
+        ):  # TODO: You can change the boundary to change the frame rate
+            parent_given_tokens_num = parent_given_tokens.shape[1]
+            generate_batchsize_persample = (parent_given_tokens_num - 1) // 2
+            generate_batchsize_total = generate_batchsize_persample * sample_num
+            total_frames = generate_frame_num
+            frame_len = 400
+            enc_text = tokenizer.encode(seq_text)
+            enc_duration = tokenizer.encode(str(float(duration)) + "秒")
+            seq = (
+                enc_duration
+                + [tokenizer["<n>"]]
+                + enc_text
+                + [tokenizer["<start_of_image>"]]
+                + [-1] * 400 * generate_frame_num
+            )
+            text_len = len(seq) - frame_len * generate_frame_num - 1
+            logging.info(
+                "[Stage2: Generating Frames, Frame Rate {:d}]\nraw text: {:s}".format(
+                    int(4 / duration), tokenizer.decode(enc_text)
+                )
+            )
+            # generation
+            seq = (
+                torch.cuda.LongTensor(seq, device=args.device)
+                .unsqueeze(0)
+                .repeat(generate_batchsize_total, 1)
+            )
+            for sample_i in range(sample_num):
+                for i in range(generate_batchsize_persample):
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 : text_len + 1 + 400
+                    ] = parent_given_tokens[sample_i][2 * i]
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 + 400 : text_len + 1 + 800
+                    ] = parent_given_tokens[sample_i][2 * i + 1]
+                    seq[sample_i * generate_batchsize_persample + i][
+                        text_len + 1 + 800 : text_len + 1 + 1200
+                    ] = parent_given_tokens[sample_i][2 * i + 2]
+            if use_guidance:
+                guider_seq = (
+                    enc_duration
+                    + [tokenizer["<n>"]]
+                    + tokenizer.encode(video_guidance_text)
+                    + [tokenizer["<start_of_image>"]]
+                    + [-1] * 400 * generate_frame_num
+                )
+                guider_text_len = len(guider_seq) - frame_len * generate_frame_num - 1
+                guider_seq = (
+                    torch.cuda.LongTensor(guider_seq, device=args.device)
+                    .unsqueeze(0)
+                    .repeat(generate_batchsize_total, 1)
+                )
+                for sample_i in range(sample_num):
+                    for i in range(generate_batchsize_persample):
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 : text_len + 1 + 400
+                        ] = parent_given_tokens[sample_i][2 * i]
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 + 400 : text_len + 1 + 800
+                        ] = parent_given_tokens[sample_i][2 * i + 1]
+                        guider_seq[sample_i * generate_batchsize_persample + i][
+                            text_len + 1 + 800 : text_len + 1 + 1200
+                        ] = parent_given_tokens[sample_i][2 * i + 2]
+                video_log_text_attention_weights = 0
+            else:
+                guider_seq = None
+                guider_text_len = 0
+                video_log_text_attention_weights = 1.4
+            mbz = args.max_inference_batch_size
+            assert generate_batchsize_total < mbz or generate_batchsize_total % mbz == 0
+            output_list = []
+            start_time = time.time()
+            for tim in range(max(generate_batchsize_total // mbz, 1)):
+                input_seq = (
+                    seq[: min(generate_batchsize_total, mbz)].clone()
+                    if tim == 0
+                    else seq[mbz * tim : mbz * (tim + 1)].clone()
+                )
+                guider_seq2 = (
+                    (
+                        guider_seq[: min(generate_batchsize_total, mbz)].clone()
+                        if tim == 0
+                        else guider_seq[mbz * tim : mbz * (tim + 1)].clone()
+                    )
+                    if guider_seq is not None
+                    else None
+                )
+                output_list.append(
+                    my_filling_sequence(
+                        model,
+                        args,
+                        input_seq,
+                        batch_size=min(generate_batchsize_total, mbz),
+                        get_masks_and_position_ids=get_masks_and_position_ids_stage2,
+                        text_len=text_len,
+                        frame_len=frame_len,
+                        strategy=strategy_cogview2,
+                        strategy2=strategy_cogvideo,
+                        log_text_attention_weights=video_log_text_attention_weights,
+                        mode_stage1=False,
+                        guider_seq=guider_seq2,
+                        guider_text_len=guider_text_len,
+                        guidance_alpha=args.guidance_alpha,
+                        limited_spatial_channel_mem=True,
+                    )[0]
+                )
+            logging.info(
+                "Duration {:.2f}, Taken time {:.2f}\n".format(
+                    duration, time.time() - start_time
+                )
+            )
+            output_tokens = torch.cat(output_list, dim=0)
+            output_tokens = output_tokens[
+                :, text_len + 1 : text_len + 1 + (total_frames) * 400
+            ].reshape(sample_num, -1, 400 * total_frames)
+            output_tokens_merge = torch.cat(
+                (
+                    output_tokens[:, :, : 1 * 400],
+                    output_tokens[:, :, 400 * 3 : 4 * 400],
+                    output_tokens[:, :, 400 * 1 : 2 * 400],
+                    output_tokens[:, :, 400 * 4 : (total_frames) * 400],
+                ),
+                dim=2,
+            ).reshape(sample_num, -1, 400)
+            output_tokens_merge = torch.cat(
+                (output_tokens_merge, output_tokens[:, -1:, 400 * 2 : 3 * 400]), dim=1
+            )
+            duration /= 2
+            parent_given_tokens = output_tokens_merge
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 2 model to cpu")
+            model = model.cpu()
+            torch.cuda.empty_cache()
+            logging.debug(
+                "moving out model2 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        logging.info(
+            "CogVideo Stage2 completed. Taken time {:.2f}\n".format(
+                time.time() - stage2_starttime
+            )
+        )
+        # decoding
+        # imgs = [torch.nn.functional.interpolate(tokenizer.decode(image_ids=seq.tolist()), size=(480, 480)) for seq in output_tokens_merge]
+        # os.makedirs(output_dir_full_path, exist_ok=True)
+        # my_save_multiple_images(imgs, output_dir_full_path,subdir="frames", debug=False)
+        # torch.save(output_tokens_merge.cpu(), os.path.join(output_dir_full_path, 'frame_token.pt'))
+        # os.system(f"gifmaker -i '{output_dir_full_path}'/frames/0*.jpg -o '{output_dir_full_path}/{str(float(duration))}_concat.gif' -d 0.2")
+        # direct super-resolution by CogView2
+        logging.info("[Direct super-resolution]")
+        dsr_starttime = time.time()
+        enc_text = tokenizer.encode(seq_text)
+        frame_num_per_sample = parent_given_tokens.shape[1]
+        parent_given_tokens_2d = parent_given_tokens.reshape(-1, 400)
+        text_seq = (
+            torch.cuda.LongTensor(enc_text, device=args.device)
+            .unsqueeze(0)
+            .repeat(parent_given_tokens_2d.shape[0], 1)
+        )
+        sred_tokens = dsr(text_seq, parent_given_tokens_2d)
+        decoded_sr_videos = []
+        for sample_i in range(sample_num):
+            decoded_sr_imgs = []
+            for frame_i in range(frame_num_per_sample):
+                decoded_sr_img = tokenizer.decode(
+                    image_ids=sred_tokens[frame_i + sample_i * frame_num_per_sample][
+                        -3600:
+                    ]
+                )
+                decoded_sr_imgs.append(
+                    torch.nn.functional.interpolate(decoded_sr_img, size=(480, 480))
+                )
+            decoded_sr_videos.append(decoded_sr_imgs)
+        for sample_i in range(sample_num):
+            my_save_multiple_images(
+                decoded_sr_videos[sample_i],
+                outputdir,
+                subdir=f"frames/{sample_i+sample_num*gpu_rank}",
+                debug=False,
+            )
+            os.system(
+                f"gifmaker -i '{outputdir}'/frames/'{sample_i+sample_num*gpu_rank}'/0*.jpg -o '{outputdir}/{sample_i+sample_num*gpu_rank}.gif' -d 0.125"
+            )
+        logging.info(
+            "Direct super-resolution completed. Taken time {:.2f}\n".format(
+                time.time() - dsr_starttime
+            )
+        )
+        return True
+    def process_stage1(
+        model,
+        seq_text,
+        duration,
+        video_raw_text=None,
+        video_guidance_text="视频",
+        image_text_suffix="",
+        outputdir=None,
+        batch_size=1,
+    ):
+        process_start_time = time.time()
+        use_guide = args.use_guidance_stage1
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 1 model to cuda")
+            model = model.cuda()
+            logging.debug(
+                "moving in model1 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        if video_raw_text is None:
+            video_raw_text = seq_text
+        mbz = (
+            args.stage1_max_inference_batch_size
+            if args.stage1_max_inference_batch_size > 0
+            else args.max_inference_batch_size
+        )
+        assert batch_size < mbz or batch_size % mbz == 0
+        frame_len = 400
+        # generate the first frame:
+        enc_text = tokenizer.encode(seq_text + image_text_suffix)
+        seq_1st = (
+            enc_text + [tokenizer["<start_of_image>"]] + [-1] * 400
+        )  # IV!!  # test local!!! # test randboi!!!
+        logging.info(
+            "[Generating First Frame with CogView2]Raw text: {:s}".format(
+                tokenizer.decode(enc_text)
+            )
+        )
+        text_len_1st = len(seq_1st) - frame_len * 1 - 1
+        seq_1st = torch.cuda.LongTensor(seq_1st, device=args.device).unsqueeze(0)
+        output_list_1st = []
+        for tim in range(max(batch_size // mbz, 1)):
+            start_time = time.time()
+            output_list_1st.append(
+                my_filling_sequence(
+                    model,
+                    args,
+                    seq_1st.clone(),
+                    batch_size=min(batch_size, mbz),
+                    get_masks_and_position_ids=get_masks_and_position_ids_stage1,
+                    text_len=text_len_1st,
+                    frame_len=frame_len,
+                    strategy=strategy_cogview2,
+                    strategy2=strategy_cogvideo,
+                    log_text_attention_weights=1.4,
+                    enforce_no_swin=True,
+                    mode_stage1=True,
+                )[0]
+            )
+            logging.info(
+                "[First Frame]Taken time {:.2f}\n".format(time.time() - start_time)
+            )
+        output_tokens_1st = torch.cat(output_list_1st, dim=0)
+        given_tokens = output_tokens_1st[
+            :, text_len_1st + 1 : text_len_1st + 401
+        ].unsqueeze(
+            1
+        )  # given_tokens.shape: [bs, frame_num, 400]
+        # generate subsequent frames:
+        total_frames = generate_frame_num
+        enc_duration = tokenizer.encode(str(float(duration)) + "秒")
+        if use_guide:
+            video_raw_text = video_raw_text + " 视频"
+        enc_text_video = tokenizer.encode(video_raw_text)
+        seq = (
+            enc_duration
+            + [tokenizer["<n>"]]
+            + enc_text_video
+            + [tokenizer["<start_of_image>"]]
+            + [-1] * 400 * generate_frame_num
+        )
+        guider_seq = (
+            enc_duration
+            + [tokenizer["<n>"]]
+            + tokenizer.encode(video_guidance_text)
+            + [tokenizer["<start_of_image>"]]
+            + [-1] * 400 * generate_frame_num
+        )
+        logging.info(
+            "[Stage1: Generating Subsequent Frames, Frame Rate {:.1f}]\nraw text: {:s}".format(
+                4 / duration, tokenizer.decode(enc_text_video)
+            )
+        )
+        text_len = len(seq) - frame_len * generate_frame_num - 1
+        guider_text_len = len(guider_seq) - frame_len * generate_frame_num - 1
+        seq = (
+            torch.cuda.LongTensor(seq, device=args.device)
+            .unsqueeze(0)
+            .repeat(batch_size, 1)
+        )
+        guider_seq = (
+            torch.cuda.LongTensor(guider_seq, device=args.device)
+            .unsqueeze(0)
+            .repeat(batch_size, 1)
+        )
+        for given_frame_id in range(given_tokens.shape[1]):
+            seq[
+                :,
+                text_len
+                + 1
+                + given_frame_id * 400 : text_len
+                + 1
+                + (given_frame_id + 1) * 400,
+            ] = given_tokens[:, given_frame_id]
+            guider_seq[
+                :,
+                guider_text_len
+                + 1
+                + given_frame_id * 400 : guider_text_len
+                + 1
+                + (given_frame_id + 1) * 400,
+            ] = given_tokens[:, given_frame_id]
+        output_list = []
+        if use_guide:
+            video_log_text_attention_weights = 0
+        else:
+            guider_seq = None
+            video_log_text_attention_weights = 1.4
+        for tim in range(max(batch_size // mbz, 1)):
+            start_time = time.time()
+            input_seq = (
+                seq[: min(batch_size, mbz)].clone()
+                if tim == 0
+                else seq[mbz * tim : mbz * (tim + 1)].clone()
+            )
+            guider_seq2 = (
+                (
+                    guider_seq[: min(batch_size, mbz)].clone()
+                    if tim == 0
+                    else guider_seq[mbz * tim : mbz * (tim + 1)].clone()
+                )
+                if guider_seq is not None
+                else None
+            )
+            output_list.append(
+                my_filling_sequence(
+                    model,
+                    args,
+                    input_seq,
+                    batch_size=min(batch_size, mbz),
+                    get_masks_and_position_ids=get_masks_and_position_ids_stage1,
+                    text_len=text_len,
+                    frame_len=frame_len,
+                    strategy=strategy_cogview2,
+                    strategy2=strategy_cogvideo,
+                    log_text_attention_weights=video_log_text_attention_weights,
+                    guider_seq=guider_seq2,
+                    guider_text_len=guider_text_len,
+                    guidance_alpha=args.guidance_alpha,
+                    limited_spatial_channel_mem=True,
+                    mode_stage1=True,
+                )[0]
+            )
+        output_tokens = torch.cat(output_list, dim=0)[:, 1 + text_len :]
+        if args.both_stages:
+            move_start_time = time.time()
+            logging.debug("moving stage 1 model to cpu")
+            model = model.cpu()
+            torch.cuda.empty_cache()
+            logging.debug(
+                "moving in model1 takes time: {:.2f}".format(
+                    time.time() - move_start_time
+                )
+            )
+        # decoding
+        imgs, sred_imgs, txts = [], [], []
+        for seq in output_tokens:
+            decoded_imgs = [
+                torch.nn.functional.interpolate(
+                    tokenizer.decode(image_ids=seq.tolist()[i * 400 : (i + 1) * 400]),
+                    size=(480, 480),
+                )
+                for i in range(total_frames)
+            ]
+            imgs.append(decoded_imgs)  # only the last image (target)
+        assert len(imgs) == batch_size
+        save_tokens = (
+            output_tokens[:, : +total_frames * 400].reshape(-1, total_frames, 400).cpu()
+        )
+        if outputdir is not None:
+            for clip_i in range(len(imgs)):
+                # os.makedirs(output_dir_full_paths[clip_i], exist_ok=True)
+                my_save_multiple_images(
+                    imgs[clip_i], outputdir, subdir=f"frames/{clip_i}", debug=False
+                )
+                os.system(
+                    f"gifmaker -i '{outputdir}'/frames/'{clip_i}'/0*.jpg -o '{outputdir}/{clip_i}.gif' -d 0.25"
+                )
+            torch.save(save_tokens, os.path.join(outputdir, "frame_tokens.pt"))
+        logging.info(
+            "CogVideo Stage1 completed. Taken time {:.2f}\n".format(
+                time.time() - process_start_time
+            )
+        )
+        return save_tokens
+    # ======================================================================================================
+    if args.stage_1 or args.both_stages:
+        if args.input_source != "interactive":
+            with open(args.input_source, "r") as fin:
+                promptlist = fin.readlines()
+            promptlist = [p.strip() for p in promptlist]
+        else:
+            promptlist = None
+        now_qi = -1
+        while True:
+            now_qi += 1
+            if promptlist is not None:  # with input-source
+                if args.multi_gpu:
+                    if now_qi % dist.get_world_size() != dist.get_rank():
+                        continue
+                    rk = dist.get_rank()
+                else:
+                    rk = 0
+                raw_text = promptlist[now_qi]
+                raw_text = raw_text.strip()
+                print(f"Working on Line No. {now_qi} on {rk}... [{raw_text}]")
+            else:  # interactive
+                raw_text = input("\nPlease Input Query (stop to exit) >>> ")
+                raw_text = raw_text.strip()
+                if not raw_text:
+                    print("Query should not be empty!")
+                    continue
+                if raw_text == "stop":
+                    return
+            try:
+                path = os.path.join(args.output_path, f"{now_qi}_{raw_text}")
+                parent_given_tokens = process_stage1(
+                    model_stage1,
+                    raw_text,
+                    duration=4.0,
+                    video_raw_text=raw_text,
+                    video_guidance_text="视频",
+                    image_text_suffix=" 高清摄影",
+                    outputdir=path if args.stage_1 else None,
+                    batch_size=args.batch_size,
+                )
+                if args.both_stages:
+                    process_stage2(
+                        model_stage2,
+                        raw_text,
+                        duration=2.0,
+                        video_raw_text=raw_text + " 视频",
+                        video_guidance_text="视频",
+                        parent_given_tokens=parent_given_tokens,
+                        outputdir=path,
+                        gpu_rank=0,
+                        gpu_parallel_size=1,
+                    )  # TODO: 修改
+            except (ValueError, FileNotFoundError) as e:
+                print(e)
+                continue
+    elif args.stage_2:
+        sample_dirs = os.listdir(args.output_path)
+        for sample in sample_dirs:
+            raw_text = sample.split("_")[-1]
+            path = os.path.join(args.output_path, sample, "Interp")
+            parent_given_tokens = torch.load(
+                os.path.join(args.output_path, sample, "frame_tokens.pt")
+            )
+            process_stage2(
+                raw_text,
+                duration=2.0,
+                video_raw_text=raw_text + " 视频",
+                video_guidance_text="视频",
+                parent_given_tokens=parent_given_tokens,
+                outputdir=path,
+                gpu_rank=0,
+                gpu_parallel_size=1,
+            )  # TODO: 修改
+    else:
+        assert False
+if __name__ == "__main__":
+    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+    py_parser = argparse.ArgumentParser(add_help=False)
+    py_parser.add_argument("--generate-frame-num", type=int, default=5)
+    py_parser.add_argument("--coglm-temperature2", type=float, default=0.89)
+    # py_parser.add_argument("--interp-duration", type=float, default=-1) # -1是顺序生成，0是超分，0.5/1/2是插帧
+    # py_parser.add_argument("--total-duration", type=float, default=4.0) # 整个的时间
+    py_parser.add_argument("--use-guidance-stage1", action="store_true")
+    py_parser.add_argument("--use-guidance-stage2", action="store_true")
+    py_parser.add_argument("--guidance-alpha", type=float, default=3.0)
+    py_parser.add_argument(
+        "--stage-1", action="store_true"
+    )  # stage 1: sequential generation
+    py_parser.add_argument("--stage-2", action="store_true")  # stage 2: interp + dsr
+    py_parser.add_argument(
+        "--both-stages", action="store_true"
+    )  # stage 1&2: sequential generation; interp + dsr
+    py_parser.add_argument("--parallel-size", type=int, default=1)
+    py_parser.add_argument(
+        "--stage1-max-inference-batch-size", type=int, default=-1
+    )  # -1: use max-inference-batch-size
+    py_parser.add_argument("--multi-gpu", action="store_true")
+    CogVideoCacheModel.add_model_specific_args(py_parser)
+    known, args_list = py_parser.parse_known_args()
+    args = get_args(args_list)
+    args = argparse.Namespace(**vars(args), **vars(known))
+    args.layout = [int(x) for x in args.layout.split(",")]
+    args.do_train = False
+    torch.cuda.set_device(args.device)
+    with torch.no_grad():
+        main(args)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/cogvideo_cache_model.py ADDED Viewed

	@@ -0,0 +1,695 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   cogvideo_cache_model.py
+@Time    :   2022/07/15 11:22:19
+@Author  :   Wenyi Hong
+@Version :   1.0
+@Contact :   [email protected]
+'''
+# here put the import lib
+from multiprocessing import context
+from tkinter import E
+import torch
+from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin
+from SwissArmyTransformer.mpu.utils import split_tensor_along_last_dim
+from SwissArmyTransformer.model.transformer import unscaled_init_method
+from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear
+import torch.nn.functional as F
+from deepspeed.runtime.activation_checkpointing.checkpointing import get_cuda_rng_tracker
+import math
+class PositionEmbeddingMixin(BaseMixin):
+    def __init__(self, additional_sequence_length, hidden_size,
+                 init_method_std=0.02, reinit_slice=slice(512, 912),
+                 ):
+        super(PositionEmbeddingMixin, self).__init__()
+        self.reinit_slice = reinit_slice
+        self.position_embeddings = torch.nn.Embedding(additional_sequence_length, hidden_size)
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+    def reinit(self, parent_model=None):
+        old_weights = self.transformer.position_embeddings.weight.data[self.reinit_slice]
+        old_len, hidden_size = old_weights.shape
+        assert hidden_size == self.position_embeddings.weight.shape[-1]
+        self.position_embeddings.weight.data.view(-1, old_len, hidden_size).copy_(old_weights)
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, framenum, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, frame_num, window_size, window_size, C)
+    """
+    B, framenum, H, W, C = x.shape
+    x = x.view(B, framenum, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(-1, framenum, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, frame_num, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, frame_num, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    framenum = windows.shape[1]
+    x = windows.view(B, H // window_size, W // window_size, framenum, window_size, window_size, -1)
+    x = x.permute(0, 3, 1, 4, 2, 5, 6).contiguous().view(B, framenum, H, W, -1)
+    return x
+class WindowAttentionMixin(BaseMixin):
+    def __init__(self, num_layers,
+                hidden_size,
+                frame_resolution,
+                window_size,
+                shift_size,
+                n_head,
+                frame_num,
+                init_method=unscaled_init_method(0.02),
+                output_layer_init_method=unscaled_init_method(0.02),
+                time_dim_attend_length=0
+        ):
+        super(WindowAttentionMixin, self).__init__()
+        self.num_layers = num_layers # replace attention in the LAST n layers
+        self.query_key_value = torch.nn.ModuleList(
+            [ColumnParallelLinear(hidden_size, 3*hidden_size,stride=3,
+                gather_output=False,init_method=init_method)
+                for layer_id in range(num_layers)
+            ])
+        self.dense = torch.nn.ModuleList(
+            [RowParallelLinear(
+                hidden_size,
+                hidden_size,
+                input_is_parallel=True,
+                init_method=output_layer_init_method,
+                bias=True,
+                module=self,
+                name="dense")
+                for layer_id in range(num_layers)
+            ])
+        self.n_head = n_head
+        self.window_size = window_size
+        self.frame_resolution = frame_resolution
+        self.frame_len = frame_resolution * frame_resolution
+        self.time_dim_attend_length = time_dim_attend_length
+        assert frame_resolution % window_size == 0
+        assert 0 < shift_size < window_size
+        nW = (self.frame_resolution // self.window_size) ** 2
+        ws_squre = self.window_size * self.window_size
+        # odd non-shift, even shift
+        img_mask = torch.zeros((1, 1, frame_resolution, frame_resolution, 1))
+        h_slices = (slice(0, -shift_size),
+                    slice(-shift_size, None))
+        w_slices = (slice(0, -shift_size),
+                    slice(-shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, :, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, 1, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        sub_attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) #[nW, self.window_size * self.window_size, self.window_size * self.window_size]
+        sub_attn_mask = sub_attn_mask.masked_fill(sub_attn_mask != 0, float(0.0)).masked_fill(sub_attn_mask == 0, float(1.00))
+        attn_mask = sub_attn_mask.repeat(1, frame_num, frame_num)
+        attn_mask = attn_mask.tril()
+        causal_mask = torch.ones(ws_squre*frame_num, ws_squre*frame_num)
+        causal_mask = causal_mask.tril()
+        self.shift_sizes = [0, shift_size]
+        self.attn_mask = attn_mask
+        self.causal_mask = causal_mask
+        self.mask_initialized = False
+        self.attn_distribution = torch.nn.ParameterList([
+            torch.nn.Parameter(torch.zeros(hidden_size))
+            for _ in range(num_layers)
+        ])
+    def reinit(self, *pre_mixins):
+        start_layer = len(self.transformer.layers) - self.num_layers
+        assert start_layer >= 0
+        for layer_id in range(self.num_layers):
+            old_attention = self.transformer.layers[start_layer + layer_id].attention
+            self.query_key_value[layer_id].weight.data.copy_(old_attention.query_key_value.weight.data)
+            self.query_key_value[layer_id].bias.data.copy_(old_attention.query_key_value.bias.data)
+    def attention_extra_NAR_inference(self, frame_hidden_state, layer_id, attn_dropout=None, memkv_text=None, stage=1):
+        # frame_hidden_state [batchsize, frame_num*frame_size, n_head*hiddensize_perhead]
+        if not self.mask_initialized:
+            self.attn_mask = self.attn_mask.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.causal_mask = self.causal_mask.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.mask_initialized = True
+        b0, s1, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        frame_len = self.frame_resolution * self.frame_resolution
+        frame_num = s1 // frame_len
+        if stage == 2:
+            assert frame_num == 3
+        assert frame_num*frame_len == s1
+        wind_square = self.window_size * self.window_size
+        nW = frame_len // wind_square
+        bswin = b0 * nW
+        if memkv_text is not None:
+            s0 = memkv_text.shape[-2]
+            k_text = memkv_text[..., :h0].expand(b0, -1, -1).reshape(b0, s0, self.n_head, h).permute(0, 2, 1, 3)
+            v_text = memkv_text[..., h0:].expand(b0, -1, -1).reshape(b0, s0, self.n_head, h).permute(0, 2, 1, 3)
+        # shift
+        frame_hidden_state = frame_hidden_state.reshape(b0, frame_num, self.frame_resolution, self.frame_resolution, h0)
+        if self.shift_sizes[layer_id%2] > 0:
+            frame_hidden_state = torch.roll(frame_hidden_state, shifts=(-self.shift_sizes[layer_id%2], -self.shift_sizes[layer_id%2]), dims=(2,3))
+        # window partition
+        frame_hidden_state = window_partition(frame_hidden_state, self.window_size).reshape(bswin, frame_num*wind_square, h0)
+        qkv = self.query_key_value[layer_id](frame_hidden_state).reshape(bswin, frame_num*wind_square, 3, self.n_head, h)\
+                .permute(2, 0, 3, 1, 4) #[3, bswin, n_head, frame_num*wind_size*wind_size, h]
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = torch.matmul(q / math.sqrt(h), k.transpose(-1, -2))
+        if stage == 1:
+            if self.shift_sizes[layer_id%2] > 0:
+                attn = torch.mul(attn.view(bswin // nW, nW, self.n_head, frame_num*wind_square, frame_num*wind_square),
+                                self.attn_mask[:,:frame_num*wind_square, :frame_num*wind_square].unsqueeze(1).unsqueeze(0))\
+                    - 10000.0 * (1.0 - self.attn_mask[:,:frame_num*wind_square, :frame_num*wind_square].unsqueeze(1).unsqueeze(0))
+                attn = attn.view(bswin, self.n_head, frame_num*wind_square, frame_num*wind_square)
+            else:
+                attn = torch.mul(attn, self.causal_mask[:frame_num*wind_square, :frame_num*wind_square].unsqueeze(0).unsqueeze(0))\
+                    - 10000.0 * (1.0 - self.causal_mask[:frame_num*wind_square, :frame_num*wind_square].unsqueeze(0).unsqueeze(0))
+        if memkv_text is None:
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = torch.matmul(attn, v).permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        else:
+            attn_frame2text = torch.matmul(q.reshape(b0, -1, self.n_head, frame_num*wind_square, h) / math.sqrt(h), k_text.unsqueeze(1).transpose(-1, -2))
+            attn_frame2text = attn_frame2text.reshape(bswin, self.n_head, frame_num*wind_square, s0)
+            attn = torch.cat((attn, attn_frame2text), dim=-1)
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = (torch.matmul(attn[..., :-s0], v) +
+                            torch.matmul(attn[..., -s0:].reshape(b0, -1, self.n_head,frame_num*wind_square, s0), v_text.unsqueeze(1))\
+                                .reshape(bswin, self.n_head, frame_num*wind_square, h))\
+                .permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        context_swin = window_reverse(context_swin, self.window_size, self.frame_resolution, self.frame_resolution)
+        # reverse cycle shift
+        if self.shift_sizes[layer_id%2] > 0:
+            context_swin = torch.roll(context_swin, shifts=(self.shift_sizes[layer_id%2], self.shift_sizes[layer_id%2]), dims=(2,3))
+        ret_context = context_swin.reshape(b0, s1, h0)
+        # for mem
+        memk = k.permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        memv = v.permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        memk = window_reverse(memk, self.window_size, self.frame_resolution, self.frame_resolution)
+        memv = window_reverse(memv, self.window_size, self.frame_resolution, self.frame_resolution)
+        if self.shift_sizes[layer_id%2] > 0:
+            memk = torch.roll(memk, shifts=(self.shift_sizes[layer_id%2], self.shift_sizes[layer_id%2]), dims=(2,3))
+            memv = torch.roll(memv, shifts=(self.shift_sizes[layer_id%2], self.shift_sizes[layer_id%2]), dims=(2,3))
+        memk, memv = memk.reshape(b0, s1, h0), memv.reshape(b0, s1, h0)
+        ret_mem = torch.cat((memk, memv), dim=-1)
+        return ret_context, ret_mem
+    def attention_extra_AR_inference(self, frame_hidden_state, memkv, pos, layer_id, log_text_attention_weights=0, attn_dropout=None, memkv_text=None, stage=1):
+        # frame_hidden_state [batchsize, 1, n_head*hiddensize_perhead]
+        # memkv [batchsize, pos, hidden_size*2] (include frames only)
+        # if memkv_text is not None: will attend to text
+        # pos: token's pos
+        b0, sin, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        assert sin == 1
+        this_qkv = self.query_key_value[layer_id](frame_hidden_state)
+        thisq, thisk, thisv = this_qkv[..., :h0], this_qkv[..., h0:2*h0], this_qkv[..., 2*h0:]
+        s1 = memkv.shape[1] if memkv is not None else 0
+        frame_len = self.frame_resolution * self.frame_resolution
+        frame_num_before = s1 // frame_len
+        if memkv is not None:
+            pos_inframe = pos - frame_num_before * frame_len
+            xpos = pos_inframe // self.frame_resolution # pos = xpos*self.frame_resolution + ypos
+            ypos = pos_inframe % self.frame_resolution
+            # [start, end)
+            if self.shift_sizes[layer_id%2] > 0:
+                xstart = ((xpos+self.shift_sizes[layer_id%2]) // self.window_size) * self.window_size - self.shift_sizes[layer_id%2]
+                ystart = ((ypos+self.shift_sizes[layer_id%2]) // self.window_size) * self.window_size - self.shift_sizes[layer_id%2]
+                xend = xstart + self.window_size
+                yend = ystart + self.window_size
+                xstart, ystart = max(0, xstart), max(0, ystart)
+                xend, yend = min(xend, self.frame_resolution), min(yend, self.frame_resolution)
+            else:
+                xstart = (xpos // self.window_size) * self.window_size
+                ystart = (ypos // self.window_size) * self.window_size
+                xend, yend = xstart + self.window_size, ystart+self.window_size
+            # select index
+            selected_index = list()
+            if frame_num_before > 0:
+                # frames before
+                frame_attended_start = max(0, frame_num_before-self.time_dim_attend_length+1) if self.time_dim_attend_length > 0 else 0
+                for x in range(xstart, xend):
+                    for y in range(ystart, yend):
+                        selected_index.append(x*self.frame_resolution+y+frame_len*frame_attended_start)
+                cnt_per_frame = len(selected_index)
+                for _ in range((frame_num_before-frame_attended_start-1)*cnt_per_frame):
+                    selected_index.append(selected_index[-cnt_per_frame]+frame_len)
+            # the last frame
+            for x in range(xstart, xend):
+                for y in range(ystart, yend):
+                    tmppos = x*self.frame_resolution+y + frame_num_before * frame_len
+                    if tmppos < pos:
+                        selected_index.append(tmppos)
+                    else:
+                        break
+            cnt_all = len(selected_index)+1
+            selected_index = torch.tensor(selected_index, device=memkv.device)
+            used_memkv = torch.index_select(memkv, 1, selected_index)
+            used_k, used_v = used_memkv[..., :h0], used_memkv[..., h0:]
+            used_k = torch.cat((used_k.expand(thisk.shape[0], -1, -1), thisk), dim=-2)
+            used_v = torch.cat((used_v.expand(thisv.shape[0], -1, -1), thisv), dim=-2)
+            if memkv_text is not None:
+                cnt_all += memkv_text.shape[-2]
+                used_k = torch.cat((memkv_text[..., :h0].expand(thisk.shape[0], -1, -1), used_k), dim=-2)
+                used_v = torch.cat((memkv_text[..., h0:].expand(thisv.shape[0], -1, -1), used_v), dim=-2)
+            used_k = used_k.reshape(b0, cnt_all, self.n_head, h).permute(0, 2, 1, 3)
+            used_v = used_v.reshape(b0, cnt_all, self.n_head, h).permute(0, 2, 1, 3)
+        else:
+            used_k = thisk
+            used_v = thisv
+            if memkv_text is not None:
+                used_k = torch.cat((memkv_text[..., :h0].expand(thisk.shape[0], -1, -1), used_k), dim=-2)
+                used_v = torch.cat((memkv_text[..., h0:].expand(thisv.shape[0], -1, -1), used_v), dim=-2)
+                used_k = used_k.reshape(b0, 1+memkv_text.shape[-2], self.n_head, h).permute(0, 2, 1, 3)
+                used_v = used_v.reshape(b0, 1+memkv_text.shape[-2], self.n_head, h).permute(0, 2, 1, 3)
+            else:
+                used_k = used_k.reshape(b0, 1, self.n_head, h).permute(0, 2, 1, 3)
+                used_v = used_v.reshape(b0, 1, self.n_head, h).permute(0, 2, 1, 3)
+        thisq = thisq.reshape(b0, 1, self.n_head, h).permute(0, 2, 1, 3) # [b0, n_head, 1, h]
+        attn = torch.matmul(thisq / math.sqrt(h), used_k.transpose(-1, -2))
+        if memkv_text is not None:
+            attn[..., :memkv_text.shape[-2]] += log_text_attention_weights
+        attn = F.softmax(attn, dim=-1)
+        context_swin = torch.matmul(attn, used_v).permute(0, 2, 1, 3).reshape(b0, 1, h0)
+        return context_swin, this_qkv[..., h0:]
+class FullAttentionMixin(BaseMixin):
+    def __init__(self, num_layers,
+                hidden_size,
+                frame_resolution,
+                n_head,
+                frame_num,
+                init_method=unscaled_init_method(0.02),
+                output_layer_init_method=unscaled_init_method(0.02),
+                **kwargs,
+        ):
+        super(FullAttentionMixin, self).__init__()
+        self.num_layers = num_layers # replace attention in the LAST n layers
+        self.query_key_value = torch.nn.ModuleList(
+            [ColumnParallelLinear(hidden_size, 3*hidden_size,stride=3,
+                gather_output=False,init_method=init_method)
+                for layer_id in range(num_layers)
+            ])
+        self.dense = torch.nn.ModuleList(
+            [RowParallelLinear(
+                hidden_size,
+                hidden_size,
+                input_is_parallel=True,
+                init_method=output_layer_init_method,
+                bias=True,
+                module=self,
+                name="dense")
+                for layer_id in range(num_layers)
+            ])
+        self.n_head = n_head
+        self.frame_resolution = frame_resolution
+        self.frame_len = frame_resolution * frame_resolution
+        self.attn_distribution = torch.nn.ParameterList([
+            torch.nn.Parameter(torch.zeros(hidden_size))
+            for _ in range(num_layers)
+        ])
+    def reinit(self, *pre_mixins):
+        start_layer = len(self.transformer.layers) - self.num_layers
+        assert start_layer >= 0
+        for layer_id in range(self.num_layers):
+            old_attention = self.transformer.layers[start_layer + layer_id].attention
+            self.query_key_value[layer_id].weight.data.copy_(old_attention.query_key_value.weight.data)
+            self.query_key_value[layer_id].bias.data.copy_(old_attention.query_key_value.bias.data)
+    def attention_extra_NAR_inference(self, frame_hidden_state, layer_id, attn_dropout=None, memkv_text=None, stage=1):
+        # frame_hidden_state [batchsize, frame_num*frame_size, n_head*hiddensize_perhead]
+        assert stage == 1
+        b0, s1, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        frame_len = self.frame_resolution * self.frame_resolution
+        frame_num = s1 // frame_len
+        assert frame_num*frame_len == s1
+        if memkv_text is not None:
+            s0 = memkv_text.shape[-2]
+            k_text = memkv_text[..., :h0].expand(b0, -1, -1).reshape(b0, s0, self.n_head, h).permute(0, 2, 1, 3)
+            v_text = memkv_text[..., h0:].expand(b0, -1, -1).reshape(b0, s0, self.n_head, h).permute(0, 2, 1, 3)
+        qkv = self.query_key_value[layer_id](frame_hidden_state).reshape(b0, s1, 3, self.n_head, h)\
+                .permute(2, 0, 3, 1, 4) #[3, b0, n_head, s1, h]
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = torch.matmul(q / math.sqrt(h), k.transpose(-1, -2))
+        attn = attn - 10000.0 * (1.0-torch.ones(b0, self.n_head, s1, s1, device=attn.device, dtype=attn.dtype).tril())
+        if memkv_text is None:
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = torch.matmul(attn, v).permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        else:
+            attn_frame2text = torch.matmul(q / math.sqrt(h), k_text.transpose(-1, -2)) #[b0, s1, s0]
+            attn = torch.cat((attn, attn_frame2text), dim=-1)
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = (torch.matmul(attn[..., :-s0], v) + torch.matmul(attn[..., -s0:], v_text))\
+                .permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        # for mem
+        memk = k.permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        memv = v.permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        ret_mem = torch.cat((memk, memv), dim=-1)
+        return context_swin, ret_mem
+    def attention_extra_AR_inference(self, frame_hidden_state, memkv, pos, layer_id, log_text_attention_weights=0, attn_dropout=None, memkv_text=None, stage=1):
+        # pos: current token's pos
+        b0, sin, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        assert sin == 1
+        assert stage == 1
+        this_qkv = self.query_key_value[layer_id](frame_hidden_state)
+        thisq, thisk, thisv = this_qkv[..., :h0], this_qkv[..., h0:2*h0], this_qkv[..., 2*h0:]
+        if memkv is not None:
+            used_k, used_v = memkv[..., :h0], memkv[..., h0:]
+            used_k = torch.cat((used_k.expand(thisk.shape[0], -1, -1), thisk), dim=-2)
+            used_v = torch.cat((used_v.expand(thisv.shape[0], -1, -1), thisv), dim=-2)
+        else:
+            used_k, used_v = thisk, thisv
+        if memkv_text is not None:
+            used_k = torch.cat((memkv_text[..., :h0].expand(thisk.shape[0], -1, -1), used_k), dim=-2)
+            used_v = torch.cat((memkv_text[..., h0:].expand(thisv.shape[0], -1, -1), used_v), dim=-2)
+        used_k = used_k.reshape(b0, -1, self.n_head, h).permute(0, 2, 1, 3)
+        used_v = used_v.reshape(b0, -1, self.n_head, h).permute(0, 2, 1, 3)
+        thisq = thisq.reshape(b0, 1, self.n_head, h).permute(0, 2, 1, 3) # [b0, n_head, 1, h]
+        attn = torch.matmul(thisq / math.sqrt(h), used_k.transpose(-1, -2))
+        if memkv_text is not None:
+            attn[..., :memkv_text.shape[-2]] += log_text_attention_weights
+        attn = F.softmax(attn, dim=-1)
+        context_swin = torch.matmul(attn, used_v).permute(0, 2, 1, 3).reshape(b0, 1, h0)
+        return context_swin, this_qkv[..., h0:]
+def attention_localframe_and_text_NAR(q0, k0, v0, attention_mask,
+                             n_head, text_len, frame_len, frame_num,
+                             attention_dropout=None, log_text_attention_weights=0, stage=1, **kwargs):
+    b, s0, h0 = q0.shape
+    s1 = s0 - text_len
+    h = h0 // n_head
+    assert q0.shape[1] == v0.shape[1] == k0.shape[1] == text_len+frame_len*frame_num
+    # attention_mask.shape [4, b or 1, 1, text_len+frame_len, text_len+frame_len]
+    if stage == 2:
+        assert frame_num == 3
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0 = k0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.transpose(-1, -2)
+    score_any2text = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T[..., :text_len])
+    score_any2text += log_text_attention_weights
+    score_any2text_part1 = torch.mul(score_any2text[..., :text_len, :], attention_mask[..., :text_len, :text_len]) \
+        - 10000.0 * (1.0 - attention_mask[..., :text_len, :text_len])
+    # context for text
+    attention_probs_text = F.softmax(score_any2text_part1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs_text = attention_dropout(attention_probs_text)
+    context_text2text = torch.matmul(attention_probs_text, v0[..., :text_len, :])
+    context_text2text = context_text2text.transpose(1, 2).reshape(b, text_len, h0)
+    if frame_num > 0:
+        score_any2text_part2 = score_any2text[..., text_len:, :]
+        # score: frame local
+        q0_frame = q0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h)
+        v0_frame = v0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h)
+        k0T_frame = k0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h).transpose(-1, -2)
+        score_frame_local0 = torch.matmul(q0_frame / math.sqrt(q0_frame.shape[-1]), k0T_frame)
+        if stage == 1:
+            score_frame_local0 = torch.mul(score_frame_local0, attention_mask[..., text_len:, text_len:].unsqueeze(1)) \
+                - 10000.0 * (1.0 - attention_mask[..., text_len:, text_len:].unsqueeze(1))
+        # context for frame
+        score_frame_all = torch.cat((score_any2text_part2,
+                                    score_frame_local0.view(b, n_head, s1, frame_len)), dim=-1)
+        attention_probs_frame = F.softmax(score_frame_all, dim=-1)
+        if attention_dropout is not None:
+            with get_cuda_rng_tracker().fork():
+                attention_probs_frame = attention_dropout(attention_probs_frame)
+        context_frame2text = torch.matmul(attention_probs_frame[..., :text_len], v0[..., :text_len, :]) # [b, n_head, s1, h]
+        context_frame_local0 = torch.matmul(attention_probs_frame[..., text_len:text_len+frame_len].\
+            view(b, n_head, frame_num, frame_len, frame_len), v0_frame).view(b, n_head, s1, h)
+        context_frame = (context_frame2text + context_frame_local0).transpose(1, 2).reshape(b, s1, h0)
+    else:
+        context_frame = None
+    return context_text2text, context_frame
+def attention_localframe_and_text_AR(q0, k0, v0, n_head, text_len, frame_len, frame_num,
+                                        attention_dropout=None, log_text_attention_weights=0, layer_id=None, limited_spatial_channel_mem=False, stage=1, **kwargs):
+    # limited_spatial_channel_mem=True means: mems in spatial channel is consisted of {mem_text, mem_current_frame}
+    b, s0, h0 = k0.shape
+    frame_num_before = (s0-text_len-1) // frame_len # frame_num == frame_num_before or frame_num == frame_num_before+1
+    h = h0 // n_head
+    assert q0.shape[1] == 1
+    assert v0.shape[1] == k0.shape[1]
+    q0 = q0.reshape(b, 1, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    if limited_spatial_channel_mem:
+        assert frame_num_before == 0
+        assert stage == 1 # not implemented for stage-2 yet
+        score = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+        score[..., :text_len] += log_text_attention_weights
+        attention_probs_frame = F.softmax(score, dim=-1)
+        context_frame = torch.matmul(attention_probs_frame, v0).transpose(1, 2).reshape(b, 1, h0)
+    else:
+        score_token2text = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T[..., :text_len])
+        score_token2text += log_text_attention_weights
+        score_frame_local0 = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T[..., text_len+frame_num_before*frame_len:])
+        score_frame_all = torch.cat((score_token2text,
+                                    score_frame_local0), dim=-1)
+        attention_probs_frame = F.softmax(score_frame_all, dim=-1)
+        context_token2text = torch.matmul(attention_probs_frame[..., :text_len], v0[..., :text_len, :]) # [b, n_head, s1, h]
+        context_frame_local0 = torch.matmul(attention_probs_frame[..., text_len:], \
+            v0[:, :, text_len+frame_num_before*frame_len:, :])
+        context_frame = (context_token2text + context_frame_local0).transpose(1, 2).reshape(b, 1, h0)
+    return context_frame
+class CogVideoCacheModel(BaseModel):
+    def __init__(self, args, transformer=None, parallel_output=True, window_size=None, cogvideo_stage=None):
+        super().__init__(args, transformer=transformer, parallel_output=parallel_output)
+        self.layout = args.layout # [64, 64+1024, 64+6*1024]
+        self.stage = cogvideo_stage if cogvideo_stage is not None else args.cogvideo_stage # 1 or 2
+        self.n_head = args.num_attention_heads
+        self.window_size = window_size if window_size is not None else args.window_size
+        frame_resolution = int(math.sqrt(self.layout[1]-self.layout[0]))
+        self.add_mixin('extra_position_embedding', PositionEmbeddingMixin(
+            args.additional_seqlen, args.hidden_size
+        ))
+        if self.stage == 1:
+            self.add_mixin('attention_plus', FullAttentionMixin(
+                num_layers=args.num_layers,
+                hidden_size=args.hidden_size,
+                frame_resolution=frame_resolution,
+                n_head=args.num_attention_heads,
+                frame_num=(args.layout[2]-args.layout[0])//(args.layout[1]-args.layout[0]),
+            ))
+        else:
+            self.add_mixin('attention_plus', WindowAttentionMixin(
+                num_layers=args.num_layers,
+                hidden_size=args.hidden_size,
+                frame_resolution=frame_resolution,
+                window_size=self.window_size,
+                shift_size=self.window_size//2,
+                n_head=args.num_attention_heads,
+                frame_num=(args.layout[2]-args.layout[0])//(args.layout[1]-args.layout[0]),
+            ))
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('VideoSwinLocalModel', 'video swin local model configurations')
+        group.add_argument("--layout", type=str, default='64, 464, 2064')
+        group.add_argument("--window-size", type=int, default=10) # 优先级在直接参数赋值之后
+        group.add_argument("--additional-seqlen", type=int, default=2000)
+        group.add_argument("--cogvideo-stage", type=int, default=1, choices=[1,2])  # 优先级在直接参数赋值之后
+        return parser
+    def disable_untrainable_params(self):
+        pass
+    def position_embedding_forward(self, position_ids, **kw_args):
+        if position_ids.shape[-1] > 1:
+            if self.stage == 1:
+                if position_ids[0,-1] >= (512+400):
+                    frame_num = position_ids.shape[-1] // 400
+                    position_embeddings = torch.cat(
+                    (
+                        self.transformer.position_embeddings(position_ids[..., :-400*(frame_num-1)]),
+                        self.get_mixin('extra_position_embedding').position_embeddings(position_ids[..., -400*(frame_num-1):]-(512+400))
+                    ),
+                    dim=-2
+                )
+                else:
+                    position_embeddings = self.transformer.position_embeddings(position_ids)
+            else:
+                # given 3, interpolate 2
+                position_embeddings = torch.cat(
+                    (
+                        self.transformer.position_embeddings(position_ids[..., :-800]),
+                        self.get_mixin('extra_position_embedding').position_embeddings(position_ids[..., -800:]-(512+400))
+                    ),
+                    dim=-2
+                )
+        else:
+            if position_ids[0, 0] >= (512+400):
+                position_embeddings = self.get_mixin('extra_position_embedding').position_embeddings(position_ids-(512+400))
+            else:
+                position_embeddings = self.transformer.position_embeddings(position_ids)
+        return position_embeddings
+    def attention_forward(self, hidden_states, mask, layer_id, mems=None, log_text_attention_weights=0, text_len=0, frame_len=0, counter=0, enforce_no_swin=False, limited_spatial_channel_mem=False, **kw_args):
+        attn_module = self.transformer.layers[layer_id].attention
+        hidden_size = hidden_states.shape[-1]
+        # base model qkv
+        if mems is None:
+            mixed_raw_layer = attn_module.query_key_value(hidden_states)
+            q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer, 3)
+            assert (q0.shape[1]-text_len) % frame_len == 0
+            memkv0 = torch.cat((k0, v0), dim=-1)
+            context_text, context_frame_local_text = attention_localframe_and_text_NAR(
+                    q0, k0, v0,
+                    mask,
+                    n_head=attn_module.num_attention_heads_per_partition,
+                    text_len=text_len,
+                    frame_len=frame_len,
+                    frame_num=(q0.shape[1]-text_len)//frame_len,
+                    log_text_attention_weights=log_text_attention_weights,
+                    stage=self.stage
+                )
+            # change: self.swin_attend_to_text默认为True:
+            memkv1_text = self.get_mixin('attention_plus').query_key_value[layer_id](hidden_states[..., :text_len, :])[..., hidden_size:]
+            output_text = attn_module.dense(context_text)
+            if (q0.shape[1]-text_len)//frame_len > 0:
+                assert (q0.shape[1]-text_len) % frame_len == 0
+                context_frame_swin, memkv1_frame = self.get_mixin('attention_plus').attention_extra_NAR_inference(
+                    hidden_states[:,text_len:], layer_id, memkv_text=memkv1_text, stage=self.stage)
+                if not enforce_no_swin:
+                    attn_distrib = torch.sigmoid(self.get_mixin('attention_plus').attn_distribution[layer_id])
+                    attn_distrib = attn_distrib.unsqueeze(0).unsqueeze(0)
+                    output_frame = torch.mul(attn_module.dense(context_frame_local_text), attn_distrib)\
+                        +torch.mul(self.get_mixin('attention_plus').dense[layer_id](context_frame_swin), 1-attn_distrib)
+                else:
+                    output_frame = attn_module.dense(context_frame_local_text[..., :frame_len, :])
+                output = torch.cat((output_text, output_frame), dim=-2)
+                memkv1 = torch.cat((memkv1_text, memkv1_frame), dim=-2) if memkv1_text is not None else memkv1_frame
+            else:
+                output = output_text
+                memkv1 = memkv1_text
+            kw_args['output_this_layer']['mem_kv'] = (memkv0, memkv1)
+        else:
+            mixed_raw_layer = attn_module.query_key_value(hidden_states)
+            q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer, 3)
+            new_memkv0 = torch.cat((k0, v0), dim=-1)
+            old_k0, old_v0 = mems[0][layer_id][..., :hidden_size], mems[0][layer_id][..., hidden_size:]
+            context_frame_local_text = attention_localframe_and_text_AR(
+                    q0,
+                    torch.cat((old_k0.expand(k0.shape[0], -1, -1), k0), dim=-2),
+                    torch.cat((old_v0.expand(v0.shape[0], -1, -1), v0), dim=-2),
+                    n_head=attn_module.num_attention_heads_per_partition,
+                    text_len=text_len,
+                    frame_len=frame_len,
+                    frame_num=None,
+                    log_text_attention_weights=log_text_attention_weights,
+                    layer_id=layer_id,
+                    limited_spatial_channel_mem=limited_spatial_channel_mem,
+                )
+            old_memkv1 = mems[1][layer_id] if mems[1] is not None else None
+            context_frame_swin, new_memkv1 = self.get_mixin('attention_plus').attention_extra_AR_inference(hidden_states,
+                                                                                                            old_memkv1[..., text_len:, :] if old_memkv1.shape[-2]>text_len else None,
+                                                                                                            counter-text_len,
+                                                                                                            layer_id,
+                                                                                                            memkv_text=old_memkv1[..., :text_len, :],
+                                                                                                            log_text_attention_weights=log_text_attention_weights)
+            if not enforce_no_swin:
+                attn_distrib = torch.sigmoid(self.get_mixin('attention_plus').attn_distribution[layer_id])
+                attn_distrib = attn_distrib.unsqueeze(0).unsqueeze(0)
+                output = torch.mul(attn_module.dense(context_frame_local_text), attn_distrib)\
+                    +torch.mul(self.get_mixin('attention_plus').dense[layer_id](context_frame_swin), 1-attn_distrib)
+            else:
+                output = attn_module.dense(context_frame_local_text)
+            kw_args['output_this_layer']['mem_kv'] = (new_memkv0, new_memkv1)
+        return output

src/videogen_hub/pipelines/cogvideo/cogvideo_src/models/cogvideo_model.py ADDED Viewed

	@@ -0,0 +1,543 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   cogvideo_model.py
+@Time    :   2022/07/11 16:12:05
+@Author  :   Wenyi Hong
+@Version :   1.0
+@Contact :   [email protected]
+'''
+# here put the import lib
+import torch
+from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin
+from SwissArmyTransformer.mpu.utils import split_tensor_along_last_dim
+from SwissArmyTransformer.model.transformer import unscaled_init_method
+from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear
+import torch.nn.functional as F
+from deepspeed.runtime.activation_checkpointing.checkpointing import get_cuda_rng_tracker
+import math
+class PositionEmbeddingMixin(BaseMixin):
+    def __init__(self, additional_sequence_length, hidden_size,
+                 init_method_std=0.02, reinit_slice=slice(512, 912),
+                 ):
+        super(PositionEmbeddingMixin, self).__init__()
+        self.reinit_slice = reinit_slice
+        self.position_embeddings = torch.nn.Embedding(additional_sequence_length, hidden_size)
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+    def reinit(self, parent_model=None):
+        old_weights = self.transformer.position_embeddings.weight.data[self.reinit_slice]
+        old_len, hidden_size = old_weights.shape
+        assert hidden_size == self.position_embeddings.weight.shape[-1]
+        self.position_embeddings.weight.data.view(-1, old_len, hidden_size).copy_(old_weights)
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, framenum, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, frame_num, window_size, window_size, C)
+    """
+    B, framenum, H, W, C = x.shape
+    x = x.view(B, framenum, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 2, 4, 1, 3, 5, 6).contiguous().view(-1, framenum, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, frame_num, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, frame_num, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    framenum = windows.shape[1]
+    x = windows.view(B, H // window_size, W // window_size, framenum, window_size, window_size, -1)
+    x = x.permute(0, 3, 1, 4, 2, 5, 6).contiguous().view(B, framenum, H, W, -1)
+    return x
+class WindowAttentionMixin(BaseMixin):
+    def __init__(self, num_layers,
+                hidden_size,
+                frame_resolution,
+                window_size,
+                shift_size,
+                n_head,
+                frame_num,
+                init_method=unscaled_init_method(0.02),
+                output_layer_init_method=unscaled_init_method(0.02),
+        ):
+        super(WindowAttentionMixin, self).__init__()
+        self.num_layers = num_layers # replace attention in the LAST n layers
+        self.query_key_value = torch.nn.ModuleList(
+            [ColumnParallelLinear(hidden_size, 3*hidden_size,stride=3,
+                gather_output=False,init_method=init_method)
+                for layer_id in range(num_layers)
+            ])
+        self.dense = torch.nn.ModuleList(
+            [RowParallelLinear(
+                hidden_size,
+                hidden_size,
+                input_is_parallel=True,
+                init_method=output_layer_init_method,
+                bias=True,
+                module=self,
+                name="dense",
+                )
+                for layer_id in range(num_layers)
+            ])
+        self.n_head = n_head
+        self.window_size = window_size
+        self.frame_resolution = frame_resolution
+        self.frame_len = frame_resolution * frame_resolution
+        assert frame_resolution % window_size == 0
+        assert 0 < shift_size < window_size
+        nW = (self.frame_resolution // self.window_size) ** 2
+        ws_squre = self.window_size * self.window_size
+        # odd non-shift, even shift
+        img_mask = torch.zeros((1, 1, frame_resolution, frame_resolution, 1))
+        h_slices = (slice(0, -shift_size),
+                    slice(-shift_size, None))
+        w_slices = (slice(0, -shift_size),
+                    slice(-shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, :, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, 1, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        sub_attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) #[nW, self.window_size * self.window_size, self.window_size * self.window_size]
+        sub_attn_mask = sub_attn_mask.masked_fill(sub_attn_mask != 0, float(0.0)).masked_fill(sub_attn_mask == 0, float(1.00))
+        attn_mask = sub_attn_mask.repeat(1, frame_num, frame_num)
+        self.attn_mask_sequential = attn_mask.clone().tril()
+        self.causal_mask_sequential = torch.ones(1, ws_squre*frame_num, ws_squre*frame_num).tril()
+        self.causal_mask_interp = torch.ones(1, ws_squre*frame_num, ws_squre*frame_num)
+        self.attn_mask_interp = attn_mask.clone()
+        # bi-dir
+        for bi_idx in range(0, frame_num, 2):
+            for uni_idx in range(1, frame_num, 2):
+                self.attn_mask_interp[:, bi_idx*ws_squre:(bi_idx+1)*ws_squre, uni_idx*ws_squre:(uni_idx+1)*ws_squre] = 0
+                self.causal_mask_interp[:, bi_idx*ws_squre:(bi_idx+1)*ws_squre, uni_idx*ws_squre:(uni_idx+1)*ws_squre] = 0
+        # uni-dir
+        for uni_idx in range(1, frame_num, 2):
+            self.attn_mask_interp[:, ws_squre*uni_idx:ws_squre*(uni_idx+1), ws_squre*uni_idx:ws_squre*(uni_idx+1)].tril_()
+            self.causal_mask_interp[:, ws_squre*uni_idx:ws_squre*(uni_idx+1), ws_squre*uni_idx:ws_squre*(uni_idx+1)].tril_()
+            for uni_idx2 in range(uni_idx+2, frame_num, 2):
+                self.attn_mask_interp[:, ws_squre*uni_idx:ws_squre*(uni_idx+1), ws_squre*uni_idx2:ws_squre*(uni_idx2+1)] = 0
+                self.causal_mask_interp[:, ws_squre*uni_idx:ws_squre*(uni_idx+1), ws_squre*uni_idx2:ws_squre*(uni_idx2+1)] = 0
+        # expand dim
+        self.attn_mask_sequential = self.attn_mask_sequential[None, None, :, None]
+        self.attn_mask_interp = self.attn_mask_interp[None, None, :, None]
+        self.causal_mask_sequential = self.causal_mask_sequential[None, None, :, None]
+        self.causal_mask_interp = self.causal_mask_interp[None, None, :, None]
+        self.shift_sizes = [0, shift_size]
+        # self.register_buffer("attn_mask", attn_mask)
+        # self.register_buffer("causal_mask", causal_mask)
+        self.mask_initialized = False
+        self.attn_distribution = torch.nn.ParameterList([
+            torch.nn.Parameter(torch.zeros(hidden_size))
+            for _ in range(num_layers)
+        ])
+    def reinit(self, *pre_mixins):
+        start_layer = len(self.transformer.layers) - self.num_layers
+        assert start_layer >= 0
+        for layer_id in range(self.num_layers):
+            old_attention = self.transformer.layers[start_layer + layer_id].attention
+            self.query_key_value[layer_id].weight.data.copy_(old_attention.query_key_value.weight.data)
+            self.query_key_value[layer_id].bias.data.copy_(old_attention.query_key_value.bias.data)
+    def attention_extra(self, frame_hidden_state, layer_id, attn_dropout, text_hidden_state=None,
+                       text_attn_mask=None, mode_sequential=True):
+        # pb relax
+        swin_pb_relax = True
+        alpha = 16
+        # frame_hidden_state [batchsize, frame_num*frame_size, n_head*hiddensize_perhead]
+        if not self.mask_initialized:
+            self.attn_mask_sequential = self.attn_mask_sequential.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.causal_mask_sequential = self.causal_mask_sequential.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.attn_mask_interp = self.attn_mask_interp.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.causal_mask_interp = self.causal_mask_interp.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.mask_initialized = True
+        b0, s1, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        frame_len = self.frame_resolution * self.frame_resolution
+        frame_num = s1 // frame_len
+        assert frame_num*frame_len == s1
+        wind_square = self.window_size * self.window_size
+        nW = frame_len // wind_square
+        bswin = b0 * nW
+        causal_mask = self.causal_mask_sequential if mode_sequential else self.causal_mask_interp
+        attn_mask = self.attn_mask_sequential if mode_sequential else self.attn_mask_interp
+        if text_hidden_state is not None:
+            s0 = text_hidden_state.shape[1]
+            qkv_text = self.query_key_value[layer_id](text_hidden_state).reshape(b0, s0, 3, self.n_head, h).permute(2, 0, 3, 1, 4) #[3, b0, n_head, s0, h]
+            q_text, k_text, v_text = qkv_text[0], qkv_text[1], qkv_text[2]
+        # shift
+        frame_hidden_state = frame_hidden_state.reshape(b0, frame_num, self.frame_resolution, self.frame_resolution, h0)
+        if self.shift_sizes[layer_id%2] > 0:
+            frame_hidden_state = torch.roll(frame_hidden_state, shifts=(-self.shift_sizes[layer_id%2], -self.shift_sizes[layer_id%2]), dims=(2,3))
+        # window partition
+        frame_hidden_state = window_partition(frame_hidden_state, self.window_size).reshape(bswin, frame_num*wind_square, h0)
+        qkv = self.query_key_value[layer_id](frame_hidden_state).reshape(bswin, frame_num*wind_square, 3, self.n_head, h)\
+                .permute(2, 0, 3, 1, 4) #[3, bswin, n_head, frame_num*wind_size*wind_size, h]
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # pb-relax
+        if swin_pb_relax:
+            attn = torch.matmul(q / (math.sqrt(h)*alpha), k.transpose(-1, -2))
+        else:
+            attn = torch.matmul(q / math.sqrt(h), k.transpose(-1, -2))
+        if self.shift_sizes[layer_id%2] > 0:
+            # attn = attn.view(bswin // nW, nW, self.n_head, frame_num*wind_square, frame_num*wind_square) + self.attn_mask.unsqueeze(1).unsqueeze(0)
+            attn = torch.mul(attn.view(bswin // nW, nW, self.n_head, frame_num*wind_square, frame_num*wind_square), attn_mask)\
+                 - 10000.0 * (1.0 - attn_mask)
+            attn = attn.view(bswin, self.n_head, frame_num*wind_square, frame_num*wind_square)
+        else:
+            attn = torch.mul(attn.view(bswin // nW, nW, self.n_head, frame_num*wind_square, frame_num*wind_square), causal_mask)\
+                 - 10000.0 * (1.0 - causal_mask)
+            attn = attn.view(bswin, self.n_head, frame_num*wind_square, frame_num*wind_square)
+        if swin_pb_relax:
+            swin_pb_relax_const = torch.max(attn.reshape(bswin, self.n_head, -1), dim=-1, keepdim=True)[0].detach().unsqueeze(-1)
+            attn = (attn - swin_pb_relax_const)*alpha
+        if text_hidden_state is None:
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = torch.matmul(attn, v).permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        else:
+            assert text_attn_mask is not None
+            text_attn_mask = text_attn_mask.unsqueeze(2).unsqueeze(2)
+            # pb-relax
+            if swin_pb_relax:
+                attn_frame2text = torch.matmul(q.reshape(b0, -1, self.n_head, frame_num*wind_square, h) / (math.sqrt(h)*alpha), k_text.unsqueeze(1).transpose(-1, -2))
+                attn_frame2text = (attn_frame2text-swin_pb_relax_const.reshape(b0, -1, self.n_head, 1, 1))*alpha
+            else:
+                attn_frame2text = torch.matmul(q.reshape(b0, -1, self.n_head, frame_num*wind_square, h) / math.sqrt(h), k_text.unsqueeze(1).transpose(-1, -2))
+            attn_frame2text = torch.mul(text_attn_mask, attn_frame2text) - 10000.0 * (1.0 - text_attn_mask)
+            attn_frame2text = attn_frame2text.reshape(bswin, self.n_head, frame_num*wind_square, s0)
+            attn = torch.cat((attn, attn_frame2text), dim=-1)
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = (torch.matmul(attn[..., :-s0], v) +
+                            torch.matmul(attn[..., -s0:].reshape(b0, -1, self.n_head,frame_num*wind_square, s0), v_text.unsqueeze(1))\
+                                .reshape(bswin, self.n_head, frame_num*wind_square, h))\
+                .permute(0, 2, 1, 3).reshape(bswin, frame_num, self.window_size, self.window_size, h0)
+        context_swin = window_reverse(context_swin, self.window_size, self.frame_resolution, self.frame_resolution)
+        # reverse cycle shift
+        if self.shift_sizes[layer_id%2] > 0:
+            context_swin = torch.roll(context_swin, shifts=(self.shift_sizes[layer_id%2], self.shift_sizes[layer_id%2]), dims=(2,3))
+        context_swin = context_swin.reshape(b0, s1, h0)
+        return context_swin
+class FullAttentionMixin(BaseMixin):
+    def __init__(self, num_layers,
+                hidden_size,
+                frame_resolution,
+                n_head,
+                frame_num,
+                init_method=unscaled_init_method(0.02),
+                output_layer_init_method=unscaled_init_method(0.02),
+        ):
+        super(FullAttentionMixin, self).__init__()
+        self.num_layers = num_layers # replace attention in the LAST n layers
+        self.query_key_value = torch.nn.ModuleList(
+            [ColumnParallelLinear(hidden_size, 3*hidden_size,stride=3,
+                gather_output=False,init_method=init_method)
+                for layer_id in range(num_layers)
+            ])
+        self.dense = torch.nn.ModuleList(
+            [RowParallelLinear(
+                hidden_size,
+                hidden_size,
+                input_is_parallel=True,
+                init_method=output_layer_init_method,
+                bias=True,
+                module=self,
+                name="dense",)
+                for layer_id in range(num_layers)
+            ])
+        self.n_head = n_head
+        self.frame_resolution = frame_resolution
+        self.frame_len = frame_resolution * frame_resolution
+        self.causal_mask = torch.ones(1, 1, self.frame_len*frame_num, self.frame_len*frame_num).tril()
+        self.mask_initialized = False
+        self.attn_distribution = torch.nn.ParameterList([
+            torch.nn.Parameter(torch.zeros(hidden_size))
+            for _ in range(num_layers)
+        ])
+    def reinit(self, *pre_mixins):
+        start_layer = len(self.transformer.layers) - self.num_layers
+        assert start_layer >= 0
+        for layer_id in range(self.num_layers):
+            base_attention = self.transformer.layers[start_layer + layer_id].attention
+            self.query_key_value[layer_id].weight.data.copy_(base_attention.query_key_value.weight.data)
+            self.query_key_value[layer_id].bias.data.copy_(base_attention.query_key_value.bias.data)
+    def attention_extra(self, frame_hidden_state, layer_id, attn_dropout, text_hidden_state=None,
+                       text_attn_mask=None, mode_sequential=False):
+        # pb relax
+        # frame_hidden_state [batchsize, frame_num*frame_size, n_head*hiddensize_perhead]
+        assert mode_sequential == True # only
+        swin_pb_relax = True
+        alpha = 16
+        if not self.mask_initialized:
+            self.causal_mask = self.causal_mask.to(device=frame_hidden_state.device, dtype=frame_hidden_state.dtype)
+            self.mask_initialized = True
+        b0, s1, h0 = frame_hidden_state.shape
+        h = h0 // self.n_head
+        frame_len = self.frame_resolution * self.frame_resolution
+        frame_num = s1 // frame_len
+        assert frame_num*frame_len == s1
+        qkv = self.query_key_value[layer_id](frame_hidden_state).reshape(b0, s1, 3, self.n_head, h)\
+                .permute(2, 0, 3, 1, 4) #[3, b0, n_head, s1, h]
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # frames-to-frames
+        if swin_pb_relax:
+            attn = torch.matmul(q / (math.sqrt(h)*alpha), k.transpose(-1, -2))
+        else:
+            attn = torch.matmul(q / math.sqrt(h), k.transpose(-1, -2))
+        attn = torch.mul(attn, self.causal_mask) - 10000.0 * (1.0 - self.causal_mask)
+        if swin_pb_relax:
+            swin_pb_relax_const = torch.max(attn.reshape(b0, self.n_head, -1), dim=-1, keepdim=True)[0].detach().unsqueeze(-1)
+            attn = (attn - swin_pb_relax_const)*alpha
+        if text_hidden_state is None:
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_swin = torch.matmul(attn, v).permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        else:
+            # frame-to-text
+            assert text_attn_mask is not None
+            s0 = text_hidden_state.shape[1]
+            qkv_text = self.query_key_value[layer_id](text_hidden_state).reshape(b0, s0, 3, self.n_head, h).permute(2, 0, 3, 1, 4) #[3, b0, n_head, s0, h]
+            q_text, k_text, v_text = qkv_text[0], qkv_text[1], qkv_text[2]
+            text_attn_mask = text_attn_mask.unsqueeze(2)
+            if swin_pb_relax:
+                attn_frame2text = torch.matmul(q.reshape(b0, self.n_head, s1, h) / (math.sqrt(h)*alpha), k_text.transpose(-1, -2))
+                attn_frame2text = (attn_frame2text-swin_pb_relax_const.reshape(b0, self.n_head, 1, 1))*alpha
+            else:
+                attn_frame2text = torch.matmul(q.reshape(b0, self.n_head, s1, h) / math.sqrt(h), k_text.transpose(-1, -2))
+            attn_frame2text = torch.mul(text_attn_mask, attn_frame2text) - 10000.0 * (1.0 - text_attn_mask)
+            attn_frame2text = attn_frame2text.reshape(b0, self.n_head, s1, s0)
+            attn = torch.cat((attn, attn_frame2text), dim=-1)
+            attn = F.softmax(attn, dim=-1)
+            if attn_dropout is not None:
+                with get_cuda_rng_tracker().fork():
+                    attn = attn_dropout(attn)
+            context_frame = (torch.matmul(attn[..., :-s0], v) +
+                            torch.matmul(attn[..., -s0:].reshape(b0, self.n_head,s1, s0), v_text))\
+                .permute(0, 2, 1, 3).reshape(b0, s1, h0)
+        return context_frame
+def attention_localframe_and_text(q0, k0, v0, attention_mask_totxt, attention_mask_local,
+                             n_head, text_len, frame_len, frame_num, attention_dropout=None, layer_id=0, **kwargs):
+    b, s0, h0 = q0.shape
+    s1 = s0 - text_len
+    h = h0 // n_head
+    assert q0.shape[1] == v0.shape[1] == k0.shape[1] == text_len+frame_len*frame_num
+    # attention_mask_totxt [b, 1, 1, text_len]
+    # attention_mask_local [1, 1, frame_num, frame_len, frame_len]
+    # attention_mask: [1, 1, text_len+frame_len, text_len+frame_len]
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0 = k0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.transpose(-1, -2)
+    # score: any2text
+    score_any2text = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T[..., :text_len])
+    score_any2text_part1 = torch.mul(score_any2text[..., :text_len, :], attention_mask_totxt) \
+        - 10000.0 * (1.0 - attention_mask_totxt)
+    score_any2text_part2 = torch.mul(score_any2text[..., text_len:, :], attention_mask_totxt) - \
+                                     10000.0 * (1.0 - attention_mask_totxt)
+    # score: frame local
+    q0_frame = q0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h)
+    v0_frame = v0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h)
+    k0T_frame = k0[:, :, text_len:].reshape(b, n_head, frame_num, frame_len, h).transpose(-1, -2)
+    score_frame_local0 = torch.matmul(q0_frame / math.sqrt(q0_frame.shape[-1]), k0T_frame)
+    score_frame_local0 = torch.mul(score_frame_local0, attention_mask_local) \
+        - 10000.0 * (1.0 - attention_mask_local)
+    # context for frame
+    score_frame_all = torch.cat((score_any2text_part2,
+                                 score_frame_local0.view(b, n_head, s1, frame_len)), dim=-1)
+    attention_probs_frame = F.softmax(score_frame_all, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs_frame = attention_dropout(attention_probs_frame)
+    context_frame2text = torch.matmul(attention_probs_frame[..., :text_len], v0[..., :text_len, :]) # [b, n_head, s1, h]
+    context_frame_local0 = torch.matmul(attention_probs_frame[..., text_len:text_len+frame_len].\
+        view(b, n_head, frame_num, frame_len, frame_len), v0_frame).view(b, n_head, s1, h)
+    context_frame = (context_frame2text + context_frame_local0).transpose(1, 2).reshape(b, s1, h0)
+    # context for text
+    attention_probs_text = F.softmax(score_any2text_part1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs_text = attention_dropout(attention_probs_text)
+    context_text2text = torch.matmul(attention_probs_text, v0[..., :text_len, :])
+    context_text2text = context_text2text.transpose(1, 2).reshape(b, text_len, h0)
+    return context_text2text, context_frame
+class CogVideoModel(BaseModel):
+    def __init__(self, args, transformer=None, parallel_output=True):
+        super().__init__(args, transformer=transformer, parallel_output=parallel_output)
+        self.stage = args.cogvideo_stage # 1 or 2
+        self.mode_sequential = True if self.stage==1 else False
+        self.layout = args.layout # [64, 64+400, 64+5*400]
+        self.n_head = args.num_attention_heads
+        frame_resolution = int(math.sqrt(self.layout[1]-self.layout[0]))
+        frame_num = (args.layout[2]-args.layout[0])//(args.layout[1]-args.layout[0])
+        frame_len = self.layout[1]-self.layout[0]
+        self.add_mixin('extra_position_embedding', PositionEmbeddingMixin(
+            args.additional_seqlen, args.hidden_size
+        ))
+        if args.window_size == -1:
+            # full attention
+            assert self.stage == 1
+            self.add_mixin('attention_plus', FullAttentionMixin(
+                num_layers=args.num_layers,
+                hidden_size=args.hidden_size,
+                frame_resolution=frame_resolution,
+                n_head=args.num_attention_heads,
+                frame_num=frame_num,
+            ))
+        else:
+            self.add_mixin('attention_plus', WindowAttentionMixin(
+                num_layers=args.num_layers,
+                hidden_size=args.hidden_size,
+                frame_resolution=frame_resolution,
+                window_size=args.window_size,
+                shift_size=args.window_size//2,
+                n_head=args.num_attention_heads,
+                frame_num=frame_num,
+            ))
+        # attention_mask_local
+        self.attention_mask_local_sequential = torch.ones(1, 1, frame_num, frame_len, frame_len).tril().unsqueeze(0)
+        self.attention_mask_local_interp = torch.ones(1, 1, frame_num, frame_len, frame_len)
+        for idx in range(1, frame_num, 2):
+            self.attention_mask_local_interp[:, :, idx:idx+1].tril_()
+        self.attention_mask_local_interp = self.attention_mask_local_interp.unsqueeze(0)
+        self.mask_initialized = False
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('CogVideoModel', 'CogVideo model configurations')
+        group.add_argument("--layout", type=str, default='64, 464, 2064', help='text_len, textlen+frame_len, textlen+frame_len*frame_num')
+        group.add_argument("--window-size", type=int, default=10, help="swin attention's window size in temperal channel, -1 represents full attention")
+        group.add_argument("--additional-seqlen", type=int, default=2000)
+        group.add_argument("--cogvideo-stage", type=int, default=1, choices=[1,2])
+        return parser
+    def disable_untrainable_params(self):
+        self.transformer.requires_grad_(False)
+    def position_embedding_forward(self, position_ids, **kw_args):
+        position = position_ids[..., :(64+400)]
+        position_plus = position_ids[..., (64+400):]
+        position_embeddings = torch.cat(
+            (
+                self.transformer.position_embeddings(position),
+                self.get_mixin('extra_position_embedding').position_embeddings(position_plus-(512+400))
+            ),
+            dim=-2
+        )
+        return position_embeddings
+    def attention_forward(self, hidden_states, mask, layer_id, **kw_args):
+        # mask.shape=[bs, 1, 1, 64]
+        if not self.mask_initialized:
+            self.attention_mask_local_sequential = self.attention_mask_local_sequential.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            self.attention_mask_local_interp = self.attention_mask_local_interp.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            self.mask_initialized = True
+        attn_module = self.transformer.layers[layer_id].attention
+        hidden_size = hidden_states.shape[-1]
+        bs = hidden_states.shape[0]
+        # base model qkv
+        mixed_raw_layer = attn_module.query_key_value(hidden_states)
+        q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer, 3)
+        dropout_fn = self.transformer.layers[layer_id].attention.attention_dropout if self.training else None
+        attention_mask_local = self.attention_mask_local_sequential if self.mode_sequential else self.attention_mask_local_interp
+        context_text, context_frame_local_text = attention_localframe_and_text(
+                q0, k0, v0,
+                attention_mask_totxt=mask,
+                attention_mask_local=attention_mask_local,
+                n_head=attn_module.num_attention_heads_per_partition,
+                text_len=self.layout[0],
+                frame_len=self.layout[1]-self.layout[0],
+                frame_num=(self.layout[2]-self.layout[0])//(self.layout[1]-self.layout[0]),
+                attention_dropout=dropout_fn,
+                layer_id=layer_id,
+            )
+        context_frame_swin = self.get_mixin('attention_plus').attention_extra(
+            hidden_states[:, self.layout[0]:], layer_id, dropout_fn,
+            text_hidden_state=hidden_states[:, :self.layout[0]],
+            text_attn_mask=mask[..., 0, :],
+            mode_sequential=self.mode_sequential)
+        attn_distrib = torch.sigmoid(self.get_mixin('attention_plus').attn_distribution[layer_id])
+        attn_distrib = attn_distrib.unsqueeze(0).unsqueeze(0)
+        output_text = attn_module.dense(context_text)
+        output_frame = torch.mul(attn_module.dense(context_frame_local_text), attn_distrib)\
+            +torch.mul(self.get_mixin('attention_plus').dense[layer_id](context_frame_swin), 1-attn_distrib)
+        output = torch.cat((output_text, output_frame), dim=-2)
+        return output

src/videogen_hub/pipelines/cogvideo/cogvideo_src/pretrain_cogvideo.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   pretrain_cogvideo.py
+@Time    :   2021/10/06 00:58:32
+@Author  :   Wenyi Hong
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import torch
+import argparse
+import numpy as np
+from videogen_hub.depend.icetk import icetk as tokenizer
+tokenizer.add_special_tokens(['<start_of_image>', '<start_of_english>', '<start_of_chinese>'])
+from models.cogvideo_model import CogVideoModel
+from SwissArmyTransformer import mpu, get_args
+from SwissArmyTransformer.training.deepspeed_training import training_main
+from SwissArmyTransformer.data_utils import BinaryDataset
+def get_masks_and_position_ids_video(data, attention_mask_totxt=None, args=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+    assert attention_mask_totxt is not None
+    layout = args.layout
+    assert seq_length == layout[-1]
+    n_pads = layout[0] - attention_mask_totxt.sum(dim=-1).long()
+    frame_len = layout[1]-layout[0]
+    position_ids = torch.zeros(batch_size, layout[2], dtype=torch.long,
+                                device=data.device)
+    for i in range(batch_size):
+        torch.arange(layout[0] - n_pads[i], out=position_ids[i, n_pads[i]:layout[0]],
+            dtype=torch.long, device=data.device)
+        torch.arange(512, 512+layout[2]-layout[0],
+            out=position_ids[i, layout[0]:], dtype=torch.long, device=data.device)
+    return position_ids
+def get_batch(data_iterator, args, timers):
+    # Items and their type.
+    keys = ['text', 'loss_mask', 'attention_mask_totxt']
+    datatype = torch.int64
+    # Broadcast data.
+    timers('data loader').start()
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    timers('data loader').stop()
+    data_b = mpu.broadcast_data(keys, data, datatype)
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    loss_mask = data_b['loss_mask'].float()
+    attention_mask_totxt = data_b['attention_mask_totxt'].float()
+    labels = tokens_[:, 1:].clone().contiguous()
+    loss_mask = loss_mask[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].clone().contiguous()
+    for idx in range(args.layout[0], args.layout[2], 400):
+        tokens[:, idx] = tokenizer['<start_of_image>']
+    # Get the masks and postition ids.
+    position_ids = get_masks_and_position_ids_video(
+        tokens,
+        attention_mask_totxt=attention_mask_totxt,
+        args=args
+        )
+    attention_mask_totxt = attention_mask_totxt.unsqueeze(1).unsqueeze(1)
+    # Convert
+    if args.fp16:
+        attention_mask_totxt = attention_mask_totxt.half()
+    return tokens, labels, loss_mask, attention_mask_totxt, position_ids
+def forward_step(data_iterator, model, args, timers):
+    """Forward step."""
+    # Get the batch.
+    timers('batch generator').start()
+    tokens, labels, loss_mask, attention_mask_totxt, position_ids = get_batch(
+        data_iterator, args, timers)
+    timers('batch generator').stop()
+    # Forward model.
+    logits, *mems = model(tokens, position_ids, attention_mask_totxt)
+    # ======= hyper params =======#
+    perframe_len = 400
+    text_len=64
+    frame_num = 5
+    logits_img_tokens = logits[:, text_len:, :tokenizer.num_image_tokens].float().contiguous()
+    losses = mpu.vocab_parallel_cross_entropy(logits_img_tokens, labels[:, text_len:])
+    # scaling loss mask
+    loss_mask = loss_mask[:, text_len:].reshape(-1)
+    losses_1d = losses.reshape(-1) * loss_mask
+    loss = torch.sum(losses_1d) / loss_mask.sum()
+    # =====================   Log partial losses   ======================== #
+    log_loss_dict = {}
+    bs = losses.shape[0]
+    if args.cogvideo_stage == 1:
+        for i in range(frame_num):
+            log_loss_dict[f'AR_f{i}_loss'] = losses[:, i*perframe_len:(i+1)*perframe_len].contiguous().reshape(-1).detach().sum() / max((perframe_len*bs), 1)
+    else:
+        for i in range(1, frame_num-1):
+            log_loss_dict[f'ITP_f{i}_loss'] = losses[:, i*perframe_len:(i+1)*perframe_len].contiguous().reshape(-1).detach().sum() / max((perframe_len*bs), 1)
+    # ===================== END OF BLOCK ======================= #
+    return loss, log_loss_dict
+def create_dataset_function(path, args):
+    dataset_layout = [64, 464, 2064]
+    input_layout = [64, 464, 2064]
+    # frame_num = 6
+    # frame_interval = 2 # DEBUG!!!
+    def process_fn(row):
+        row = row.astype(np.int64)
+        text = row[:dataset_layout[0]]
+        frames = row[dataset_layout[0]:]
+        if text[0] == tokenizer['<pad>']:
+            text = text[1:] # due to our way of data processing
+        if args.cogvideo_stage == 1:
+            text, loss_mask, frames = make_text_video_generation(text, frames)
+        else:
+            text, loss_mask, frames = mask_video_frame_interpolation(text, frames)
+        n_pad = input_layout[0] - len(text)
+        parts = [
+            np.array([tokenizer['<pad>']] * n_pad, dtype=np.int64),
+            text,
+            np.array([tokenizer['<start_of_image>']], dtype=np.int64),
+            frames,
+        ]
+        ret = np.concatenate(parts, axis=0)
+        attention_mask_totxt = np.array([0] * n_pad + [1] * (input_layout[0]-n_pad))
+        return {'text': ret,
+            'loss_mask':  loss_mask,
+            'attention_mask_totxt': attention_mask_totxt,
+            }
+    return BinaryDataset(path, process_fn, length_per_sample=dataset_layout[-1])
+def make_text_video_generation(text, frames):
+    input_layout = [64, 464, 2064]
+    text = text[text!= tokenizer['<pad>']][:input_layout[0]] # dataset format: 1.0秒<n>{text}<pad><pad> ...
+    loss_mask = np.array([0] * (input_layout[1]+1) + [1] * (input_layout[2] - input_layout[1])) # 按照input的，之后loss_mask会左移一位
+    return text, loss_mask, frames
+def mask_video_frame_interpolation(text, frames):
+    input_layout = [64, 464, 2064]
+    frame_len = input_layout[1]-input_layout[0]
+    # text format: <pad> 1.0秒 <n> {text} <pad> <pad>
+    text = text[text!= tokenizer['<pad>']][:input_layout[0]]
+    loss_mask = np.array([0] * (input_layout[1]+1)
+                        + [1] * (input_layout[1]-input_layout[0])
+                        + [0] * (input_layout[1]-input_layout[0])
+                        + [1] * (input_layout[1]-input_layout[0])
+                        + [0] * (input_layout[1]-input_layout[0]) )# 按照input的，之后loss_mask会左移一位
+    return text, loss_mask, frames
+if __name__ == '__main__':
+    py_parser = argparse.ArgumentParser(add_help=False)
+    py_parser.add_argument('--txt-loss-scale', type=float, default=1)
+    CogVideoModel.add_model_specific_args(py_parser)
+    known, args_list = py_parser.parse_known_args()
+    args = get_args(args_list)
+    args = argparse.Namespace(**vars(args), **vars(known))
+    args.layout = [int(x) for x in args.layout.split(',')]
+    training_main(args, model_cls=CogVideoModel, forward_step_function=forward_step, create_dataset_function=create_dataset_function)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+SwissArmyTransformer==0.2.9
+icetk
+gifmaker
+torchvision

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   __init__.py
+@Time    :   2022/03/02 13:57:09
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from .direct_sr import DirectSuperResolution
+from .iterative_sr import IterativeSuperResolution
+from .sr_group import SRGroup

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/cluster_label2.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b87880fdbe89670f12844377b9cf97a9733b1f54e3a9b73cbb9835084c4e02ec
+size 160128

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/direct_sr.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   direct_sr.py
+@Time    :   2022/03/02 13:58:11
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import torch
+# -*- encoding: utf-8 -*-
+'''
+@File    :   inference_cogview2.py
+@Time    :   2021/10/10 16:31:34
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from PIL import ImageEnhance, Image
+import torch
+import argparse
+from torchvision import transforms
+from SwissArmyTransformer import get_args
+from SwissArmyTransformer.training.model_io import load_checkpoint
+from .dsr_sampling import filling_sequence_dsr, IterativeEntfilterStrategy
+from SwissArmyTransformer.generation.utils import timed_name, save_multiple_images, generate_continually
+from .dsr_model import DsrModel
+from videogen_hub.depend.icetk import icetk as tokenizer
+class DirectSuperResolution:
+    def __init__(self, args, path, max_bz=4, topk=6, onCUDA=False):
+        args.load = path
+        args.kernel_size = 5
+        args.kernel_size2 = 5
+        args.new_sequence_length = 4624
+        args.layout = [96,496,4096]
+        model = DsrModel(args)
+        if args.fp16:
+            model = model.half()
+        load_checkpoint(model, args) # on cpu
+        model.eval()
+        self.model = model
+        self.onCUDA = onCUDA
+        if onCUDA:
+            self.model = self.model.cuda()
+        invalid_slices = [slice(tokenizer.num_image_tokens, None)]
+        self.strategy = IterativeEntfilterStrategy(invalid_slices,
+            temperature=1.0, topk=topk) # temperature not used # Temperature Freezed Here!!
+        self.max_bz = max_bz
+    def __call__(self, text_tokens, image_tokens, enhance=False):
+        if len(text_tokens.shape) == 1:
+            text_tokens.unsqueeze_(0)
+        if len(image_tokens.shape) == 1:
+            image_tokens.unsqueeze_(0)
+        # =====================   Debug   ======================== #
+        # new_image_tokens = []
+        # for small_img in image_tokens:
+        #     decoded = tokenizer.decode(image_ids=small_img)
+        #     decoded = torch.nn.functional.interpolate(decoded, size=(480, 480)).squeeze(0)
+        #     ndarr = decoded.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
+        #     image_pil_raw = ImageEnhance.Sharpness(Image.fromarray(ndarr))
+        #     small_img2 = tokenizer.encode(image_pil=image_pil_raw.enhance(1.5), image_size=480).view(-1)
+        #     new_image_tokens.append(small_img2)
+        # image_tokens = torch.stack(new_image_tokens)
+        # return image_tokens
+        # ===================== END OF BLOCK ======================= #
+        if enhance:
+            new_image_tokens = []
+            for small_img in image_tokens:
+                decoded = tokenizer.decode(image_ids=small_img).squeeze(0)
+                ndarr = decoded.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
+                image_pil_raw = ImageEnhance.Sharpness(Image.fromarray(ndarr))
+                small_img2 = tokenizer.encode(image_pil=image_pil_raw.enhance(1.), image_size=160).view(-1)
+                new_image_tokens.append(small_img2)
+            image_tokens = torch.stack(new_image_tokens)
+        seq = torch.cat((text_tokens,image_tokens), dim=1)
+        seq1 = torch.tensor([tokenizer['<start_of_image>']]*3601, device=image_tokens.device).unsqueeze(0).expand(text_tokens.shape[0], -1)
+        if not self.onCUDA:
+            print('Converting Dsr model...')
+            model = self.model.cuda()
+        else:
+            model = self.model
+        print('Direct super-resolution...')
+        output_list = []
+        for tim in range(max((text_tokens.shape[0]+self.max_bz-1) // self.max_bz, 1)):
+            output1 = filling_sequence_dsr(model,
+                seq[tim*self.max_bz:(tim+1)*self.max_bz],
+                seq1[tim*self.max_bz:(tim+1)*self.max_bz],
+                warmup_steps=1, block_hw=(1, 0),
+                strategy=self.strategy
+                )
+            output_list.extend(output1[1:])
+        if not self.onCUDA:
+            print('Moving back Dsr to cpu...')
+            model = model.cpu()
+            torch.cuda.empty_cache()
+        return torch.cat(output_list, dim=0)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/dsr_model.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   cuda2d_model.py
+@Time    :   2021/10/02 01:36:32
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import torch
+import torch.nn.functional as F
+from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin
+from SwissArmyTransformer.model.transformer import split_tensor_along_last_dim, unscaled_init_method
+from SwissArmyTransformer.mpu.utils import sqrt
+from deepspeed.runtime.activation_checkpointing.checkpointing import get_cuda_rng_tracker
+from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear
+class PositionEmbeddingMixin(BaseMixin):
+    def __init__(self, additional_sequence_length, hidden_size,
+                 init_method_std=0.02, reinit_slice=slice(512, 512+400)
+                 ):
+        super(PositionEmbeddingMixin, self).__init__()
+        self.reinit_slice = reinit_slice
+        self.position_embeddings = torch.nn.Embedding(additional_sequence_length, hidden_size)
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+    def reinit(self, parent_model=None):
+        old_weights = self.transformer.position_embeddings.weight.data[self.reinit_slice]
+        old_len, hidden_size = old_weights.shape
+        assert hidden_size == self.position_embeddings.weight.shape[-1]
+        old_edge, new_edge = sqrt(old_len), sqrt(self.position_embeddings.weight.shape[-2])
+        assert new_edge % old_edge == 0
+        self.position_embeddings.weight.data.view(new_edge // old_edge, old_edge, new_edge // old_edge, old_edge, hidden_size).copy_(old_weights.view(1, old_edge, 1, old_edge, hidden_size))
+        # self.position_embeddings.weight.data.view(-1, old_len, hidden_size).copy_(old_weights)
+class AttentionMixin(BaseMixin):
+    def __init__(self, num_layers,
+                 hidden_size,
+                 init_method=unscaled_init_method(0.02),
+                 output_layer_init_method=unscaled_init_method(0.02)
+                 ):
+        super(AttentionMixin, self).__init__()
+        self.num_layers = num_layers  # replace attention in the LAST n layers
+        self.query_key_value = torch.nn.ModuleList(
+            [ColumnParallelLinear(hidden_size, 3 * hidden_size, stride=3,
+                                  gather_output=False, init_method=init_method)
+             for layer_id in range(num_layers)
+             ])
+        self.dense = torch.nn.ModuleList(
+            [RowParallelLinear(hidden_size,
+                               hidden_size,
+                               input_is_parallel=True,
+                               init_method=output_layer_init_method)
+             for layer_id in range(num_layers)
+             ])
+    def reinit(self, parent_model=None):
+        start_layer = len(self.transformer.layers) - self.num_layers
+        assert start_layer >= 0
+        for layer_id in range(self.num_layers):
+            old_attention = self.transformer.layers[start_layer + layer_id].attention
+            self.query_key_value[layer_id].weight.data.copy_(old_attention.query_key_value.weight.data)
+            self.query_key_value[layer_id].bias.data.copy_(old_attention.query_key_value.bias.data)
+            self.dense[layer_id].weight.data.copy_(old_attention.dense.weight.data)
+            self.dense[layer_id].bias.data.copy_(old_attention.dense.bias.data)
+class DsrModel(BaseModel):
+    def __init__(self, args, transformer=None):
+        super().__init__(args, transformer=transformer)
+        self.original_sequence_length = args.max_sequence_length
+        additional_seqlen = args.new_sequence_length - args.max_sequence_length
+        self.add_mixin('extra_position_embedding', PositionEmbeddingMixin(
+            additional_seqlen, args.hidden_size
+        ))
+        self.add_mixin('attention_plus', AttentionMixin(
+            num_layers=args.num_layers,
+            hidden_size=args.hidden_size
+        ))
+        self.layout = args.layout
+        # [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1] 4095 {layout[2]}
+        self.kernel_size = args.kernel_size
+        self.kernel_size2 = args.kernel_size2
+        self.log_attention_weights = None
+    def position_embedding_forward(self, position_ids, **kw_args):
+        position = position_ids[..., :self.layout[1]]
+        position_plus = position_ids[..., self.layout[1]:] - self.original_sequence_length
+        position_embeddings = torch.cat(
+                (
+                    self.transformer.position_embeddings(position),
+                    self.get_mixin('extra_position_embedding').position_embeddings(position_plus)
+                ),
+                dim=-2
+            )
+        return position_embeddings
+    def attention_forward(self, hidden_states, mask,
+                        layer_id=None, log_attention_weights=None, **kw_args):
+        attn_module = self.transformer.layers[layer_id].attention
+        # attention_plus on all layers
+        query_key_value_plus = self.get_mixin('attention_plus').query_key_value[layer_id]
+        dense_plus = self.get_mixin('attention_plus').dense[layer_id]
+        # split two parts
+        hidden_states_plus = hidden_states[:, self.layout[1]:]
+        hidden_states = hidden_states[:, :self.layout[1]]
+        # base model qkv
+        mixed_raw_layer = attn_module.query_key_value(hidden_states)
+        q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer, 3)
+        # cuda2d model qkv
+        mixed_raw_layer = query_key_value_plus(hidden_states_plus)
+        q1, k1, v1 = split_tensor_along_last_dim(mixed_raw_layer, 3)
+        dropout_fn = attn_module.attention_dropout if self.training else None
+        # cuda2d attention
+        context_layer0, context_layer1 = sparse_attention_2d_light(
+                q0, k0, v0,
+                q1, k1, v1,
+                mask,
+                n_head=attn_module.num_attention_heads_per_partition,
+                text_len=self.layout[0],
+                kernel_size=self.kernel_size,
+                kernel_size2=self.kernel_size2,
+                attention_dropout=dropout_fn,
+                log_attention_weights=log_attention_weights,
+                add_scalar=(kw_args['add_scalar'] if 'add_scalar' in kw_args else 0)
+            )
+        output_0 = attn_module.dense(context_layer0)
+        output_1 = dense_plus(context_layer1)
+        output = torch.cat((output_0, output_1), dim=1)
+        return output
+    def final_forward(self, logits, **kwargs):
+        logits_parallel = logits
+        logits_parallel = torch.nn.functional.linear(logits_parallel.float(), self.transformer.word_embeddings.weight[:20000].float())
+        # logits_parallel = torch.nn.functional.linear(logits_parallel, self.transformer.word_embeddings.weight[:20000])
+        return logits_parallel
+    def disable_untrainable_params(self):
+        self.transformer.requires_grad_(False)
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('Cuda2dModel', 'cuda2d model configurations')
+        group.add_argument("--kernel-size", type=int, default=5)
+        group.add_argument("--kernel-size2", type=int, default=5)
+        group.add_argument("--layout", type=str, default='96,496,4096')
+        group.add_argument("--new-sequence-length", type=int, default=4096)
+        return parser
+def sparse_attention_2d_light(q0, k0, v0, q1, k1, v1, attention_mask, n_head, text_len, kernel_size=9, kernel_size2=7, attention_dropout=None, log_attention_weights = None, add_scalar=0, **kwargs):
+    '''
+    q0, k0, v0: [batch_size, 1088, hidden_size]
+    q1, k1, v1: [batch_size, 4096, h2]
+    n_head: int
+    attention_mask: [batch_size, 1088, 1088]
+    '''
+    from SwissArmyTransformer.ops.local_attention_function import f_similar, f_weighting
+    b, s0, h0 = q0.shape
+    b, s1, h1 = q1.shape
+    h, l0, l1 = h0 // n_head, sqrt(s0-text_len), sqrt(s1)
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    # standard attention for level 0
+    attention_scores = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+    if log_attention_weights is not None:
+        attention_scores += log_attention_weights
+    attention_scores = torch.mul(attention_scores, attention_mask) - \
+                    10000.0 * (1.0 - attention_mask)
+    attention_probs0 = F.softmax(attention_scores, dim=-1)
+    # local attention for level 1
+    q1 = (q1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1) / math.sqrt(h1//n_head)).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    k1 = k1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    v1 = v1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    # scores_1_to_1 = f_similar(q1, k1, kernel_size*2-1, kernel_size, True)
+    scores_1_to_1 = f_similar(q1, k1, kernel_size*2-1, kernel_size, False)
+    # cross attention
+    k0T = k0T[..., -l0**2:].reshape(b*n_head, h, l0, l0).contiguous()
+    scores_1_to_0 = f_similar(q1, k0T, kernel_size2, kernel_size2, False) # [b*n_head, l1, l1, field]
+    scores_1 = torch.cat(
+        (
+            scores_1_to_0.view(b*n_head, -1, scores_1_to_0.shape[3]) + add_scalar,
+            scores_1_to_1.view(b*n_head, -1, scores_1_to_1.shape[3])
+        ),
+        dim=-1)
+    attention_probs1 = F.softmax(scores_1, dim=-1)
+    if attention_dropout is not None:
+        # with get_cuda_rng_tracker().fork():
+            attention_probs0 = attention_dropout(attention_probs0)
+            attention_probs1 = attention_dropout(attention_probs1)
+    # weighting for level 0
+    context0 = torch.matmul(attention_probs0, v0) # [b, n_head, s0, h]
+    # weighting for level 1
+    probs_1_to_1 = attention_probs1[:, :, -scores_1_to_1.shape[3]:].view_as(scores_1_to_1)
+    # context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size*2-1, kernel_size, True)
+    context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size*2-1, kernel_size, False)
+    context1 = context1_to_1.view(b, n_head * h, l1**2)
+    # weighting for cross attention
+    probs_1_to_0 = attention_probs1[:, :, :scores_1_to_0.shape[3]].view_as(scores_1_to_0)
+    v0_part = v0[:, :, -l0**2:].transpose(-1, -2).contiguous().view(b*n_head, h, l0, l0)
+    context1_to_0 = f_weighting(v0_part, probs_1_to_0.contiguous(), kernel_size2, kernel_size2, False)
+    context1_to_0 = context1_to_0.view(b, n_head * h, l1**2)
+    context1 = context1 + context1_to_0
+    return context0.transpose(1, 2).reshape(b, s0, h0), context1.transpose(-1, -2)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/dsr_sampling.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# -*- encoding: utf-8 -*-
+"""
+@File    :   cuda2d_sampling.py
+@Time    :   2021/10/09 00:46:04
+@Author  :   Ming Ding
+@Contact :   [email protected]
+"""
+# here put the import lib
+import os
+import sys
+import math
+import random
+from cv2 import reduce
+import torch
+import torch
+import torch.nn.functional as F
+import numpy as np
+def top_k_logits_(logits, top_k=0, filter_value=-float("Inf")):
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits[indices_to_remove] = filter_value
+    return logits
+class IterativeEntfilterStrategy:
+    def __init__(self, invalid_slices=[], temperature=1.0, topk=6):
+        self.invalid_slices = invalid_slices
+        self.temperature = temperature
+        self.topk = topk
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda"
+        self.cluster_labels = torch.tensor(
+            np.load("cluster_label2.npy"), device=device, dtype=torch.long
+        )
+    def forward(
+        self,
+        logits_,
+        tokens,
+        temperature=None,
+        entfilter=None,
+        filter_topk=5,
+        temperature2=None,
+    ):
+        # In interative strategy, logits are of shape [batch_size, seq_length, hidden_size]
+        if temperature is None:
+            temperature = self.temperature
+        logits = logits_.float() / temperature
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -float("Inf")
+        logits = logits.view(-1, logits.shape[-1])
+        rprobs = F.softmax(logits.float(), dim=-1)
+        c = self.cluster_labels.expand(*rprobs.shape)
+        cprobs = torch.zeros(logits.shape[0], 500, device=logits.device).scatter_add_(
+            1, c, rprobs
+        )
+        best_scores, best_clusters = cprobs.topk(self.topk)
+        bz = logits.shape[0]
+        best_scores = best_scores / best_scores.sum(dim=-1, keepdim=True)
+        sampled_ids = torch.multinomial(best_scores, num_samples=1)
+        selected_clusters = torch.gather(best_clusters, dim=1, index=sampled_ids)
+        selected_mask = (
+            self.cluster_labels.unsqueeze(0).expand(bz, -1) != selected_clusters
+        )  # cluster_labels [1, 20000] \in [0,500)
+        logits[selected_mask] = -65504
+        # for i in range(bz):
+        #     selected_cluster = best_clusters[i][torch.multinomial(best_scores[i] / best_scores[i].sum(), num_samples=1)]
+        #     logits[i, self.cluster_labels != selected_cluster] = -65504
+        # logits = top_k_logits(logits, self.topk, self.top_p)
+        probs = F.softmax(
+            logits.float() / 0.6, dim=-1
+        )  # float is essetial, due to a bug in Pytorch
+        pred = torch.multinomial(probs, num_samples=1).view(*logits_.shape[:2])
+        assert tokens.shape[1] == pred.shape[1] + 1
+        tokens = torch.cat((tokens[:, :1], pred), dim=1)
+        return tokens
+def filling_sequence_dsr(
+    model,
+    seq0,
+    seq1,
+    warmup_steps=3,
+    block_hw=(4, 4),
+    strategy=IterativeEntfilterStrategy(topk=10),
+):
+    """
+    seq: [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1]
+        4095 {layout[2]} final_token.
+    Attention:
+    The sampling temperature are changing, temporally we hard code them here.
+    The temperature in the strategy is not used.
+    """
+    assert hasattr(model, "layout")
+    layout = model.layout
+    assert (
+        len(seq0.shape) == 2 and len(seq1.shape) == 2 and seq0.shape[0] == seq1.shape[0]
+    )
+    assert len(layout) == 3
+    assert seq1.shape[1] == layout[-1] - layout[-2] + 1
+    assert (seq1 >= 0).all() and (seq0 >= 0).all()
+    device = seq0.device
+    # concat and pad sequences
+    batch_size = seq0.shape[0]
+    n_pad = layout[1] - seq0.shape[1]
+    assert n_pad > 0, "You should truncate long input before filling."
+    seq = torch.cat(
+        (
+            torch.tensor([0] * n_pad, device=device, dtype=seq0.dtype)
+            .unsqueeze(0)
+            .expand(batch_size, n_pad),
+            seq0,
+            seq1,
+        ),
+        dim=1,
+    )  # [b, layout[-1]+1]
+    assert seq.shape[1] == layout[-1] + 1
+    # build initial tokens, attention_mask, and position_ids
+    tokens = seq.clone()
+    attention_mask = torch.ones(layout[1], layout[1]).to(device)
+    attention_mask[: layout[0], layout[0] :] = 0
+    attention_mask[n_pad:, :n_pad] = 0
+    attention_mask = attention_mask.type_as(next(model.parameters()))  # if fp16
+    position_ids = torch.cat(
+        (
+            torch.zeros(n_pad, dtype=torch.long),
+            torch.arange(0, layout[0] - n_pad),
+            torch.arange(513, 513 + layout[1] - layout[0]),
+            torch.arange(1024, 1024 + layout[2] - layout[1]),
+        )
+    ).to(device)
+    log_attention_weights = torch.zeros(layout[1], layout[1], device=device).type_as(
+        next(model.parameters())
+    )
+    log_attention_weights[layout[0] :, n_pad : layout[0]] = 0.0
+    # prepare for interation
+    unfixed = tokens < 0  # just init an all-False tensor
+    unfixed[:, -layout[-1] + layout[-2] :] = True
+    ll, rr = block_hw
+    edge_len = int(math.sqrt(layout[-1] - layout[-2]) + 1e-4)
+    num_steps = warmup_steps + ll - 1 + rr
+    # interative refining
+    # unfixed[..., -(layout[-1] - layout[-2]):].view(
+    #     batch_size, edge_len//ll, ll, edge_len//rr, rr)[:, :, :, :, -1] = False
+    ret = []
+    ret.append(tokens[:, layout[-2] + 1 :].clone())
+    for step_cnt in range(1, num_steps + 1):
+        if step_cnt <= warmup_steps:
+            logits, *_dump = model(
+                tokens[:, :-1],
+                position_ids,
+                attention_mask,
+                log_attention_weights=log_attention_weights,
+            )
+            real_temp = 1.0
+            new_tokens = strategy.forward(logits, tokens, real_temp)
+            tokens[unfixed] = new_tokens[unfixed]
+        else:
+            logits, *_dump = model(
+                tokens[:, :-1],
+                position_ids,
+                attention_mask,
+                log_attention_weights=log_attention_weights,
+            )
+            real_temp = 1.0
+            new_tokens = strategy.forward(
+                logits,
+                tokens,
+                real_temp,
+                entfilter=1.3,
+                filter_topk=5,
+                temperature2=0.6,
+            )
+            # tokens[unfixed] = new_tokens[unfixed]
+            # fixed tokens (update unfixed)
+            unfixed2 = tokens > 10000000
+            for x in range(min(ll, step_cnt - warmup_steps)):
+                y = step_cnt - warmup_steps - x - 1
+                if y < rr:
+                    unfixed[..., -(layout[-1] - layout[-2]) :].view(
+                        batch_size, edge_len // ll, ll, edge_len // rr, rr
+                    )[:, :, x, :, y] = False
+                    unfixed2[..., -(layout[-1] - layout[-2]) :].view(
+                        batch_size, edge_len // ll, ll, edge_len // rr, rr
+                    )[:, :, x, :, y] = True
+            tokens[unfixed2] = new_tokens[unfixed2]
+        ret.append(tokens[:, layout[-2] + 1 :].clone())
+    return ret

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/iterative_sr.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   iterative_sr.py
+@Time    :   2022/03/02 15:57:45
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+# here put the import lib
+import os
+import sys
+import math
+import random
+from PIL import ImageEnhance, Image
+import torch
+import argparse
+from torchvision import transforms
+from SwissArmyTransformer.training.model_io import load_checkpoint
+from SwissArmyTransformer import get_args
+from .itersr_sampling import filling_sequence_itersr, IterativeEntfilterStrategy
+from SwissArmyTransformer.generation.utils import timed_name, save_multiple_images, generate_continually
+from .itersr_model import ItersrModel
+from videogen_hub.depend.icetk import icetk as tokenizer
+class IterativeSuperResolution:
+    def __init__(self, args, path, max_bz=4, shared_transformer=None):
+        args.load = path
+        args.kernel_size = 5
+        args.kernel_size2 = 5
+        args.new_sequence_length = 4624
+        args.layout = [16,3616]
+        model = ItersrModel(args, transformer=shared_transformer)
+        if args.fp16:
+            model = model.half()
+        load_checkpoint(model, args) # on cpu
+        model.eval()
+        self.model = model.cuda()
+        # save cpu weights
+        self.saved_weights = dict((k,v.cpu())
+            for k, v in model.named_parameters()
+            if 'transformer' in k
+        )
+        invalid_slices = [slice(tokenizer.num_image_tokens, None)]
+        self.strategy = IterativeEntfilterStrategy(invalid_slices,
+            temperature=args.temp_all_itersr, topk=args.topk_itersr)
+        self.max_bz = max_bz
+    def _restore_transformer_from_cpu(self, non_blocking=False):
+        for k, v in self.model.named_parameters():
+            if k in self.saved_weights:
+                v.copy_(self.saved_weights[k])
+    def __call__(self, text_tokens, image_tokens, enhance=False, input_mask=None):
+        if len(text_tokens.shape) == 1:
+            text_tokens.unsqueeze_(0)
+        text_tokens = text_tokens.clone()[..., :16]
+        if len(image_tokens.shape) == 1:
+            image_tokens.unsqueeze_(0)
+        if enhance:
+            new_image_tokens = []
+            for big_img in image_tokens:
+                decoded = tokenizer.decode(image_ids=big_img).squeeze(0)
+                ndarr = decoded.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()
+                image_pil_raw = ImageEnhance.Sharpness(Image.fromarray(ndarr))
+                big_img2 = tokenizer.encode(image_pil=image_pil_raw.enhance(1.5), image_size=480).view(-1)
+                new_image_tokens.append(big_img2)
+            image_tokens = torch.stack(new_image_tokens)
+        print('Converting Itersr model...')
+        self._restore_transformer_from_cpu()
+        model = self.model
+        print('iterative super-resolution...')
+        output_list = []
+        for tim in range(max(text_tokens.shape[0] // self.max_bz, 1)):
+                big_img = image_tokens[tim*self.max_bz:(tim+1)*self.max_bz]
+                text_seq = text_tokens[tim*self.max_bz:(tim+1)*self.max_bz]
+                mask_raw = torch.tensor(
+                    [
+                        -1, 0, 1, 2, 3, 4,
+                        0, -1, 2, -1, -2, 5,
+                        1, -2, 3, 4, 5, 6,
+                        2, 3, 4, 5, -1, 1,
+                        3, -1, -2, 0, -1, 2,
+                        4, 5, 6, 1, 3, -2
+                    ]
+                ).view(1, 6, 1, 6).expand(10, 6, 10, 6).reshape(-1).contiguous()
+                topks = [60, 40, 40, 40, 20, 20, 10]
+                for mask_ratio in range(1, 7):
+                    self.strategy.topk = topks[mask_ratio]
+                    mask = (mask_raw.to(big_img.device) >= mask_ratio)
+                    if input_mask is not None:
+                        mask = mask & input_mask
+                    big_img.masked_fill_(mask, tokenizer['<start_of_image>'])
+                    seq1 = big_img
+                    output1 = filling_sequence_itersr(model, text_seq, seq1,
+                        warmup_steps=1, block_hw=(1, 0),
+                        strategy=self.strategy
+                        )
+                    big_img = output1
+                    print(f'Iter {mask_ratio} times.')
+                output_list.append(output1.clone())
+        return torch.cat(output_list, dim=0)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/itersr_model.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   itersr_model.py
+@Time    :   2021/10/02 01:36:32
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import torch
+import torch.nn.functional as F
+from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin
+from SwissArmyTransformer.mpu.utils import sqrt
+from deepspeed.runtime.activation_checkpointing.checkpointing import get_cuda_rng_tracker
+from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear
+from SwissArmyTransformer.model.transformer import unscaled_init_method, split_tensor_along_last_dim
+class PositionEmbeddingMixin(BaseMixin):
+    def __init__(self, additional_sequence_length, hidden_size,
+                 init_method_std=0.02, reinit_slice=slice(512, 512+400)
+                 ):
+        super(PositionEmbeddingMixin, self).__init__()
+        self.reinit_slice = reinit_slice
+        self.position_embeddings = torch.nn.Embedding(additional_sequence_length, hidden_size)
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+    def reinit(self, parent_model=None):
+        old_weights = self.transformer.position_embeddings.weight.data[self.reinit_slice]
+        old_len, hidden_size = old_weights.shape
+        assert hidden_size == self.position_embeddings.weight.shape[-1]
+        old_edge, new_edge = sqrt(old_len), sqrt(self.position_embeddings.weight.shape[-2])
+        assert new_edge % old_edge == 0
+        self.position_embeddings.weight.data.view(new_edge // old_edge, old_edge, new_edge // old_edge, old_edge, hidden_size).copy_(old_weights.view(1, old_edge, 1, old_edge, hidden_size))
+class ItersrModel(BaseModel):
+    def __init__(self, args, transformer=None):
+        super().__init__(args, transformer=transformer)
+        self.original_sequence_length = args.max_sequence_length
+        additional_seqlen = args.new_sequence_length - args.max_sequence_length
+        self.add_mixin('extra_position_embedding', PositionEmbeddingMixin(
+            additional_seqlen, args.hidden_size
+        ))
+        # self.add_mixin('attention_plus', AttentionMixin(
+        #     num_layers=args.num_layers,
+        #     hidden_size=args.hidden_size
+        # ))
+        self.layout = args.layout
+        # [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1] 4095 {layout[2]}
+        self.kernel_size = args.kernel_size
+        self.kernel_size2 = args.kernel_size2
+        self.log_attention_weights = None
+    def position_embedding_forward(self, position_ids, **kw_args):
+        position = position_ids[..., :self.layout[0]]
+        position_plus = position_ids[..., self.layout[0]:] - self.original_sequence_length
+        position_embeddings = torch.cat(
+                (
+                    self.transformer.position_embeddings(position),
+                    self.get_mixin('extra_position_embedding').position_embeddings(position_plus)
+                ),
+                dim=-2
+            )
+        return position_embeddings
+    def attention_forward(self, hidden_states, mask,
+                        layer_id=None, log_attention_weights=None, **kw_args):
+        attn_module = self.transformer.layers[layer_id].attention
+        # base model qkv
+        mixed_raw_layer = attn_module.query_key_value(hidden_states)
+        q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer[:, :self.layout[0]], 3)
+        # cuda2d model qkv
+        q1, k1, v1 = split_tensor_along_last_dim(mixed_raw_layer[:, self.layout[0]:], 3)
+        dropout_fn = attn_module.attention_dropout if self.training else None
+        # cuda2d attention
+        context_layer = sparse_attention_2d_text(
+                q0, k0, v0,
+                q1, k1, v1,
+                mask,
+                n_head=attn_module.num_attention_heads_per_partition,
+                text_len=self.layout[0],
+                kernel_size=self.kernel_size,
+                attention_dropout=dropout_fn,
+                log_attention_weights=log_attention_weights,
+            )
+        output = attn_module.dense(context_layer)
+        return output
+    def final_forward(self, logits, **kwargs):
+        logits_parallel = logits
+        logits_parallel = torch.nn.functional.linear(logits_parallel, self.transformer.word_embeddings.weight[:20000]).float()
+        # logits_parallel = torch.nn.functional.linear(logits_parallel, self.transformer.word_embeddings.weight[:20000])
+        return logits_parallel
+    # def disable_untrainable_params(self):
+    #     self.transformer.requires_grad_(False)
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group('Cuda2dModel', 'cuda2d model configurations')
+        group.add_argument("--kernel-size", type=int, default=5)
+        group.add_argument("--kernel-size2", type=int, default=5)
+        group.add_argument("--layout", type=str, default='16,3616')
+        group.add_argument("--new-sequence-length", type=int, default=4096)
+        return parser
+def sparse_attention_2d_text(q0, k0, v0, q1, k1, v1, attention_mask, n_head, text_len, kernel_size=9,  attention_dropout=None, log_attention_weights = None, **kwargs):
+    '''
+    q0, k0, v0: [batch_size, 16, hidden_size]
+    q1, k1, v1: [batch_size, 3600, hidden_size]
+    n_head: int
+    attention_mask: [batch_size, 16]
+    '''
+    from SwissArmyTransformer.ops.local_attention_function import f_similar, f_weighting
+    b, s0, h0 = q0.shape
+    b, s1, h1 = q1.shape
+    h, l1 = h0 // n_head, sqrt(s1)
+    assert attention_mask.shape[-1] == s0, f"Mask Shape: {attention_mask.shape}"
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    # standard attention for level 0
+    attention_scores = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+    attention_scores = torch.mul(attention_scores, attention_mask) - \
+                    10000.0 * (1.0 - attention_mask)
+    attention_probs0 = F.softmax(attention_scores, dim=-1)
+    # local attention for level 1
+    q1 = (q1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1) / math.sqrt(h1//n_head)).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    k1 = k1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    v1 = v1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    scores_1_to_1 = f_similar(q1, k1, kernel_size*2-1, kernel_size, False)
+    # cross attention
+    scores_1_to_0 = torch.matmul(q1.view(b, n_head, h, s1).transpose(-1, -2), k0T)
+    if log_attention_weights is not None:
+        scores_1_to_0 += log_attention_weights
+    scores_1_to_0 = torch.mul(scores_1_to_0, attention_mask) - \
+                    10000.0 * (1.0 - attention_mask)
+    scores_1 = torch.cat(
+        (
+            scores_1_to_0.view(b*n_head, s1, s0),
+            scores_1_to_1.view(b*n_head, -1, scores_1_to_1.shape[3])
+        ),
+        dim=-1)
+    attention_probs1 = F.softmax(scores_1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs1 = attention_dropout(attention_probs1)
+    # weighting for level 0
+    context0 = torch.matmul(attention_probs0, v0) # [b, n_head, s0, h]
+    # weighting for level 1
+    probs_1_to_1 = attention_probs1[:, :, -scores_1_to_1.shape[3]:].view_as(scores_1_to_1)
+    context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size*2-1, kernel_size, False)
+    context1 = context1_to_1.view(b, n_head, h, l1**2)
+    # weighting for cross attention
+    probs_1_to_0 = attention_probs1[:, :, :scores_1_to_0.shape[3]].view(b, n_head, -1, scores_1_to_0.shape[3])
+    context1_to_0 = torch.matmul(probs_1_to_0, v0)
+    context1 = context1.transpose(-1, -2) + context1_to_0
+    output = torch.cat((context0, context1), dim=2).transpose(1, 2).reshape(b, s0+s1, h0)
+    return output
+def sparse_attention_2d_notext(q0, k0, v0, q1, k1, v1, attention_mask, n_head, text_len, kernel_size=9,  attention_dropout=None, log_attention_weights = None, **kwargs):
+    '''
+    q0, k0, v0: [batch_size, 16, hidden_size]
+    q1, k1, v1: [batch_size, 3600, hidden_size]
+    n_head: int
+    attention_mask: [batch_size, 16]
+    '''
+    from SwissArmyTransformer.mpu.local_attention_function import f_similar, f_weighting
+    b, s0, h0 = q0.shape
+    b, s1, h1 = q1.shape
+    h, l1 = h0 // n_head, sqrt(s1)
+    assert len(attention_mask.shape) == 4 and attention_mask.shape[-1] == s0, f"Mask Shape: {attention_mask.shape}"
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    # standard attention for level 0
+    attention_scores = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+    attention_scores = torch.mul(attention_scores, attention_mask) - \
+                    10000.0 * (1.0 - attention_mask)
+    attention_probs0 = F.softmax(attention_scores, dim=-1)
+    # local attention for level 1
+    q1 = (q1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1) / math.sqrt(h1//n_head)).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    k1 = k1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    v1 = v1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b*n_head, h1//n_head, l1, l1)
+    scores_1_to_1 = f_similar(q1, k1, kernel_size*2-1, kernel_size, False)
+    attention_probs1 = F.softmax(scores_1_to_1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs1 = attention_dropout(attention_probs1)
+    # weighting for level 0
+    context0 = torch.matmul(attention_probs0, v0) # [b, n_head, s0, h]
+    # weighting for level 1
+    probs_1_to_1 = attention_probs1
+    context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size*2-1, kernel_size, False)
+    context1 = context1_to_1.view(b, n_head, h, l1**2)
+    # weighting for cross attention
+    context1 = context1.transpose(-1, -2)
+    output = torch.cat((context0, context1), dim=2).transpose(1, 2).reshape(b, s0+s1, h0)
+    return output

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/itersr_sampling.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   itersr_sampling.py
+@Time    :   2022/03/03 14:24:28
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from videogen_hub.depend.icetk import icetk as tokenizer
+def top_k_logits_(logits, top_k=0, filter_value=-float('Inf')):
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits[indices_to_remove] = filter_value
+    return logits
+# class IterativeEntfilterStrategy:
+#     def __init__(self, invalid_slices=[], temperature=1., topk=10):
+#         self.invalid_slices = invalid_slices
+#         self.temperature = temperature
+#         self.topk = topk
+#         self.cluster_labels = torch.tensor(np.load('cluster_label.npy'), device='cuda', dtype=torch.long)
+#     def forward(self, logits_, tokens, temperature=None, entfilter=None, filter_topk=5, temperature2=None):
+#         # In interative strategy, logits are of shape [batch_size, seq_length, hidden_size]
+#         if temperature is None:
+#             temperature = self.temperature
+#         logits = logits_.float() / temperature
+#         for invalid_slice in self.invalid_slices:
+#             logits[..., invalid_slice] = -float('Inf')
+#         logits = logits.view(-1, logits.shape[-1])
+#         rprobs = F.softmax(logits.float(), dim=-1)
+#         c = self.cluster_labels.expand(*rprobs.shape)
+#         cprobs = torch.zeros(logits.shape[0], 500, device=logits.device).scatter_add_(1, c, rprobs)
+#         best_scores, best_clusters = cprobs.topk(self.topk)
+#         bz = logits.shape[0]
+#         best_scores = best_scores / best_scores.sum(dim=-1, keepdim=True)
+#         sampled_ids = torch.multinomial(best_scores, num_samples=1)
+#         selected_clusters = torch.gather(best_clusters, dim=1, index=sampled_ids)
+#         selected_mask = (self.cluster_labels.unsqueeze(0).expand(bz, -1) != selected_clusters) # cluster_labels [1, 20000] \in [0,500)
+#         logits[selected_mask] = -65504
+#         # for i in range(bz):
+#         #     selected_cluster = best_clusters[i][torch.multinomial(best_scores[i] / best_scores[i].sum(), num_samples=1)]
+#         #     logits[i, self.cluster_labels != selected_cluster] = -65504
+#         # logits = top_k_logits(logits, self.topk, self.top_p)
+#         probs = F.softmax(logits.float(), dim=-1)  # float is essetial, due to a bug in Pytorch
+#         pred = torch.multinomial(probs, num_samples=1).view(*logits_.shape[:2])
+#         assert tokens.shape[1] == pred.shape[1]
+#         tokens = pred
+#         return tokens
+class IterativeEntfilterStrategy:
+    def __init__(self, invalid_slices=[], temperature=1., topk=10):
+        self.invalid_slices = invalid_slices
+        self.temperature = temperature
+        self.topk = topk
+    def forward(self, logits, tokens, temperature=None, entfilter=None, filter_topk=5, temperature2=None):
+        # In interative strategy, logits are of shape [batch_size, seq_length, hidden_size]
+        if temperature is None:
+            temperature = self.temperature
+        # check entropy filter
+        # if entfilter is not None:
+        #     assert temperature2 is not None
+        #     topraw = (torch.topk(logits, filter_topk, dim=-1)[0]).softmax(dim=-1)
+        #     ent = -(topraw * topraw.log()).sum(dim=-1) # [batch_size, seq_length]
+        #     temperature = torch.tensor([[[temperature - temperature2]]], device=logits.device).expand(*logits.shape[:2], 1) * (ent > entfilter).unsqueeze(-1) + temperature2
+        logits = logits.float() / temperature
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -float('Inf')
+        # debiased topk
+        # probs = F.softmax(logits, dim=-1)
+        # tk_value, tk_idx = torch.topk(probs, self.topk, dim=-1)
+        # pred = torch.multinomial(probs.view(-1, logits.shape[-1]), num_samples=1).view(*logits.shape[:2], 1)
+        # edge_idx = tk_idx[:, :, -1:]
+        # edge_value = tk_value[:, :, -1:]
+        # edge_mask = probs.gather(dim=-1, index=pred) < edge_value
+        # pred[edge_mask] = edge_idx[edge_mask] # replace outliers as the "filter_topk"-th token
+        # pred.squeeze_(-1) # [batch_size, seq_length]
+        top_k_logits_(logits, self.topk)
+        probs = F.softmax(logits, dim=-1)
+        pred = torch.multinomial(probs.view(-1, logits.shape[-1]), num_samples=1).view(*logits.shape[:2], 1)
+        pred.squeeze_(-1)
+        assert tokens.shape[1] == pred.shape[1]
+        tokens = pred
+        return tokens
+def filling_sequence_itersr(
+        model,
+        seq0,
+        seq1,
+        warmup_steps=3,
+        block_hw=(4, 4),
+        strategy=IterativeEntfilterStrategy(topk=10),
+        ):
+    '''
+        seq: [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1]
+            4095 {layout[2]} final_token.
+        Attention:
+        The sampling temperature are changing, temporally we hard code them here.
+        The temperature in the strategy is not used.
+    '''
+    assert hasattr(model, 'layout')
+    layout = model.layout
+    device = seq0.device
+    # concat and pad sequences
+    batch_size = seq0.shape[0]
+    n_pad = layout[0] - seq0.shape[1]
+    assert n_pad >= 0, "You should truncate long input before filling."
+    seq = torch.cat((
+        torch.tensor([0]*n_pad, device=device, dtype=seq0.dtype)
+            .unsqueeze(0).expand(batch_size, n_pad),
+        seq0, seq1), dim=1) # [b, layout[-1]+1]
+    assert seq.shape[1] == layout[-1]
+    # build initial tokens, attention_mask, and position_ids
+    tokens = seq.clone()
+    attention_mask = torch.ones(layout[0]).to(device)
+    attention_mask[:n_pad] = 0
+    attention_mask = attention_mask.unsqueeze(0).type_as(next(model.parameters())) # if fp16
+    position_ids = torch.cat((
+        torch.zeros(n_pad, dtype=torch.long),
+        torch.arange(0, layout[0] - n_pad),
+        torch.arange(1024, 1024+layout[1]-layout[0]))).to(device)
+    log_attention_weights = torch.zeros(layout[0], device=device).type_as(next(model.parameters()))
+    log_attention_weights[n_pad:layout[0]] = 0.
+    log_attention_weights = log_attention_weights.unsqueeze(0)
+    # prepare for interation
+    unfixed = (tokens == tokenizer['<start_of_image>'])
+    ll, rr = block_hw
+    edge_len = int(math.sqrt(layout[-1] - layout[-2]) + 1e-4)
+    num_steps = 1
+    # interative refining
+    # unfixed[..., -(layout[-1] - layout[-2]):].view(
+    #     batch_size, edge_len//ll, ll, edge_len//rr, rr)[:, :, :, :, -1] = False
+    ret = []
+    # ret.append(tokens[:, layout[-2]:-1].clone())
+    for step_cnt in range(1, num_steps+1):
+        logits, *_dump = model(tokens, position_ids, attention_mask, log_attention_weights=log_attention_weights)
+        real_temp = 1.
+        new_tokens = strategy.forward(logits, tokens, real_temp)
+        tokens[unfixed] = new_tokens[unfixed]
+        ret.append(tokens[:, layout[-2]:].clone())
+    return torch.cat(ret, dim=0)

src/videogen_hub/pipelines/cogvideo/cogvideo_src/sr_pipeline/sr_group.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# -*- encoding: utf-8 -*-
+'''
+@File    :   sr_group.py
+@Time    :   2022/04/02 01:17:21
+@Author  :   Ming Ding
+@Contact :   [email protected]
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from SwissArmyTransformer.resources import auto_create
+from .direct_sr import DirectSuperResolution
+from .iterative_sr import IterativeSuperResolution
+class SRGroup:
+    def __init__(self, args, home_path=None,):
+        dsr_path = auto_create('cogview2-dsr', path=home_path)
+        itersr_path = auto_create('cogview2-itersr', path=home_path)
+        dsr = DirectSuperResolution(args, dsr_path)
+        itersr = IterativeSuperResolution(args, itersr_path, shared_transformer=dsr.model.transformer)
+        self.dsr = dsr
+        self.itersr = itersr
+    def sr_base(self, img_tokens, txt_tokens):
+        assert img_tokens.shape[-1] == 400 and len(img_tokens.shape) == 2
+        batch_size = img_tokens.shape[0]
+        txt_len = txt_tokens.shape[-1]
+        if len(txt_tokens.shape) == 1:
+            txt_tokens = txt_tokens.unsqueeze(0).expand(batch_size, txt_len)
+        sred_tokens = self.dsr(txt_tokens, img_tokens)
+        iter_tokens = self.itersr(txt_tokens, sred_tokens[:, -3600:].clone())
+        return iter_tokens[-batch_size:]
+    # def sr_patch(self, img_tokens, txt_tokens):
+    #     assert img_tokens.shape[-1] == 3600 and len(img_tokens.shape) == 2
+    #     batch_size = img_tokens.shape[0] * 9
+    #     txt_len = txt_tokens.shape[-1]
+    #     if len(txt_tokens.shape) == 1:
+    #         txt_tokens = txt_tokens.unsqueeze(0).expand(batch_size, txt_len)
+    #     img_tokens = img_tokens.view(img_tokens.shape[0], 3, 20, 3, 20).permute(0, 1, 3, 2, 4).reshape(batch_size, 400)
+    #     iter_tokens = self.sr_base(img_tokens, txt_tokens)
+    #     return iter_tokens

src/videogen_hub/pipelines/consisti2v/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 TIGER Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

src/videogen_hub/pipelines/consisti2v/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/configs/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/configs/inference/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/configs/inference/inference.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+output_dir: "samples/inference"
+output_name: "i2v"
+pretrained_model_path: "TIGER-Lab/ConsistI2V"
+unet_path: null
+unet_ckpt_prefix: "module."
+pipeline_pretrained_path: null
+sampling_kwargs:
+  height: 256
+  width: 256
+  n_frames: 16
+  steps: 50
+  ddim_eta: 0.0
+  guidance_scale_txt: 7.5
+  guidance_scale_img: 1.0
+  guidance_rescale: 0.0
+  num_videos_per_prompt: 1
+  frame_stride: 3
+unet_additional_kwargs:
+  variant: null
+  n_temp_heads: 8
+  augment_temporal_attention: true
+  temp_pos_embedding: "rotary" # "rotary" or "sinusoidal"
+  first_frame_condition_mode: "concat"
+  use_frame_stride_condition: true
+  noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive"
+  noise_alpha: 1.0
+noise_scheduler_kwargs:
+  beta_start: 0.00085
+  beta_end: 0.012
+  beta_schedule: "linear"
+  steps_offset: 1
+  clip_sample: false
+  rescale_betas_zero_snr: false     # true if using zero terminal snr
+  timestep_spacing:       "leading" # "trailing" if using zero terminal snr
+  prediction_type:        "epsilon" # "v_prediction" if using zero terminal snr
+frameinit_kwargs:
+  enable: true
+  camera_motion: null
+  noise_level: 850
+  filter_params:
+    method: 'gaussian'
+    d_s: 0.25
+    d_t: 0.25

src/videogen_hub/pipelines/consisti2v/configs/inference/inference_autoregress.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+output_dir: "samples/inference"
+output_name: "long_video"
+pretrained_model_path: "TIGER-Lab/ConsistI2V"
+unet_path: null
+unet_ckpt_prefix: "module."
+pipeline_pretrained_path: null
+sampling_kwargs:
+  height: 256
+  width: 256
+  n_frames: 16
+  steps: 50
+  ddim_eta: 0.0
+  guidance_scale_txt: 7.5
+  guidance_scale_img: 1.0
+  guidance_rescale: 0.0
+  num_videos_per_prompt: 1
+  frame_stride: 3
+  autoregress_steps: 3
+unet_additional_kwargs:
+  variant: null
+  n_temp_heads: 8
+  augment_temporal_attention: true
+  temp_pos_embedding: "rotary" # "rotary" or "sinusoidal"
+  first_frame_condition_mode: "concat"
+  use_frame_stride_condition: true
+  noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive"
+  noise_alpha: 1.0
+noise_scheduler_kwargs:
+  beta_start: 0.00085
+  beta_end: 0.012
+  beta_schedule: "linear"
+  steps_offset: 1
+  clip_sample: false
+  rescale_betas_zero_snr: false     # true if using zero terminal snr
+  timestep_spacing:       "leading" # "trailing" if using zero terminal snr
+  prediction_type:        "epsilon" # "v_prediction" if using zero terminal snr
+frameinit_kwargs:
+  enable: true
+  noise_level: 850
+  filter_params:
+    method: 'gaussian'
+    d_s: 0.25
+    d_t: 0.25

src/videogen_hub/pipelines/consisti2v/configs/prompts/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/configs/prompts/default.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+seeds: random
+prompts:
+  - "timelapse at the snow land with aurora in the sky."
+  - "fireworks."
+  - "clown fish swimming through the coral reef."
+  - "melting ice cream dripping down the cone."
+n_prompts:
+  - ""
+path_to_first_frames:
+  - "assets/example/example_01.png"
+  - "assets/example/example_02.png"
+  - "assets/example/example_03.png"
+  - "assets/example/example_04.png"

src/videogen_hub/pipelines/consisti2v/configs/training/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/configs/training/training.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+output_dir: "checkpoints"
+pretrained_model_path: "stabilityai/stable-diffusion-2-1-base"
+noise_scheduler_kwargs:
+  num_train_timesteps:    1000
+  beta_start:             0.00085
+  beta_end:               0.012
+  beta_schedule:          "linear"
+  steps_offset:           1
+  clip_sample:            false
+  rescale_betas_zero_snr: false     # true if using zero terminal snr
+  timestep_spacing:       "leading" # "trailing" if using zero terminal snr
+  prediction_type:        "epsilon" # "v_prediction" if using zero terminal snr
+train_data:
+  dataset:             "joint"
+  pexels_config:
+    enable:            false
+    json_path:         null
+    caption_json_path: null
+    video_folder:      null
+  webvid_config:
+    enable:            true
+    json_path:         "/path/to/webvid/annotation"
+    video_folder:      "/path/to/webvid/data"
+  sample_size:       256
+  sample_duration:   null
+  sample_fps:        null
+  sample_stride:     [1, 5]
+  sample_n_frames:   16
+validation_data:
+  prompts:
+    - "timelapse at the snow land with aurora in the sky."
+    - "fireworks."
+    - "clown fish swimming through the coral reef."
+    - "melting ice cream dripping down the cone."
+  path_to_first_frames:
+    - "assets/example/example_01.jpg"
+    - "assets/example/example_02.jpg"
+    - "assets/example/example_03.jpg"
+    - "assets/example/example_04.jpg"
+  num_inference_steps: 50
+  ddim_eta: 0.0
+  guidance_scale_txt: 7.5
+  guidance_scale_img: 1.0
+  guidance_rescale: 0.0
+  frame_stride: 3
+trainable_modules:
+  - "all"
+  # - "conv3ds."
+  # - "tempo_attns."
+resume_from_checkpoint: null
+unet_additional_kwargs:
+  variant: null
+  n_temp_heads: 8
+  augment_temporal_attention: true
+  temp_pos_embedding: "rotary" # "rotary" or "sinusoidal"
+  first_frame_condition_mode: "concat"
+  use_frame_stride_condition: true
+  noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive"
+  noise_alpha: 1.0
+cfg_random_null_text_ratio: 0.1
+cfg_random_null_img_ratio: 0.1
+use_ema: false
+ema_decay: 0.9999
+learning_rate:    5.e-5
+train_batch_size: 3
+gradient_accumulation_steps: 1
+max_grad_norm: 0.5
+max_train_epoch:      -1
+max_train_steps:      200000
+checkpointing_epochs: -1
+checkpointing_steps:  2000
+validation_steps:     1000
+seed: 42
+mixed_precision: "bf16"
+num_workers: 32
+enable_xformers_memory_efficient_attention: true
+is_image: false
+is_debug: false

src/videogen_hub/pipelines/consisti2v/consisti2v/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/consisti2v/data/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/consisti2v/data/dataset.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os, io, csv, math, random
+import json
+import numpy as np
+from einops import rearrange
+from decord import VideoReader
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from diffusers.utils import logging
+logger = logging.get_logger(__name__)
+class WebVid10M(Dataset):
+    def __init__(
+            self,
+            json_path, video_folder=None,
+            sample_size=256, sample_stride=4, sample_n_frames=16,
+            is_image=False,
+            **kwargs,
+        ):
+        logger.info(f"loading annotations from {json_path} ...")
+        with open(json_path, 'rb') as json_file:
+            json_list = list(json_file)
+        self.dataset = [json.loads(json_str) for json_str in json_list]
+        self.length = len(self.dataset)
+        logger.info(f"data scale: {self.length}")
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride if isinstance(sample_stride, int) else tuple(sample_stride)
+        self.sample_n_frames = sample_n_frames
+        self.is_image        = is_image
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.Resize(sample_size[0], antialias=None),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_relative_path, name = video_dict['file'], video_dict['text']
+        if self.video_folder is not None:
+            if video_relative_path[0] == '/':
+                video_dir = os.path.join(self.video_folder, os.path.basename(video_relative_path))
+            else:
+                video_dir = os.path.join(self.video_folder, video_relative_path)
+        else:
+            video_dir = video_relative_path
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        if not self.is_image:
+            if isinstance(self.sample_stride, int):
+                stride = self.sample_stride
+            elif isinstance(self.sample_stride, tuple):
+                stride = random.randint(self.sample_stride[0], self.sample_stride[1])
+            clip_length = min(video_length, (self.sample_n_frames - 1) * stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        else:
+            frame_difference = random.randint(2, self.sample_n_frames)
+            clip_length = min(video_length, (frame_difference - 1) * self.sample_stride + 1)
+            start_idx = random.randint(0, video_length - clip_length)
+            batch_index = [start_idx, start_idx + clip_length - 1]
+        pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        del video_reader
+        return pixel_values, name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length-1)
+        pixel_values = self.pixel_transforms(pixel_values)
+        sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+class Pexels(Dataset):
+    def __init__(
+            self,
+            json_path, caption_json_path, video_folder=None,
+            sample_size=256, sample_duration=1, sample_fps=8,
+            is_image=False,
+            **kwargs,
+        ):
+        logger.info(f"loading captions from {caption_json_path} ...")
+        with open(caption_json_path, 'rb') as caption_json_file:
+            caption_json_list = list(caption_json_file)
+        self.caption_dict = {json.loads(json_str)['id']: json.loads(json_str)['text'] for json_str in caption_json_list}
+        logger.info(f"loading annotations from {json_path} ...")
+        with open(json_path, 'rb') as json_file:
+            json_list = list(json_file)
+        dataset = [json.loads(json_str) for json_str in json_list]
+        self.dataset = []
+        for data in dataset:
+            data['text'] = self.caption_dict[data['id']]
+            if data['height'] / data['width'] < 0.625:
+                self.dataset.append(data)
+        self.length = len(self.dataset)
+        logger.info(f"data scale: {self.length}")
+        self.video_folder    = video_folder
+        self.sample_duration = sample_duration
+        self.sample_fps      = sample_fps
+        self.sample_n_frames = sample_duration * sample_fps
+        self.is_image        = is_image
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.Resize(sample_size[0], antialias=None),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_relative_path, name = video_dict['file'], video_dict['text']
+        fps = video_dict['fps']
+        if self.video_folder is not None:
+            if video_relative_path[0] == '/':
+                video_dir = os.path.join(self.video_folder, os.path.basename(video_relative_path))
+            else:
+                video_dir = os.path.join(self.video_folder, video_relative_path)
+        else:
+            video_dir = video_relative_path
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        if not self.is_image:
+            clip_length = min(video_length, math.ceil(fps * self.sample_duration))
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        else:
+            frame_difference = random.randint(2, self.sample_n_frames)
+            sample_stride = math.ceil((fps * self.sample_duration) / (self.sample_n_frames - 1) - 1)
+            clip_length = min(video_length, (frame_difference - 1) * sample_stride + 1)
+            start_idx = random.randint(0, video_length - clip_length)
+            batch_index = [start_idx, start_idx + clip_length - 1]
+        pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        del video_reader
+        return pixel_values, name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length-1)
+        pixel_values = self.pixel_transforms(pixel_values)
+        sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+class JointDataset(Dataset):
+    def __init__(
+            self,
+            webvid_config, pexels_config,
+            sample_size=256,
+            sample_duration=None, sample_fps=None, sample_stride=None, sample_n_frames=None,
+            is_image=False,
+            **kwargs,
+        ):
+        assert (sample_duration is None and sample_fps is None) or (sample_duration is not None and sample_fps is not None), "sample_duration and sample_fps should be both None or not None"
+        if sample_duration is not None and sample_fps is not None:
+            assert sample_stride is None, "when sample_duration and sample_fps are not None, sample_stride should be None"
+        if sample_stride is not None:
+            assert sample_fps is None and sample_duration is None, "when sample_stride is not None, sample_duration and sample_fps should be both None"
+        self.dataset = []
+        if pexels_config.enable:
+            logger.info(f"loading pexels dataset")
+            logger.info(f"loading captions from {pexels_config.caption_json_path} ...")
+            with open(pexels_config.caption_json_path, 'rb') as caption_json_file:
+                caption_json_list = list(caption_json_file)
+            self.caption_dict = {json.loads(json_str)['id']: json.loads(json_str)['text'] for json_str in caption_json_list}
+            logger.info(f"loading annotations from {pexels_config.json_path} ...")
+            with open(pexels_config.json_path, 'rb') as json_file:
+                json_list = list(json_file)
+            dataset = [json.loads(json_str) for json_str in json_list]
+            for data in dataset:
+                data['text'] = self.caption_dict[data['id']]
+                data['dataset'] = 'pexels'
+                if data['height'] / data['width'] < 0.625:
+                    self.dataset.append(data)
+        if webvid_config.enable:
+            logger.info(f"loading webvid dataset")
+            logger.info(f"loading annotations from {webvid_config.json_path} ...")
+            with open(webvid_config.json_path, 'rb') as json_file:
+                json_list = list(json_file)
+            dataset = [json.loads(json_str) for json_str in json_list]
+            for data in dataset:
+                data['dataset'] = 'webvid'
+            self.dataset.extend(dataset)
+        self.length = len(self.dataset)
+        logger.info(f"data scale: {self.length}")
+        self.pexels_folder   = pexels_config.video_folder
+        self.webvid_folder   = webvid_config.video_folder
+        self.sample_duration = sample_duration
+        self.sample_fps      = sample_fps
+        self.sample_n_frames = sample_duration * sample_fps if sample_n_frames is None else sample_n_frames
+        self.sample_stride   = sample_stride if (sample_stride is None) or (sample_stride is not None and isinstance(sample_stride, int)) else tuple(sample_stride)
+        self.is_image        = is_image
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.Resize(sample_size[0], antialias=None),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_relative_path, name = video_dict['file'], video_dict['text']
+        if video_dict['dataset'] == 'pexels':
+            video_folder = self.pexels_folder
+        elif video_dict['dataset'] == 'webvid':
+            video_folder = self.webvid_folder
+        else:
+            raise NotImplementedError
+        if video_folder is not None:
+            if video_relative_path[0] == '/':
+                video_dir = os.path.join(video_folder, os.path.basename(video_relative_path))
+            else:
+                video_dir = os.path.join(video_folder, video_relative_path)
+        else:
+            video_dir = video_relative_path
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        stride = None
+        if not self.is_image:
+            if self.sample_duration is not None:
+                fps = video_dict['fps']
+                clip_length = min(video_length, math.ceil(fps * self.sample_duration))
+            elif self.sample_stride is not None:
+                if isinstance(self.sample_stride, int):
+                    stride = self.sample_stride
+                elif isinstance(self.sample_stride, tuple):
+                    stride = random.randint(self.sample_stride[0], self.sample_stride[1])
+                clip_length = min(video_length, (self.sample_n_frames - 1) * stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        else:
+            frame_difference = random.randint(2, self.sample_n_frames)
+            if self.sample_duration is not None:
+                fps = video_dict['fps']
+                sample_stride = math.ceil((fps * self.sample_duration) / (self.sample_n_frames - 1) - 1)
+            elif self.sample_stride is not None:
+                sample_stride = self.sample_stride
+            clip_length = min(video_length, (frame_difference - 1) * sample_stride + 1)
+            start_idx = random.randint(0, video_length - clip_length)
+            batch_index = [start_idx, start_idx + clip_length - 1]
+        pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        del video_reader
+        return pixel_values, name, stride
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name, stride = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length-1)
+        pixel_values = self.pixel_transforms(pixel_values)
+        sample = dict(pixel_values=pixel_values, text=name, stride=stride)
+        return sample

src/videogen_hub/pipelines/consisti2v/consisti2v/models/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/consisti2v/models/rotary_embedding.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from math import pi, log
+import torch
+from torch.nn import Module, ModuleList
+from torch.cuda.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+from einops import rearrange, repeat
+from beartype import beartype
+from beartype.typing import Literal, Union, Optional
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim = -1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim = dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+@autocast(enabled = False)
+def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:].to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim = -1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+    rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
+    return apply_rotary_emb(rotations, t, start_index = start_index)
+# classes
+class RotaryEmbedding(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Optional[Tensor] = None,
+        freqs_for: Union[
+            Literal['lang'],
+            Literal['pixel'],
+            Literal['constant']
+        ] = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        learned_freq = False,
+        use_xpos = False,
+        xpos_scale_base = 512,
+        interpolate_factor = 1.,
+        theta_rescale_factor = 1.,
+        seq_before_head_dim = False
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        self.tmp_store('cached_freqs', None)
+        self.tmp_store('cached_scales', None)
+        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.tmp_store('dummy', torch.tensor(0))
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            self.tmp_store('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store('scale', scale)
+    @property
+    def device(self):
+        return self.dummy.device
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent = False)
+    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
+        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0, freq_seq_len = None, seq_pos = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        if exists(freq_seq_len):
+            assert freq_seq_len >= seq_len
+            seq_len = freq_seq_len
+        if seq_pos is None:
+            seq_pos = self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset)
+        else:
+            assert seq_pos.shape[0] == seq_len
+        freqs = self.forward(seq_pos, seq_len = seq_len, offset = offset)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+        return apply_rotary_emb(freqs, t, seq_dim = seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, freq_seq_len = k_len)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+        freqs = self.forward(seq, seq_len = seq_len)
+        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+        rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    @beartype
+    def get_scale(
+        self,
+        t: Tensor,
+        seq_len: Optional[int] = None,
+        offset = 0
+    ):
+        assert self.use_xpos
+        should_cache = exists(seq_len)
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = torch.cat((scale, scale), dim = -1)
+        if should_cache:
+            self.tmp_store('cached_scales', scale)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            else:
+                pos = torch.arange(dim, device = self.device)
+            freqs = self.forward(pos, seq_len = dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+    @autocast(enabled = False)
+    def forward(
+        self,
+        t: Tensor,
+        seq_len = None,
+        offset = 0
+    ):
+        # should_cache = (
+        #     not self.learned_freq and \
+        #     exists(seq_len) and \
+        #     self.freqs_for != 'pixel'
+        # )
+        # if (
+        #     should_cache and \
+        #     exists(self.cached_freqs) and \
+        #     (offset + seq_len) <= self.cached_freqs.shape[0]
+        # ):
+        #     return self.cached_freqs[offset:(offset + seq_len)].detach()
+        freqs = self.freqs
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        # if should_cache:
+        #     self.tmp_store('cached_freqs', freqs.detach())
+        return freqs

src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_attention.py ADDED Viewed

	@@ -0,0 +1,809 @@

+from importlib import import_module
+from typing import Callable, Optional, Union
+import math
+from einops import rearrange, repeat
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.lora import LoRACompatibleLinear, LoRALinearLayer
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    AttnProcessor,
+    AttnProcessor2_0,
+    SpatialNorm,
+    LORA_ATTENTION_PROCESSORS,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    SlicedAttnAddedKVProcessor,
+    XFormersAttnAddedKVProcessor,
+    LoRAAttnAddedKVProcessor,
+    XFormersAttnProcessor,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    SlicedAttnProcessor,
+    AttentionProcessor
+)
+from .rotary_embedding import RotaryEmbedding
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+@maybe_allow_in_graph
+class ConditionalAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block=False,
+        processor: Optional["AttnProcessor"] = None,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        self.to_q = LoRACompatibleLinear(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias=bias)
+            self.to_v = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(LoRACompatibleLinear(self.inner_dim, query_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ):
+        is_lora = hasattr(self, "processor") and isinstance(
+            self.processor,
+            LORA_ATTENTION_PROCESSORS,
+        )
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+                LoRAAttnAddedKVProcessor,
+            ),
+        )
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and (is_lora or is_custom_diffusion):
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for LoRA or custom diffusion for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+            if is_lora:
+                # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
+                # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
+                processor = LoRAXFormersAttnProcessor(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_lora:
+                attn_processor_class = (
+                    LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
+                )
+                processor = attn_processor_class(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    AttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+        self.set_processor(processor)
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor"):
+        if (
+            hasattr(self, "processor")
+            and not isinstance(processor, LORA_ATTENTION_PROCESSORS)
+            and self.to_q.lora_layer is not None
+        ):
+            deprecate(
+                "set_processor to offload LoRA",
+                "0.26.0",
+                "In detail, removing LoRA layers via calling `set_processor` or `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
+            )
+            # (Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
+            # We need to remove all LoRA layers
+            for module in self.modules():
+                if hasattr(module, "set_lora_layer"):
+                    module.set_lora_layer(None)
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+        self.processor = processor
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        if not return_deprecated_lora:
+            return self.processor
+        # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
+        # serialization format for LoRA Attention Processors. It should be deleted once the integration
+        # with PEFT is completed.
+        is_lora_activated = {
+            name: module.lora_layer is not None
+            for name, module in self.named_modules()
+            if hasattr(module, "lora_layer")
+        }
+        # 1. if no layer has a LoRA activated we can return the processor as usual
+        if not any(is_lora_activated.values()):
+            return self.processor
+        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
+        is_lora_activated.pop("add_k_proj", None)
+        is_lora_activated.pop("add_v_proj", None)
+        # 2. else it is not posssible that only some layers have LoRA activated
+        if not all(is_lora_activated.values()):
+            raise ValueError(
+                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
+            )
+        # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
+        non_lora_processor_cls_name = self.processor.__class__.__name__
+        lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
+        hidden_size = self.inner_dim
+        # now create a LoRA attention processor from the LoRA layers
+        if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]:
+            kwargs = {
+                "cross_attention_dim": self.cross_attention_dim,
+                "rank": self.to_q.lora_layer.rank,
+                "network_alpha": self.to_q.lora_layer.network_alpha,
+                "q_rank": self.to_q.lora_layer.rank,
+                "q_hidden_size": self.to_q.lora_layer.out_features,
+                "k_rank": self.to_k.lora_layer.rank,
+                "k_hidden_size": self.to_k.lora_layer.out_features,
+                "v_rank": self.to_v.lora_layer.rank,
+                "v_hidden_size": self.to_v.lora_layer.out_features,
+                "out_rank": self.to_out[0].lora_layer.rank,
+                "out_hidden_size": self.to_out[0].lora_layer.out_features,
+            }
+            if hasattr(self.processor, "attention_op"):
+                kwargs["attention_op"] = self.prcoessor.attention_op
+            lora_processor = lora_processor_cls(hidden_size, **kwargs)
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
+        elif lora_processor_cls == LoRAAttnAddedKVProcessor:
+            lora_processor = lora_processor_cls(
+                hidden_size,
+                cross_attention_dim=self.add_k_proj.weight.shape[0],
+                rank=self.to_q.lora_layer.rank,
+                network_alpha=self.to_q.lora_layer.network_alpha,
+            )
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
+            # only save if used
+            if self.add_k_proj.lora_layer is not None:
+                lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict())
+                lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict())
+            else:
+                lora_processor.add_k_proj_lora = None
+                lora_processor.add_v_proj_lora = None
+        else:
+            raise ValueError(f"{lora_processor_cls} does not exist.")
+        return lora_processor
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def head_to_batch_dim(self, tensor, out_dim=3):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(self, query, key, attention_mask=None):
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
+        if batch_size is None:
+            deprecate(
+                "batch_size=None",
+                "0.22.0",
+                (
+                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
+                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
+                    " `prepare_attention_mask` when preparing the attention_mask."
+                ),
+            )
+            batch_size = 1
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(self, encoder_hidden_states):
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+class TemporalConditionalAttention(Attention):
+    def __init__(self, n_frames=8, rotary_emb=False, *args, **kwargs):
+        super().__init__(processor=RotaryEmbAttnProcessor2_0() if rotary_emb else None, *args, **kwargs)
+        if not rotary_emb:
+            self.pos_enc = PositionalEncoding(self.inner_dim)
+        else:
+            rotary_bias = RelativePositionBias(heads=kwargs['heads'], max_distance=32)
+            self.rotary_bias = rotary_bias
+            self.rotary_emb = RotaryEmbedding(self.inner_dim // 2)
+        self.use_rotary_emb = rotary_emb
+        self.n_frames = n_frames
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        adjacent_slices=None,
+        **cross_attention_kwargs):
+        key_pos_idx = None
+        bt, hw, c = hidden_states.shape
+        hidden_states = rearrange(hidden_states, '(b t) hw c -> b hw t c', t=self.n_frames)
+        if not self.use_rotary_emb:
+            pos_embed = self.pos_enc(self.n_frames)
+            hidden_states = hidden_states + pos_embed
+        hidden_states = rearrange(hidden_states, 'b hw t c -> (b hw) t c')
+        if encoder_hidden_states is not None:
+            assert adjacent_slices is None
+            encoder_hidden_states = encoder_hidden_states[::self.n_frames]
+            encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b hw) n c', hw=hw)
+        if adjacent_slices is not None:
+            assert encoder_hidden_states is None
+            adjacent_slices = rearrange(adjacent_slices, 'b c h w n -> b (h w) n c')
+            if not self.use_rotary_emb:
+                first_frame_pos_embed = pos_embed[0:1, :]
+                adjacent_slices = adjacent_slices + first_frame_pos_embed
+            else:
+                pos_idx = torch.arange(self.n_frames, device=hidden_states.device, dtype=hidden_states.dtype)
+                first_frame_pos_pad = torch.zeros(adjacent_slices.shape[2], device=hidden_states.device, dtype=hidden_states.dtype)
+                key_pos_idx = torch.cat([pos_idx, first_frame_pos_pad], dim=0)
+            adjacent_slices = rearrange(adjacent_slices, 'b hw n c -> (b hw) n c')
+            encoder_hidden_states = torch.cat([hidden_states, adjacent_slices], dim=1)
+        if not self.use_rotary_emb:
+            out = self.processor(
+                self,
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+        else:
+            out = self.processor(
+                self,
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                key_pos_idx=key_pos_idx,
+                **cross_attention_kwargs,
+            )
+        out = rearrange(out, '(b hw) t c -> (b t) hw c', hw=hw)
+        return out
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers, attention_op=None):
+        if use_memory_efficient_attention_xformers:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            processor = (
+                AttnProcessor2_0()
+                if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                else AttnProcessor()
+            )
+        self.set_processor(processor)
+class PositionalEncoding(nn.Module):
+    def __init__(self, dim, max_pos=512):
+        super().__init__()
+        pos = torch.arange(max_pos)
+        freq = torch.arange(dim//2) / dim
+        freq = (freq * torch.tensor(10000).log()).exp()
+        x = rearrange(pos, 'L -> L 1') / freq
+        x = rearrange(x, 'L d -> L d 1')
+        pe = torch.cat((x.sin(), x.cos()), dim=-1)
+        self.pe = rearrange(pe, 'L d sc -> L (d sc)')
+        self.dummy = nn.Parameter(torch.rand(1))
+    def forward(self, length):
+        enc = self.pe[:length]
+        enc = enc.to(self.dummy.device, self.dummy.dtype)
+        return enc
+# code taken from https://github.com/Vchitect/LaVie/blob/main/base/models/temporal_attention.py
+class RelativePositionBias(nn.Module):
+    def __init__(
+        self,
+        heads=8,
+        num_buckets=32,
+        max_distance=128,
+    ):
+        super().__init__()
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
+        ret = 0
+        n = -relative_position
+        num_buckets //= 2
+        ret += (n < 0).long() * num_buckets
+        n = torch.abs(n)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    def forward(self, qlen, klen, device, dtype):
+        q_pos = torch.arange(qlen, dtype = torch.long, device = device)
+        k_pos = torch.arange(klen, dtype = torch.long, device = device)
+        rel_pos = rearrange(k_pos, 'j -> 1 j') - rearrange(q_pos, 'i -> i 1')
+        rp_bucket = self._relative_position_bucket(rel_pos, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        values = values.to(device, dtype)
+        return rearrange(values, 'i j h -> h i j') # num_heads, num_frames, num_frames
+class RotaryEmbAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    Add rotary embedding support
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale: float = 1.0,
+        key_pos_idx: Optional[torch.Tensor] = None,
+    ):
+        assert attention_mask is None
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        # if attention_mask is not None:
+        #     attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        #     # scaled_dot_product_attention expects attention_mask shape to be
+        #     # (batch, heads, source_length, target_length)
+        #     attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, scale=scale)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        qlen = hidden_states.shape[1]
+        klen = encoder_hidden_states.shape[1]
+        # currently only add bias for self attention. Relative distance doesn't make sense for cross attention.
+        # if qlen == klen:
+        #     time_rel_pos_bias = attn.rotary_bias(qlen, klen, device=hidden_states.device, dtype=hidden_states.dtype)
+        #     attention_mask = repeat(time_rel_pos_bias, "h d1 d2 -> b h d1 d2", b=batch_size)
+        key = attn.to_k(encoder_hidden_states, scale=scale)
+        value = attn.to_v(encoder_hidden_states, scale=scale)
+        query = attn.rotary_emb.rotate_queries_or_keys(query)
+        if qlen == klen:
+            key = attn.rotary_emb.rotate_queries_or_keys(key)
+        elif key_pos_idx is not None:
+            key = attn.rotary_emb.rotate_queries_or_keys(key, seq_pos=key_pos_idx)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, scale=scale)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_transformer_blocks.py ADDED Viewed

	@@ -0,0 +1,564 @@

+# Modified from https://github.com/huggingface/diffusers/blob/v0.21.0/src/diffusers/models/transformer_2d.py
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange, repeat
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate
+from diffusers.models.attention import AdaLayerNorm, AdaLayerNormZero, FeedForward, GatedSelfAttentionDense
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRACompatibleLinear
+from .videoldm_attention import ConditionalAttention, TemporalConditionalAttention
+class Transformer2DConditionModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        attention_type: str = "default",
+        # additional
+        n_frames: int = 8,
+        is_temporal: bool = False,
+        augment_temporal_attention: bool = False,
+        rotary_emb=False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = LoRACompatibleLinear(in_channels, inner_dim)
+            else:
+                self.proj_in = LoRACompatibleConv(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicConditionalTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    attention_type=attention_type,
+                    # additional
+                    n_frames=n_frames,
+                    is_temporal=is_temporal,
+                    augment_temporal_attention=augment_temporal_attention,
+                    rotary_emb=rotary_emb,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = LoRACompatibleLinear(inner_dim, in_channels)
+            else:
+                self.proj_out = LoRACompatibleConv(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches:
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        self.alpha = None
+        if is_temporal:
+            self.alpha = nn.Parameter(torch.ones(1))
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        condition_on_first_frame: bool = False,
+    ):
+        input_states = hidden_states
+        input_height, input_width = hidden_states.shape[-2:]
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states, lora_scale)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states, scale=lora_scale)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            hidden_states = self.pos_embed(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                    # additional
+                    condition_on_first_frame=condition_on_first_frame,
+                    input_height=input_height,
+                    input_width=input_width,
+                )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states, scale=lora_scale)
+            else:
+                hidden_states = self.proj_out(hidden_states, scale=lora_scale)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        elif self.is_input_patches:
+            # TODO: cleanup!
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if self.alpha is not None:
+            with torch.no_grad():
+                self.alpha.clamp_(0, 1)
+            output = self.alpha * input_states + (1 - self.alpha) * output
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+@maybe_allow_in_graph
+class BasicConditionalTransformerBlock(nn.Module):
+    """ transformer block with first frame conditioning """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        # additional
+        n_frames: int = 8,
+        is_temporal: bool = False,
+        augment_temporal_attention: bool = False,
+        rotary_emb=False,
+    ):
+        super().__init__()
+        self.n_frames = n_frames
+        self.only_cross_attention = only_cross_attention
+        self.augment_temporal_attention = augment_temporal_attention
+        self.is_temporal = is_temporal
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        if not is_temporal:
+            self.attn1 = ConditionalAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn1 = TemporalConditionalAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                # additional
+                n_frames=n_frames,
+                rotary_emb=rotary_emb,
+            )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            if not is_temporal:
+                self.attn2 = ConditionalAttention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                )  # is self-attn if encoder_hidden_states is none
+            else:
+                self.attn2 = TemporalConditionalAttention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    # additional
+                    n_frames=n_frames,
+                    rotary_emb=rotary_emb,
+                )
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        condition_on_first_frame: bool = False,
+        input_height: Optional[int] = None,
+        input_width: Optional[int] = None,
+    ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        if condition_on_first_frame:
+            first_frame_hidden_states = rearrange(norm_hidden_states, '(b f) d h -> b f d h', f=self.n_frames)[:, 0, :, :]
+            first_frame_hidden_states = repeat(first_frame_hidden_states, 'b d h -> b f d h', f=self.n_frames)
+            first_frame_hidden_states = rearrange(first_frame_hidden_states, 'b f d h -> (b f) d h')
+            first_frame_concat_hidden_states = torch.cat((norm_hidden_states, first_frame_hidden_states), dim=1)
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else first_frame_concat_hidden_states,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+        elif self.is_temporal and self.augment_temporal_attention:
+            first_frame_hidden_states = rearrange(norm_hidden_states, '(b f) d h -> b f d h', f=self.n_frames)[:, 0, :, :]
+            first_frame_hidden_states = rearrange(first_frame_hidden_states, 'b (h w) c -> b h w c', h=input_height, w=input_width)
+            first_frame_hidden_states = first_frame_hidden_states.permute(0, 3, 1, 2)
+            padded_first_frame = torch.nn.functional.pad(first_frame_hidden_states, (1, 1, 1, 1), "replicate")
+            first_frame_windows = padded_first_frame.unfold(2, 3, 1).unfold(3, 3, 1)
+            mask = torch.tensor([[1, 1, 1], [1, 0, 1], [1, 1, 1]], dtype=torch.bool)
+            adjacent_slices = first_frame_windows[:, :, :, :, mask]
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                adjacent_slices=adjacent_slices,
+                **cross_attention_kwargs,
+            )
+        else:
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 2.5 ends
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice, scale=lora_scale)
+                    for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)
+                ],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states

src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_unet.py ADDED Viewed

	@@ -0,0 +1,1371 @@

+import os
+import re
+from typing import Optional, Tuple, Union, Dict, List, Any
+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.models import ModelMixin
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+from diffusers.models.unet_2d_blocks import UNetMidBlock2DCrossAttn, UNetMidBlock2DSimpleCrossAttn
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.activations import get_activation
+from diffusers.configuration_utils import register_to_config, ConfigMixin
+from diffusers.models.modeling_utils import load_state_dict, load_model_dict_into_meta
+from diffusers.utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    FLAX_WEIGHTS_NAME,
+    HF_HUB_OFFLINE,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+from diffusers import __version__
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+if is_accelerate_available():
+    import accelerate
+    from accelerate.utils import set_module_tensor_to_device
+    from accelerate.utils.versions import is_torch_version
+from .videoldm_unet_blocks import get_down_block, get_up_block, VideoLDMUNetMidBlock2DCrossAttn
+logger = logging.get_logger(__name__)
+class VideoLDMUNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D", # -> VideoLDMDownBlock
+            "CrossAttnDownBlock2D", # -> VideoLDMDownBlock
+            "CrossAttnDownBlock2D", # -> VideoLDMDownBlock
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D", # -> VideoLDMUpBlock
+            "CrossAttnUpBlock2D", # -> VideoLDMUpBlock
+            "CrossAttnUpBlock2D", # -> VideoLDMUpBlock
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        # additional
+        use_temporal: bool = True,
+        n_frames: int = 8,
+        n_temp_heads: int = 8,
+        first_frame_condition_mode: str = "none",
+        augment_temporal_attention: bool = False,
+        temp_pos_embedding: str = "sinusoidal",
+        use_frame_stride_condition: bool = False,
+    ):
+        super().__init__()
+        rotary_emb = False
+        if temp_pos_embedding == "rotary":
+            # from rotary_embedding_torch import RotaryEmbedding
+            # rotary_emb = RotaryEmbedding(32)
+            # self.rotary_emb = rotary_emb
+            rotary_emb = True
+            self.rotary_emb = rotary_emb
+        self.use_temporal = use_temporal
+        self.augment_temporal_attention = augment_temporal_attention
+        assert first_frame_condition_mode in ["none", "concat", "conv2d", "input_only"], f"first_frame_condition_mode: {first_frame_condition_mode} must be one of ['none', 'concat', 'conv2d', 'input_only']"
+        self.first_frame_condition_mode = first_frame_condition_mode
+        latent_channels = in_channels
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        self.use_frame_stride_condition = use_frame_stride_condition
+        if self.use_frame_stride_condition:
+            self.frame_stride_embedding = TimestepEmbedding(
+                timestep_input_dim,
+                time_embed_dim,
+                act_fn=act_fn,
+                post_act_fn=timestep_post_act,
+                cond_proj_dim=time_cond_proj_dim,
+            )
+            # zero init
+            nn.init.zeros_(self.frame_stride_embedding.linear_2.weight)
+            nn.init.zeros_(self.frame_stride_embedding.linear_2.bias)
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+                # additional
+                use_temporal=use_temporal,
+                augment_temporal_attention=augment_temporal_attention,
+                n_frames=n_frames,
+                n_temp_heads=n_temp_heads,
+                first_frame_condition_mode=first_frame_condition_mode,
+                latent_channels=latent_channels,
+                rotary_emb=rotary_emb,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = VideoLDMUNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+                # additional
+                use_temporal=use_temporal,
+                n_frames=n_frames,
+                first_frame_condition_mode=first_frame_condition_mode,
+                latent_channels=latent_channels,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+                # additional
+                use_temporal=use_temporal,
+                augment_temporal_attention=augment_temporal_attention,
+                n_frames=n_frames,
+                n_temp_heads=n_temp_heads,
+                first_frame_condition_mode=first_frame_condition_mode,
+                latent_channels=latent_channels,
+                rotary_emb=rotary_emb,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        # additional
+        first_frame_latents: Optional[torch.Tensor] = None,
+        frame_stride: Optional[Union[torch.Tensor, float, int]] = None,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        # reshape video data
+        assert sample.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={sample.dim()}."
+        video_length = sample.shape[2]
+        if first_frame_latents is not None:
+            assert self.config.first_frame_condition_mode != "none", "first_frame_latents is not None, but first_frame_condition_mode is 'none'."
+        if self.config.first_frame_condition_mode != "none":
+            sample = torch.cat([first_frame_latents, sample], dim=2)
+            video_length += 1
+        # copy conditioning embeddings for cross attention
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
+        sample = rearrange(sample, "b c f h w -> (b f) c h w")
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        if self.use_frame_stride_condition:
+            if not torch.is_tensor(frame_stride):
+                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                # This would be a good case for the `match` statement (Python 3.10+)
+                is_mps = sample.device.type == "mps"
+                if isinstance(timestep, float):
+                    dtype = torch.float32 if is_mps else torch.float64
+                else:
+                    dtype = torch.int32 if is_mps else torch.int64
+                frame_stride = torch.tensor([frame_stride], dtype=dtype, device=sample.device)
+            elif len(frame_stride.shape) == 0:
+                frame_stride = frame_stride[None].to(sample.device)
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            frame_stride = frame_stride.expand(sample.shape[0])
+            fs_emb = self.time_proj(frame_stride)
+            # `Timesteps` does not contain any weights and will always return f32 tensors
+            # but time_embedding might actually be running in fp16. so we need to cast here.
+            # there might be better ways to encapsulate this.
+            fs_emb = fs_emb.to(dtype=sample.dtype)
+            fs_emb = self.frame_stride_embedding(fs_emb, timestep_cond)
+            emb = emb + fs_emb
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    first_frame_latents=first_frame_latents,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale, first_frame_latents=first_frame_latents,)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                # additional
+                first_frame_latents=first_frame_latents,
+            )
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_block_additional_residuals) > 0
+                and sample.shape == down_block_additional_residuals[0].shape
+            ):
+                sample += down_block_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    first_frame_latents=first_frame_latents,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                    first_frame_latents=first_frame_latents,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        sample = rearrange(sample, "(b f) c h w -> b c f h w", f=video_length)
+        if self.config.first_frame_condition_mode != "none":
+            sample = sample[:, :, 1:, :, :]
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        kwargs.pop("low_cpu_mem_usage", False)
+        kwargs.pop("device_map", None)
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = None
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = False
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            # Convert the weights
+            from diffusers.models.modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+                    if len(unexpected_keys) > 0:
+                        logger.warn(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    try:
+                        accelerate.load_checkpoint_and_dispatch(
+                            model,
+                            model_file,
+                            device_map,
+                            max_memory=max_memory,
+                            offload_folder=offload_folder,
+                            offload_state_dict=offload_state_dict,
+                            dtype=torch_dtype,
+                        )
+                    except AttributeError as e:
+                        # When using accelerate loading, we do not have the ability to load the state
+                        # dict and rename the weight names manually. Additionally, accelerate skips
+                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
+                        # (which look like they should be private variables?), so we can't use the standard hooks
+                        # to rename parameters on load. We need to mimic the original weight names so the correct
+                        # attributes are available. After we have loaded the weights, we convert the deprecated
+                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
+                        # the weights so we don't have to do this again.
+                        if "'Attention' object has no attribute" in str(e):
+                            logger.warn(
+                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
+                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
+                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
+                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
+                                " please also re-upload it or open a PR on the original repository."
+                            )
+                            model._temp_convert_self_to_deprecated_attention_blocks()
+                            accelerate.load_checkpoint_and_dispatch(
+                                model,
+                                model_file,
+                                device_map,
+                                max_memory=max_memory,
+                                offload_folder=offload_folder,
+                                offload_state_dict=offload_state_dict,
+                                dtype=torch_dtype,
+                            )
+                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
+                        else:
+                            raise e
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        m, u = loading_info["missing_keys"], loading_info["unexpected_keys"]
+        logger.info(f"### missing keys: {len(m)}; unexpected keys: {len(u)};")
+        # print(f"### missing keys:\n{m}\n### unexpected keys:\n{u}\n")
+        spatial_params = [p.numel() if "conv3ds" not in n and "tempo_attns" not in n else 0 for n, p in model.named_parameters()]
+        tconv_params = [p.numel() if "conv3ds." in n else 0 for n, p in model.named_parameters()]
+        tattn_params = [p.numel() if "tempo_attns." in n else 0 for n, p in model.named_parameters()]
+        tffconv_params = [p.numel() if "first_frame_conv." in n else 0 for n, p in model.named_parameters()]
+        logger.info(f"### First Frame Convolution Layer Parameters: {sum(tffconv_params) / 1e6} M")
+        logger.info(f"### Spatial UNet Parameters: {sum(spatial_params) / 1e6} M")
+        logger.info(f"### Temporal Convolution Module Parameters: {sum(tconv_params) / 1e6} M")
+        logger.info(f"### Temporal Attention Module Parameters: {sum(tattn_params) / 1e6} M")
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+if __name__ == "__main__":
+    # test
+    from diffusers import AutoencoderKL, DDIMScheduler
+    from transformers import CLIPTextModel, CLIPTokenizer
+    from consisti2v.pipelines.pipeline_animation import AnimationPipeline
+    from consisti2v.pipelines.pipeline_conditional_animation import ConditionalAnimationPipeline
+    from consisti2v.utils.util import save_videos_grid
+    pretrained_model_path = "models/StableDiffusion/stable-diffusion-v1-5"
+    prompt = "apply eye makeup"
+    first_frame_path = "/ML-A100/home/weiming/datasets/UCF/frames/v_ApplyEyeMakeup_g01_c01_frame_90.jpg"
+    tokenizer    = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer", use_safetensors=True)
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
+    vae          = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae", use_safetensors=True)
+    unet         = VideoLDMUNet3DConditionModel.from_pretrained(
+        pretrained_model_path,
+        subfolder="unet",
+        use_safetensors=True
+    )
+    noise_scheduler_kwargs = {
+        "num_train_timesteps": 1000,
+        "beta_start":          0.00085,
+        "beta_end":            0.012,
+        "beta_schedule":       "linear",
+        "steps_offset":        1,
+        "clip_sample":         False,
+    }
+    noise_scheduler = DDIMScheduler(**noise_scheduler_kwargs)
+    # latent = torch.randn(1, 4, 8, 64, 64).to("cuda")
+    # text_embedding = torch.randn(1, 77, 768).to("cuda")
+    # timestep = torch.randint(0, 1000, (1,)).to("cuda").squeeze(0)
+    # output = unet(latent, timestep, text_embedding)
+    pipeline = ConditionalAnimationPipeline(
+        unet=unet, vae=vae, tokenizer=tokenizer, text_encoder=text_encoder, scheduler=noise_scheduler,
+    ).to("cuda")
+    sample = pipeline(
+        prompt,
+        num_inference_steps = 25,
+        guidance_scale      = 8.,
+        video_length        = 8,
+        height              = 256,
+        width               =  256,
+        first_frame_paths   = first_frame_path,
+    ).videos
+    print(sample.shape)
+    save_videos_grid(sample, f"samples/videoldm.gif")

src/videogen_hub/pipelines/consisti2v/consisti2v/models/videoldm_unet_blocks.py ADDED Viewed

	@@ -0,0 +1,1159 @@

+from typing import Optional, Dict, Tuple, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from diffusers.utils import logging
+from diffusers.models.unet_2d_blocks import (
+    DownBlock2D,
+    UpBlock2D
+)
+from diffusers.models.resnet import (
+    ResnetBlock2D,
+    Downsample2D,
+    Upsample2D,
+)
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.activations import get_activation
+from diffusers.utils import logging, is_torch_version
+from diffusers.utils.import_utils import is_xformers_available
+from .videoldm_transformer_blocks import Transformer2DConditionModel
+logger = logging.get_logger(__name__)
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    attention_type="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    downsample_type=None,
+    dropout=0.0,
+    # additional
+    use_temporal=True,
+    augment_temporal_attention=False,
+    n_frames=8,
+    n_temp_heads=8,
+    first_frame_condition_mode="none",
+    latent_channels=4,
+    rotary_emb=False,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return VideoLDMDownBlock(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            # additional
+            use_temporal=use_temporal,
+            n_frames=n_frames,
+            first_frame_condition_mode=first_frame_condition_mode,
+            latent_channels=latent_channels
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        return VideoLDMCrossAttnDownBlock(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+            # additional
+            use_temporal=use_temporal,
+            augment_temporal_attention=augment_temporal_attention,
+            n_frames=n_frames,
+            n_temp_heads=n_temp_heads,
+            first_frame_condition_mode=first_frame_condition_mode,
+            latent_channels=latent_channels,
+            rotary_emb=rotary_emb,
+        )
+    raise ValueError(f'{down_block_type} does not exist.')
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    attention_type="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    upsample_type=None,
+    dropout=0.0,
+    # additional
+    use_temporal=True,
+    augment_temporal_attention=False,
+    n_frames=8,
+    n_temp_heads=8,
+    first_frame_condition_mode="none",
+    latent_channels=4,
+    rotary_emb=None,
+):
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return VideoLDMUpBlock(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            # additional
+            use_temporal=use_temporal,
+            n_frames=n_frames,
+            first_frame_condition_mode=first_frame_condition_mode,
+            latent_channels=latent_channels
+        )
+    elif up_block_type == 'CrossAttnUpBlock2D':
+        return VideoLDMCrossAttnUpBlock(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+            # additional
+            use_temporal=use_temporal,
+            augment_temporal_attention=augment_temporal_attention,
+            n_frames=n_frames,
+            n_temp_heads=n_temp_heads,
+            first_frame_condition_mode=first_frame_condition_mode,
+            latent_channels=latent_channels,
+            rotary_emb=rotary_emb,
+        )
+    raise ValueError(f'{up_block_type} does not exist.')
+class TemporalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        # additional
+        n_frames=8,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = Conv3DLayer(in_channels, out_channels, n_frames=n_frames)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = Conv3DLayer(out_channels, out_channels, n_frames=n_frames)
+        self.nonlinearity = get_activation(non_linearity)
+        self.alpha = nn.Parameter(torch.ones(1))
+    def forward(self, input_tensor, temb=None):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        # weighted sum between spatial and temporal features
+        with torch.no_grad():
+            self.alpha.clamp_(0, 1)
+        output_tensor = self.alpha * input_tensor + (1 - self.alpha) * output_tensor
+        return output_tensor
+class Conv3DLayer(nn.Conv3d):
+    def __init__(self, in_dim, out_dim, n_frames):
+        k, p = (3, 1, 1), (1, 0, 0)
+        super().__init__(in_channels=in_dim, out_channels=out_dim, kernel_size=k, stride=1, padding=p)
+        self.to_3d = Rearrange('(b t) c h w -> b c t h w', t=n_frames)
+        self.to_2d = Rearrange('b c t h w -> (b t) c h w')
+    def forward(self, x):
+        h = self.to_3d(x)
+        h = super().forward(h)
+        out = self.to_2d(h)
+        return out
+class IdentityLayer(nn.Identity):
+    def __init__(self, return_trans2d_output, *args, **kwargs):
+        super().__init__()
+        self.return_trans2d_output = return_trans2d_output
+    def forward(self, x, *args, **kwargs):
+        if self.return_trans2d_output:
+            return Transformer2DModelOutput(sample=x)
+        else:
+            return x
+class VideoLDMCrossAttnDownBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        attention_type="default",
+        # additional
+        use_temporal=True,
+        augment_temporal_attention=False,
+        n_frames=8,
+        n_temp_heads=8,
+        first_frame_condition_mode="none",
+        latent_channels=4,
+        rotary_emb=False,
+    ):
+        super().__init__()
+        self.use_temporal = use_temporal
+        self.n_frames = n_frames
+        self.first_frame_condition_mode = first_frame_condition_mode
+        if self.first_frame_condition_mode == "conv2d":
+            self.first_frame_conv = nn.Conv2d(latent_channels, in_channels, kernel_size=1)
+        resnets = []
+        attentions = []
+        self.n_frames = n_frames
+        self.n_temp_heads = n_temp_heads
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DConditionModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                        # additional
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+        # >>> Temporal Layers >>>
+        conv3ds = []
+        tempo_attns = []
+        for i in range(num_layers):
+            if self.use_temporal:
+                conv3ds.append(
+                    TemporalResnetBlock(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        n_frames=n_frames,
+                    )
+                )
+                tempo_attns.append(
+                    Transformer2DConditionModel(
+                        n_temp_heads,
+                        out_channels // n_temp_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                        # additional
+                        n_frames=n_frames,
+                        is_temporal=True,
+                        augment_temporal_attention=augment_temporal_attention,
+                        rotary_emb=rotary_emb
+                    )
+                )
+            else:
+                conv3ds.append(IdentityLayer(return_trans2d_output=False))
+                tempo_attns.append(IdentityLayer(return_trans2d_output=True))
+        self.conv3ds = nn.ModuleList(conv3ds)
+        self.tempo_attns = nn.ModuleList(tempo_attns)
+        # <<< Temporal Layers <<<
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        # additional
+        first_frame_latents=None,
+    ):
+        condition_on_first_frame = (self.first_frame_condition_mode != "none" and self.first_frame_condition_mode != "input_only")
+        # input shape: hidden_states = (b f) c h w, first_frame_latents = b c 1 h w
+        if self.first_frame_condition_mode == "conv2d":
+            hidden_states = rearrange(hidden_states, '(b t) c h w -> b c t h w', t=self.n_frames)
+            hidden_height = hidden_states.shape[3]
+            first_frame_height = first_frame_latents.shape[3]
+            downsample_ratio = hidden_height / first_frame_height
+            first_frame_latents = F.interpolate(first_frame_latents.squeeze(2), scale_factor=downsample_ratio, mode="nearest")
+            first_frame_latents = self.first_frame_conv(first_frame_latents).unsqueeze(2)
+            hidden_states[:, :, 0:1, :, :] = first_frame_latents
+            hidden_states = rearrange(hidden_states, 'b c t h w -> (b t) c h w', t=self.n_frames)
+        output_states = ()
+        for resnet, conv3d, attn, tempo_attn in zip(self.resnets, self.conv3ds, self.attentions, self.tempo_attns):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = conv3d(hidden_states)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                condition_on_first_frame=condition_on_first_frame,
+            ).sample
+            hidden_states = tempo_attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                condition_on_first_frame=False,
+            ).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class VideoLDMCrossAttnUpBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        attention_type="default",
+        # additional
+        use_temporal=True,
+        augment_temporal_attention=False,
+        n_frames=8,
+        n_temp_heads=8,
+        first_frame_condition_mode="none",
+        latent_channels=4,
+        rotary_emb=False,
+    ):
+        super().__init__()
+        self.use_temporal = use_temporal
+        self.n_frames = n_frames
+        self.first_frame_condition_mode = first_frame_condition_mode
+        if self.first_frame_condition_mode == "conv2d":
+            self.first_frame_conv = nn.Conv2d(latent_channels, prev_output_channel, kernel_size=1)
+        resnets = []
+        attentions = []
+        self.n_frames = n_frames
+        self.n_temp_heads = n_temp_heads
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DConditionModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                        # additional
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+        # >>> Temporal Layers >>>
+        conv3ds = []
+        tempo_attns = []
+        for i in range(num_layers):
+            if self.use_temporal:
+                conv3ds.append(
+                    TemporalResnetBlock(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        n_frames=n_frames,
+                    )
+                )
+                tempo_attns.append(
+                    Transformer2DConditionModel(
+                        n_temp_heads,
+                        out_channels // n_temp_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                        # additional
+                        n_frames=n_frames,
+                        augment_temporal_attention=augment_temporal_attention,
+                        is_temporal=True,
+                        rotary_emb=rotary_emb,
+                    )
+                )
+            else:
+                conv3ds.append(IdentityLayer(return_trans2d_output=False))
+                tempo_attns.append(IdentityLayer(return_trans2d_output=True))
+        self.conv3ds = nn.ModuleList(conv3ds)
+        self.tempo_attns = nn.ModuleList(tempo_attns)
+        # <<< Temporal Layers <<<
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        # additional
+        first_frame_latents=None,
+    ):
+        condition_on_first_frame = (self.first_frame_condition_mode != "none" and self.first_frame_condition_mode != "input_only")
+        # input shape: hidden_states = (b f) c h w, first_frame_latents = b c 1 h w
+        if self.first_frame_condition_mode == "conv2d":
+            hidden_states = rearrange(hidden_states, '(b t) c h w -> b c t h w', t=self.n_frames)
+            hidden_height = hidden_states.shape[3]
+            first_frame_height = first_frame_latents.shape[3]
+            downsample_ratio = hidden_height / first_frame_height
+            first_frame_latents = F.interpolate(first_frame_latents.squeeze(2), scale_factor=downsample_ratio, mode="nearest")
+            first_frame_latents = self.first_frame_conv(first_frame_latents).unsqueeze(2)
+            hidden_states[:, :, 0:1, :, :] = first_frame_latents
+            hidden_states = rearrange(hidden_states, 'b c t h w -> (b t) c h w', t=self.n_frames)
+        for resnet, conv3d, attn, tempo_attn in zip(self.resnets, self.conv3ds, self.attentions, self.tempo_attns):
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = conv3d(hidden_states)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                condition_on_first_frame=condition_on_first_frame,
+            ).sample
+            hidden_states = tempo_attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                condition_on_first_frame=False,
+            ).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class VideoLDMUNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        attention_type="default",
+        # additional
+        use_temporal=True,
+        n_frames: int = 8,
+        first_frame_condition_mode="none",
+        latent_channels=4,
+    ):
+        super().__init__()
+        self.use_temporal = use_temporal
+        self.n_frames = n_frames
+        self.first_frame_condition_mode = first_frame_condition_mode
+        if self.first_frame_condition_mode == "conv2d":
+            self.first_frame_conv = nn.Conv2d(latent_channels, in_channels, kernel_size=1)
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        if self.use_temporal:
+            conv3ds = [
+                TemporalResnetBlock(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    n_frames=n_frames,
+                )
+            ]
+        else:
+            conv3ds = [IdentityLayer(return_trans2d_output=False)]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DConditionModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                        # additional
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if self.use_temporal:
+                conv3ds.append(
+                    TemporalResnetBlock(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                conv3ds.append(IdentityLayer(return_trans2d_output=False))
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.conv3ds = nn.ModuleList(conv3ds)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        # additional
+        first_frame_latents=None,
+    ) -> torch.FloatTensor:
+        condition_on_first_frame = (self.first_frame_condition_mode != "none" and self.first_frame_condition_mode != "input_only")
+        # input shape: hidden_states = (b f) c h w, first_frame_latents = b c 1 h w
+        if self.first_frame_condition_mode == "conv2d":
+            hidden_states = rearrange(hidden_states, '(b t) c h w -> b c t h w', t=self.n_frames)
+            hidden_height = hidden_states.shape[3]
+            first_frame_height = first_frame_latents.shape[3]
+            downsample_ratio = hidden_height / first_frame_height
+            first_frame_latents = F.interpolate(first_frame_latents.squeeze(2), scale_factor=downsample_ratio, mode="nearest")
+            first_frame_latents = self.first_frame_conv(first_frame_latents).unsqueeze(2)
+            hidden_states[:, :, 0:1, :, :] = first_frame_latents
+            hidden_states = rearrange(hidden_states, 'b c t h w -> (b t) c h w', t=self.n_frames)
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        hidden_states = self.conv3ds[0](hidden_states)
+        for attn, resnet, conv3d in zip(self.attentions, self.resnets[1:], self.conv3ds[1:]):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    # additional
+                    condition_on_first_frame=condition_on_first_frame,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = conv3d(hidden_states)
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    # additional
+                    condition_on_first_frame=condition_on_first_frame,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = conv3d(hidden_states)
+        return hidden_states
+class VideoLDMDownBlock(DownBlock2D):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        # additional
+        use_temporal=True,
+        n_frames: int = 8,
+        first_frame_condition_mode="none",
+        latent_channels=4,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            temb_channels,
+            dropout,
+            num_layers,
+            resnet_eps,
+            resnet_time_scale_shift,
+            resnet_act_fn,
+            resnet_groups,
+            resnet_pre_norm,
+            output_scale_factor,
+            add_downsample,
+            downsample_padding,)
+        self.use_temporal = use_temporal
+        self.n_frames = n_frames
+        self.first_frame_condition_mode = first_frame_condition_mode
+        if self.first_frame_condition_mode == "conv2d":
+            self.first_frame_conv = nn.Conv2d(latent_channels, in_channels, kernel_size=1)
+        # >>> Temporal Layers >>>
+        conv3ds = []
+        for i in range(num_layers):
+            if self.use_temporal:
+                conv3ds.append(
+                    TemporalResnetBlock(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                conv3ds.append(IdentityLayer(return_trans2d_output=False))
+        self.conv3ds = nn.ModuleList(conv3ds)
+        # <<< Temporal Layers <<<
+    def forward(self, hidden_states, temb=None, scale: float = 1, first_frame_latents=None):
+        # input shape: hidden_states = (b f) c h w, first_frame_latents = b c 1 h w
+        if self.first_frame_condition_mode == "conv2d":
+            hidden_states = rearrange(hidden_states, '(b t) c h w -> b c t h w', t=self.n_frames)
+            hidden_height = hidden_states.shape[3]
+            first_frame_height = first_frame_latents.shape[3]
+            downsample_ratio = hidden_height / first_frame_height
+            first_frame_latents = F.interpolate(first_frame_latents.squeeze(2), scale_factor=downsample_ratio, mode="nearest")
+            first_frame_latents = self.first_frame_conv(first_frame_latents).unsqueeze(2)
+            hidden_states[:, :, 0:1, :, :] = first_frame_latents
+            hidden_states = rearrange(hidden_states, 'b c t h w -> (b t) c h w', t=self.n_frames)
+        output_states = ()
+        for resnet, conv3d in zip(self.resnets, self.conv3ds):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+            hidden_states = conv3d(hidden_states)
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+class VideoLDMUpBlock(UpBlock2D):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        # additional
+        use_temporal=True,
+        n_frames: int = 8,
+        first_frame_condition_mode="none",
+        latent_channels=4,
+    ):
+        super().__init__(
+            in_channels,
+            prev_output_channel,
+            out_channels,
+            temb_channels,
+            dropout,
+            num_layers,
+            resnet_eps,
+            resnet_time_scale_shift,
+            resnet_act_fn,
+            resnet_groups,
+            resnet_pre_norm,
+            output_scale_factor,
+            add_upsample,
+        )
+        self.use_temporal = use_temporal
+        self.n_frames = n_frames
+        self.first_frame_condition_mode = first_frame_condition_mode
+        if self.first_frame_condition_mode == "conv2d":
+            self.first_frame_conv = nn.Conv2d(latent_channels, prev_output_channel, kernel_size=1)
+        # >>> Temporal Layers >>>
+        conv3ds = []
+        for i in range(num_layers):
+            if self.use_temporal:
+                conv3ds.append(
+                    TemporalResnetBlock(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        n_frames=n_frames,
+                    )
+                )
+            else:
+                conv3ds.append(IdentityLayer(return_trans2d_output=False))
+        self.conv3ds = nn.ModuleList(conv3ds)
+        # <<< Temporal Layers <<<
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1, first_frame_latents=None):
+        # input shape: hidden_states = (b f) c h w, first_frame_latents = b c 1 h w
+        if self.first_frame_condition_mode == "conv2d":
+            hidden_states = rearrange(hidden_states, '(b t) c h w -> b c t h w', t=self.n_frames)
+            hidden_height = hidden_states.shape[3]
+            first_frame_height = first_frame_latents.shape[3]
+            downsample_ratio = hidden_height / first_frame_height
+            first_frame_latents = F.interpolate(first_frame_latents.squeeze(2), scale_factor=downsample_ratio, mode="nearest")
+            first_frame_latents = self.first_frame_conv(first_frame_latents).unsqueeze(2)
+            hidden_states[:, :, 0:1, :, :] = first_frame_latents
+            hidden_states = rearrange(hidden_states, 'b c t h w -> (b t) c h w', t=self.n_frames)
+        for resnet, conv3d in zip(self.resnets, self.conv3ds):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+            hidden_states = conv3d(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+        return hidden_states

src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/pipeline_autoregress_animation.py ADDED Viewed

	@@ -0,0 +1,615 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+from typing import Callable, List, Optional, Union
+from dataclasses import dataclass
+import math
+import numpy as np
+import torch
+from tqdm import tqdm
+from torchvision import transforms as T
+from PIL import Image
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging, BaseOutput
+from einops import rearrange, repeat
+from ..models.unet import UNet3DConditionModel
+from ..utils.frameinit_utils import freq_mix_3d, get_freq_filter
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# copied from https://github.com/huggingface/diffusers/blob/v0.23.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L59C1-L70C21
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class AutoregressiveAnimationPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.freq_filter = None
+    @torch.no_grad()
+    def init_filter(self, video_length, height, width, filter_params):
+        # initialize frequency filter for noise reinitialization
+        batch_size = 1
+        num_channels_latents = self.unet.config.in_channels
+        filter_shape = [
+            batch_size,
+            num_channels_latents,
+            video_length,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor
+        ]
+        # self.freq_filter = get_freq_filter(filter_shape, device=self._execution_device, params=filter_params)
+        self.freq_filter = get_freq_filter(
+            filter_shape,
+            device=self._execution_device,
+            filter_type=filter_params.method,
+            n=filter_params.n if filter_params.method=="butterworth" else None,
+            d_s=filter_params.d_s,
+            d_t=filter_params.d_t
+        )
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance is not None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if do_classifier_free_guidance == "text":
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            elif do_classifier_free_guidance == "both":
+                text_embeddings = torch.cat([uncond_embeddings, uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents, first_frames=None):
+        video_length = latents.shape[2]
+        latents = 1 / self.vae.config.scaling_factor * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in tqdm(range(latents.shape[0]), **self._progress_bar_config):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        if first_frames is not None:
+            first_frames = first_frames.unsqueeze(2)
+            video = torch.cat([first_frames, video], dim=2)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps, first_frame_paths=None):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if first_frame_paths is not None and (not isinstance(prompt, str) and not isinstance(first_frame_paths, list)):
+            raise ValueError(f"`first_frame_paths` has to be of type `str` or `list` but is {type(first_frame_paths)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None, noise_sampling_method="vanilla", noise_alpha=1.0):
+        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                # shape = shape
+                shape = (1,) + shape[1:]
+                if noise_sampling_method == "vanilla":
+                    latents = [
+                        torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                        for i in range(batch_size)
+                    ]
+                elif noise_sampling_method == "pyoco_mixed":
+                    base_shape = (batch_size, num_channels_latents, 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
+                    latents = []
+                    noise_alpha_squared = noise_alpha ** 2
+                    for i in range(batch_size):
+                        base_latent = torch.randn(base_shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared))
+                        ind_latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                        latents.append(base_latent + ind_latent)
+                elif noise_sampling_method == "pyoco_progressive":
+                    latents = []
+                    noise_alpha_squared = noise_alpha ** 2
+                    for i in range(batch_size):
+                        latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                        ind_latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                        for j in range(1, video_length):
+                            latent[:, :, j, :, :] = latent[:, :, j - 1, :, :] * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared)) + ind_latent[:, :, j, :, :]
+                        latents.append(latent)
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                if noise_sampling_method == "vanilla":
+                    latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+                elif noise_sampling_method == "pyoco_mixed":
+                    noise_alpha_squared = noise_alpha ** 2
+                    base_shape = (batch_size, num_channels_latents, 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
+                    base_latents = torch.randn(base_shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared))
+                    ind_latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                    latents = base_latents + ind_latents
+                elif noise_sampling_method == "pyoco_progressive":
+                    noise_alpha_squared = noise_alpha ** 2
+                    latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
+                    ind_latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                    for j in range(1, video_length):
+                        latents[:, :, j, :, :] = latents[:, :, j - 1, :, :] * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared)) + ind_latents[:, :, j, :, :]
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale_txt: float = 7.5,
+        guidance_scale_img: float = 2.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        # additional
+        first_frame_paths: Optional[Union[str, List[str]]] = None,
+        first_frames: Optional[torch.FloatTensor] = None,
+        noise_sampling_method: str = "vanilla",
+        noise_alpha: float = 1.0,
+        guidance_rescale: float = 0.0,
+        frame_stride: Optional[int] = None,
+        autoregress_steps: int = 3,
+        use_frameinit: bool = False,
+        frameinit_noise_level: int = 999,
+        **kwargs,
+    ):
+        if first_frame_paths is not None and first_frames is not None:
+            raise ValueError("Only one of `first_frame_paths` and `first_frames` can be passed.")
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, first_frame_paths)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+            first_frame_input = first_frame_paths if first_frame_paths is not None else first_frames
+            if first_frame_input is not None:
+                assert len(prompt) == len(first_frame_input), "prompt and first_frame_paths should have the same length"
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = None
+        # two guidance mode: text and text+image
+        if guidance_scale_txt > 1.0:
+            do_classifier_free_guidance = "text"
+        if guidance_scale_img > 1.0:
+            do_classifier_free_guidance = "both"
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Encode input first frame
+        first_frame_latents = None
+        if first_frame_paths is not None:
+            first_frame_paths = first_frame_paths if isinstance(first_frame_paths, list) else [first_frame_paths] * batch_size
+            img_transform = T.Compose([
+                T.ToTensor(),
+                T.Resize(height, antialias=None),
+                T.CenterCrop((height, width)),
+                T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ])
+            first_frames = []
+            for first_frame_path in first_frame_paths:
+                first_frame = Image.open(first_frame_path).convert('RGB')
+                first_frame = img_transform(first_frame).unsqueeze(0)
+                first_frames.append(first_frame)
+            first_frames = torch.cat(first_frames, dim=0)
+        if first_frames is not None:
+            first_frames = first_frames.to(device, dtype=self.vae.dtype)
+            first_frame_latents = self.vae.encode(first_frames).latent_dist
+            first_frame_latents = first_frame_latents.sample()
+            first_frame_latents = first_frame_latents * self.vae.config.scaling_factor # b, c, h, w
+            first_frame_latents = repeat(first_frame_latents, "b c h w -> (b n) c h w", n=num_videos_per_prompt)
+            first_frames = repeat(first_frames, "b c h w -> (b n) c h w", n=num_videos_per_prompt)
+        full_video_latent = torch.zeros(batch_size * num_videos_per_prompt, self.unet.config.in_channels, video_length * autoregress_steps - autoregress_steps + 1, height // self.vae_scale_factor, width // self.vae_scale_factor, device=device, dtype=self.vae.dtype)
+        start_idx = 0
+        for ar_step in range(autoregress_steps):
+            # Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+            # Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_videos_per_prompt,
+                num_channels_latents,
+                video_length,
+                height,
+                width,
+                text_embeddings.dtype,
+                device,
+                generator,
+                latents,
+                noise_sampling_method,
+                noise_alpha,
+            )
+            latents_dtype = latents.dtype
+            if use_frameinit:
+                current_diffuse_timestep = frameinit_noise_level # diffuse to noise level
+                diffuse_timesteps = torch.full((batch_size,),int(current_diffuse_timestep))
+                diffuse_timesteps = diffuse_timesteps.long()
+                first_frames_static_vid = repeat(first_frame_latents, "b c h w -> b c t h w", t=video_length)
+                z_T = self.scheduler.add_noise(
+                    original_samples=first_frames_static_vid.to(device),
+                    noise=latents.to(device),
+                    timesteps=diffuse_timesteps.to(device)
+                )
+                latents = freq_mix_3d(z_T.to(dtype=torch.float32), latents, LPF=self.freq_filter)
+                latents = latents.to(dtype=latents_dtype)
+            if first_frame_latents is not None:
+                first_frame_noisy_latent = latents[:, :, 0, :, :]
+                latents = latents[:, :, 1:, :, :]
+            # Prepare extra step kwargs.
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+            # Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    if do_classifier_free_guidance is None:
+                        latent_model_input = latents
+                    elif do_classifier_free_guidance == "text":
+                        latent_model_input = torch.cat([latents] * 2)
+                    elif do_classifier_free_guidance == "both":
+                        latent_model_input = torch.cat([latents] * 3)
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if first_frame_latents is not None:
+                        if do_classifier_free_guidance is None:
+                            first_frame_latents_input = first_frame_latents
+                        elif do_classifier_free_guidance == "text":
+                            first_frame_latents_input = torch.cat([first_frame_latents] * 2)
+                        elif do_classifier_free_guidance == "both":
+                            first_frame_latents_input = torch.cat([first_frame_noisy_latent, first_frame_latents, first_frame_latents])
+                        first_frame_latents_input = first_frame_latents_input.unsqueeze(2)
+                        # predict the noise residual
+                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, first_frame_latents=first_frame_latents_input, frame_stride=frame_stride).sample.to(dtype=latents_dtype)
+                    else:
+                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                    # noise_pred = []
+                    # import pdb
+                    # pdb.set_trace()
+                    # for batch_idx in range(latent_model_input.shape[0]):
+                    #     noise_pred_single = self.unet(latent_model_input[batch_idx:batch_idx+1], t, encoder_hidden_states=text_embeddings[batch_idx:batch_idx+1]).sample.to(dtype=latents_dtype)
+                    #     noise_pred.append(noise_pred_single)
+                    # noise_pred = torch.cat(noise_pred)
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        if do_classifier_free_guidance == "text":
+                            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                            noise_pred = noise_pred_uncond + guidance_scale_txt * (noise_pred_text - noise_pred_uncond)
+                        elif do_classifier_free_guidance == "both":
+                            noise_pred_uncond, noise_pred_img, noise_pred_both = noise_pred.chunk(3)
+                            noise_pred = noise_pred_uncond + guidance_scale_img * (noise_pred_img - noise_pred_uncond) + guidance_scale_txt * (noise_pred_both - noise_pred_img)
+                    if do_classifier_free_guidance and guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        # currently only support text guidance
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+            # Post-processing
+            latents = torch.cat([first_frame_latents.unsqueeze(2), latents], dim=2)
+            first_frame_latents = latents[:, :, -1, :, :]
+            full_video_latent[:, :, start_idx:start_idx + video_length, :, :] = latents
+            latents = None
+            start_idx += (video_length - 1)
+        # video = self.decode_latents(latents, first_frames)
+        video = self.decode_latents(full_video_latent)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

src/videogen_hub/pipelines/consisti2v/consisti2v/pipelines/pipeline_conditional_animation.py ADDED Viewed

	@@ -0,0 +1,695 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+from typing import Callable, List, Optional, Union
+from dataclasses import dataclass
+import math
+import numpy as np
+import torch
+from tqdm import tqdm
+from torchvision import transforms as T
+from torchvision.transforms import functional as F
+from PIL import Image
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging, BaseOutput
+from einops import rearrange, repeat
+from ..models.videoldm_unet import VideoLDMUNet3DConditionModel
+from ..utils.frameinit_utils import get_freq_filter, freq_mix_3d
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# copied from https://github.com/huggingface/diffusers/blob/v0.23.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L59C1-L70C21
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def pan_right(image, num_frames=16, crop_width=256):
+    frames = []
+    height, width = image.shape[-2:]
+    for i in range(num_frames):
+        # Calculate the start position of the crop
+        start_x = int((width - crop_width) * (i / num_frames))
+        crop = F.crop(image, 0, start_x, height, crop_width)
+        frames.append(crop.unsqueeze(0))
+    return torch.cat(frames, dim=0)
+def pan_left(image, num_frames=16, crop_width=256):
+    frames = []
+    height, width = image.shape[-2:]
+    for i in range(num_frames):
+        # Start position moves from right to left
+        start_x = int((width - crop_width) * (1 - (i / num_frames)))
+        crop = F.crop(image, 0, start_x, height, crop_width)
+        frames.append(crop.unsqueeze(0))
+    return torch.cat(frames, dim=0)
+def zoom_in(image, num_frames=16, crop_width=256, ratio=1.5):
+    frames = []
+    height, width = image.shape[-2:]
+    max_crop_size = min(width, height)
+    for i in range(num_frames):
+        # Calculate the size of the crop
+        crop_size = max_crop_size - int((max_crop_size - max_crop_size // ratio) * (i / num_frames))
+        start_x = (width - crop_size) // 2
+        start_y = (height - crop_size) // 2
+        crop = F.crop(image, start_y, start_x, crop_size, crop_size)
+        resized_crop = F.resize(crop, (crop_width, crop_width), antialias=None)  # Resize back to original size
+        frames.append(resized_crop.unsqueeze(0))
+    return torch.cat(frames, dim=0)
+def zoom_out(image, num_frames=16, crop_width=256, ratio=1.5):
+    frames = []
+    height, width = image.shape[-2:]
+    min_crop_size = min(width, height) // ratio  # Starting from a quarter of the size
+    for i in range(num_frames):
+        # Calculate the size of the crop
+        crop_size = min_crop_size + int((min(width, height) - min_crop_size) * (i / num_frames))
+        start_x = (width - crop_size) // 2
+        start_y = (height - crop_size) // 2
+        crop = F.crop(image, start_y, start_x, crop_size, crop_size)
+        resized_crop = F.resize(crop, (crop_width, crop_width), antialias=None)  # Resize back to original size
+        frames.append(resized_crop.unsqueeze(0))
+    return torch.cat(frames, dim=0)
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class ConditionalAnimationPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: VideoLDMUNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.freq_filter = None
+    @torch.no_grad()
+    def init_filter(self, video_length, height, width, filter_params):
+        # initialize frequency filter for noise reinitialization
+        batch_size = 1
+        num_channels_latents = self.unet.config.in_channels
+        filter_shape = [
+            batch_size,
+            num_channels_latents,
+            video_length,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor
+        ]
+        # self.freq_filter = get_freq_filter(filter_shape, device=self._execution_device, params=filter_params)
+        self.freq_filter = get_freq_filter(
+            filter_shape,
+            device=self._execution_device,
+            filter_type=filter_params.method,
+            n=filter_params.n if filter_params.method=="butterworth" else None,
+            d_s=filter_params.d_s,
+            d_t=filter_params.d_t
+        )
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance is not None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if do_classifier_free_guidance == "text":
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            elif do_classifier_free_guidance == "both":
+                text_embeddings = torch.cat([uncond_embeddings, uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents, first_frames=None):
+        video_length = latents.shape[2]
+        latents = 1 / self.vae.config.scaling_factor * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in tqdm(range(latents.shape[0]), **self._progress_bar_config):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        if first_frames is not None:
+            first_frames = first_frames.unsqueeze(2)
+            video = torch.cat([first_frames, video], dim=2)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps, first_frame_paths=None):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if first_frame_paths is not None and (not isinstance(prompt, str) and not isinstance(first_frame_paths, list)):
+            raise ValueError(f"`first_frame_paths` has to be of type `str` or `list` but is {type(first_frame_paths)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None, noise_sampling_method="vanilla", noise_alpha=1.0):
+        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                # shape = shape
+                shape = (1,) + shape[1:]
+                if noise_sampling_method == "vanilla":
+                    latents = [
+                        torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                        for i in range(batch_size)
+                    ]
+                elif noise_sampling_method == "pyoco_mixed":
+                    base_shape = (batch_size, num_channels_latents, 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
+                    latents = []
+                    noise_alpha_squared = noise_alpha ** 2
+                    for i in range(batch_size):
+                        base_latent = torch.randn(base_shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared))
+                        ind_latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                        latents.append(base_latent + ind_latent)
+                elif noise_sampling_method == "pyoco_progressive":
+                    latents = []
+                    noise_alpha_squared = noise_alpha ** 2
+                    for i in range(batch_size):
+                        latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                        ind_latent = torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                        for j in range(1, video_length):
+                            latent[:, :, j, :, :] = latent[:, :, j - 1, :, :] * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared)) + ind_latent[:, :, j, :, :]
+                        latents.append(latent)
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                if noise_sampling_method == "vanilla":
+                    latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+                elif noise_sampling_method == "pyoco_mixed":
+                    noise_alpha_squared = noise_alpha ** 2
+                    base_shape = (batch_size, num_channels_latents, 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
+                    base_latents = torch.randn(base_shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared))
+                    ind_latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                    latents = base_latents + ind_latents
+                elif noise_sampling_method == "pyoco_progressive":
+                    noise_alpha_squared = noise_alpha ** 2
+                    latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
+                    ind_latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype) * math.sqrt(1 / (1 + noise_alpha_squared))
+                    for j in range(1, video_length):
+                        latents[:, :, j, :, :] = latents[:, :, j - 1, :, :] * math.sqrt((noise_alpha_squared) / (1 + noise_alpha_squared)) + ind_latents[:, :, j, :, :]
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale_txt: float = 7.5,
+        guidance_scale_img: float = 2.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        # additional
+        first_frame_paths: Optional[Union[str, List[str]]] = None,
+        first_frames: Optional[torch.FloatTensor] = None,
+        noise_sampling_method: str = "vanilla",
+        noise_alpha: float = 1.0,
+        guidance_rescale: float = 0.0,
+        frame_stride: Optional[int] = None,
+        use_frameinit: bool = False,
+        frameinit_noise_level: int = 999,
+        camera_motion: str = None,
+        **kwargs,
+    ):
+        if first_frame_paths is not None and first_frames is not None:
+            raise ValueError("Only one of `first_frame_paths` and `first_frames` can be passed.")
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, first_frame_paths)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+            first_frame_input = first_frame_paths if first_frame_paths is not None else first_frames
+            if first_frame_input is not None:
+                assert len(prompt) == len(first_frame_input), "prompt and first_frame_paths should have the same length"
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = None
+        # two guidance mode: text and text+image
+        if guidance_scale_txt > 1.0:
+            do_classifier_free_guidance = "text"
+        if guidance_scale_img > 1.0:
+            do_classifier_free_guidance = "both"
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Encode input first frame
+        first_frame_latents = None
+        if first_frame_paths is not None:
+            first_frame_paths = first_frame_paths if isinstance(first_frame_paths, list) else [first_frame_paths] * batch_size
+            if camera_motion is None:
+                img_transform = T.Compose([
+                    T.ToTensor(),
+                    T.Resize(height, antialias=None),
+                    T.CenterCrop((height, width)),
+                    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+                ])
+            elif camera_motion == "pan_left" or camera_motion == "pan_right":
+                img_transform = T.Compose([
+                    T.ToTensor(),
+                    T.Resize(height, antialias=None),
+                    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+                ])
+            elif camera_motion == "zoom_out" or camera_motion == "zoom_in":
+                img_transform = T.Compose([
+                    T.ToTensor(),
+                    T.Resize(height * 2, antialias=None),
+                    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+                ])
+            first_frames = []
+            for first_frame_path in first_frame_paths:
+                first_frame = Image.open(first_frame_path).convert('RGB')
+                first_frame = img_transform(first_frame)
+                if camera_motion is not None:
+                    if camera_motion == "pan_left":
+                        first_frame = pan_left(first_frame, num_frames=video_length, crop_width=width)
+                    elif camera_motion == "pan_right":
+                        first_frame = pan_right(first_frame, num_frames=video_length, crop_width=width)
+                    elif camera_motion == "zoom_in":
+                        first_frame = zoom_in(first_frame, num_frames=video_length, crop_width=width)
+                    elif camera_motion == "zoom_out":
+                        first_frame = zoom_out(first_frame, num_frames=video_length, crop_width=width)
+                    else:
+                        raise NotImplementedError(f"camera_motion: {camera_motion} is not implemented.")
+                first_frames.append(first_frame.unsqueeze(0))
+            first_frames = torch.cat(first_frames, dim=0)
+        if first_frames is not None:
+            first_frames = first_frames.to(device, dtype=self.vae.dtype)
+            if camera_motion is not None:
+                first_frames = rearrange(first_frames, "b f c h w -> (b f) c h w")
+            first_frame_latents = self.vae.encode(first_frames).latent_dist
+            first_frame_latents = first_frame_latents.sample()
+            first_frame_latents = first_frame_latents * self.vae.config.scaling_factor # b, c, h, w
+            first_frame_static_vid = rearrange(first_frame_latents, "(b f) c h w -> b c f h w", f=video_length if camera_motion is not None else 1)
+            first_frame_latents = first_frame_static_vid[:, :, 0, :, :]
+            first_frame_latents = repeat(first_frame_latents, "b c h w -> (b n) c h w", n=num_videos_per_prompt)
+            first_frames = repeat(first_frames, "b c h w -> (b n) c h w", n=num_videos_per_prompt)
+        if use_frameinit and camera_motion is None:
+            first_frame_static_vid = repeat(first_frame_static_vid, "b c 1 h w -> b c t h w", t=video_length)
+        # self._progress_bar_config = {}
+        # vid = self.decode_latents(first_frame_static_vid)
+        # vid = torch.from_numpy(vid)
+        # from ..utils.util import save_videos_grid
+        # save_videos_grid(vid, "samples/debug/camera_motion/first_frame_static_vid.mp4", fps=8)
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+            noise_sampling_method,
+            noise_alpha,
+        )
+        latents_dtype = latents.dtype
+        if use_frameinit:
+            current_diffuse_timestep = frameinit_noise_level # diffuse to t noise level
+            diffuse_timesteps = torch.full((batch_size,),int(current_diffuse_timestep))
+            diffuse_timesteps = diffuse_timesteps.long()
+            z_T = self.scheduler.add_noise(
+                original_samples=first_frame_static_vid.to(device),
+                noise=latents.to(device),
+                timesteps=diffuse_timesteps.to(device)
+            )
+            latents = freq_mix_3d(z_T.to(dtype=torch.float32), latents.to(dtype=torch.float32), LPF=self.freq_filter)
+            latents = latents.to(dtype=latents_dtype)
+        if first_frame_latents is not None:
+            first_frame_noisy_latent = latents[:, :, 0, :, :]
+            latents = latents[:, :, 1:, :, :]
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                if do_classifier_free_guidance is None:
+                    latent_model_input = latents
+                elif do_classifier_free_guidance == "text":
+                    latent_model_input = torch.cat([latents] * 2)
+                elif do_classifier_free_guidance == "both":
+                    latent_model_input = torch.cat([latents] * 3)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if first_frame_latents is not None:
+                    if do_classifier_free_guidance is None:
+                        first_frame_latents_input = first_frame_latents
+                    elif do_classifier_free_guidance == "text":
+                        first_frame_latents_input = torch.cat([first_frame_latents] * 2)
+                    elif do_classifier_free_guidance == "both":
+                        first_frame_latents_input = torch.cat([first_frame_noisy_latent, first_frame_latents, first_frame_latents])
+                    first_frame_latents_input = first_frame_latents_input.unsqueeze(2)
+                    # predict the noise residual
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, first_frame_latents=first_frame_latents_input, frame_stride=frame_stride).sample.to(dtype=latents_dtype)
+                else:
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                # perform guidance
+                if do_classifier_free_guidance:
+                    if do_classifier_free_guidance == "text":
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale_txt * (noise_pred_text - noise_pred_uncond)
+                    elif do_classifier_free_guidance == "both":
+                        noise_pred_uncond, noise_pred_img, noise_pred_both = noise_pred.chunk(3)
+                        noise_pred = noise_pred_uncond + guidance_scale_img * (noise_pred_img - noise_pred_uncond) + guidance_scale_txt * (noise_pred_both - noise_pred_img)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    # currently only support text guidance
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        latents = torch.cat([first_frame_latents.unsqueeze(2), latents], dim=2)
+        # video = self.decode_latents(latents, first_frames)
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

src/videogen_hub/pipelines/consisti2v/consisti2v/utils/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/consisti2v/utils/frameinit_utils.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# modified from https://github.com/TianxingWu/FreeInit/blob/master/freeinit_utils.py
+import torch
+import torch.fft as fft
+import math
+def freq_mix_3d(x, noise, LPF):
+    """
+    Noise reinitialization.
+    Args:
+        x: diffused latent
+        noise: randomly sampled noise
+        LPF: low pass filter
+    """
+    # FFT
+    x_freq = fft.fftn(x, dim=(-3, -2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+    noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+    noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+    # frequency mix
+    HPF = 1 - LPF
+    x_freq_low = x_freq * LPF
+    noise_freq_high = noise_freq * HPF
+    x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain
+    # IFFT
+    x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+    x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+    return x_mixed
+def get_freq_filter(shape, device, filter_type, n, d_s, d_t):
+    """
+    Form the frequency filter for noise reinitialization.
+    Args:
+        shape: shape of latent (B, C, T, H, W)
+        filter_type: type of the freq filter
+        n: (only for butterworth) order of the filter, larger n ~ ideal, smaller n ~ gaussian
+        d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+        d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+    """
+    if filter_type == "gaussian":
+        return gaussian_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+    elif filter_type == "ideal":
+        return ideal_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+    elif filter_type == "box":
+        return box_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
+    elif filter_type == "butterworth":
+        return butterworth_low_pass_filter(shape=shape, n=n, d_s=d_s, d_t=d_t).to(device)
+    else:
+        raise NotImplementedError
+def gaussian_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+    """
+    Compute the gaussian low pass filter mask.
+    Args:
+        shape: shape of the filter (volume)
+        d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+        d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+    """
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+    if d_s==0 or d_t==0:
+        return mask
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+                mask[..., t,h,w] = math.exp(-1/(2*d_s**2) * d_square)
+    return mask
+def butterworth_low_pass_filter(shape, n=4, d_s=0.25, d_t=0.25):
+    """
+    Compute the butterworth low pass filter mask.
+    Args:
+        shape: shape of the filter (volume)
+        n: order of the filter, larger n ~ ideal, smaller n ~ gaussian
+        d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+        d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+    """
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+    if d_s==0 or d_t==0:
+        return mask
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+                mask[..., t,h,w] = 1 / (1 + (d_square / d_s**2)**n)
+    return mask
+def ideal_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+    """
+    Compute the ideal low pass filter mask.
+    Args:
+        shape: shape of the filter (volume)
+        d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+        d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+    """
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+    if d_s==0 or d_t==0:
+        return mask
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
+                mask[..., t,h,w] =  1 if d_square <= d_s*2 else 0
+    return mask
+def box_low_pass_filter(shape, d_s=0.25, d_t=0.25):
+    """
+    Compute the ideal low pass filter mask (approximated version).
+    Args:
+        shape: shape of the filter (volume)
+        d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
+        d_t: normalized stop frequency for temporal dimension (0.0-1.0)
+    """
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+    if d_s==0 or d_t==0:
+        return mask
+    threshold_s = round(int(H // 2) * d_s)
+    threshold_t = round(T // 2 * d_t)
+    cframe, crow, ccol = T // 2, H // 2, W //2
+    mask[..., cframe - threshold_t:cframe + threshold_t, crow - threshold_s:crow + threshold_s, ccol - threshold_s:ccol + threshold_s] = 1.0
+    return mask

src/videogen_hub/pipelines/consisti2v/consisti2v/utils/util.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import imageio
+import numpy as np
+from typing import Union
+import torch
+import torchvision
+import torch.distributed as dist
+import wandb
+from tqdm import tqdm
+from einops import rearrange
+from torchmetrics.image.fid import _compute_fid
+def zero_rank_print(s):
+    if (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0): print("### " + s)
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, wandb=False, global_step=0, format="gif"):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    if wandb:
+        wandb_video = wandb.Video(outputs, fps=fps)
+        wandb.log({"val_videos": wandb_video}, step=global_step)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    if format == "gif":
+        imageio.mimsave(path, outputs, fps=fps)
+    elif format == "mp4":
+        torchvision.io.write_video(path, np.array(outputs), fps=fps, video_codec='h264', options={'crf': '10'})
+# DDIM Inversion
+@torch.no_grad()
+def init_prompt(prompt, pipeline):
+    uncond_input = pipeline.tokenizer(
+        [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
+        return_tensors="pt"
+    )
+    uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
+    text_input = pipeline.tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
+    context = torch.cat([uncond_embeddings, text_embeddings])
+    return context
+def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
+              sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
+    timestep, next_timestep = min(
+        timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
+    alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
+    alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
+    beta_prod_t = 1 - alpha_prod_t
+    next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+    next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+    next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+    return next_sample
+def get_noise_pred_single(latents, t, context, first_frame_latents, frame_stride, unet):
+    noise_pred = unet(latents, t, encoder_hidden_states=context, first_frame_latents=first_frame_latents, frame_stride=frame_stride).sample
+    return noise_pred
+@torch.no_grad()
+def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt, first_frame_latents, frame_stride):
+    context = init_prompt(prompt, pipeline)
+    uncond_embeddings, cond_embeddings = context.chunk(2)
+    all_latent = [latent]
+    latent = latent.clone().detach()
+    for i in tqdm(range(num_inv_steps)):
+        t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
+        noise_pred = get_noise_pred_single(latent, t, cond_embeddings, first_frame_latents, frame_stride, pipeline.unet)
+        latent = next_step(noise_pred, t, latent, ddim_scheduler)
+        all_latent.append(latent)
+    return all_latent
+@torch.no_grad()
+def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt="", first_frame_latents=None, frame_stride=3):
+    ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt, first_frame_latents, frame_stride)
+    return ddim_latents
+def compute_fid(real_features, fake_features, num_features, device):
+    orig_dtype = real_features.dtype
+    mx_num_feats = (num_features, num_features)
+    real_features_sum = torch.zeros(num_features).double().to(device)
+    real_features_cov_sum = torch.zeros(mx_num_feats).double().to(device)
+    real_features_num_samples = torch.tensor(0).long().to(device)
+    fake_features_sum = torch.zeros(num_features).double().to(device)
+    fake_features_cov_sum = torch.zeros(mx_num_feats).double().to(device)
+    fake_features_num_samples = torch.tensor(0).long().to(device)
+    real_features = real_features.double()
+    fake_features = fake_features.double()
+    real_features_sum += real_features.sum(dim=0)
+    real_features_cov_sum += real_features.t().mm(real_features)
+    real_features_num_samples += real_features.shape[0]
+    fake_features_sum += fake_features.sum(dim=0)
+    fake_features_cov_sum += fake_features.t().mm(fake_features)
+    fake_features_num_samples += fake_features.shape[0]
+    """Calculate FID score based on accumulated extracted features from the two distributions."""
+    if real_features_num_samples < 2 or fake_features_num_samples < 2:
+        raise RuntimeError("More than one sample is required for both the real and fake distributed to compute FID")
+    mean_real = (real_features_sum / real_features_num_samples).unsqueeze(0)
+    mean_fake = (fake_features_sum / fake_features_num_samples).unsqueeze(0)
+    cov_real_num = real_features_cov_sum - real_features_num_samples * mean_real.t().mm(mean_real)
+    cov_real = cov_real_num / (real_features_num_samples - 1)
+    cov_fake_num = fake_features_cov_sum - fake_features_num_samples * mean_fake.t().mm(mean_fake)
+    cov_fake = cov_fake_num / (fake_features_num_samples - 1)
+    return _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake).to(orig_dtype)
+def compute_inception_score(gen_probs, num_splits=10):
+    num_gen = gen_probs.shape[0]
+    gen_probs = gen_probs.detach().cpu().numpy()
+    scores = []
+    np.random.RandomState(42).shuffle(gen_probs)
+    for i in range(num_splits):
+        part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits]
+        kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True)))
+        kl = np.mean(np.sum(kl, axis=1))
+        scores.append(np.exp(kl))
+    return float(np.mean(scores)), float(np.std(scores))
+    # idx = torch.randperm(features.shape[0])
+    # features = features[idx]
+    # # calculate probs and logits
+    # prob = features.softmax(dim=1)
+    # log_prob = features.log_softmax(dim=1)
+    # # split into groups
+    # prob = prob.chunk(splits, dim=0)
+    # log_prob = log_prob.chunk(splits, dim=0)
+    # # calculate score per split
+    # mean_prob = [p.mean(dim=0, keepdim=True) for p in prob]
+    # kl_ = [p * (log_p - m_p.log()) for p, log_p, m_p in zip(prob, log_prob, mean_prob)]
+    # kl_ = [k.sum(dim=1).mean().exp() for k in kl_]
+    # kl = torch.stack(kl_)
+    # return mean and std
+    # return kl.mean(), kl.std()

src/videogen_hub/pipelines/consisti2v/scripts/__init__.py ADDED Viewed

File without changes

src/videogen_hub/pipelines/consisti2v/scripts/animate.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import argparse
+import datetime
+import random
+import os
+import logging
+from omegaconf import OmegaConf
+import torch
+import diffusers
+from diffusers import AutoencoderKL, DDIMScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+from consisti2v.models.videoldm_unet import VideoLDMUNet3DConditionModel
+from consisti2v.pipelines.pipeline_conditional_animation import (
+    ConditionalAnimationPipeline,
+)
+from consisti2v.utils.util import save_videos_grid
+from diffusers.utils.import_utils import is_xformers_available
+def main(args, config):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    diffusers.utils.logging.set_verbosity_info()
+    time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    savedir = f"{config.output_dir}/{config.output_name}-{time_str}"
+    os.makedirs(savedir)
+    samples = []
+    sample_idx = 0
+    ### >>> create validation pipeline >>> ###
+    if config.pipeline_pretrained_path is None:
+        noise_scheduler = DDIMScheduler(
+            **OmegaConf.to_container(config.noise_scheduler_kwargs)
+        )
+        tokenizer = CLIPTokenizer.from_pretrained(
+            config.pretrained_model_path, subfolder="tokenizer", use_safetensors=True
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            config.pretrained_model_path, subfolder="text_encoder"
+        )
+        vae = AutoencoderKL.from_pretrained(
+            config.pretrained_model_path, subfolder="vae", use_safetensors=True
+        )
+        unet = VideoLDMUNet3DConditionModel.from_pretrained(
+            config.pretrained_model_path,
+            subfolder="unet",
+            variant=config.unet_additional_kwargs["variant"],
+            temp_pos_embedding=config.unet_additional_kwargs["temp_pos_embedding"],
+            augment_temporal_attention=config.unet_additional_kwargs[
+                "augment_temporal_attention"
+            ],
+            use_temporal=True,
+            n_frames=config.sampling_kwargs["n_frames"],
+            n_temp_heads=config.unet_additional_kwargs["n_temp_heads"],
+            first_frame_condition_mode=config.unet_additional_kwargs[
+                "first_frame_condition_mode"
+            ],
+            use_frame_stride_condition=config.unet_additional_kwargs[
+                "use_frame_stride_condition"
+            ],
+            use_safetensors=True,
+        )
+        # 1. unet ckpt
+        if config.unet_path is not None:
+            if os.path.isdir(config.unet_path):
+                unet_dict = VideoLDMUNet3DConditionModel.from_pretrained(
+                    config.unet_path
+                )
+                m, u = unet.load_state_dict(unet_dict.state_dict(), strict=False)
+                assert len(u) == 0
+                del unet_dict
+            else:
+                checkpoint_dict = torch.load(config.unet_path, map_location="cpu")
+                state_dict = (
+                    checkpoint_dict["state_dict"]
+                    if "state_dict" in checkpoint_dict
+                    else checkpoint_dict
+                )
+                if config.unet_ckpt_prefix is not None:
+                    state_dict = {
+                        k.replace(config.unet_ckpt_prefix, ""): v
+                        for k, v in state_dict.items()
+                    }
+                m, u = unet.load_state_dict(state_dict, strict=False)
+                assert len(u) == 0
+        if is_xformers_available() and int(torch.__version__.split(".")[0]) < 2:
+            unet.enable_xformers_memory_efficient_attention()
+        pipeline = ConditionalAnimationPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=noise_scheduler,
+        )
+    else:
+        pipeline = ConditionalAnimationPipeline.from_pretrained(
+            config.pipeline_pretrained_path
+        )
+    pipeline.to("cuda")
+    # (frameinit) initialize frequency filter for noise reinitialization -------------
+    if config.frameinit_kwargs.enable:
+        pipeline.init_filter(
+            width=config.sampling_kwargs.width,
+            height=config.sampling_kwargs.height,
+            video_length=config.sampling_kwargs.n_frames,
+            filter_params=config.frameinit_kwargs.filter_params,
+        )
+    # -------------------------------------------------------------------------------
+    ### <<< create validation pipeline <<< ###
+    if args.prompt is not None:
+        prompts = [args.prompt]
+        n_prompts = [args.n_prompt]
+        first_frame_paths = [args.path_to_first_frame]
+        random_seeds = [int(args.seed)] if args.seed != "random" else "random"
+    else:
+        prompt_config = OmegaConf.load(args.prompt_config)
+        prompts = prompt_config.prompts
+        n_prompts = (
+            list(prompt_config.n_prompts) * len(prompts)
+            if len(prompt_config.n_prompts) == 1
+            else prompt_config.n_prompts
+        )
+        first_frame_paths = prompt_config.path_to_first_frames
+        random_seeds = prompt_config.seeds
+    if random_seeds == "random":
+        random_seeds = [random.randint(0, 1e5) for _ in range(len(prompts))]
+    else:
+        random_seeds = (
+            [random_seeds] if isinstance(random_seeds, int) else list(random_seeds)
+        )
+        random_seeds = (
+            random_seeds * len(prompts) if len(random_seeds) == 1 else random_seeds
+        )
+    config.prompt_kwargs = OmegaConf.create(
+        {
+            "random_seeds": [],
+            "prompts": prompts,
+            "n_prompts": n_prompts,
+            "first_frame_paths": first_frame_paths,
+        }
+    )
+    for prompt_idx, (prompt, n_prompt, first_frame_path, random_seed) in enumerate(
+        zip(prompts, n_prompts, first_frame_paths, random_seeds)
+    ):
+        # manually set random seed for reproduction
+        if random_seed != -1:
+            torch.manual_seed(random_seed)
+        else:
+            torch.seed()
+        config.prompt_kwargs.random_seeds.append(torch.initial_seed())
+        print(f"current seed: {torch.initial_seed()}")
+        print(f"sampling {prompt} ...")
+        sample = pipeline(
+            prompt,
+            negative_prompt=n_prompt,
+            first_frame_paths=first_frame_path,
+            num_inference_steps=config.sampling_kwargs.steps,
+            guidance_scale_txt=config.sampling_kwargs.guidance_scale_txt,
+            guidance_scale_img=config.sampling_kwargs.guidance_scale_img,
+            width=config.sampling_kwargs.width,
+            height=config.sampling_kwargs.height,
+            video_length=config.sampling_kwargs.n_frames,
+            noise_sampling_method=config.unet_additional_kwargs[
+                "noise_sampling_method"
+            ],
+            noise_alpha=float(config.unet_additional_kwargs["noise_alpha"]),
+            eta=config.sampling_kwargs.ddim_eta,
+            frame_stride=config.sampling_kwargs.frame_stride,
+            guidance_rescale=config.sampling_kwargs.guidance_rescale,
+            num_videos_per_prompt=config.sampling_kwargs.num_videos_per_prompt,
+            use_frameinit=config.frameinit_kwargs.enable,
+            frameinit_noise_level=config.frameinit_kwargs.noise_level,
+            camera_motion=config.frameinit_kwargs.camera_motion,
+        ).videos
+        samples.append(sample)
+        prompt = "-".join((prompt.replace("/", "").split(" ")[:10])).replace(":", "")
+        if sample.shape[0] > 1:
+            for cnt, samp in enumerate(sample):
+                save_videos_grid(
+                    samp.unsqueeze(0),
+                    f"{savedir}/sample/{sample_idx}-{cnt + 1}-{prompt}.{args.format}",
+                    format=args.format,
+                )
+        else:
+            save_videos_grid(
+                sample,
+                f"{savedir}/sample/{sample_idx}-{prompt}.{args.format}",
+                format=args.format,
+            )
+        print(f"save to {savedir}/sample/{prompt}.{args.format}")
+        sample_idx += 1
+    samples = torch.concat(samples)
+    # save_videos_grid(samples, f"{savedir}/sample.{args.format}", n_rows=4, format=args.format)
+    # OmegaConf.save(config, f"{savedir}/config.yaml")
+    # if args.save_model:
+    #     pipeline.save_pretrained(f"{savedir}/model")
+    return samples
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inference_config", type=str, default="configs/inference/inference.yaml"
+    )
+    parser.add_argument("--prompt", "-p", type=str, default=None)
+    parser.add_argument("--n_prompt", "-n", type=str, default="")
+    parser.add_argument("--seed", type=str, default="random")
+    parser.add_argument("--path_to_first_frame", "-f", type=str, default=None)
+    parser.add_argument(
+        "--prompt_config", type=str, default="configs/prompts/default.yaml"
+    )
+    parser.add_argument("--format", type=str, default="mp4", choices=["gif", "mp4"])
+    parser.add_argument("--save_model", action="store_true")
+    parser.add_argument("optional_args", nargs="*", default=[])
+    args = parser.parse_args()
+    config = OmegaConf.load(args.inference_config)
+    if args.optional_args:
+        modified_config = OmegaConf.from_dotlist(args.optional_args)
+        config = OmegaConf.merge(config, modified_config)
+    main(args, config)