import os
import tyro
import imageio
import numpy as np
import tqdm
import torch
import torch.nn.functional as F
from safetensors.torch import load_file
import rembg
import gradio as gr

import kiui
from kiui.op import recenter
from kiui.cam import orbit_camera
from core.utils import get_rays
from core.options import AllConfigs, Options
from core.models import LTRFM_Mesh, LTRFM_NeRF
from core.instant_utils.mesh_util import save_obj, save_obj_with_mtl
from mvdream.pipeline_mvdream import MVDreamPipeline
from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
from huggingface_hub import hf_hub_download
import spaces

IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
GRADIO_VIDEO_PATH = 'gradio_output.mp4'
GRADIO_OBJ_PATH = 'gradio_output_rgb.obj'
GRADIO_OBJ_ALBEDO_PATH = 'gradio_output_albedo.obj'
GRADIO_OBJ_SHADING_PATH = 'gradio_output_shading.obj'

ckpt_path = hf_hub_download(repo_id="rgxie/LDM", filename="LDM_6V_SDF.ckpt")

opt = Options(
    input_size=512,
    down_channels=(32, 64, 128, 256, 512),
    down_attention=(False, False, False, False, True),
    up_channels=(512, 256, 128),
    up_attention=(True, False, False, False),
    volume_mode='TRF_NeRF',
    splat_size=64,
    output_size=62,  # crop patch
    data_mode='s5',
    num_views=8,
    gradient_accumulation_steps=1,
    mixed_precision='bf16',
    resume=ckpt_path,
)

# Model selection
if opt.volume_mode == 'TRF_Mesh':
    model = LTRFM_Mesh(opt)
elif opt.volume_mode == 'TRF_NeRF':
    model = LTRFM_NeRF(opt)
else:
    model = None

# Resume pretrained checkpoint
if opt.resume:
    if opt.resume.endswith('safetensors'):
        ckpt = load_file(opt.resume, device='cpu')
    else:
        ckpt_dict = torch.load(opt.resume, map_location='cpu')
        ckpt = ckpt_dict["model"]

    state_dict = model.state_dict()
    for k, v in ckpt.items():
        k = k.replace('module.', '')
        if k in state_dict:
            if state_dict[k].shape == v.shape:
                state_dict[k].copy_(v)
            else:
                print(f'[WARN] mismatching shape for param {k}: ckpt {v.shape} != model {state_dict[k].shape}, ignored.')
        else:
            print(f'[WARN] unexpected param {k}: {v.shape}')
    print('[INFO] load resume success!')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.half().to(device)
model.eval()

tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device)
proj_matrix[0, 0] = 1 / tan_half_fov
proj_matrix[1, 1] = 1 / tan_half_fov
proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
proj_matrix[2, 3] = 1

# Load dreams
pipe_text = MVDreamPipeline.from_pretrained(
    'ashawkey/mvdream-sd2.1-diffusers',
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
pipe_text = pipe_text.to(device)

pipe_image = MVDreamPipeline.from_pretrained(
    "ashawkey/imagedream-ipmv-diffusers",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
pipe_image = pipe_image.to(device)

pipe_image_plus = DiffusionPipeline.from_pretrained(
    "sudo-ai/zero123plus-v1.2",
    custom_pipeline="zero123plus",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
pipe_image_plus.scheduler = EulerAncestralDiscreteScheduler.from_config(
    pipe_image_plus.scheduler.config, timestep_spacing='trailing'
)

unet_path = './pretrained/diffusion_pytorch_model.bin'

if os.path.exists(unet_path):
    unet_ckpt_path = unet_path
else:
    unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")

state_dict = torch.load(unet_ckpt_path, map_location='cpu')
pipe_image_plus.unet.load_state_dict(state_dict, strict=True)
pipe_image_plus = pipe_image_plus.to(device)

# Load rembg
bg_remover = rembg.new_session()

@spaces.GPU
def generate_mv(condition_input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42, mv_moedl_option=None):
    kiui.seed_everything(input_seed)
    os.makedirs(os.path.join(opt.workspace, "gradio"), exist_ok=True)

    if condition_input_image is None:
        mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation)
        mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8)

        mv_image = []
        for i in range(4):
            image = rembg.remove(mv_image_uint8[i], session=bg_remover)
            image = image.astype(np.float32) / 255
            image = recenter(image, image[..., 0] > 0, border_ratio=0.2)
            image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:])
            mv_image.append(image)

        mv_image_grid = np.concatenate([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=1)
        input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0)
        processed_image = None

    else:
        condition_input_image = np.array(condition_input_image)
        carved_image = rembg.remove(condition_input_image, session=bg_remover)
        mask = carved_image[..., -1] > 0
        image = recenter(carved_image, mask, border_ratio=0.2)
        image = image.astype(np.float32) / 255.0
        processed_image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])

        if mv_moedl_option == 'mvdream':
            mv_image = pipe_image(prompt, processed_image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation)
            mv_image_grid = np.concatenate([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=1)
            input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0)

        else:
            from PIL import Image
            from einops import rearrange

            processed_image = Image.fromarray((processed_image * 255).astype(np.uint8))
            mv_image = pipe_image_plus(processed_image, num_inference_steps=input_num_steps).images[0]
            mv_image = np.asarray(mv_image, dtype=np.float32) / 255.0
            mv_image = torch.from_numpy(mv_image).permute(2, 0, 1).contiguous().float()
            mv_image_grid = rearrange(mv_image, 'c (n h) (m w) -> (m h) (n w) c', n=3, m=2).numpy()
            mv_image = rearrange(mv_image, 'c (n h) (m w) -> (n m) h w c', n=3, m=2).numpy()
            input_image = mv_image

    return mv_image_grid, processed_image, input_image

@spaces.GPU
def generate_3d(input_image, condition_input_image, mv_moedl_option=None, input_seed=42):
    kiui.seed_everything(input_seed)

    output_obj_rgb_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_PATH)
    output_obj_albedo_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_ALBEDO_PATH)
    output_obj_shading_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_SHADING_PATH)
    output_video_path = os.path.join(opt.workspace, "gradio", GRADIO_VIDEO_PATH)

    input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device)
    input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False)

    input_rays_o, input_rays_d = get_rays(opt, proj_matrix, device, 'center')

    with torch.no_grad():
        preds = model(
            cond_img=input_image,
            rays=(input_rays_o, input_rays_d)
        )

    pred_rgb = preds[0].permute(0, 2, 3, 1).contiguous().cpu().numpy()
    pred_albedo = preds[1].permute(0, 2, 3, 1).contiguous().cpu().numpy()
    pred_shading = preds[2].permute(0, 2, 3, 1).contiguous().cpu().numpy()

    save_obj(output_obj_rgb_path, pred_rgb)
    save_obj_with_mtl(output_obj_albedo_path, pred_albedo, mode="albedo")
    save_obj_with_mtl(output_obj_shading_path, pred_shading, mode="shading")

    camera_positions = orbit_camera(type="spherical", radius=2.5, h=3, w=2)
    output_frames = []
    for pose in tqdm.tqdm(camera_positions, ncols=0):
        with torch.no_grad():
            preds = model(cond_img=input_image, rays=get_rays(opt, proj_matrix, device, pose))
        pred_rgb = preds[0].permute(0, 2, 3, 1).contiguous().cpu().numpy()
        output_frames.append(pred_rgb)
    output_frames = np.stack(output_frames, axis=0)

    imageio.mimwrite(output_video_path, output_frames, fps=24, quality=8)
    return output_obj_rgb_path, output_obj_albedo_path, output_obj_shading_path, output_video_path

def update_mv_model(mv_moedl_option):
    if mv_moedl_option == 'mvdream':
        return gr.update(visible=False)
    else:
        return gr.update(visible=True)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        "## Generate 3D object from text or image prompt"
    )
    with gr.Row():
        with gr.Column():
            input_prompt = gr.Textbox(label="Prompt", lines=3)
            input_image = gr.Image(label="Input Image", type='numpy', optional=True)
            input_seed = gr.Slider(minimum=0, maximum=65535, step=1, label="Random Seed", value=42)
            input_elevation = gr.Slider(minimum=-10, maximum=10, step=1, label="Elevation", value=0)
            input_num_steps = gr.Slider(minimum=1, maximum=150, step=1, label="Number of Inference Steps", value=30)
            mv_moedl_option = gr.Radio(
                ["mvdream", "zero123plus"],
                label="Model Option",
                value="mvdream",
                interactive=True
            )
            generate_mv_button = gr.Button(value="Generate Multi-View Images")
            generate_mv_button.click(fn=generate_mv,
                                    inputs=[input_image, input_prompt, '', input_elevation, input_num_steps, input_seed, mv_moedl_option],
                                    outputs=['multi_view_output', 'processed_image_output', 'input_image_output'])

        with gr.Column():
            gr.Markdown("### Multi-View Images")
            multi_view_output = gr.Image()
            processed_image_output = gr.Image()
            gr.Markdown("### Input Image (Processed)")
            input_image_output = gr.Image()
            generate_3d_button = gr.Button(value="Generate 3D Model")
            generate_3d_button.click(fn=generate_3d,
                                    inputs=[input_image_output, processed_image_output, mv_moedl_option, input_seed],
                                    outputs=['output_obj_rgb', 'output_obj_albedo', 'output_obj_shading', 'output_video'])

    output_obj_rgb = gr.File(label="RGB 3D Model (.obj)")
    output_obj_albedo = gr.File(label="Albedo 3D Model (.obj)")
    output_obj_shading = gr.File(label="Shading 3D Model (.obj)")
    output_video = gr.Video(label="360° View of the Generated 3D Model (.mp4)")

demo.launch()