import os import tyro import imageio import numpy as np import tqdm import torch import torch.nn.functional as F from safetensors.torch import load_file import rembg import gradio as gr import kiui from kiui.op import recenter from kiui.cam import orbit_camera from core.utils import get_rays from core.options import AllConfigs, Options from core.models import LTRFM_Mesh, LTRFM_NeRF from core.instant_utils.mesh_util import save_obj, save_obj_with_mtl from mvdream.pipeline_mvdream import MVDreamPipeline from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler from huggingface_hub import hf_hub_download import spaces IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) GRADIO_VIDEO_PATH = 'gradio_output.mp4' GRADIO_OBJ_PATH = 'gradio_output_rgb.obj' GRADIO_OBJ_ALBEDO_PATH = 'gradio_output_albedo.obj' GRADIO_OBJ_SHADING_PATH = 'gradio_output_shading.obj' ckpt_path = hf_hub_download(repo_id="rgxie/LDM", filename="LDM_6V_SDF.ckpt") opt = Options( input_size=512, down_channels=(32, 64, 128, 256, 512), down_attention=(False, False, False, False, True), up_channels=(512, 256, 128), up_attention=(True, False, False, False), volume_mode='TRF_NeRF', splat_size=64, output_size=62, # crop patch data_mode='s5', num_views=8, gradient_accumulation_steps=1, mixed_precision='bf16', resume=ckpt_path, ) # Model selection if opt.volume_mode == 'TRF_Mesh': model = LTRFM_Mesh(opt) elif opt.volume_mode == 'TRF_NeRF': model = LTRFM_NeRF(opt) else: model = None # Resume pretrained checkpoint if opt.resume: if opt.resume.endswith('safetensors'): ckpt = load_file(opt.resume, device='cpu') else: ckpt_dict = torch.load(opt.resume, map_location='cpu') ckpt = ckpt_dict["model"] state_dict = model.state_dict() for k, v in ckpt.items(): k = k.replace('module.', '') if k in state_dict: if state_dict[k].shape == v.shape: state_dict[k].copy_(v) else: print(f'[WARN] mismatching shape for param {k}: ckpt {v.shape} != model {state_dict[k].shape}, ignored.') else: print(f'[WARN] unexpected param {k}: {v.shape}') print('[INFO] load resume success!') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.half().to(device) model.eval() tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy)) proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device) proj_matrix[0, 0] = 1 / tan_half_fov proj_matrix[1, 1] = 1 / tan_half_fov proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear) proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear) proj_matrix[2, 3] = 1 # Load dreams pipe_text = MVDreamPipeline.from_pretrained( 'ashawkey/mvdream-sd2.1-diffusers', torch_dtype=torch.float16, trust_remote_code=True, ) pipe_text = pipe_text.to(device) pipe_image = MVDreamPipeline.from_pretrained( "ashawkey/imagedream-ipmv-diffusers", torch_dtype=torch.float16, trust_remote_code=True, ) pipe_image = pipe_image.to(device) pipe_image_plus = DiffusionPipeline.from_pretrained( "sudo-ai/zero123plus-v1.2", custom_pipeline="zero123plus", torch_dtype=torch.float16, trust_remote_code=True, ) pipe_image_plus.scheduler = EulerAncestralDiscreteScheduler.from_config( pipe_image_plus.scheduler.config, timestep_spacing='trailing' ) unet_path = './pretrained/diffusion_pytorch_model.bin' if os.path.exists(unet_path): unet_ckpt_path = unet_path else: unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model") state_dict = torch.load(unet_ckpt_path, map_location='cpu') pipe_image_plus.unet.load_state_dict(state_dict, strict=True) pipe_image_plus = pipe_image_plus.to(device) # Load rembg bg_remover = rembg.new_session() @spaces.GPU def generate_mv(condition_input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42, mv_moedl_option=None): kiui.seed_everything(input_seed) os.makedirs(os.path.join(opt.workspace, "gradio"), exist_ok=True) if condition_input_image is None: mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation) mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8) mv_image = [] for i in range(4): image = rembg.remove(mv_image_uint8[i], session=bg_remover) image = image.astype(np.float32) / 255 image = recenter(image, image[..., 0] > 0, border_ratio=0.2) image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:]) mv_image.append(image) mv_image_grid = np.concatenate([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=1) input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0) processed_image = None else: condition_input_image = np.array(condition_input_image) carved_image = rembg.remove(condition_input_image, session=bg_remover) mask = carved_image[..., -1] > 0 image = recenter(carved_image, mask, border_ratio=0.2) image = image.astype(np.float32) / 255.0 processed_image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4]) if mv_moedl_option == 'mvdream': mv_image = pipe_image(prompt, processed_image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation) mv_image_grid = np.concatenate([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=1) input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0) else: from PIL import Image from einops import rearrange processed_image = Image.fromarray((processed_image * 255).astype(np.uint8)) mv_image = pipe_image_plus(processed_image, num_inference_steps=input_num_steps).images[0] mv_image = np.asarray(mv_image, dtype=np.float32) / 255.0 mv_image = torch.from_numpy(mv_image).permute(2, 0, 1).contiguous().float() mv_image_grid = rearrange(mv_image, 'c (n h) (m w) -> (m h) (n w) c', n=3, m=2).numpy() mv_image = rearrange(mv_image, 'c (n h) (m w) -> (n m) h w c', n=3, m=2).numpy() input_image = mv_image return mv_image_grid, processed_image, input_image @spaces.GPU def generate_3d(input_image, condition_input_image, mv_moedl_option=None, input_seed=42): kiui.seed_everything(input_seed) output_obj_rgb_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_PATH) output_obj_albedo_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_ALBEDO_PATH) output_obj_shading_path = os.path.join(opt.workspace, "gradio", GRADIO_OBJ_SHADING_PATH) output_video_path = os.path.join(opt.workspace, "gradio", GRADIO_VIDEO_PATH) input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device) input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False) input_rays_o, input_rays_d = get_rays(opt, proj_matrix, device, 'center') with torch.no_grad(): preds = model( cond_img=input_image, rays=(input_rays_o, input_rays_d) ) pred_rgb = preds[0].permute(0, 2, 3, 1).contiguous().cpu().numpy() pred_albedo = preds[1].permute(0, 2, 3, 1).contiguous().cpu().numpy() pred_shading = preds[2].permute(0, 2, 3, 1).contiguous().cpu().numpy() save_obj(output_obj_rgb_path, pred_rgb) save_obj_with_mtl(output_obj_albedo_path, pred_albedo, mode="albedo") save_obj_with_mtl(output_obj_shading_path, pred_shading, mode="shading") camera_positions = orbit_camera(type="spherical", radius=2.5, h=3, w=2) output_frames = [] for pose in tqdm.tqdm(camera_positions, ncols=0): with torch.no_grad(): preds = model(cond_img=input_image, rays=get_rays(opt, proj_matrix, device, pose)) pred_rgb = preds[0].permute(0, 2, 3, 1).contiguous().cpu().numpy() output_frames.append(pred_rgb) output_frames = np.stack(output_frames, axis=0) imageio.mimwrite(output_video_path, output_frames, fps=24, quality=8) return output_obj_rgb_path, output_obj_albedo_path, output_obj_shading_path, output_video_path def update_mv_model(mv_moedl_option): if mv_moedl_option == 'mvdream': return gr.update(visible=False) else: return gr.update(visible=True) # Gradio interface with gr.Blocks() as demo: gr.Markdown( "## Generate 3D object from text or image prompt" ) with gr.Row(): with gr.Column(): input_prompt = gr.Textbox(label="Prompt", lines=3) input_image = gr.Image(label="Input Image", type='numpy', optional=True) input_seed = gr.Slider(minimum=0, maximum=65535, step=1, label="Random Seed", value=42) input_elevation = gr.Slider(minimum=-10, maximum=10, step=1, label="Elevation", value=0) input_num_steps = gr.Slider(minimum=1, maximum=150, step=1, label="Number of Inference Steps", value=30) mv_moedl_option = gr.Radio( ["mvdream", "zero123plus"], label="Model Option", value="mvdream", interactive=True ) generate_mv_button = gr.Button(value="Generate Multi-View Images") generate_mv_button.click(fn=generate_mv, inputs=[input_image, input_prompt, '', input_elevation, input_num_steps, input_seed, mv_moedl_option], outputs=['multi_view_output', 'processed_image_output', 'input_image_output']) with gr.Column(): gr.Markdown("### Multi-View Images") multi_view_output = gr.Image() processed_image_output = gr.Image() gr.Markdown("### Input Image (Processed)") input_image_output = gr.Image() generate_3d_button = gr.Button(value="Generate 3D Model") generate_3d_button.click(fn=generate_3d, inputs=[input_image_output, processed_image_output, mv_moedl_option, input_seed], outputs=['output_obj_rgb', 'output_obj_albedo', 'output_obj_shading', 'output_video']) output_obj_rgb = gr.File(label="RGB 3D Model (.obj)") output_obj_albedo = gr.File(label="Albedo 3D Model (.obj)") output_obj_shading = gr.File(label="Shading 3D Model (.obj)") output_video = gr.Video(label="360° View of the Generated 3D Model (.mp4)") demo.launch()