sketch-to-3d

Running on Zero

App Files Files Community

sketch-to-3d / app.py

linoyts HF staff

Update app.py

0ad8efc verified 9 days ago

raw

history blame

22 kB

	import gradio as gr
	import spaces
	from gradio_litmodel3d import LitModel3D

	import os
	import shutil
	os.environ['SPCONV_ALGO'] = 'native'
	from typing import *
	import torch
	import numpy as np
	import imageio
	from easydict import EasyDict as edict
	from PIL import Image
	from trellis.pipelines import TrellisImageTo3DPipeline
	from trellis.representations import Gaussian, MeshExtractResult
	from trellis.utils import render_utils, postprocessing_utils



	import os
	import random
	import torch
	import torchvision.transforms.functional as TF

	from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
	from diffusers import DDIMScheduler, EulerAncestralDiscreteScheduler
	from controlnet_aux import PidiNetDetector, HEDdetector
	from diffusers.utils import load_image
	from huggingface_hub import HfApi
	from pathlib import Path
	from PIL import Image, ImageOps
	import torch
	import numpy as np
	import cv2
	import os
	import random

	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""

	style_list = [
	{
	"name": "(No style)",
	"prompt": "{prompt}",
	"negative_prompt": "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
	},
	{
	"name": "Cinematic",
	"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
	"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
	},
	{
	"name": "3D Model",
	"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
	"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
	},
	{
	"name": "Anime",
	"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
	"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
	},
	{
	"name": "Digital Art",
	"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
	"negative_prompt": "photo, photorealistic, realism, ugly",
	},
	{
	"name": "Photographic",
	"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
	"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
	},
	{
	"name": "Pixel art",
	"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
	"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
	},
	{
	"name": "Fantasy art",
	"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
	"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
	},
	{
	"name": "Neonpunk",
	"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
	"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
	},
	{
	"name": "Manga",
	"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
	"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
	},
	]
	styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
	STYLE_NAMES = list(styles.keys())
	DEFAULT_STYLE_NAME = "(No style)"

	MAX_SEED = np.iinfo(np.int32).max
	TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
	os.makedirs(TMP_DIR, exist_ok=True)



	def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
	p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
	return p.replace("{prompt}", positive), n + negative


	def start_session(req: gr.Request):
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	os.makedirs(user_dir, exist_ok=True)


	def end_session(req: gr.Request):
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	shutil.rmtree(user_dir)

	@spaces.GPU
	def preprocess_image(image: Image.Image,
	prompt: str = "",
	negative_prompt: str = "",
	style_name: str = "",
	num_steps: int = 25,
	guidance_scale: float = 5,
	controlnet_conditioning_scale: float = 1.0,
	do_preprocess: bool = True) -> Image.Image:
	"""
	Preprocess the input image.

	Args:
	image (Image.Image): The input image.

	Returns:
	Image.Image: The preprocessed image.
	"""

	if do_preprocess:
	width, height = image['composite'].size
	ratio = np.sqrt(1024. * 1024. / (width * height))
	new_width, new_height = int(width * ratio), int(height * ratio)
	image = image['composite'].resize((new_width, new_height))

	print("image:",type(image))

	prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)

	print("params:", prompt, negative_prompt, style_name, num_steps, guidance_scale, controlnet_conditioning_scale)
	image = pipe_control(
	prompt=prompt,
	negative_prompt=negative_prompt,
	image=image,
	num_inference_steps=num_steps,
	controlnet_conditioning_scale=controlnet_conditioning_scale,
	guidance_scale=guidance_scale,
	width=new_width,
	height=new_height).images[0]


	processed_image = pipeline.preprocess_image(image)
	return processed_image, False
	else:
	return image, False


	def preprocess_images(images: List[Tuple[Image.Image, str]], do_preprocess = True) -> List[Image.Image]:
	"""
	Preprocess a list of input images.

	Args:
	images (List[Tuple[Image.Image, str]]): The input images.

	Returns:
	List[Image.Image]: The preprocessed images.
	"""
	images = [image[0] for image in images]
	processed_images = [pipeline.preprocess_image(image) for image in images]
	return processed_images, False


	def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
	return {
	'gaussian': {
	**gs.init_params,
	'_xyz': gs._xyz.cpu().numpy(),
	'_features_dc': gs._features_dc.cpu().numpy(),
	'_scaling': gs._scaling.cpu().numpy(),
	'_rotation': gs._rotation.cpu().numpy(),
	'_opacity': gs._opacity.cpu().numpy(),
	},
	'mesh': {
	'vertices': mesh.vertices.cpu().numpy(),
	'faces': mesh.faces.cpu().numpy(),
	},
	}


	def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
	gs = Gaussian(
	aabb=state['gaussian']['aabb'],
	sh_degree=state['gaussian']['sh_degree'],
	mininum_kernel_size=state['gaussian']['mininum_kernel_size'],
	scaling_bias=state['gaussian']['scaling_bias'],
	opacity_bias=state['gaussian']['opacity_bias'],
	scaling_activation=state['gaussian']['scaling_activation'],
	)
	gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
	gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
	gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
	gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
	gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')

	mesh = edict(
	vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
	faces=torch.tensor(state['mesh']['faces'], device='cuda'),
	)

	return gs, mesh


	def get_seed(randomize_seed: bool, seed: int) -> int:
	"""
	Get the random seed.
	"""
	return np.random.randint(0, MAX_SEED) if randomize_seed else seed


	@spaces.GPU
	def image_to_3d(
	image: Image.Image,
	multiimages: List[Tuple[Image.Image, str]],
	is_multiimage: bool,
	seed: int,
	ss_guidance_strength: float,
	ss_sampling_steps: int,
	slat_guidance_strength: float,
	slat_sampling_steps: int,
	multiimage_algo: Literal["multidiffusion", "stochastic"],
	req: gr.Request,
	) -> Tuple[dict, str]:
	"""
	Convert an image to a 3D model.

	Args:
	image (Image.Image): The input image.
	multiimages (List[Tuple[Image.Image, str]]): The input images in multi-image mode.
	is_multiimage (bool): Whether is in multi-image mode.
	seed (int): The random seed.
	ss_guidance_strength (float): The guidance strength for sparse structure generation.
	ss_sampling_steps (int): The number of sampling steps for sparse structure generation.
	slat_guidance_strength (float): The guidance strength for structured latent generation.
	slat_sampling_steps (int): The number of sampling steps for structured latent generation.
	multiimage_algo (Literal["multidiffusion", "stochastic"]): The algorithm for multi-image generation.

	Returns:
	dict: The information of the generated 3D model.
	str: The path to the video of the 3D model.
	"""
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	if not is_multiimage:
	outputs = pipeline.run(
	image,
	seed=seed,
	formats=["gaussian", "mesh"],
	preprocess_image=False,
	sparse_structure_sampler_params={
	"steps": ss_sampling_steps,
	"cfg_strength": ss_guidance_strength,
	},
	slat_sampler_params={
	"steps": slat_sampling_steps,
	"cfg_strength": slat_guidance_strength,
	},
	)
	else:
	outputs = pipeline.run_multi_image(
	[image[0] for image in multiimages],
	seed=seed,
	formats=["gaussian", "mesh"],
	preprocess_image=False,
	sparse_structure_sampler_params={
	"steps": ss_sampling_steps,
	"cfg_strength": ss_guidance_strength,
	},
	slat_sampler_params={
	"steps": slat_sampling_steps,
	"cfg_strength": slat_guidance_strength,
	},
	mode=multiimage_algo,
	)
	video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
	video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
	video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
	video_path = os.path.join(user_dir, 'sample.mp4')
	imageio.mimsave(video_path, video, fps=15)
	state = pack_state(outputs['gaussian'][0], outputs['mesh'][0])
	torch.cuda.empty_cache()
	return state, video_path


	@spaces.GPU(duration=90)
	def extract_glb(
	state: dict,
	mesh_simplify: float,
	texture_size: int,
	req: gr.Request,
	) -> Tuple[str, str]:
	"""
	Extract a GLB file from the 3D model.

	Args:
	state (dict): The state of the generated 3D model.
	mesh_simplify (float): The mesh simplification factor.
	texture_size (int): The texture resolution.

	Returns:
	str: The path to the extracted GLB file.
	"""
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	gs, mesh = unpack_state(state)
	glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
	glb_path = os.path.join(user_dir, 'sample.glb')
	glb.export(glb_path)
	torch.cuda.empty_cache()
	return glb_path, glb_path


	def reset_do_preprocess():
	return True

	@spaces.GPU
	def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
	"""
	Extract a Gaussian file from the 3D model.

	Args:
	state (dict): The state of the generated 3D model.

	Returns:
	str: The path to the extracted Gaussian file.
	"""
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	gs, _ = unpack_state(state)
	gaussian_path = os.path.join(user_dir, 'sample.ply')
	gs.save_ply(gaussian_path)
	torch.cuda.empty_cache()
	return gaussian_path, gaussian_path


	def prepare_multi_example() -> List[Image.Image]:
	multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
	images = []
	for case in multi_case:
	_images = []
	for i in range(1, 4):
	img = Image.open(f'assets/example_multi_image/{case}_{i}.png')
	W, H = img.size
	img = img.resize((int(W / H * 512), 512))
	_images.append(np.array(img))
	images.append(Image.fromarray(np.concatenate(_images, axis=1)))
	return images


	def split_image(image: Image.Image) -> List[Image.Image]:
	"""
	Split an image into multiple views.
	"""
	image = np.array(image)
	alpha = image[..., 3]
	alpha = np.any(alpha>0, axis=0)
	start_pos = np.where(~alpha[:-1] & alpha[1:])[0].tolist()
	end_pos = np.where(alpha[:-1] & ~alpha[1:])[0].tolist()
	images = []
	for s, e in zip(start_pos, end_pos):
	images.append(Image.fromarray(image[:, s:e+1]))
	return [preprocess_image(image) for image in images]


	with gr.Blocks(delete_cache=(600, 600), js=js_func) as demo:
	gr.Markdown("""
	## Sketch to 3D Asset with [TRELLIS](https://trellis3d.github.io/)
	* draw or upload a sketch and click "Generate" to create a 3D asset.
	""")

	with gr.Row():
	with gr.Column():
	with gr.Tabs() as input_tabs:
	with gr.Tab(label="Single Image", id=0) as single_image_input_tab:
	#image_prompt = gr.Image(label="Image Prompt", format="png", image_mode="RGBA", type="pil", height=300)
	image_prompt = gr.ImageEditor(type="pil", image_mode="L", crop_size=(512, 512))
	sketch_btn = gr.Button("process sketch")
	with gr.Column():
	prompt = gr.Textbox(label="Prompt")
	with gr.Row():
	style = gr.Dropdown(label="Style", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
	negative_prompt = gr.Textbox(label="Negative prompt")
	with gr.Tab(label="Multiple Images", id=1, visible=False) as multiimage_input_tab:
	multiimage_prompt = gr.Gallery(label="Image Prompt", format="png", type="pil", height=300, columns=3)
	gr.Markdown("""
	Input different views of the object in separate images.

	NOTE: this is an experimental algorithm without training a specialized model. It may not produce the best results for all images, especially those having different poses or inconsistent details.
	""")

	with gr.Accordion(label="Generation Settings", open=False):
	seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
	randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
	gr.Markdown("Stage 1: Sparse Structure Generation")
	with gr.Row():
	ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
	ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
	gr.Markdown("Stage 2: Structured Latent Generation")
	with gr.Row():
	slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=3.0, step=0.1)
	slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
	multiimage_algo = gr.Radio(["stochastic", "multidiffusion"], label="Multi-image Algorithm", value="stochastic")

	generate_btn = gr.Button("Generate")

	with gr.Accordion(label="GLB Extraction Settings", open=False):
	mesh_simplify = gr.Slider(0.9, 0.98, label="Simplify", value=0.95, step=0.01)
	texture_size = gr.Slider(512, 2048, label="Texture Size", value=1024, step=512)

	with gr.Row():
	extract_glb_btn = gr.Button("Extract GLB", interactive=False)
	extract_gs_btn = gr.Button("Extract Gaussian", interactive=False)
	gr.Markdown("""
	NOTE: Gaussian file can be very large (~50MB), it will take a while to display and download.
	""")

	with gr.Column():
	video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
	model_output = LitModel3D(label="Extracted GLB/Gaussian", exposure=10.0, height=300)

	with gr.Row():
	download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
	download_gs = gr.DownloadButton(label="Download Gaussian", interactive=False)

	is_multiimage = gr.State(False)
	do_preprocess = gr.State(True)
	output_buf = gr.State()

	#Example images at the bottom of the page
	with gr.Row() as single_image_example:
	examples = gr.Examples(
	examples=[
	f'assets/example_image/{image}'
	for image in os.listdir("assets/example_image")
	],
	inputs=[image_prompt],
	fn=preprocess_image,
	outputs=[image_prompt],
	run_on_click=True,
	examples_per_page=64,
	)
	with gr.Row(visible=False) as multiimage_example:
	examples_multi = gr.Examples(
	examples=prepare_multi_example(),
	inputs=[image_prompt],
	fn=split_image,
	outputs=[multiimage_prompt],
	run_on_click=True,
	examples_per_page=8,
	)

	# Handlers
	demo.load(start_session)
	demo.unload(end_session)

	single_image_input_tab.select(
	lambda: tuple([False, gr.Row.update(visible=True), gr.Row.update(visible=False)]),
	outputs=[is_multiimage, single_image_example, multiimage_example]
	)
	multiimage_input_tab.select(
	lambda: tuple([True, gr.Row.update(visible=False), gr.Row.update(visible=True)]),
	outputs=[is_multiimage, single_image_example, multiimage_example]
	)

	# image_prompt.upload(
	# preprocess_image,
	# inputs=[image_prompt, prompt, negative_prompt, style, do_preprocess],
	# outputs=[image_prompt, do_preprocess],
	# )
	sketch_btn.click(
	preprocess_image,
	inputs=[image_prompt, prompt, negative_prompt, style, do_preprocess],
	outputs=[image_prompt, do_preprocess],
	)
	multiimage_prompt.upload(
	preprocess_images,
	inputs=[multiimage_prompt],
	outputs=[multiimage_prompt, do_preprocess],
	)

	generate_btn.click(
	get_seed,
	inputs=[randomize_seed, seed],
	outputs=[seed],
	).then(
	image_to_3d,
	inputs=[image_prompt, multiimage_prompt, is_multiimage, seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps, multiimage_algo, do_preprocess],
	outputs=[output_buf, video_output, do_preprocess],
	).then(
	lambda: tuple([gr.Button(interactive=True), gr.Button(interactive=True)]),
	outputs=[extract_glb_btn, extract_gs_btn],
	)

	video_output.clear(
	lambda: tuple([gr.Button(interactive=False), gr.Button(interactive=False)]),
	outputs=[extract_glb_btn, extract_gs_btn],
	)

	extract_glb_btn.click(
	extract_glb,
	inputs=[output_buf, mesh_simplify, texture_size],
	outputs=[model_output, download_glb],
	).then(
	lambda: gr.Button(interactive=True),
	outputs=[download_glb],
	)

	extract_gs_btn.click(
	extract_gaussian,
	inputs=[output_buf],
	outputs=[model_output, download_gs],
	).then(
	lambda: gr.Button(interactive=True),
	outputs=[download_gs],
	)

	model_output.clear(
	lambda: gr.Button(interactive=False),
	outputs=[download_glb],
	)


	# Launch the Gradio app
	if __name__ == "__main__":
	pipeline = TrellisImageTo3DPipeline.from_pretrained("JeffreyXiang/TRELLIS-image-large")
	pipeline.cuda()

	device = "cuda" if torch.cuda.is_available() else "cpu"

	#scribble controlnet
	controlnet = ControlNetModel.from_pretrained(
	"xinsir/controlnet-scribble-sdxl-1.0",
	torch_dtype=torch.float16
	)
	vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)

	pipe_control = StableDiffusionXLControlNetPipeline.from_pretrained(
	"sd-community/sdxl-flash",
	controlnet=controlnet,
	vae=vae,
	torch_dtype=torch.float16,
	)
	pipe_control.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe_control.scheduler.config)
	pipe_control.to(device)

	try:
	pipeline.preprocess_image(Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))) # Preload rembg
	except:
	pass
	demo.launch()