Spaces:
Running
on
Zero
Running
on
Zero
# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# -------------------------------------------------------------------------- | |
# If you find this code useful, we kindly ask you to cite our paper in your work. | |
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation | |
# More information about the method can be found at https://marigoldmonodepth.github.io | |
# -------------------------------------------------------------------------- | |
import functools | |
import os | |
import sys | |
import tempfile | |
import av | |
import numpy as np | |
import spaces | |
import gradio as gr | |
import torch as torch | |
import einops | |
from huggingface_hub import login | |
from colorize import colorize_depth_multi_thread | |
from video_io import get_video_fps, write_video_from_numpy | |
VERBOSE = False | |
MAX_FRAMES = 100 | |
def process(pipe, device, path_input): | |
print(f"Processing {path_input}") | |
path_output_dir = tempfile.mkdtemp() | |
os.makedirs(path_output_dir, exist_ok=True) | |
name_base = os.path.splitext(os.path.basename(path_input))[0] | |
path_out_in = os.path.join(path_output_dir, f"{name_base}_depth_input.mp4") | |
path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4") | |
output_fps = int(get_video_fps(path_input)) | |
container = av.open(path_input) | |
stream = container.streams.video[0] | |
fps = float(stream.average_rate) | |
duration_sec = float(stream.duration * stream.time_base) if stream.duration else 0 | |
total_frames = int(duration_sec * fps) | |
if total_frames > MAX_FRAMES: | |
gr.Warning( | |
f"Only the first {MAX_FRAMES} frames (~{MAX_FRAMES / fps:.1f} sec.) will be processed for demonstration; " | |
f"use the code from GitHub for full processing" | |
) | |
generator = torch.Generator(device=device) | |
generator.manual_seed(2024) | |
pipe_out: RollingDepthOutput = pipe( | |
# input setting | |
input_video_path=path_input, | |
start_frame=0, | |
frame_count=min(MAX_FRAMES, total_frames), # 0 = all | |
processing_res=768, | |
# infer setting | |
dilations=[1, 25], | |
cap_dilation=True, | |
snippet_lengths=[3], | |
init_infer_steps=[1], | |
strides=[1], | |
coalign_kwargs=None, | |
refine_step=0, # 0 = off | |
max_vae_bs=8, # batch size for encoder/decoder | |
# other settings | |
generator=generator, | |
verbose=VERBOSE, | |
# output settings | |
restore_res=False, | |
unload_snippet=False, | |
) | |
depth_pred = pipe_out.depth_pred # [N 1 H W] | |
# Colorize results | |
cmap = "Spectral_r" | |
colored_np = colorize_depth_multi_thread( | |
depth=depth_pred.numpy(), | |
valid_mask=None, | |
chunk_size=4, | |
num_threads=4, | |
color_map=cmap, | |
verbose=VERBOSE, | |
) # [n h w 3], in [0, 255] | |
write_video_from_numpy( | |
frames=colored_np, | |
output_path=path_out_vis, | |
fps=output_fps, | |
crf=23, | |
preset="medium", | |
verbose=VERBOSE, | |
) | |
# Save rgb | |
rgb = (pipe_out.input_rgb.numpy() * 255).astype(np.uint8) # [N 3 H W] | |
rgb = einops.rearrange(rgb, "n c h w -> n h w c") | |
write_video_from_numpy( | |
frames=rgb, | |
output_path=path_out_in, | |
fps=output_fps, | |
crf=23, | |
preset="medium", | |
verbose=VERBOSE, | |
) | |
return path_out_in, path_out_vis | |
def run_demo_server(pipe, device): | |
process_pipe = spaces.GPU(functools.partial(process, pipe, device), duration=120) | |
os.environ["GRADIO_ALLOW_FLAGGING"] = "never" | |
with gr.Blocks( | |
analytics_enabled=False, | |
title="RollingDepth", | |
css=""" | |
h1 { | |
text-align: center; | |
display: block; | |
} | |
h2 { | |
text-align: center; | |
display: block; | |
} | |
h3 { | |
text-align: center; | |
display: block; | |
} | |
""", | |
) as demo: | |
gr.HTML( | |
""" | |
<h1>๐น RollingDepth ๐น: Video Depth without Video Models</h1> | |
<div style="text-align: center; margin-top: 20px;"> | |
<a title="Website" href="https://rollingdepth.github.io" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;"> | |
<img src="https://www.obukhov.ai/img/badges/badge-website.svg" alt="Website Badge"> | |
</a> | |
<a title="arXiv" href="https://arxiv.org/abs/2411.19189" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;"> | |
<img src="https://www.obukhov.ai/img/badges/badge-pdf.svg" alt="arXiv Badge"> | |
</a> | |
<a title="GitHub" href="https://github.com/prs-eth/rollingdepth" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;"> | |
<img src="https://img.shields.io/github/stars/prs-eth/rollingdepth?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="GitHub Stars Badge"> | |
</a> | |
<a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block; margin-right: 4px;"> | |
<img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social"> | |
</a> | |
</div> | |
<p style="margin-top: 20px; text-align: justify;"> | |
RollingDepth is the state-of-the-art depth estimator for videos in the wild. Upload your video into the | |
<b>left</b> pane, or click any of the <b>examples</b> below. The result preview will be computed and | |
appear in the <b>right</b> panes. For full functionality, use the code on GitHub. | |
<b>TIP:</b> When running out of GPU time, fork the demo. | |
</p> | |
""" | |
) | |
with gr.Row(equal_height=True): | |
with gr.Column(scale=1): | |
input_video = gr.Video(label="Input Video") | |
with gr.Column(scale=2): | |
with gr.Row(equal_height=True): | |
output_video_1 = gr.Video( | |
label="Preprocessed video", | |
interactive=False, | |
autoplay=True, | |
loop=True, | |
show_share_button=True, | |
scale=5, | |
) | |
output_video_2 = gr.Video( | |
label="Generated Depth Video", | |
interactive=False, | |
autoplay=True, | |
loop=True, | |
show_share_button=True, | |
scale=5, | |
) | |
with gr.Row(equal_height=True): | |
with gr.Column(scale=1): | |
with gr.Row(equal_height=False): | |
generate_btn = gr.Button("Generate") | |
with gr.Column(scale=2): | |
pass | |
gr.Examples( | |
examples=[ | |
["files/gokart.mp4"], | |
["files/horse.mp4"], | |
["files/walking.mp4"], | |
], | |
inputs=[input_video], | |
outputs=[output_video_1, output_video_2], | |
fn=process_pipe, | |
cache_examples=True, | |
cache_mode="eager", | |
) | |
generate_btn.click( | |
fn=process_pipe, | |
inputs=[input_video], | |
outputs=[output_video_1, output_video_2], | |
) | |
demo.queue( | |
api_open=False, | |
).launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
) | |
def main(): | |
os.system("pip freeze") | |
os.system("pip uninstall -y diffusers") | |
os.system("pip install rollingdepth_src/diffusers") | |
os.system("pip freeze") | |
if "HF_TOKEN_LOGIN" in os.environ: | |
login(token=os.environ["HF_TOKEN_LOGIN"]) | |
if torch.cuda.is_available(): | |
device = torch.device("cuda") | |
elif torch.backends.mps.is_available(): | |
device = torch.device("mps") | |
else: | |
device = torch.device("cpu") | |
sys.path.append(os.path.join(os.path.dirname(__file__), "rollingdepth_src")) | |
from rollingdepth import RollingDepthOutput, RollingDepthPipeline | |
pipe: RollingDepthPipeline = RollingDepthPipeline.from_pretrained( | |
"prs-eth/rollingdepth-v1-0", | |
torch_dtype=torch.float16, | |
) | |
pipe.set_progress_bar_config(disable=True) | |
try: | |
import xformers | |
pipe.enable_xformers_memory_efficient_attention() | |
except: | |
pass # run without xformers | |
pipe = pipe.to(device) | |
run_demo_server(pipe, device) | |
if __name__ == "__main__": | |
main() | |