Spaces:
Sleeping
Sleeping
File size: 6,666 Bytes
bcec54e 311c16e c6fb6c8 311c16e ed2df5e 5cc1836 bcec54e c6fb6c8 bcec54e c6fb6c8 bcec54e c6fb6c8 bcec54e c6fb6c8 bcec54e c6fb6c8 bcec54e 3da0b4c bcec54e 3da0b4c c6fb6c8 9c81e89 bcec54e 53cbc64 bcec54e 3da0b4c c6fb6c8 8414e4e bcec54e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'refer')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'taming-transformers')))
os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
import cv2
import numpy as np
import torch
from depth.models_depth.model import EVPDepth
from models_refer.model import EVPRefer
from depth.configs.train_options import TrainOptions
from depth.configs.test_options import TestOptions
import glob
import utils
import torchvision.transforms as transforms
from utils_depth.misc import colorize
from PIL import Image
import torch.nn.functional as F
import gradio as gr
import tempfile
from transformers import CLIPTokenizer
css = """
#img-display-container {
max-height: 50vh;
}
#img-display-input {
max-height: 40vh;
}
#img-display-output {
max-height: 40vh;
}
"""
def create_depth_demo(model, device):
gr.Markdown("### Depth Prediction demo")
with gr.Row():
input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
raw_file = gr.File(label="16-bit raw depth, multiplier:256")
submit = gr.Button("Submit")
def on_submit(image):
transform = transforms.ToTensor()
image = transform(image).unsqueeze(0).to(device)
shape = image.shape
image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True)
image = F.pad(image, (0, 0, 40, 0))
with torch.no_grad():
pred = model(image)['pred_d']
pred = pred[:,:,40:,:]
pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
pred_d_numpy = pred.squeeze().cpu().numpy()
colored_depth, _, _ = colorize(pred_d_numpy, cmap='gray_r')
tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
raw_depth = Image.fromarray((pred_d_numpy*256).astype('uint16'))
raw_depth.save(tmp.name)
return [colored_depth, tmp.name]
submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg", "imgs/test_img5.jpg"],
inputs=[input_image])
def create_refseg_demo(model, tokenizer, device):
gr.Markdown("### Referring Segmentation demo")
with gr.Row():
input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
refseg_image = gr.Image(label="Output Mask", elem_id='img-display-output')
input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2)
submit = gr.Button("Submit")
def on_submit(image, text):
image = np.array(image)
image_t = transforms.ToTensor()(image).unsqueeze(0).to(device)
image_t = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(image_t)
shape = image_t.shape
image_t = torch.nn.functional.interpolate(image_t, (512,512), mode='bilinear', align_corners=True)
input_ids = tokenizer(text=text, truncation=True, max_length=40, return_length=True,
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")['input_ids'].to(device)
with torch.no_grad():
pred = model(image_t, input_ids)
pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
output_mask = pred.cpu().argmax(1).data.numpy().squeeze()
alpha = 0.65
image[output_mask == 0] = (image[output_mask == 0]*alpha).astype(np.uint8)
contours, _ = cv2.findContours(output_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cv2.drawContours(image, contours, -1, (0, 255, 0), 2)
return Image.fromarray(image)
submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
examples = gr.Examples(examples=[["imgs/test_img2.jpg", "green plant"], ["imgs/test_img3.jpg", "chair"], ["imgs/test_img4.jpg", "left green plant"], ["imgs/test_img5.jpg", "man walking on foot"], ["imgs/test_img5.jpg", "the rightest camel"]],
inputs=[input_image, input_text])
def main():
upload_2_models = True
opt = TestOptions().initialize()
args = opt.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if upload_2_models:
model = EVPDepth(args=args, caption_aggregation=True)
model.to(device)
model_weight = torch.load('best_model_nyu.ckpt', map_location=device)['model']
model.load_state_dict(model_weight, strict=False)
model.eval()
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
model_refseg = EVPRefer()
model_refseg.to(device)
model_weight = torch.load('best_model_refcoco.pth', map_location=device)['model']
model_refseg.load_state_dict(model_weight, strict=False)
model_refseg.eval()
del model_weight
print('Models uploaded successfully')
title = "# EVP"
description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature
Refinement and Regularized Image-Text Alignment**.
EVP is a deep learning model for metric depth estimation from a single image as well as referring segmentation.
Please refer to our [project page](https://lavreniuk.github.io/EVP) or [paper](https://arxiv.org/abs/2312.08548) or [github](https://github.com/Lavreniuk/EVP) for more details."""
with gr.Blocks() as demo:
gr.Markdown(title)
gr.Markdown(description)
if upload_2_models:
with gr.Tab("Depth Prediction"):
create_depth_demo(model, device)
with gr.Tab("Referring Segmentation"):
create_refseg_demo(model_refseg, tokenizer, device)
gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/MykolaL/evp?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
<p><img src="https://visitor-badge.glitch.me/badge?page_id=MykolaL/evp" alt="visitors"></p></center>''')
demo.queue().launch(share=True)
if __name__ == '__main__':
main()
|