|
import cv2 |
|
import torch |
|
import os, glob |
|
import numpy as np |
|
import gradio as gr |
|
from PIL import Image |
|
from omegaconf import OmegaConf |
|
from contextlib import nullcontext |
|
from pytorch_lightning import seed_everything |
|
from os.path import join as ospj |
|
|
|
from util import * |
|
|
|
|
|
def predict(cfgs, model, sampler, batch): |
|
|
|
context = nullcontext if cfgs.aae_enabled else torch.no_grad |
|
|
|
with context(): |
|
|
|
batch, batch_uc_1, batch_uc_2 = prepare_batch(cfgs, batch) |
|
|
|
if cfgs.dual_conditioner: |
|
c, uc_1, uc_2 = model.conditioner.get_unconditional_conditioning( |
|
batch, |
|
batch_uc_1=batch_uc_1, |
|
batch_uc_2=batch_uc_2, |
|
force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings, |
|
) |
|
else: |
|
c, uc_1 = model.conditioner.get_unconditional_conditioning( |
|
batch, |
|
batch_uc=batch_uc_1, |
|
force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings, |
|
) |
|
|
|
if cfgs.dual_conditioner: |
|
x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc_1=uc_1, uc_2=uc_2) |
|
samples_z = sampler(model, x, cond=c, batch=batch, uc_1=uc_1, uc_2=uc_2, init_step=0, |
|
aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed) |
|
else: |
|
x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc=uc_1) |
|
samples_z = sampler(model, x, cond=c, batch=batch, uc=uc_1, init_step=0, |
|
aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed) |
|
|
|
samples_x = model.decode_first_stage(samples_z) |
|
samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0) |
|
|
|
return samples, samples_z |
|
|
|
|
|
def demo_predict(input_blk, text, num_samples, steps, scale, seed, show_detail): |
|
|
|
global cfgs, global_index |
|
|
|
global_index += 1 |
|
|
|
if num_samples > 1: cfgs.noise_iters = 0 |
|
|
|
cfgs.batch_size = num_samples |
|
cfgs.steps = steps |
|
cfgs.scale[0] = scale |
|
cfgs.detailed = show_detail |
|
seed_everything(seed) |
|
|
|
sampler = init_sampling(cfgs) |
|
|
|
image = input_blk["image"] |
|
mask = input_blk["mask"] |
|
image = cv2.resize(image, (cfgs.W, cfgs.H)) |
|
mask = cv2.resize(mask, (cfgs.W, cfgs.H)) |
|
|
|
mask = (mask == 0).astype(np.int32) |
|
|
|
image = torch.from_numpy(image.transpose(2,0,1)).to(dtype=torch.float32) / 127.5 - 1.0 |
|
mask = torch.from_numpy(mask.transpose(2,0,1)).to(dtype=torch.float32).mean(dim=0, keepdim=True) |
|
masked = image * mask |
|
mask = 1 - mask |
|
|
|
seg_mask = torch.cat((torch.ones(len(text)), torch.zeros(cfgs.seq_len-len(text)))) |
|
|
|
|
|
txt = f"\"{text}\"" |
|
original_size_as_tuple = torch.tensor((cfgs.H, cfgs.W)) |
|
crop_coords_top_left = torch.tensor((0, 0)) |
|
target_size_as_tuple = torch.tensor((cfgs.H, cfgs.W)) |
|
|
|
image = torch.tile(image[None], (num_samples, 1, 1, 1)) |
|
mask = torch.tile(mask[None], (num_samples, 1, 1, 1)) |
|
masked = torch.tile(masked[None], (num_samples, 1, 1, 1)) |
|
seg_mask = torch.tile(seg_mask[None], (num_samples, 1)) |
|
original_size_as_tuple = torch.tile(original_size_as_tuple[None], (num_samples, 1)) |
|
crop_coords_top_left = torch.tile(crop_coords_top_left[None], (num_samples, 1)) |
|
target_size_as_tuple = torch.tile(target_size_as_tuple[None], (num_samples, 1)) |
|
|
|
text = [text for i in range(num_samples)] |
|
txt = [txt for i in range(num_samples)] |
|
name = [str(global_index) for i in range(num_samples)] |
|
|
|
batch = { |
|
"image": image, |
|
"mask": mask, |
|
"masked": masked, |
|
"seg_mask": seg_mask, |
|
"label": text, |
|
"txt": txt, |
|
"original_size_as_tuple": original_size_as_tuple, |
|
"crop_coords_top_left": crop_coords_top_left, |
|
"target_size_as_tuple": target_size_as_tuple, |
|
"name": name |
|
} |
|
|
|
samples, samples_z = predict(cfgs, model, sampler, batch) |
|
samples = samples.cpu().numpy().transpose(0, 2, 3, 1) * 255 |
|
results = [Image.fromarray(sample.astype(np.uint8)) for sample in samples] |
|
|
|
if cfgs.detailed: |
|
sections = [] |
|
attn_map = Image.open(f"./temp/attn_map/attn_map_{global_index}.png") |
|
seg_maps = np.load(f"./temp/seg_map/seg_{global_index}.npy") |
|
for i, seg_map in enumerate(seg_maps): |
|
seg_map = cv2.resize(seg_map, (cfgs.W, cfgs.H)) |
|
sections.append((seg_map, text[0][i])) |
|
seg = (results[0], sections) |
|
else: |
|
attn_map = None |
|
seg = None |
|
|
|
return results, attn_map, seg |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
cfgs = OmegaConf.load("./configs/demo.yaml") |
|
|
|
model = init_model(cfgs) |
|
global_index = 0 |
|
|
|
block = gr.Blocks().queue() |
|
with block: |
|
|
|
with gr.Row(): |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 1200px; margin: 20px auto;"> |
|
<h1 style="font-weight: 600; font-size: 2rem; margin: 0rem"> |
|
UDiffText: A Unified Framework for High-quality Text Synthesis in Arbitrary Images via Character-aware Diffusion Models |
|
</h1> |
|
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem"> |
|
[<a href="" style="color:blue;">arXiv</a>] |
|
[<a href="" style="color:blue;">Code</a>] |
|
[<a href="" style="color:blue;">ProjectPage</a>] |
|
</h3> |
|
<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> |
|
Our proposed UDiffText is capable of synthesizing accurate and harmonious text in either synthetic or real-word images, thus can be applied to tasks like scene text editing (a), arbitrary text generation (b) and accurate T2I generation (c) |
|
</h2> |
|
<div align=center><img src="file/demo/teaser.png" alt="UDiffText" width="80%"></div> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
|
|
input_blk = gr.Image(source='upload', tool='sketch', type="numpy", label="Input", height=512) |
|
text = gr.Textbox(label="Text to render:", info="the text you want to render at the masked region") |
|
run_button = gr.Button(variant="primary") |
|
|
|
with gr.Accordion("Advanced options", open=False): |
|
|
|
num_samples = gr.Slider(label="Images", info="number of generated images, locked as 1", minimum=1, maximum=1, value=1, step=1) |
|
steps = gr.Slider(label="Steps", info ="denoising sampling steps", minimum=1, maximum=200, value=50, step=1) |
|
scale = gr.Slider(label="Guidance Scale", info="the scale of classifier-free guidance (CFG)", minimum=0.0, maximum=10.0, value=4.0, step=0.1) |
|
seed = gr.Slider(label="Seed", info="random seed for noise initialization", minimum=0, maximum=2147483647, step=1, randomize=True) |
|
show_detail = gr.Checkbox(label="Show Detail", info="show the additional visualization results", value=True) |
|
|
|
with gr.Column(): |
|
|
|
gallery = gr.Gallery(label="Output", height=512, preview=True) |
|
|
|
with gr.Accordion("Visualization results", open=True): |
|
|
|
with gr.Tab(label="Attention Maps"): |
|
gr.Markdown("### Attention maps for each character (extracted from middle blocks at intermediate sampling step):") |
|
attn_map = gr.Image(show_label=False, show_download_button=False) |
|
with gr.Tab(label="Segmentation Maps"): |
|
gr.Markdown("### Character-level segmentation maps (using upscaled attention maps):") |
|
seg_map = gr.AnnotatedImage(height=384, show_label=False, show_download_button=False) |
|
|
|
|
|
examples = [] |
|
example_paths = sorted(glob.glob(ospj("./demo/examples", "*"))) |
|
for example_path in example_paths: |
|
label = example_path.split(os.sep)[-1].split(".")[0].split("_")[0] |
|
examples.append([example_path, label]) |
|
|
|
gr.Markdown("## Examples:") |
|
gr.Examples( |
|
examples=examples, |
|
inputs=[input_blk, text] |
|
) |
|
|
|
run_button.click(fn=demo_predict, inputs=[input_blk, text, num_samples, steps, scale, seed, show_detail], outputs=[gallery, attn_map, seg_map]) |
|
|
|
block.launch() |