|
import gradio as gr |
|
import clip |
|
from model import ClipCaptionModel |
|
from transformers import GPT2Tokenizer |
|
import numpy as np |
|
import torch |
|
import PIL |
|
from predict import generate2, generate_beam |
|
from huggingface_hub import hf_hub_download |
|
|
|
D = torch.device |
|
CPU = torch.device('cpu') |
|
pretrained_model_variance = "0.015" |
|
device = "cpu" |
|
model_path = hf_hub_download('johko/capdec_015', 'model.pt') |
|
|
|
clip_model, preprocess = clip.load("RN50x4", device=device, jit=False) |
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
|
|
|
model_0 = hf_hub_download('johko/capdec_0', 'model.pt') |
|
model_001 = hf_hub_download('johko/capdec_001', 'model.pt') |
|
model_005 = hf_hub_download('johko/capdec_005', 'model.pt') |
|
model_015 = hf_hub_download('johko/capdec_015', 'model.pt') |
|
model_025 = hf_hub_download('johko/capdec_025', 'model.pt') |
|
model_05 = hf_hub_download('johko/capdec_05', 'model.pt') |
|
|
|
|
|
def load_noise_level_model(noise_level): |
|
if noise_level == "0.0": |
|
model_path = model_0 |
|
elif noise_level == "0.001": |
|
model_path = model_001 |
|
elif noise_level == "0.005": |
|
model_path = model_005 |
|
elif noise_level == "0.015": |
|
model_path = model_015 |
|
elif noise_level == "0.025": |
|
model_path = model_025 |
|
elif noise_level == "0.05": |
|
model_path = model_05 |
|
else: |
|
raise ValueError("Unknown Noise Level") |
|
|
|
model = ClipCaptionModel() |
|
model.load_state_dict(torch.load(model_path, map_location=CPU)) |
|
model = model.eval() |
|
model = model.to(device) |
|
|
|
return model |
|
|
|
def infer(input_image: np.ndarray, noise_level: str): |
|
use_beam_search = True |
|
|
|
model = load_noise_level_model(noise_level) |
|
|
|
pil_image = PIL.Image.fromarray(input_image) |
|
|
|
image = preprocess(pil_image).unsqueeze(0).to(device) |
|
with torch.no_grad(): |
|
prefix = clip_model.encode_image(image).to(device, dtype=torch.float32) |
|
prefix_embed = model.clip_project(prefix).reshape(1, 40, -1) |
|
if use_beam_search: |
|
generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0] |
|
else: |
|
generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed) |
|
|
|
return input_image, generated_text_prefix |
|
|
|
description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf) |
|
by David Nukrai, Ron Mokady and Amir Globerson. |
|
|
|
The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels, |
|
with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose.""" |
|
|
|
dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level") |
|
input_image = gr.components.Image(label="Input Image") |
|
output_image = gr.components.Image(label="Image") |
|
output_text = gr.components.Textbox(label="Generated Caption") |
|
|
|
iface = gr.Interface( |
|
title="CapDec Image Captioning", |
|
description=description, |
|
fn=infer, |
|
inputs=[input_image, dropdown], |
|
outputs=[output_image, output_text], |
|
examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]]) |
|
iface.launch() |