HeliosZhao
update groundedsam
afe31ff
raw
history blame
11.9 kB
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
import warnings
# os.system("cd Make-A-Protagonist/experts/GroundedSAM")
# os.system("python -m pip install -e segment_anything")
# os.system("python -m pip install -e GroundingDINO")
# os.system("cd ../../..")
# os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO")
# os.system("pip install --upgrade diffusers[torch]")
warnings.filterwarnings("ignore")
import gradio as gr
from inference import InferencePipeline
class InferenceUtil:
def __init__(self, hf_token: str | None):
self.hf_token = hf_token
def load_model_info(self, model_id: str) -> tuple[str, str]:
## TODO the modelcard is in the readme of huggingface repo, should know how to write it
try:
card = InferencePipeline.get_model_card(model_id, self.hf_token)
except Exception:
return '', ''
# return ''
base_model = getattr(card.data, 'base_model', '')
protagonist = getattr(card.data, 'protagonist', '')
training_prompt = getattr(card.data, 'training_prompt', '')
return protagonist, training_prompt
# return training_prompt
# TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)'
HF_TOKEN = os.getenv('HF_TOKEN')
# print("HF Token ===> ", HF_TOKEN)
pipe = InferencePipeline(HF_TOKEN)
app = InferenceUtil(HF_TOKEN)
with gr.Blocks(css='style.css') as demo:
# gr.Markdown(TITLE)
gr.HTML(
"""
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
Make-A-Protagonist:
<br>
Generic Video Editing with An Ensemble of Experts
</h1>
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
<a href="https://yuyangzhao.com">Yuyang Zhao</a><sup>1</sup>
<a href="https://xieenze.github.io/">Enze Xie</a><sup>2</sup>
<a href="https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ&hl=en">Lanqing Hong</a><sup>2</sup>
<a href="https://scholar.google.com.sg/citations?user=XboZC1AAAAAJ&hl=en">Zhenguo Li</a><sup>2</sup>
<a href="https://www.comp.nus.edu.sg/~leegh/">Gim Hee Lee</a><sup>1</sup>
</h2>
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
<sup>1 </sup>National University of Singapore
<sup>2 </sup>Huawei Noah's Ark Lab</span>
</h2>
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
<span class="link-block">
[<a href="https://arxiv.org/abs/2305.08850" target="_blank"
class="external-link ">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>]
</span>
<!-- Github link -->
<span class="link-block">
[<a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist" target="_blank"
class="external-link ">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>]
</span>
<!-- Github link -->
<span class="link-block">
[<a href="https://make-a-protagonist.github.io/" target="_blank"
class="external-link ">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Homepage</span>
</a>]
</span>
</h2>
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
TL;DR: The first framework for generic video editing with both visual and textual clues.
</h2>
</div>
""")
with gr.Row():
with gr.Column():
with gr.Box():
model_id = gr.Dropdown(
label='Model ID',
choices=[
'Make-A-Protagonist/ikun',
'Make-A-Protagonist/huaqiang',
'Make-A-Protagonist/yanzi',
'Make-A-Protagonist/car-turn',
],
value='Make-A-Protagonist/ikun')
with gr.Row():
base_model_used_for_training = gr.Textbox(
label='Protagonist', interactive=False, value='man')
prompt_used_for_training = gr.Textbox(
label='Training prompt', interactive=False, value='A man is playing basketball')
with gr.Box():
ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto")
ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt',
max_lines=1,
placeholder='Example: "man"')
prompt = gr.Textbox(label='Prompt',
max_lines=1,
placeholder='Example: "A panda is surfing"')
video_length = gr.Slider(label='Video length',
minimum=4,
maximum=8,
step=1,
value=8)
fps = gr.Slider(label='FPS',
minimum=1,
maximum=8,
step=1,
value=4)
seed = gr.Slider(label='Seed',
minimum=0,
maximum=100000,
step=1,
value=0)
with gr.Accordion('ControlNet Parameters', open=True):
control_pose = gr.Slider(label='Pose',
minimum=0,
maximum=1,
step=0.1,
value=.5)
control_depth = gr.Slider(label='Depth',
minimum=0,
maximum=1,
step=0.1,
value=.5)
with gr.Accordion('Editing Function', open=True):
with gr.Row():
source_pro = gr.Slider(label='Source Protagonist',
minimum=0,
maximum=1,
step=1,
value=0)
source_bg = gr.Slider(label='Source Background',
minimum=0,
maximum=1,
step=1,
value=0)
with gr.Accordion('Other Parameters', open=False):
num_steps = gr.Slider(label='Number of Steps',
minimum=0,
maximum=100,
step=1,
value=50)
guidance_scale = gr.Slider(label='CFG Scale',
minimum=0,
maximum=50,
step=0.1,
value=12.5)
noise_level = gr.Slider(label='Noise Level',
minimum=0,
maximum=999,
step=1,
value=0)
run_button = gr.Button('Generate')
gr.Markdown('''
- It takes a few minutes to download model first.
- It takes one minute to load model and conduct DDIM inverse
''')
with gr.Column():
result = gr.Video(label='Result')
with gr.Row():
examples = [
[
'Make-A-Protagonist/ikun',
'A man is playing basketball on the beach, anime style.',
8,
4,
33,
50,
12.5,
'data/ikun/reference_images/zhongli.jpg',
'man',
0,
0.5,
0.5,
0,
0
],
[
'Make-A-Protagonist/huaqiang',
'Elon Musk walking down the street.',
8,
4,
33,
50,
12.5,
'data/huaqiang/reference_images/musk.jpg',
'man',
0,
0.5,
0.5,
0,
1,
],
[
'Make-A-Protagonist/yanzi',
'A panda walking down the snowy street.',
8,
4,
33,
50,
12.5,
'data/yanzi/reference_images/panda.jpeg',
'panda',
0,
0.5,
0.5,
0,
0
],
[
'Make-A-Protagonist/car-turn',
'A car moving in the desert.',
8,
4,
33,
50,
12.5,
'data/car-turn/reference_images/audi.jpeg',
'car',
0,
0.0,
1.0,
0,
0
],
[
'Make-A-Protagonist/car-turn',
'A Suzuki Jimny driving down a mountain road in the rain.',
8,
4,
33,
50,
12.5,
'data/car-turn/images/0000.jpg',
'car',
0,
0.0,
1.0,
1,
0
],
]
gr.Examples(examples=examples,
inputs=[
model_id,
prompt,
video_length,
fps,
seed,
num_steps,
guidance_scale,
ref_image,
ref_pro_prompt,
noise_level,
control_pose,
control_depth,
source_pro,
source_bg,
],
outputs=result,
fn=pipe.run,
cache_examples=os.getenv('SYSTEM') == 'spaces')
model_id.change(fn=app.load_model_info,
inputs=model_id,
outputs=[
base_model_used_for_training,
prompt_used_for_training,
])
inputs = [
model_id,
prompt,
video_length,
fps,
seed,
num_steps,
guidance_scale,
ref_image,
ref_pro_prompt,
noise_level,
control_pose,
control_depth,
source_pro,
source_bg,
]
prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
demo.queue().launch(share=True)