DongfuJiang
commited on
Commit
•
a915791
1
Parent(s):
0afd9ef
update
Browse files- README.md +1 -1
- app_idefics2.py +224 -0
- app.py → app_mllava.py +0 -0
- requirements.txt +1 -1
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.24.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
short_description: Multimodal Language Model
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.24.0
|
8 |
+
app_file: app_idefics2.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
short_description: Multimodal Language Model
|
app_idefics2.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spaces
|
3 |
+
import time
|
4 |
+
from PIL import Image
|
5 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
6 |
+
from transformers.image_utils import load_image
|
7 |
+
from typing import List
|
8 |
+
processor = AutoProcessor.from_pretrained("TIGER-Lab/Mantis-8B-Idefics2")
|
9 |
+
model = AutoModelForVision2Seq.from_pretrained("TIGER-Lab/Mantis-8B-Idefics2")
|
10 |
+
|
11 |
+
@spaces.GPU
|
12 |
+
def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
|
13 |
+
global processor, model
|
14 |
+
model = model.to("cuda")
|
15 |
+
if not images:
|
16 |
+
images = None
|
17 |
+
|
18 |
+
prompt = processor.apply_chat_template(history, add_generation_prompt=True)
|
19 |
+
print("Prompt: ")
|
20 |
+
print(prompt)
|
21 |
+
print("Images: ")
|
22 |
+
print(images)
|
23 |
+
inputs = processor(text=prompt, images=images, return_tensors="pt")
|
24 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
25 |
+
|
26 |
+
from transformers import TextIteratorStreamer
|
27 |
+
from threading import Thread
|
28 |
+
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
29 |
+
kwargs["streamer"] = streamer
|
30 |
+
inputs.update(kwargs)
|
31 |
+
thread = Thread(target=model.generate, kwargs=inputs)
|
32 |
+
thread.start()
|
33 |
+
output = ""
|
34 |
+
for _output in streamer:
|
35 |
+
output += _output
|
36 |
+
yield output
|
37 |
+
|
38 |
+
def enable_next_image(uploaded_images, image):
|
39 |
+
uploaded_images.append(image)
|
40 |
+
return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
|
41 |
+
|
42 |
+
def add_message(history, message):
|
43 |
+
if message["files"]:
|
44 |
+
for file in message["files"]:
|
45 |
+
history.append([(file,), None])
|
46 |
+
if message["text"]:
|
47 |
+
history.append([message["text"], None])
|
48 |
+
return history, gr.MultimodalTextbox(value=None)
|
49 |
+
|
50 |
+
def print_like_dislike(x: gr.LikeData):
|
51 |
+
print(x.index, x.value, x.liked)
|
52 |
+
|
53 |
+
|
54 |
+
def get_chat_images(history):
|
55 |
+
images = []
|
56 |
+
for message in history:
|
57 |
+
if isinstance(message[0], tuple):
|
58 |
+
image = load_image(message[0][0])
|
59 |
+
images.append(image)
|
60 |
+
return images
|
61 |
+
|
62 |
+
def get_chat_history(history):
|
63 |
+
|
64 |
+
images = get_chat_images(history)
|
65 |
+
messages = []
|
66 |
+
cur_image_idx = 0
|
67 |
+
for i, message in enumerate(history):
|
68 |
+
if isinstance(message[0], str):
|
69 |
+
num_images = message[0].count("<image>")
|
70 |
+
messages.append(
|
71 |
+
{
|
72 |
+
"role": "user",
|
73 |
+
"content": []
|
74 |
+
}
|
75 |
+
)
|
76 |
+
assert num_images + cur_image_idx <= len(images), f"Number of images uploaded is less than the number of <image> placeholders in the text. Please upload more images."
|
77 |
+
if num_images > 0:
|
78 |
+
for sub_text in message[0].split("<image>"):
|
79 |
+
if sub_text.strip():
|
80 |
+
messages[-1]["content"].append({"type": "text", "text": sub_text.strip()})
|
81 |
+
if cur_image_idx < len(images):
|
82 |
+
messages[-1]["content"].append({"type": "image"})
|
83 |
+
cur_image_idx += 1
|
84 |
+
else:
|
85 |
+
messages[-1]["content"].append({"type": "text", "text": message[0]})
|
86 |
+
elif isinstance(message[0], tuple):
|
87 |
+
pass
|
88 |
+
return messages, images
|
89 |
+
|
90 |
+
|
91 |
+
def bot(history):
|
92 |
+
cur_messages = {"text": "", "images": []}
|
93 |
+
for message in history[::-1]:
|
94 |
+
if message[1]:
|
95 |
+
break
|
96 |
+
if isinstance(message[0], str):
|
97 |
+
cur_messages["text"] = message[0] + " " + cur_messages["text"]
|
98 |
+
elif isinstance(message[0], tuple):
|
99 |
+
cur_messages["images"].extend(message[0])
|
100 |
+
cur_messages["text"] = cur_messages["text"].strip()
|
101 |
+
cur_messages["images"] = cur_messages["images"][::-1]
|
102 |
+
if not cur_messages["text"]:
|
103 |
+
raise gr.Error("Please enter a message")
|
104 |
+
if cur_messages['text'].count("<image>") < len(cur_messages['images']):
|
105 |
+
gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
|
106 |
+
cur_messages['text'] = "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>")) + cur_messages['text']
|
107 |
+
history[-1][0] = cur_messages["text"]
|
108 |
+
if cur_messages['text'].count("<image>") > len(cur_messages['images']):
|
109 |
+
gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
|
110 |
+
cur_messages['text'] = cur_messages['text'][::-1].replace("<image>"[::-1], "", cur_messages['text'].count("<image>") - len(cur_messages['images']))[::-1]
|
111 |
+
history[-1][0] = cur_messages["text"]
|
112 |
+
|
113 |
+
chat_history, chat_images = get_chat_history(history)
|
114 |
+
|
115 |
+
generation_kwargs = {
|
116 |
+
"max_new_tokens": 4096,
|
117 |
+
"num_beams": 1,
|
118 |
+
"do_sample": False
|
119 |
+
}
|
120 |
+
|
121 |
+
response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
|
122 |
+
for _output in response:
|
123 |
+
history[-1][1] = _output
|
124 |
+
time.sleep(0.05)
|
125 |
+
yield history
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
def build_demo():
|
130 |
+
with gr.Blocks() as demo:
|
131 |
+
|
132 |
+
gr.Markdown(""" # Mantis
|
133 |
+
Mantis is a multimodal conversational AI model that can chat with users about images and text. It's optimized for multi-image reasoning, where inverleaved text and images can be used to generate responses.
|
134 |
+
|
135 |
+
### [Paper](https://arxiv.org/abs/2405.01483) | [Github](https://github.com/TIGER-AI-Lab/Mantis) | [Models](https://huggingface.co/collections/TIGER-Lab/mantis-6619b0834594c878cdb1d6e4) | [Dataset](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct) | [Website](https://tiger-ai-lab.github.io/Mantis/)
|
136 |
+
""")
|
137 |
+
|
138 |
+
gr.Markdown("""## Chat with Mantis
|
139 |
+
Mantis supports interleaved text-image input format, where you can simply use the placeholder `<image>` to indicate the position of uploaded images.
|
140 |
+
The model is optimized for multi-image reasoning, while preserving the ability to chat about text and images in a single conversation.
|
141 |
+
(The model currently serving is [🤗 TIGER-Lab/Mantis-8B-Idefics2](https://huggingface.co/TIGER-Lab/Mantis-8B-Idefics2))
|
142 |
+
""")
|
143 |
+
|
144 |
+
chatbot = gr.Chatbot(line_breaks=True)
|
145 |
+
chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload images. Please use <image> to indicate the position of uploaded images", show_label=True)
|
146 |
+
|
147 |
+
chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
|
148 |
+
|
149 |
+
"""
|
150 |
+
with gr.Accordion(label='Advanced options', open=False):
|
151 |
+
temperature = gr.Slider(
|
152 |
+
label='Temperature',
|
153 |
+
minimum=0.1,
|
154 |
+
maximum=2.0,
|
155 |
+
step=0.1,
|
156 |
+
value=0.2,
|
157 |
+
interactive=True
|
158 |
+
)
|
159 |
+
top_p = gr.Slider(
|
160 |
+
label='Top-p',
|
161 |
+
minimum=0.05,
|
162 |
+
maximum=1.0,
|
163 |
+
step=0.05,
|
164 |
+
value=1.0,
|
165 |
+
interactive=True
|
166 |
+
)
|
167 |
+
"""
|
168 |
+
|
169 |
+
bot_msg = chat_msg.success(bot, chatbot, chatbot, api_name="bot_response")
|
170 |
+
|
171 |
+
chatbot.like(print_like_dislike, None, None)
|
172 |
+
|
173 |
+
with gr.Row():
|
174 |
+
send_button = gr.Button("Send")
|
175 |
+
clear_button = gr.ClearButton([chatbot, chat_input])
|
176 |
+
|
177 |
+
send_button.click(
|
178 |
+
add_message, [chatbot, chat_input], [chatbot, chat_input]
|
179 |
+
).then(
|
180 |
+
bot, chatbot, chatbot, api_name="bot_response"
|
181 |
+
)
|
182 |
+
|
183 |
+
gr.Examples(
|
184 |
+
examples=[
|
185 |
+
{
|
186 |
+
"text": "<image> <image> <image> Which image shows a different mood of character from the others?",
|
187 |
+
"files": ["./examples/image12.jpg", "./examples/image13.jpg", "./examples/image14.jpg"]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"text": "<image> <image> What's the difference between these two images? Please describe as much as you can.",
|
191 |
+
"files": ["./examples/image1.jpg", "./examples/image2.jpg"]
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"text": "<image> <image> Which image shows an older dog?",
|
195 |
+
"files": ["./examples/image8.jpg", "./examples/image9.jpg"]
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"text": "Write a description for the given image sequence in a single paragraph, what is happening in this episode?",
|
199 |
+
"files": ["./examples/image3.jpg", "./examples/image4.jpg", "./examples/image5.jpg", "./examples/image6.jpg", "./examples/image7.jpg"]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"text": "<image> <image> How many dices are there in image 1 and image 2 respectively?",
|
203 |
+
"files": ["./examples/image10.jpg", "./examples/image15.jpg"]
|
204 |
+
},
|
205 |
+
],
|
206 |
+
inputs=[chat_input],
|
207 |
+
)
|
208 |
+
|
209 |
+
gr.Markdown("""
|
210 |
+
## Citation
|
211 |
+
```
|
212 |
+
@article{jiang2024mantis,
|
213 |
+
title={MANTIS: Interleaved Multi-Image Instruction Tuning},
|
214 |
+
author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
|
215 |
+
journal={arXiv preprint arXiv:2405.01483},
|
216 |
+
year={2024}
|
217 |
+
}
|
218 |
+
```""")
|
219 |
+
return demo
|
220 |
+
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
demo = build_demo()
|
224 |
+
demo.launch()
|
app.py → app_mllava.py
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
torch
|
2 |
-
transformers
|
3 |
Pillow
|
4 |
gradio
|
5 |
spaces
|
|
|
1 |
torch
|
2 |
+
transformers>=4.41.0
|
3 |
Pillow
|
4 |
gradio
|
5 |
spaces
|