chiayewken
commited on
Commit
·
ded79ae
1
Parent(s):
596d336
Update qwen2-vl inference (transformers instead of swift)
Browse files
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
|
|
1 |
import hashlib
|
|
|
2 |
import os
|
3 |
from pathlib import Path
|
|
|
4 |
from typing import Iterator, Optional, List, Union
|
5 |
|
6 |
import gradio as gr
|
@@ -8,6 +11,7 @@ import spaces
|
|
8 |
import torch
|
9 |
from PIL import Image
|
10 |
from pydantic import BaseModel
|
|
|
11 |
from swift.llm import (
|
12 |
ModelType,
|
13 |
get_model_tokenizer,
|
@@ -19,6 +23,9 @@ from swift.llm import (
|
|
19 |
from transformers import (
|
20 |
Qwen2VLForConditionalGeneration,
|
21 |
PreTrainedTokenizer,
|
|
|
|
|
|
|
22 |
)
|
23 |
|
24 |
MAX_MAX_NEW_TOKENS = 2048
|
@@ -42,6 +49,19 @@ this demo is governed by the original [license](https://huggingface.co/meta-llam
|
|
42 |
"""
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def save_image(image: Image.Image, folder: str) -> str:
|
46 |
image_hash = hashlib.md5(image.tobytes()).hexdigest()
|
47 |
path = Path(folder, f"{image_hash}.png")
|
@@ -139,12 +159,122 @@ class SwiftQwenModel(EvalModel):
|
|
139 |
yield delta
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
if not torch.cuda.is_available():
|
143 |
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
|
144 |
|
145 |
|
146 |
if torch.cuda.is_available():
|
147 |
-
model =
|
|
|
148 |
|
149 |
|
150 |
@spaces.GPU
|
@@ -158,10 +288,8 @@ def generate(
|
|
158 |
top_k: int = 50,
|
159 |
repetition_penalty: float = 1.2,
|
160 |
) -> Iterator[str]:
|
161 |
-
|
162 |
-
|
163 |
-
outputs.append(text)
|
164 |
-
yield "".join(outputs)
|
165 |
|
166 |
|
167 |
chat_interface = gr.ChatInterface(
|
|
|
1 |
+
import base64
|
2 |
import hashlib
|
3 |
+
import io
|
4 |
import os
|
5 |
from pathlib import Path
|
6 |
+
from threading import Thread
|
7 |
from typing import Iterator, Optional, List, Union
|
8 |
|
9 |
import gradio as gr
|
|
|
11 |
import torch
|
12 |
from PIL import Image
|
13 |
from pydantic import BaseModel
|
14 |
+
from qwen_vl_utils import process_vision_info
|
15 |
from swift.llm import (
|
16 |
ModelType,
|
17 |
get_model_tokenizer,
|
|
|
23 |
from transformers import (
|
24 |
Qwen2VLForConditionalGeneration,
|
25 |
PreTrainedTokenizer,
|
26 |
+
Qwen2VLProcessor,
|
27 |
+
TextIteratorStreamer,
|
28 |
+
AutoTokenizer,
|
29 |
)
|
30 |
|
31 |
MAX_MAX_NEW_TOKENS = 2048
|
|
|
49 |
"""
|
50 |
|
51 |
|
52 |
+
def convert_image_to_text(image: Image) -> str:
|
53 |
+
# This is also how OpenAI encodes images: https://platform.openai.com/docs/guides/vision
|
54 |
+
with io.BytesIO() as output:
|
55 |
+
image.save(output, format="PNG")
|
56 |
+
data = output.getvalue()
|
57 |
+
return base64.b64encode(data).decode("utf-8")
|
58 |
+
|
59 |
+
|
60 |
+
def convert_text_to_image(text: str) -> Image:
|
61 |
+
data = base64.b64decode(text.encode("utf-8"))
|
62 |
+
return Image.open(io.BytesIO(data))
|
63 |
+
|
64 |
+
|
65 |
def save_image(image: Image.Image, folder: str) -> str:
|
66 |
image_hash = hashlib.md5(image.tobytes()).hexdigest()
|
67 |
path = Path(folder, f"{image_hash}.png")
|
|
|
159 |
yield delta
|
160 |
|
161 |
|
162 |
+
class QwenModel(EvalModel):
|
163 |
+
path: str = "models/qwen"
|
164 |
+
engine: str = "Qwen/Qwen2-VL-7B-Instruct"
|
165 |
+
model: Optional[Qwen2VLForConditionalGeneration] = None
|
166 |
+
processor: Optional[Qwen2VLProcessor] = None
|
167 |
+
tokenizer: Optional[AutoTokenizer]
|
168 |
+
device: str = "cuda"
|
169 |
+
image_size: int = 768
|
170 |
+
lora_path: str = ""
|
171 |
+
|
172 |
+
def load(self):
|
173 |
+
if self.model is None:
|
174 |
+
path = self.path if os.path.exists(self.path) else self.engine
|
175 |
+
print(dict(load_path=path))
|
176 |
+
# noinspection PyTypeChecker
|
177 |
+
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
178 |
+
path, torch_dtype="auto", device_map="auto"
|
179 |
+
)
|
180 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.engine)
|
181 |
+
|
182 |
+
if self.lora_path:
|
183 |
+
print("Loading LORA from", self.lora_path)
|
184 |
+
self.model.load_adapter(self.lora_path)
|
185 |
+
|
186 |
+
self.model = self.model.to(self.device).eval()
|
187 |
+
self.processor = Qwen2VLProcessor.from_pretrained(self.engine)
|
188 |
+
torch.manual_seed(0)
|
189 |
+
torch.cuda.manual_seed_all(0)
|
190 |
+
|
191 |
+
def make_messages(self, inputs: List[Union[str, Image.Image]]) -> List[dict]:
|
192 |
+
text = "\n\n".join([x for x in inputs if isinstance(x, str)])
|
193 |
+
content = [
|
194 |
+
dict(
|
195 |
+
type="image",
|
196 |
+
image=f"data:image;base64,{convert_image_to_text(resize_image(x, self.image_size))}",
|
197 |
+
)
|
198 |
+
for x in inputs
|
199 |
+
if isinstance(x, Image.Image)
|
200 |
+
]
|
201 |
+
content.append(dict(type="text", text=text))
|
202 |
+
return [dict(role="user", content=content)]
|
203 |
+
|
204 |
+
def run(self, inputs: List[Union[str, Image.Image]]) -> str:
|
205 |
+
self.load()
|
206 |
+
messages = self.make_messages(inputs)
|
207 |
+
text = self.processor.apply_chat_template(
|
208 |
+
messages, tokenize=False, add_generation_prompt=True
|
209 |
+
)
|
210 |
+
|
211 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
212 |
+
# noinspection PyTypeChecker
|
213 |
+
model_inputs = self.processor(
|
214 |
+
text=[text],
|
215 |
+
images=image_inputs,
|
216 |
+
videos=video_inputs,
|
217 |
+
padding=True,
|
218 |
+
return_tensors="pt",
|
219 |
+
).to(self.device)
|
220 |
+
|
221 |
+
with torch.inference_mode():
|
222 |
+
generated_ids = self.model.generate(
|
223 |
+
**model_inputs, max_new_tokens=self.max_output_tokens
|
224 |
+
)
|
225 |
+
|
226 |
+
generated_ids_trimmed = [
|
227 |
+
out_ids[len(in_ids) :]
|
228 |
+
for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
|
229 |
+
]
|
230 |
+
output_text = self.processor.batch_decode(
|
231 |
+
generated_ids_trimmed,
|
232 |
+
skip_special_tokens=True,
|
233 |
+
clean_up_tokenization_spaces=False,
|
234 |
+
)
|
235 |
+
return output_text[0]
|
236 |
+
|
237 |
+
def run_stream(self, inputs: List[Union[str, Image.Image]]) -> Iterator[str]:
|
238 |
+
self.load()
|
239 |
+
messages = self.make_messages(inputs)
|
240 |
+
text = self.processor.apply_chat_template(
|
241 |
+
messages, tokenize=False, add_generation_prompt=True
|
242 |
+
)
|
243 |
+
|
244 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
245 |
+
# noinspection PyTypeChecker
|
246 |
+
model_inputs = self.processor(
|
247 |
+
text=[text],
|
248 |
+
images=image_inputs,
|
249 |
+
videos=video_inputs,
|
250 |
+
padding=True,
|
251 |
+
return_tensors="pt",
|
252 |
+
).to(self.device)
|
253 |
+
|
254 |
+
streamer = TextIteratorStreamer(
|
255 |
+
self.tokenizer,
|
256 |
+
timeout=10.0,
|
257 |
+
skip_prompt=True,
|
258 |
+
skip_special_tokens=True,
|
259 |
+
)
|
260 |
+
|
261 |
+
generate_kwargs = dict(**model_inputs)
|
262 |
+
t = Thread(target=self.model.generate, kwargs=generate_kwargs)
|
263 |
+
t.start()
|
264 |
+
|
265 |
+
outputs = []
|
266 |
+
for text in streamer:
|
267 |
+
outputs.append(text)
|
268 |
+
yield "".join(outputs)
|
269 |
+
|
270 |
+
|
271 |
if not torch.cuda.is_available():
|
272 |
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
|
273 |
|
274 |
|
275 |
if torch.cuda.is_available():
|
276 |
+
model = QwenModel()
|
277 |
+
model.load()
|
278 |
|
279 |
|
280 |
@spaces.GPU
|
|
|
288 |
top_k: int = 50,
|
289 |
repetition_penalty: float = 1.2,
|
290 |
) -> Iterator[str]:
|
291 |
+
for text in model.run_stream([message]):
|
292 |
+
yield text
|
|
|
|
|
293 |
|
294 |
|
295 |
chat_interface = gr.ChatInterface(
|