chiayewken commited on
Commit
ded79ae
·
1 Parent(s): 596d336

Update qwen2-vl inference (transformers instead of swift)

Browse files
Files changed (1) hide show
  1. app.py +133 -5
app.py CHANGED
@@ -1,6 +1,9 @@
 
1
  import hashlib
 
2
  import os
3
  from pathlib import Path
 
4
  from typing import Iterator, Optional, List, Union
5
 
6
  import gradio as gr
@@ -8,6 +11,7 @@ import spaces
8
  import torch
9
  from PIL import Image
10
  from pydantic import BaseModel
 
11
  from swift.llm import (
12
  ModelType,
13
  get_model_tokenizer,
@@ -19,6 +23,9 @@ from swift.llm import (
19
  from transformers import (
20
  Qwen2VLForConditionalGeneration,
21
  PreTrainedTokenizer,
 
 
 
22
  )
23
 
24
  MAX_MAX_NEW_TOKENS = 2048
@@ -42,6 +49,19 @@ this demo is governed by the original [license](https://huggingface.co/meta-llam
42
  """
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def save_image(image: Image.Image, folder: str) -> str:
46
  image_hash = hashlib.md5(image.tobytes()).hexdigest()
47
  path = Path(folder, f"{image_hash}.png")
@@ -139,12 +159,122 @@ class SwiftQwenModel(EvalModel):
139
  yield delta
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  if not torch.cuda.is_available():
143
  DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
144
 
145
 
146
  if torch.cuda.is_available():
147
- model = SwiftQwenModel()
 
148
 
149
 
150
  @spaces.GPU
@@ -158,10 +288,8 @@ def generate(
158
  top_k: int = 50,
159
  repetition_penalty: float = 1.2,
160
  ) -> Iterator[str]:
161
- outputs = []
162
- for text in model.run_stream(inputs=[message]):
163
- outputs.append(text)
164
- yield "".join(outputs)
165
 
166
 
167
  chat_interface = gr.ChatInterface(
 
1
+ import base64
2
  import hashlib
3
+ import io
4
  import os
5
  from pathlib import Path
6
+ from threading import Thread
7
  from typing import Iterator, Optional, List, Union
8
 
9
  import gradio as gr
 
11
  import torch
12
  from PIL import Image
13
  from pydantic import BaseModel
14
+ from qwen_vl_utils import process_vision_info
15
  from swift.llm import (
16
  ModelType,
17
  get_model_tokenizer,
 
23
  from transformers import (
24
  Qwen2VLForConditionalGeneration,
25
  PreTrainedTokenizer,
26
+ Qwen2VLProcessor,
27
+ TextIteratorStreamer,
28
+ AutoTokenizer,
29
  )
30
 
31
  MAX_MAX_NEW_TOKENS = 2048
 
49
  """
50
 
51
 
52
+ def convert_image_to_text(image: Image) -> str:
53
+ # This is also how OpenAI encodes images: https://platform.openai.com/docs/guides/vision
54
+ with io.BytesIO() as output:
55
+ image.save(output, format="PNG")
56
+ data = output.getvalue()
57
+ return base64.b64encode(data).decode("utf-8")
58
+
59
+
60
+ def convert_text_to_image(text: str) -> Image:
61
+ data = base64.b64decode(text.encode("utf-8"))
62
+ return Image.open(io.BytesIO(data))
63
+
64
+
65
  def save_image(image: Image.Image, folder: str) -> str:
66
  image_hash = hashlib.md5(image.tobytes()).hexdigest()
67
  path = Path(folder, f"{image_hash}.png")
 
159
  yield delta
160
 
161
 
162
+ class QwenModel(EvalModel):
163
+ path: str = "models/qwen"
164
+ engine: str = "Qwen/Qwen2-VL-7B-Instruct"
165
+ model: Optional[Qwen2VLForConditionalGeneration] = None
166
+ processor: Optional[Qwen2VLProcessor] = None
167
+ tokenizer: Optional[AutoTokenizer]
168
+ device: str = "cuda"
169
+ image_size: int = 768
170
+ lora_path: str = ""
171
+
172
+ def load(self):
173
+ if self.model is None:
174
+ path = self.path if os.path.exists(self.path) else self.engine
175
+ print(dict(load_path=path))
176
+ # noinspection PyTypeChecker
177
+ self.model = Qwen2VLForConditionalGeneration.from_pretrained(
178
+ path, torch_dtype="auto", device_map="auto"
179
+ )
180
+ self.tokenizer = AutoTokenizer.from_pretrained(self.engine)
181
+
182
+ if self.lora_path:
183
+ print("Loading LORA from", self.lora_path)
184
+ self.model.load_adapter(self.lora_path)
185
+
186
+ self.model = self.model.to(self.device).eval()
187
+ self.processor = Qwen2VLProcessor.from_pretrained(self.engine)
188
+ torch.manual_seed(0)
189
+ torch.cuda.manual_seed_all(0)
190
+
191
+ def make_messages(self, inputs: List[Union[str, Image.Image]]) -> List[dict]:
192
+ text = "\n\n".join([x for x in inputs if isinstance(x, str)])
193
+ content = [
194
+ dict(
195
+ type="image",
196
+ image=f"data:image;base64,{convert_image_to_text(resize_image(x, self.image_size))}",
197
+ )
198
+ for x in inputs
199
+ if isinstance(x, Image.Image)
200
+ ]
201
+ content.append(dict(type="text", text=text))
202
+ return [dict(role="user", content=content)]
203
+
204
+ def run(self, inputs: List[Union[str, Image.Image]]) -> str:
205
+ self.load()
206
+ messages = self.make_messages(inputs)
207
+ text = self.processor.apply_chat_template(
208
+ messages, tokenize=False, add_generation_prompt=True
209
+ )
210
+
211
+ image_inputs, video_inputs = process_vision_info(messages)
212
+ # noinspection PyTypeChecker
213
+ model_inputs = self.processor(
214
+ text=[text],
215
+ images=image_inputs,
216
+ videos=video_inputs,
217
+ padding=True,
218
+ return_tensors="pt",
219
+ ).to(self.device)
220
+
221
+ with torch.inference_mode():
222
+ generated_ids = self.model.generate(
223
+ **model_inputs, max_new_tokens=self.max_output_tokens
224
+ )
225
+
226
+ generated_ids_trimmed = [
227
+ out_ids[len(in_ids) :]
228
+ for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
229
+ ]
230
+ output_text = self.processor.batch_decode(
231
+ generated_ids_trimmed,
232
+ skip_special_tokens=True,
233
+ clean_up_tokenization_spaces=False,
234
+ )
235
+ return output_text[0]
236
+
237
+ def run_stream(self, inputs: List[Union[str, Image.Image]]) -> Iterator[str]:
238
+ self.load()
239
+ messages = self.make_messages(inputs)
240
+ text = self.processor.apply_chat_template(
241
+ messages, tokenize=False, add_generation_prompt=True
242
+ )
243
+
244
+ image_inputs, video_inputs = process_vision_info(messages)
245
+ # noinspection PyTypeChecker
246
+ model_inputs = self.processor(
247
+ text=[text],
248
+ images=image_inputs,
249
+ videos=video_inputs,
250
+ padding=True,
251
+ return_tensors="pt",
252
+ ).to(self.device)
253
+
254
+ streamer = TextIteratorStreamer(
255
+ self.tokenizer,
256
+ timeout=10.0,
257
+ skip_prompt=True,
258
+ skip_special_tokens=True,
259
+ )
260
+
261
+ generate_kwargs = dict(**model_inputs)
262
+ t = Thread(target=self.model.generate, kwargs=generate_kwargs)
263
+ t.start()
264
+
265
+ outputs = []
266
+ for text in streamer:
267
+ outputs.append(text)
268
+ yield "".join(outputs)
269
+
270
+
271
  if not torch.cuda.is_available():
272
  DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
273
 
274
 
275
  if torch.cuda.is_available():
276
+ model = QwenModel()
277
+ model.load()
278
 
279
 
280
  @spaces.GPU
 
288
  top_k: int = 50,
289
  repetition_penalty: float = 1.2,
290
  ) -> Iterator[str]:
291
+ for text in model.run_stream([message]):
292
+ yield text
 
 
293
 
294
 
295
  chat_interface = gr.ChatInterface(