Paul Dufour
commited on
Commit
·
e3b994d
1
Parent(s):
9c0577e
update non-slim versions of models
Browse files- Makefile +24 -10
- infer.py +120 -0
- onnx/QwenVL_A.onnx +2 -2
- onnx/QwenVL_A.onnx.data +2 -2
- onnx/QwenVL_B.onnx +2 -2
- onnx/QwenVL_C.onnx +2 -2
- onnx/QwenVL_D.onnx +2 -2
Makefile
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
SHELL := /bin/bash
|
2 |
|
3 |
# Configuration variables
|
4 |
-
NATIVE_ANDROID = ../Native-LLM-for-Android
|
5 |
QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
|
6 |
ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
|
7 |
ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
|
@@ -60,11 +60,12 @@ export-merged-source-models-second-pass:
|
|
60 |
$(NATIVE_PYTHON) -c 'import onnx, os, sys; \
|
61 |
src = """'"$$item"'"""; \
|
62 |
total_size = os.path.getsize(src); \
|
|
|
63 |
total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
|
64 |
needs_external = total_size > 2e9; \
|
65 |
onnx.save_model( \
|
66 |
onnx.load(src), \
|
67 |
-
|
68 |
save_as_external_data=needs_external, \
|
69 |
all_tensors_to_one_file=True, \
|
70 |
location=(os.path.basename(src) + ".data") if needs_external else None \
|
@@ -75,7 +76,7 @@ export-merged-source-models-second-pass:
|
|
75 |
echo "✅ Done second models"
|
76 |
|
77 |
|
78 |
-
all-in-one: export quantize clean-large-files
|
79 |
@echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
|
80 |
|
81 |
export: export-abcd export-e
|
@@ -93,9 +94,16 @@ export-e:
|
|
93 |
|
94 |
slim:
|
95 |
@echo "🗜️ Slimming ONNX models..."
|
96 |
-
@files
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
quantize:
|
101 |
@echo "⚡ Starting quantization..."
|
@@ -144,7 +152,13 @@ clean-large-files:
|
|
144 |
|
145 |
fix-gpu-buffers:
|
146 |
@echo "🔧 Fixing GPU buffers for E models..."
|
147 |
-
|
148 |
-
files
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
SHELL := /bin/bash
|
2 |
|
3 |
# Configuration variables
|
4 |
+
NATIVE_ANDROID = $(abspath ../Native-LLM-for-Android)
|
5 |
QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
|
6 |
ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
|
7 |
ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
|
|
|
60 |
$(NATIVE_PYTHON) -c 'import onnx, os, sys; \
|
61 |
src = """'"$$item"'"""; \
|
62 |
total_size = os.path.getsize(src); \
|
63 |
+
d = os.path.join(dest_dir, os.path.basename(src)); \
|
64 |
total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
|
65 |
needs_external = total_size > 2e9; \
|
66 |
onnx.save_model( \
|
67 |
onnx.load(src), \
|
68 |
+
d, \
|
69 |
save_as_external_data=needs_external, \
|
70 |
all_tensors_to_one_file=True, \
|
71 |
location=(os.path.basename(src) + ".data") if needs_external else None \
|
|
|
76 |
echo "✅ Done second models"
|
77 |
|
78 |
|
79 |
+
all-in-one: export quantize clean-large-files fix-gpu-buffers export-merged-source-models
|
80 |
@echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
|
81 |
|
82 |
export: export-abcd export-e
|
|
|
94 |
|
95 |
slim:
|
96 |
@echo "🗜️ Slimming ONNX models..."
|
97 |
+
@files=`find $(ONNX_SRC_DIR) -name "*.onnx" -type f ! -name "QwenVL_E.onnx"`; \
|
98 |
+
total=`echo "$$files" | wc -w | tr -d ' '`; \
|
99 |
+
echo "Files found: $$total"; \
|
100 |
+
current=0; \
|
101 |
+
for item in $$files; do \
|
102 |
+
current=$$((current + 1)); \
|
103 |
+
$(call progress_bar,$$current,$$total,$$item); \
|
104 |
+
onnxslim --verbose "$$item" "$$item" || exit 1; \
|
105 |
+
done; \
|
106 |
+
echo "✅ Slimming complete"
|
107 |
|
108 |
quantize:
|
109 |
@echo "⚡ Starting quantization..."
|
|
|
152 |
|
153 |
fix-gpu-buffers:
|
154 |
@echo "🔧 Fixing GPU buffers for E models..."
|
155 |
+
@files=`find $(ONNX_DEST_DIR) -name "QwenVL_E_*.onnx" -type f`; \
|
156 |
+
total=`echo "$$files" | wc -w | tr -d ' '`; \
|
157 |
+
echo "Files found: $$total"; \
|
158 |
+
current=0; \
|
159 |
+
for item in $$files; do \
|
160 |
+
current=$$((current + 1)); \
|
161 |
+
$(call progress_bar,$$current,$$total,$$item); \
|
162 |
+
cd $(NATIVE_ANDROID) && .venv/bin/python3 ONNX_Tools/clamp_for_gpu_buffers.py --overwrite "$$item" || exit 1; \
|
163 |
+
done; \
|
164 |
+
echo "✅ GPU buffer fixes complete"
|
infer.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
import onnxruntime as ort
|
8 |
+
from PIL import Image
|
9 |
+
from io import BytesIO
|
10 |
+
from transformers import Qwen2VLConfig, AutoTokenizer
|
11 |
+
|
12 |
+
# Command line arguments
|
13 |
+
model_path = sys.argv[1]
|
14 |
+
onnx_path = sys.argv[2]
|
15 |
+
|
16 |
+
# Initialize model config and tokenizer
|
17 |
+
model_config = Qwen2VLConfig.from_pretrained(model_path)
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
19 |
+
|
20 |
+
# Model configuration
|
21 |
+
max_length = 1024
|
22 |
+
num_attention_heads = model_config.num_attention_heads
|
23 |
+
num_key_value_heads = model_config.num_key_value_heads
|
24 |
+
head_dim = model_config.hidden_size // num_attention_heads
|
25 |
+
num_layers = model_config.num_hidden_layers
|
26 |
+
|
27 |
+
# Setup ONNX sessions
|
28 |
+
session_options = ort.SessionOptions()
|
29 |
+
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
30 |
+
|
31 |
+
# Model paths and sessions
|
32 |
+
models = ['A', 'B', 'C', 'D', 'E']
|
33 |
+
model_paths = {m: os.path.join(onnx_path, f'QwenVL_{m}_q4f16.onnx') for m in models}
|
34 |
+
sessions = {m: ort.InferenceSession(path, sess_options=session_options) for m, path in model_paths.items()}
|
35 |
+
|
36 |
+
# Input/output names
|
37 |
+
inputs = {
|
38 |
+
'A': sessions['A'].get_inputs()[0].name,
|
39 |
+
'B': [sessions['B'].get_inputs()[i].name for i in range(2)],
|
40 |
+
'C': sessions['C'].get_inputs()[0].name,
|
41 |
+
'D': [inp.name for inp in sessions['D'].get_inputs()],
|
42 |
+
'E': [inp.name for inp in sessions['E'].get_inputs()]
|
43 |
+
}
|
44 |
+
|
45 |
+
outputs = {
|
46 |
+
'A': sessions['A'].get_outputs()[0].name,
|
47 |
+
'B': sessions['B'].get_outputs()[0].name,
|
48 |
+
'C': sessions['C'].get_outputs()[0].name,
|
49 |
+
'D': [out.name for out in sessions['D'].get_outputs()],
|
50 |
+
'E': [out.name for out in sessions['E'].get_outputs()]
|
51 |
+
}
|
52 |
+
|
53 |
+
# Process image
|
54 |
+
image_url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
|
55 |
+
image = Image.open(BytesIO(requests.get(image_url).content)).resize((960, 960)).convert('RGB')
|
56 |
+
image_array = np.expand_dims(np.transpose(np.array(image).astype(np.float32), (2, 0, 1)), axis=0) / 255.
|
57 |
+
|
58 |
+
# Prepare inputs
|
59 |
+
prompt = "Describe this image."
|
60 |
+
formatted_prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
61 |
+
input_ids = tokenizer(formatted_prompt, return_tensors='pt')['input_ids']
|
62 |
+
input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
|
63 |
+
tokens = np.zeros(max_length, dtype=np.int32)
|
64 |
+
tokens[:input_ids.shape[1]] = input_ids[0, :]
|
65 |
+
position = np.zeros(1, dtype=np.int64)
|
66 |
+
|
67 |
+
# Initialize caches
|
68 |
+
key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
|
69 |
+
value_cache = key_cache.copy()
|
70 |
+
|
71 |
+
# Process initial inputs
|
72 |
+
hidden_states = sessions['B'].run(
|
73 |
+
[outputs['B']],
|
74 |
+
{inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
|
75 |
+
)[0]
|
76 |
+
|
77 |
+
batch_size = np.array(0, dtype=np.int32)
|
78 |
+
batch_size, = sessions['C'].run([outputs['C']], {inputs['C']: batch_size})
|
79 |
+
|
80 |
+
# Process image features
|
81 |
+
image_features = sessions['A'].run([outputs['A']], {inputs['A']: image_array})[0]
|
82 |
+
total_ids = 100 # 10 * 10 from original factors
|
83 |
+
input_lengths += total_ids
|
84 |
+
remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
|
85 |
+
tokens_to_stop = np.array(input_lengths[0] - 5, dtype=np.int32)
|
86 |
+
|
87 |
+
hidden_states, batch_size = sessions['D'].run(
|
88 |
+
outputs['D'],
|
89 |
+
dict(zip(inputs['D'],
|
90 |
+
[hidden_states, image_features, input_lengths, tokens_to_stop, remaining_tokens]))
|
91 |
+
)
|
92 |
+
|
93 |
+
# Generate tokens
|
94 |
+
start_time = time.time()
|
95 |
+
for i in range(12): # MAX_ITERATIONS
|
96 |
+
token, key_cache, value_cache = sessions['E'].run(
|
97 |
+
outputs['E'],
|
98 |
+
dict(zip(inputs['E'],
|
99 |
+
[hidden_states, np.array([-65504. if i==0 else 0.], dtype=np.float16),
|
100 |
+
key_cache, value_cache, position, input_lengths, batch_size,
|
101 |
+
np.array([1-total_ids+10 if i==0 else position[0]+1], dtype=np.float16)]))
|
102 |
+
)
|
103 |
+
|
104 |
+
if token in [151643, 151645]: # End tokens
|
105 |
+
break
|
106 |
+
|
107 |
+
if i < 1:
|
108 |
+
position += input_lengths[0]
|
109 |
+
input_lengths[0] = 1
|
110 |
+
else:
|
111 |
+
position += 1
|
112 |
+
|
113 |
+
tokens[0] = token
|
114 |
+
hidden_states = sessions['B'].run(
|
115 |
+
[outputs['B']],
|
116 |
+
{inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
|
117 |
+
)[0]
|
118 |
+
print(tokenizer.decode(token), end='', flush=True)
|
119 |
+
|
120 |
+
print(f"\nTotal time: {time.time() - start_time:.2f}s")
|
onnx/QwenVL_A.onnx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9359181d8a217fd066b6201ca88d39ceef8d84464e886fa3af3634b767807967
|
3 |
+
size 22863481
|
onnx/QwenVL_A.onnx.data
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:48c2e8d0ebb88762b324860ca74abd35d4848b08f84619e71acc5122a0e46c8f
|
3 |
+
size 5322170368
|
onnx/QwenVL_B.onnx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b752394955396a0684cb491ebf802645ad6e73a29f4f2392c6bfd77759d7d86
|
3 |
+
size 234019162
|
onnx/QwenVL_C.onnx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09090f067d75cbfb62f90fc1f783529ede85e07006da80681fbb6f535baa29d6
|
3 |
+
size 10335
|
onnx/QwenVL_D.onnx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4393146a8d328f1eae43e9058f391a1ef07048d6793747dab948838fcdfd1e6
|
3 |
+
size 26762
|