Paul Dufour commited on
Commit
e3b994d
·
1 Parent(s): 9c0577e

update non-slim versions of models

Browse files
Makefile CHANGED
@@ -1,7 +1,7 @@
1
  SHELL := /bin/bash
2
 
3
  # Configuration variables
4
- NATIVE_ANDROID = ../Native-LLM-for-Android
5
  QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
6
  ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
7
  ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
@@ -60,11 +60,12 @@ export-merged-source-models-second-pass:
60
  $(NATIVE_PYTHON) -c 'import onnx, os, sys; \
61
  src = """'"$$item"'"""; \
62
  total_size = os.path.getsize(src); \
 
63
  total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
64
  needs_external = total_size > 2e9; \
65
  onnx.save_model( \
66
  onnx.load(src), \
67
- src, \
68
  save_as_external_data=needs_external, \
69
  all_tensors_to_one_file=True, \
70
  location=(os.path.basename(src) + ".data") if needs_external else None \
@@ -75,7 +76,7 @@ export-merged-source-models-second-pass:
75
  echo "✅ Done second models"
76
 
77
 
78
- all-in-one: export quantize clean-large-files slim fix-gpu-buffers
79
  @echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
80
 
81
  export: export-abcd export-e
@@ -93,9 +94,16 @@ export-e:
93
 
94
  slim:
95
  @echo "🗜️ Slimming ONNX models..."
96
- @files=$$(find $(ONNX_SRC_DIR) -name "*.onnx" -type f ! -name "QwenVL_E.onnx"); \
97
- $(call progress_bar,$$files,onnxslim --verbose {} {})
98
- @echo " Slimming complete"
 
 
 
 
 
 
 
99
 
100
  quantize:
101
  @echo "⚡ Starting quantization..."
@@ -144,7 +152,13 @@ clean-large-files:
144
 
145
  fix-gpu-buffers:
146
  @echo "🔧 Fixing GPU buffers for E models..."
147
- cd $(NATIVE_ANDROID) && \
148
- files=$$(find $(ONNX_DEST_DIR) -name "QwenVL_E_*.onnx" -type f); \
149
- $(call progress_bar,$$files, .venv/bin/python3 ONNX_Tools/clamp_for_gpu_buffers.py --overwrite {})
150
- @echo "✅ GPU buffer fixes complete"
 
 
 
 
 
 
 
1
  SHELL := /bin/bash
2
 
3
  # Configuration variables
4
+ NATIVE_ANDROID = $(abspath ../Native-LLM-for-Android)
5
  QWEN_VL_DIR = $(NATIVE_ANDROID)/Export_ONNX/QwenVL
6
  ONNX_SRC_DIR = $(QWEN_VL_DIR)/onnx
7
  ONNX_DEST_DIR = $(QWEN_VL_DIR)/onnx-dist
 
60
  $(NATIVE_PYTHON) -c 'import onnx, os, sys; \
61
  src = """'"$$item"'"""; \
62
  total_size = os.path.getsize(src); \
63
+ d = os.path.join(dest_dir, os.path.basename(src)); \
64
  total_size += os.path.getsize(src + ".data") if os.path.exists(src + ".data") else 0; \
65
  needs_external = total_size > 2e9; \
66
  onnx.save_model( \
67
  onnx.load(src), \
68
+ d, \
69
  save_as_external_data=needs_external, \
70
  all_tensors_to_one_file=True, \
71
  location=(os.path.basename(src) + ".data") if needs_external else None \
 
76
  echo "✅ Done second models"
77
 
78
 
79
+ all-in-one: export quantize clean-large-files fix-gpu-buffers export-merged-source-models
80
  @echo "✨ All done! ONNX models exported, slimmed, quantized and fixed"
81
 
82
  export: export-abcd export-e
 
94
 
95
  slim:
96
  @echo "🗜️ Slimming ONNX models..."
97
+ @files=`find $(ONNX_SRC_DIR) -name "*.onnx" -type f ! -name "QwenVL_E.onnx"`; \
98
+ total=`echo "$$files" | wc -w | tr -d ' '`; \
99
+ echo "Files found: $$total"; \
100
+ current=0; \
101
+ for item in $$files; do \
102
+ current=$$((current + 1)); \
103
+ $(call progress_bar,$$current,$$total,$$item); \
104
+ onnxslim --verbose "$$item" "$$item" || exit 1; \
105
+ done; \
106
+ echo "✅ Slimming complete"
107
 
108
  quantize:
109
  @echo "⚡ Starting quantization..."
 
152
 
153
  fix-gpu-buffers:
154
  @echo "🔧 Fixing GPU buffers for E models..."
155
+ @files=`find $(ONNX_DEST_DIR) -name "QwenVL_E_*.onnx" -type f`; \
156
+ total=`echo "$$files" | wc -w | tr -d ' '`; \
157
+ echo "Files found: $$total"; \
158
+ current=0; \
159
+ for item in $$files; do \
160
+ current=$$((current + 1)); \
161
+ $(call progress_bar,$$current,$$total,$$item); \
162
+ cd $(NATIVE_ANDROID) && .venv/bin/python3 ONNX_Tools/clamp_for_gpu_buffers.py --overwrite "$$item" || exit 1; \
163
+ done; \
164
+ echo "✅ GPU buffer fixes complete"
infer.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import torch
5
+ import numpy as np
6
+ import requests
7
+ import onnxruntime as ort
8
+ from PIL import Image
9
+ from io import BytesIO
10
+ from transformers import Qwen2VLConfig, AutoTokenizer
11
+
12
+ # Command line arguments
13
+ model_path = sys.argv[1]
14
+ onnx_path = sys.argv[2]
15
+
16
+ # Initialize model config and tokenizer
17
+ model_config = Qwen2VLConfig.from_pretrained(model_path)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
19
+
20
+ # Model configuration
21
+ max_length = 1024
22
+ num_attention_heads = model_config.num_attention_heads
23
+ num_key_value_heads = model_config.num_key_value_heads
24
+ head_dim = model_config.hidden_size // num_attention_heads
25
+ num_layers = model_config.num_hidden_layers
26
+
27
+ # Setup ONNX sessions
28
+ session_options = ort.SessionOptions()
29
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
30
+
31
+ # Model paths and sessions
32
+ models = ['A', 'B', 'C', 'D', 'E']
33
+ model_paths = {m: os.path.join(onnx_path, f'QwenVL_{m}_q4f16.onnx') for m in models}
34
+ sessions = {m: ort.InferenceSession(path, sess_options=session_options) for m, path in model_paths.items()}
35
+
36
+ # Input/output names
37
+ inputs = {
38
+ 'A': sessions['A'].get_inputs()[0].name,
39
+ 'B': [sessions['B'].get_inputs()[i].name for i in range(2)],
40
+ 'C': sessions['C'].get_inputs()[0].name,
41
+ 'D': [inp.name for inp in sessions['D'].get_inputs()],
42
+ 'E': [inp.name for inp in sessions['E'].get_inputs()]
43
+ }
44
+
45
+ outputs = {
46
+ 'A': sessions['A'].get_outputs()[0].name,
47
+ 'B': sessions['B'].get_outputs()[0].name,
48
+ 'C': sessions['C'].get_outputs()[0].name,
49
+ 'D': [out.name for out in sessions['D'].get_outputs()],
50
+ 'E': [out.name for out in sessions['E'].get_outputs()]
51
+ }
52
+
53
+ # Process image
54
+ image_url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
55
+ image = Image.open(BytesIO(requests.get(image_url).content)).resize((960, 960)).convert('RGB')
56
+ image_array = np.expand_dims(np.transpose(np.array(image).astype(np.float32), (2, 0, 1)), axis=0) / 255.
57
+
58
+ # Prepare inputs
59
+ prompt = "Describe this image."
60
+ formatted_prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{prompt}<|im_end|>\n<|im_start|>assistant\n"
61
+ input_ids = tokenizer(formatted_prompt, return_tensors='pt')['input_ids']
62
+ input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
63
+ tokens = np.zeros(max_length, dtype=np.int32)
64
+ tokens[:input_ids.shape[1]] = input_ids[0, :]
65
+ position = np.zeros(1, dtype=np.int64)
66
+
67
+ # Initialize caches
68
+ key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
69
+ value_cache = key_cache.copy()
70
+
71
+ # Process initial inputs
72
+ hidden_states = sessions['B'].run(
73
+ [outputs['B']],
74
+ {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
75
+ )[0]
76
+
77
+ batch_size = np.array(0, dtype=np.int32)
78
+ batch_size, = sessions['C'].run([outputs['C']], {inputs['C']: batch_size})
79
+
80
+ # Process image features
81
+ image_features = sessions['A'].run([outputs['A']], {inputs['A']: image_array})[0]
82
+ total_ids = 100 # 10 * 10 from original factors
83
+ input_lengths += total_ids
84
+ remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
85
+ tokens_to_stop = np.array(input_lengths[0] - 5, dtype=np.int32)
86
+
87
+ hidden_states, batch_size = sessions['D'].run(
88
+ outputs['D'],
89
+ dict(zip(inputs['D'],
90
+ [hidden_states, image_features, input_lengths, tokens_to_stop, remaining_tokens]))
91
+ )
92
+
93
+ # Generate tokens
94
+ start_time = time.time()
95
+ for i in range(12): # MAX_ITERATIONS
96
+ token, key_cache, value_cache = sessions['E'].run(
97
+ outputs['E'],
98
+ dict(zip(inputs['E'],
99
+ [hidden_states, np.array([-65504. if i==0 else 0.], dtype=np.float16),
100
+ key_cache, value_cache, position, input_lengths, batch_size,
101
+ np.array([1-total_ids+10 if i==0 else position[0]+1], dtype=np.float16)]))
102
+ )
103
+
104
+ if token in [151643, 151645]: # End tokens
105
+ break
106
+
107
+ if i < 1:
108
+ position += input_lengths[0]
109
+ input_lengths[0] = 1
110
+ else:
111
+ position += 1
112
+
113
+ tokens[0] = token
114
+ hidden_states = sessions['B'].run(
115
+ [outputs['B']],
116
+ {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
117
+ )[0]
118
+ print(tokenizer.decode(token), end='', flush=True)
119
+
120
+ print(f"\nTotal time: {time.time() - start_time:.2f}s")
onnx/QwenVL_A.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7668776b6d8a7dbbd5344de5948f9e7040cce04ac4fafff9155204dd2e0ef561
3
- size 341395
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9359181d8a217fd066b6201ca88d39ceef8d84464e886fa3af3634b767807967
3
+ size 22863481
onnx/QwenVL_A.onnx.data CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1bdde323eb76c15f6eab14966d5b802c51a8d9559d5260ad3cf9e868ef160bf
3
- size 5322682368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c2e8d0ebb88762b324860ca74abd35d4848b08f84619e71acc5122a0e46c8f
3
+ size 5322170368
onnx/QwenVL_B.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b2a741d6586465346e5c552c1d375da0b8321dd76a4d5498c0dd267ccd523b6
3
- size 233983352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b752394955396a0684cb491ebf802645ad6e73a29f4f2392c6bfd77759d7d86
3
+ size 234019162
onnx/QwenVL_C.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a239bb5f47b6589f4db8d9a3b57ada13cabee3508851769d473f3bd2338da732
3
- size 6384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09090f067d75cbfb62f90fc1f783529ede85e07006da80681fbb6f535baa29d6
3
+ size 10335
onnx/QwenVL_D.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d70b7429fc137486f82683d68953dd8a60d72466071fd22104bf5ff77e4460e
3
- size 25215
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4393146a8d328f1eae43e9058f391a1ef07048d6793747dab948838fcdfd1e6
3
+ size 26762