Paul Dufour commited on
Commit
9c0577e
·
2 Parent(s): b86a532 8407b6b

Merge branch 'main' of hf.co:pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16

Browse files
Files changed (2) hide show
  1. EXPORT.md +57 -0
  2. README.md +142 -1
EXPORT.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Export
2
+
3
+ The original model was exported using the following process:
4
+
5
+ The following repos were used:
6
+ * https://github.com/pdufour/Native-LLM-for-Android
7
+ * https://github.com/pdufour/transformers.js/tree/add-block-list
8
+
9
+ If you close this repo and the above 2 to the same directory you can run the following commands:
10
+
11
+ **From `Qwen2-VL-2B-Instruct-ONNX-Q4-F16`, run:**
12
+
13
+ `make all-in-one`
14
+
15
+ This will create an export of the onnx models.
16
+
17
+ **The following is a list of all commands available:**
18
+
19
+ **all-in-one**
20
+
21
+ Runs all steps (exporting, slimming, quantizing, cleaning, fixing GPU buffers) to produce fully prepared ONNX models.
22
+
23
+ **export**
24
+
25
+ Combines export-abcd and export-e to generate ONNX models for all parts.
26
+
27
+ **export-abcd**
28
+
29
+ Exports model parts A, B, C, and D by running QwenVL_Export_ABCD.py.
30
+
31
+ **export-e**
32
+
33
+ Exports model part E by running QwenVL_Export_E.py.
34
+
35
+ **slim**
36
+
37
+ Reduces ONNX model size by removing unnecessary elements for optimized deployment.
38
+
39
+ **quantize**
40
+
41
+ Quantizes all model parts (A, B, C, D, and E) to optimize size and performance.
42
+
43
+ **quantize-%**
44
+
45
+ Quantizes a specific model part (% can be A, B, C, D, or E) with targeted configurations.
46
+
47
+ **clean-large-files**
48
+
49
+ Deletes ONNX files larger than 2GB from the destination directory to retain models that will work for onnx environments.
50
+
51
+ **fix-gpu-buffers**
52
+
53
+ Applies fixes to GPU buffers in ONNX files for part E to ensure GPU memory compatibility.
54
+
55
+ **all**
56
+
57
+ Alias for all-in-one to run the full ONNX model preparation pipeline.
README.md CHANGED
@@ -2,4 +2,145 @@
2
  license: apache-2.0
3
  base_model:
4
  - Qwen/Qwen2-VL-2B-Instruct
5
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: apache-2.0
3
  base_model:
4
  - Qwen/Qwen2-VL-2B-Instruct
5
+ ---
6
+ # Requirements
7
+ This is compatible with any onnx runtime.
8
+
9
+ # Running this model
10
+
11
+ **Javascript**
12
+
13
+ See https://huggingface.co/spaces/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.
14
+
15
+
16
+ **Python**
17
+
18
+ Download the following script ./infer.py and then run like so:
19
+ python3 infer.py Qwen/Qwen2-VL-2B-Instruct 'path-to/Qwen2-VL-2B-Instruct-onnx/onnx'
20
+
21
+ ```
22
+ import os
23
+ import sys
24
+ import time
25
+ import torch
26
+ import numpy as np
27
+ import requests
28
+ import onnxruntime as ort
29
+ from PIL import Image
30
+ from io import BytesIO
31
+ from transformers import Qwen2VLConfig, AutoTokenizer
32
+
33
+ # Command line arguments
34
+ model_path = sys.argv[1]
35
+ onnx_path = sys.argv[2]
36
+
37
+ # Initialize model config and tokenizer
38
+ model_config = Qwen2VLConfig.from_pretrained(model_path)
39
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
40
+
41
+ # Model configuration
42
+ max_length = 1024
43
+ num_attention_heads = model_config.num_attention_heads
44
+ num_key_value_heads = model_config.num_key_value_heads
45
+ head_dim = model_config.hidden_size // num_attention_heads
46
+ num_layers = model_config.num_hidden_layers
47
+
48
+ # Setup ONNX sessions
49
+ session_options = ort.SessionOptions()
50
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
51
+
52
+ # Model paths and sessions
53
+ models = ['A', 'B', 'C', 'D', 'E']
54
+ model_paths = {m: os.path.join(onnx_path, f'QwenVL_{m}_q4f16.onnx') for m in models}
55
+ sessions = {m: ort.InferenceSession(path, sess_options=session_options) for m, path in model_paths.items()}
56
+
57
+ # Input/output names
58
+ inputs = {
59
+ 'A': sessions['A'].get_inputs()[0].name,
60
+ 'B': [sessions['B'].get_inputs()[i].name for i in range(2)],
61
+ 'C': sessions['C'].get_inputs()[0].name,
62
+ 'D': [inp.name for inp in sessions['D'].get_inputs()],
63
+ 'E': [inp.name for inp in sessions['E'].get_inputs()]
64
+ }
65
+
66
+ outputs = {
67
+ 'A': sessions['A'].get_outputs()[0].name,
68
+ 'B': sessions['B'].get_outputs()[0].name,
69
+ 'C': sessions['C'].get_outputs()[0].name,
70
+ 'D': [out.name for out in sessions['D'].get_outputs()],
71
+ 'E': [out.name for out in sessions['E'].get_outputs()]
72
+ }
73
+
74
+ # Process image
75
+ image_url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
76
+ image = Image.open(BytesIO(requests.get(image_url).content)).resize((960, 960)).convert('RGB')
77
+ image_array = np.expand_dims(np.transpose(np.array(image).astype(np.float32), (2, 0, 1)), axis=0) / 255.
78
+
79
+ # Prepare inputs
80
+ prompt = "Describe this image."
81
+ formatted_prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{prompt}<|im_end|>\n<|im_start|>assistant\n"
82
+ input_ids = tokenizer(formatted_prompt, return_tensors='pt')['input_ids']
83
+ input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
84
+ tokens = np.zeros(max_length, dtype=np.int32)
85
+ tokens[:input_ids.shape[1]] = input_ids[0, :]
86
+ position = np.zeros(1, dtype=np.int64)
87
+
88
+ # Initialize caches
89
+ key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
90
+ value_cache = key_cache.copy()
91
+
92
+ # Process initial inputs
93
+ hidden_states = sessions['B'].run(
94
+ [outputs['B']],
95
+ {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
96
+ )[0]
97
+
98
+ batch_size = np.array(0, dtype=np.int32)
99
+ batch_size, = sessions['C'].run([outputs['C']], {inputs['C']: batch_size})
100
+
101
+ # Process image features
102
+ image_features = sessions['A'].run([outputs['A']], {inputs['A']: image_array})[0]
103
+ total_ids = 100 # 10 * 10 from original factors
104
+ input_lengths += total_ids
105
+ remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
106
+ tokens_to_stop = np.array(input_lengths[0] - 5, dtype=np.int32)
107
+
108
+ hidden_states, batch_size = sessions['D'].run(
109
+ outputs['D'],
110
+ dict(zip(inputs['D'],
111
+ [hidden_states, image_features, input_lengths, tokens_to_stop, remaining_tokens]))
112
+ )
113
+
114
+ # Generate tokens
115
+ start_time = time.time()
116
+ for i in range(12): # MAX_ITERATIONS
117
+ token, key_cache, value_cache = sessions['E'].run(
118
+ outputs['E'],
119
+ dict(zip(inputs['E'],
120
+ [hidden_states, np.array([-65504. if i==0 else 0.], dtype=np.float16),
121
+ key_cache, value_cache, position, input_lengths, batch_size,
122
+ np.array([1-total_ids+10 if i==0 else position[0]+1], dtype=np.float16)]))
123
+ )
124
+
125
+ if token in [151643, 151645]: # End tokens
126
+ break
127
+
128
+ if i < 1:
129
+ position += input_lengths[0]
130
+ input_lengths[0] = 1
131
+ else:
132
+ position += 1
133
+
134
+ tokens[0] = token
135
+ hidden_states = sessions['B'].run(
136
+ [outputs['B']],
137
+ {inputs['B'][0]: tokens, inputs['B'][1]: input_lengths}
138
+ )[0]
139
+ print(tokenizer.decode(token), end='', flush=True)
140
+
141
+ print(f"\nTotal time: {time.time() - start_time:.2f}s")
142
+
143
+ ```
144
+
145
+ # Technical Information:
146
+ - [EXPORT.md](EXPORT.md)