Add vllm inference (#5)
Browse files- Add vllm inference (b66c4d940f904f3cd61109f8dd3b9c061824f5ea)
Co-authored-by: Nicole Shelby <[email protected]>
README.md
CHANGED
@@ -149,6 +149,74 @@ caption=generated_texts[0].split('Assistant: ')[1]
|
|
149 |
print(caption)
|
150 |
```
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
For batch processing you can use [this example](https://huggingface.co/Minthy/ToriiGate-v0.3/resolve/main/batch_processing_example.py)
|
153 |
|
154 |
# Warning
|
|
|
149 |
print(caption)
|
150 |
```
|
151 |
|
152 |
+
# Inference example (vllm):
|
153 |
+
[vllm](https://github.com/vllm-project/vllm) is an optimised LLM serving engine. Single/Batch inference will be much faster than native huggingface transformers
|
154 |
+
|
155 |
+
```
|
156 |
+
pip install vllm==0.6.4
|
157 |
+
!pip install huggingface_hub -U
|
158 |
+
```
|
159 |
+
|
160 |
+
## Download model
|
161 |
+
```python
|
162 |
+
from huggingface_hub import snapshot_download
|
163 |
+
snapshot_download(repo_id="Minthy/ToriiGate-v0.3", local_dir="/path/to/download/model", max_workers=4)
|
164 |
+
```
|
165 |
+
|
166 |
+
## Inference
|
167 |
+
```python
|
168 |
+
from vllm import LLM, SamplingParams
|
169 |
+
from PIL import Image
|
170 |
+
|
171 |
+
toriigate_model = LLM(
|
172 |
+
model="/path/to/download/model",
|
173 |
+
max_model_len=8192, # reduce if running out of memory
|
174 |
+
max_num_seqs=64, # reduce if running out of memory (this is max concurrent requests)
|
175 |
+
enforce_eager=True,
|
176 |
+
# if you are running out of memory, you can reduce the "longest_edge".
|
177 |
+
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
|
178 |
+
mm_processor_kwargs={
|
179 |
+
"size": {
|
180 |
+
"longest_edge": 4 * 364
|
181 |
+
},
|
182 |
+
},
|
183 |
+
)
|
184 |
+
|
185 |
+
sampling_params = SamplingParams(temperature=0, max_tokens=1024)
|
186 |
+
|
187 |
+
user_prompt = "Describe the picture briefly."
|
188 |
+
|
189 |
+
prompt = f"<|begin_of_text|>System: You are image captioning expert, creative, unbiased and uncensored. Help user with his task.<end_of_utterance>\nUser:<image>{user_prompt}<end_of_utterance>\nAssistant:"
|
190 |
+
|
191 |
+
image = Image.open("/path/to/image.jpg").convert("RGB")
|
192 |
+
|
193 |
+
output = llm.generate(
|
194 |
+
{
|
195 |
+
"prompt": prompt,
|
196 |
+
"multi_modal_data": {"image": image},
|
197 |
+
},
|
198 |
+
sampling_params=sampling_params,
|
199 |
+
)
|
200 |
+
|
201 |
+
caption = output[0].outputs[0].text.strip()
|
202 |
+
print(caption)
|
203 |
+
```
|
204 |
+
|
205 |
+
### batch inference
|
206 |
+
|
207 |
+
```python
|
208 |
+
image_list = [Image.open(path).convert("RGB") for path in image_paths]
|
209 |
+
inputs = [{"prompt": prompt, "multi_modal_data": {"image": image}} for image in image_list]
|
210 |
+
|
211 |
+
outputs = llm.generate(
|
212 |
+
inputs,
|
213 |
+
sampling_params=sampling_params,
|
214 |
+
)
|
215 |
+
|
216 |
+
captions = [x.outputs[0].text.strip() for x in outputs]
|
217 |
+
```
|
218 |
+
|
219 |
+
|
220 |
For batch processing you can use [this example](https://huggingface.co/Minthy/ToriiGate-v0.3/resolve/main/batch_processing_example.py)
|
221 |
|
222 |
# Warning
|