|
--- |
|
license: apache-2.0 |
|
--- |
|
## Model Details |
|
通过optimum对Qwen1.5-0.5B-Chat进行int8量化版本的过程 |
|
|
|
|
|
## Requirements |
|
```python |
|
pip install openvino-dev[pytorch]==2022.3.0 |
|
pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]" |
|
pip install --upgrade --upgrade-strategy eager "optimum[openvino]" |
|
pip install --upgrade --upgrade-strategy eager "optimum[ipex]" |
|
``` |
|
## Export OpenVINO Model |
|
```Python |
|
from transformers import AutoTokenizer |
|
from optimum.intel import OVWeightQuantizationConfig |
|
from optimum.intel.openvino import OVModelForCausalLM |
|
from optimum.exporters.openvino.convert import export_tokenizer |
|
from pathlib import Path |
|
import os |
|
#fp16 int8 int4 |
|
precision="int8" |
|
#导出模型的路径 |
|
ir_model_path = Path("./qwen0.5b-ov") |
|
if ir_model_path.exists() == False: |
|
os.mkdir(ir_model_path) |
|
compression_configs = { |
|
"sym": False, |
|
"group_size": 128, |
|
"ratio": 0.8, |
|
} |
|
#加载模型 |
|
model_path = "Qwen/Qwen1.5-0.5B-Chat" |
|
|
|
print("====Exporting IR=====") |
|
if precision == "int4": |
|
ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, |
|
compile=False, quantization_config=OVWeightQuantizationConfig( |
|
bits=4, **compression_configs)) |
|
elif precision == "int8": |
|
ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, |
|
compile=True, load_in_8bit=True) |
|
else: |
|
ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, |
|
compile=False, load_in_8bit=False) |
|
|
|
ov_model.save_pretrained(ir_model_path) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_path) |
|
tokenizer.save_pretrained(ir_model_path) |
|
|
|
print("====Exporting IR tokenizer=====") |
|
export_tokenizer(tokenizer, ir_model_path) |
|
``` |
|
## Usage |
|
```python |
|
from optimum.intel.openvino import OVModelForCausalLM |
|
from transformers import (AutoTokenizer, AutoConfig, |
|
TextIteratorStreamer) |
|
#导出模型的路径 |
|
model_dir = "./qwen0.5b-ov" |
|
ov_config = {"PERFORMANCE_HINT": "LATENCY", |
|
"NUM_STREAMS": "1", "CACHE_DIR": ""} |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_dir) |
|
ov_model = OVModelForCausalLM.from_pretrained( |
|
model_dir, |
|
device="cpu", |
|
ov_config=ov_config, |
|
config=AutoConfig.from_pretrained(model_dir), |
|
trust_remote_code=True, |
|
) |
|
streamer = TextIteratorStreamer( |
|
tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True |
|
) |
|
prompt = "今天天气如何?" |
|
length=len(prompt) |
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
model_inputs = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=True, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
) |
|
generate_kwargs = dict( |
|
input_ids=model_inputs, |
|
max_new_tokens=length, |
|
temperature=0.1, |
|
max_length=500, |
|
do_sample=True, |
|
top_p=1.0, |
|
top_k=50, |
|
repetition_penalty=1.1, |
|
streamer=streamer, |
|
pad_token_id=151645, |
|
) |
|
generated_ids = ov_model.generate(**generate_kwargs) |
|
generated_ids = [ |
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids) |
|
] |
|
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
print(response) |
|
``` |