Model Details

通过optimum对Qwen1.5-0.5B-Chat进行int8量化版本的过程

Requirements

pip install openvino-dev[pytorch]==2022.3.0
pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"
pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
pip install --upgrade --upgrade-strategy eager "optimum[ipex]"

Export OpenVINO Model

from transformers import AutoTokenizer
from optimum.intel import OVWeightQuantizationConfig
from optimum.intel.openvino import OVModelForCausalLM
from optimum.exporters.openvino.convert import export_tokenizer
from pathlib import Path
import os
#fp16 int8 int4
precision="int8"
#导出模型的路径
ir_model_path = Path("./qwen0.5b-ov")
if ir_model_path.exists() == False:
    os.mkdir(ir_model_path)
compression_configs = {
    "sym": False,
    "group_size": 128,
    "ratio": 0.8,
}
#加载模型
model_path = "Qwen/Qwen1.5-0.5B-Chat"

print("====Exporting IR=====")
if precision == "int4":
    ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
                                                    compile=False, quantization_config=OVWeightQuantizationConfig(
                                                        bits=4, **compression_configs))
elif precision == "int8":
    ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
                                                    compile=True, load_in_8bit=True)
else:
    ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
                                                    compile=False, load_in_8bit=False)

ov_model.save_pretrained(ir_model_path)

tokenizer = AutoTokenizer.from_pretrained(
    model_path)
tokenizer.save_pretrained(ir_model_path)

print("====Exporting IR tokenizer=====")
export_tokenizer(tokenizer, ir_model_path)

Usage

from optimum.intel.openvino import OVModelForCausalLM
from transformers import (AutoTokenizer, AutoConfig,
                          TextIteratorStreamer)
#导出模型的路径
model_dir = "./qwen0.5b-ov"
ov_config = {"PERFORMANCE_HINT": "LATENCY",
             "NUM_STREAMS": "1", "CACHE_DIR": ""}
tokenizer = AutoTokenizer.from_pretrained(
    model_dir)
ov_model = OVModelForCausalLM.from_pretrained(
    model_dir,
    device="cpu",
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(model_dir),
    trust_remote_code=True,
)
streamer = TextIteratorStreamer(
    tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
)
prompt = "今天天气如何?"
length=len(prompt)
messages = [
{"role": "user", "content": prompt}
]
model_inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
)
generate_kwargs = dict(
        input_ids=model_inputs,
        max_new_tokens=length,
        temperature=0.1,
        max_length=500,
        do_sample=True,
        top_p=1.0,
        top_k=50,
        repetition_penalty=1.1,
        streamer=streamer,
        pad_token_id=151645,
    )
generated_ids = ov_model.generate(**generate_kwargs)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
Downloads last month
18
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.