--- license: apache-2.0 --- ## Model Details 通过optimum对Qwen1.5-0.5B-Chat进行int8量化版本的过程 ## Requirements ```python pip install openvino-dev[pytorch]==2022.3.0 pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]" pip install --upgrade --upgrade-strategy eager "optimum[openvino]" pip install --upgrade --upgrade-strategy eager "optimum[ipex]" ``` ## Export OpenVINO Model ```Python from transformers import AutoTokenizer from optimum.intel import OVWeightQuantizationConfig from optimum.intel.openvino import OVModelForCausalLM from optimum.exporters.openvino.convert import export_tokenizer from pathlib import Path import os #fp16 int8 int4 precision="int8" #导出模型的路径 ir_model_path = Path("./qwen0.5b-ov") if ir_model_path.exists() == False: os.mkdir(ir_model_path) compression_configs = { "sym": False, "group_size": 128, "ratio": 0.8, } #加载模型 model_path = "Qwen/Qwen1.5-0.5B-Chat" print("====Exporting IR=====") if precision == "int4": ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, compile=False, quantization_config=OVWeightQuantizationConfig( bits=4, **compression_configs)) elif precision == "int8": ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, compile=True, load_in_8bit=True) else: ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True, compile=False, load_in_8bit=False) ov_model.save_pretrained(ir_model_path) tokenizer = AutoTokenizer.from_pretrained( model_path) tokenizer.save_pretrained(ir_model_path) print("====Exporting IR tokenizer=====") export_tokenizer(tokenizer, ir_model_path) ``` ## Usage ```python from optimum.intel.openvino import OVModelForCausalLM from transformers import (AutoTokenizer, AutoConfig, TextIteratorStreamer) #导出模型的路径 model_dir = "./qwen0.5b-ov" ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""} tokenizer = AutoTokenizer.from_pretrained( model_dir) ov_model = OVModelForCausalLM.from_pretrained( model_dir, device="cpu", ov_config=ov_config, config=AutoConfig.from_pretrained(model_dir), trust_remote_code=True, ) streamer = TextIteratorStreamer( tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True ) prompt = "今天天气如何?" length=len(prompt) messages = [ {"role": "user", "content": prompt} ] model_inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ) generate_kwargs = dict( input_ids=model_inputs, max_new_tokens=length, temperature=0.1, max_length=500, do_sample=True, top_p=1.0, top_k=50, repetition_penalty=1.1, streamer=streamer, pad_token_id=151645, ) generated_ids = ov_model.generate(**generate_kwargs) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) ```