Anthonyfhd commited on
Commit
e56debf
·
verified ·
1 Parent(s): e853924

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +108 -3
README.md CHANGED
@@ -1,3 +1,108 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ## Model Details
5
+ 通过optimum对Qwen1.5-0.5B-Chat进行int8量化版本的过程
6
+
7
+
8
+ ## Requirements
9
+ ```python
10
+ pip install openvino-dev[pytorch]==2022.3.0
11
+ pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"
12
+ pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
13
+ pip install --upgrade --upgrade-strategy eager "optimum[ipex]"
14
+ ```
15
+ ## Export OpenVINO Model
16
+ ```Python
17
+ from transformers import AutoTokenizer
18
+ from optimum.intel import OVWeightQuantizationConfig
19
+ from optimum.intel.openvino import OVModelForCausalLM
20
+ from optimum.exporters.openvino.convert import export_tokenizer
21
+ from pathlib import Path
22
+ import os
23
+ #fp16 int8 int4
24
+ precision="int8"
25
+ #导出模型的路径
26
+ ir_model_path = Path("./qwen0.5b-ov")
27
+ if ir_model_path.exists() == False:
28
+ os.mkdir(ir_model_path)
29
+ compression_configs = {
30
+ "sym": False,
31
+ "group_size": 128,
32
+ "ratio": 0.8,
33
+ }
34
+ #加载模型
35
+ model_path = "Qwen/Qwen1.5-0.5B-Chat"
36
+
37
+ print("====Exporting IR=====")
38
+ if precision == "int4":
39
+ ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
40
+ compile=False, quantization_config=OVWeightQuantizationConfig(
41
+ bits=4, **compression_configs))
42
+ elif precision == "int8":
43
+ ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
44
+ compile=True, load_in_8bit=True)
45
+ else:
46
+ ov_model = OVModelForCausalLM.from_pretrained(model_path, export=True,
47
+ compile=False, load_in_8bit=False)
48
+
49
+ ov_model.save_pretrained(ir_model_path)
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(
52
+ model_path)
53
+ tokenizer.save_pretrained(ir_model_path)
54
+
55
+ print("====Exporting IR tokenizer=====")
56
+ export_tokenizer(tokenizer, ir_model_path)
57
+ ```
58
+ ## Usage
59
+ ```python
60
+ from optimum.intel.openvino import OVModelForCausalLM
61
+ from transformers import (AutoTokenizer, AutoConfig,
62
+ TextIteratorStreamer)
63
+ #导出模型的路径
64
+ model_dir = "./qwen0.5b-ov"
65
+ ov_config = {"PERFORMANCE_HINT": "LATENCY",
66
+ "NUM_STREAMS": "1", "CACHE_DIR": ""}
67
+ tokenizer = AutoTokenizer.from_pretrained(
68
+ model_dir)
69
+ ov_model = OVModelForCausalLM.from_pretrained(
70
+ model_dir,
71
+ device="cpu",
72
+ ov_config=ov_config,
73
+ config=AutoConfig.from_pretrained(model_dir),
74
+ trust_remote_code=True,
75
+ )
76
+ streamer = TextIteratorStreamer(
77
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
78
+ )
79
+ prompt = "今天天气如何?"
80
+ length=len(prompt)
81
+ messages = [
82
+ {"role": "user", "content": prompt}
83
+ ]
84
+ model_inputs = tokenizer.apply_chat_template(
85
+ messages,
86
+ tokenize=True,
87
+ add_generation_prompt=True,
88
+ return_tensors="pt"
89
+ )
90
+ generate_kwargs = dict(
91
+ input_ids=model_inputs,
92
+ max_new_tokens=length,
93
+ temperature=0.1,
94
+ max_length=500,
95
+ do_sample=True,
96
+ top_p=1.0,
97
+ top_k=50,
98
+ repetition_penalty=1.1,
99
+ streamer=streamer,
100
+ pad_token_id=151645,
101
+ )
102
+ generated_ids = ov_model.generate(**generate_kwargs)
103
+ generated_ids = [
104
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs, generated_ids)
105
+ ]
106
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
107
+ print(response)
108
+ ```