update readme
Browse files
README.md
CHANGED
@@ -18,6 +18,8 @@ It is further fine-tuned on OASST1 and Dolly2 to enhance chatting ability.
|
|
18 |
|
19 |
# Quick Start
|
20 |
|
|
|
|
|
21 |
To prompt the chat model, use the following format:
|
22 |
```
|
23 |
<human>: [Instruction]
|
@@ -26,34 +28,77 @@ To prompt the chat model, use the following format:
|
|
26 |
|
27 |
## GPU Inference
|
28 |
|
29 |
-
This requires a GPU with
|
|
|
30 |
```python
|
|
|
|
|
31 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# init
|
33 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
|
34 |
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", torch_dtype=torch.float16)
|
35 |
model = model.to('cuda:0')
|
36 |
# infer
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
print(output_str)
|
|
|
|
|
|
|
41 |
```
|
42 |
|
43 |
## GPU Inference in Int8
|
44 |
|
45 |
-
This requires a GPU with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
```python
|
|
|
|
|
48 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# init
|
50 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
|
51 |
-
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", device_map=
|
|
|
52 |
# infer
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
print(output_str)
|
|
|
|
|
|
|
57 |
```
|
58 |
|
59 |
## CPU Inference
|
@@ -68,8 +113,13 @@ inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.devi
|
|
68 |
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
69 |
output_str = tokenizer.decode(outputs[0])
|
70 |
print(output_str)
|
|
|
|
|
|
|
71 |
```
|
72 |
|
|
|
|
|
73 |
|
74 |
# Uses
|
75 |
|
|
|
18 |
|
19 |
# Quick Start
|
20 |
|
21 |
+
Please note that the model requires `transformers` version >= 4.25.1.
|
22 |
+
|
23 |
To prompt the chat model, use the following format:
|
24 |
```
|
25 |
<human>: [Instruction]
|
|
|
28 |
|
29 |
## GPU Inference
|
30 |
|
31 |
+
This requires a GPU with 8GB memory.
|
32 |
+
|
33 |
```python
|
34 |
+
import torch
|
35 |
+
import transformers
|
36 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
+
|
38 |
+
MIN_TRANSFORMERS_VERSION = '4.25.1'
|
39 |
+
|
40 |
+
# check transformers version
|
41 |
+
assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
|
42 |
+
|
43 |
# init
|
44 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
|
45 |
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", torch_dtype=torch.float16)
|
46 |
model = model.to('cuda:0')
|
47 |
# infer
|
48 |
+
prompt = "<human>: Who is Alan Turing?\n<bot>:"
|
49 |
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
50 |
+
input_length = inputs.input_ids.shape[1]
|
51 |
+
outputs = model.generate(
|
52 |
+
**inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
|
53 |
+
)
|
54 |
+
token = outputs.sequences[0, input_length:]
|
55 |
+
output_str = tokenizer.decode(token)
|
56 |
print(output_str)
|
57 |
+
"""
|
58 |
+
Alan Mathison Turing (23 June 1912 7 June 1954) was an English computer scientist, mathematician, logician, cryptanalyst, philosopher, mathematician, and theoretical biologist.
|
59 |
+
"""
|
60 |
```
|
61 |
|
62 |
## GPU Inference in Int8
|
63 |
|
64 |
+
This requires a GPU with 6GB memory.
|
65 |
+
|
66 |
+
To run inference with int8, please ensure you have installed accelerate and bitandbytes. You can install them with the following command:
|
67 |
+
|
68 |
+
```bash
|
69 |
+
pip install accelerate
|
70 |
+
pip install bitsandbytes
|
71 |
+
```
|
72 |
+
|
73 |
+
Then you can run inference with int8 as follows:
|
74 |
|
75 |
```python
|
76 |
+
import torch
|
77 |
+
import transformers
|
78 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
79 |
+
|
80 |
+
MIN_TRANSFORMERS_VERSION = '4.25.1'
|
81 |
+
|
82 |
+
# check transformers version
|
83 |
+
assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'
|
84 |
+
|
85 |
# init
|
86 |
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1")
|
87 |
+
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-Chat-INCITE-6.9B-v1", device_map='auto', torch_dtype=torch.float16, load_in_8bit=True)
|
88 |
+
|
89 |
# infer
|
90 |
+
prompt = "<human>: Who is Alan Turing?\n<bot>:"
|
91 |
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
92 |
+
input_length = inputs.input_ids.shape[1]
|
93 |
+
outputs = model.generate(
|
94 |
+
**inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
|
95 |
+
)
|
96 |
+
token = outputs.sequences[0, input_length:]
|
97 |
+
output_str = tokenizer.decode(token)
|
98 |
print(output_str)
|
99 |
+
"""
|
100 |
+
Alan Mathison Turing (23 June 1912 – 7 June 1954) was an English computer scientist, mathematician, logician, cryptanalyst, philosopher, and theoretical biologist.
|
101 |
+
"""
|
102 |
```
|
103 |
|
104 |
## CPU Inference
|
|
|
113 |
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
114 |
output_str = tokenizer.decode(outputs[0])
|
115 |
print(output_str)
|
116 |
+
"""
|
117 |
+
Alan Turing was a British mathematician and computer scientist. He was one of the key figures in the development of computer science and artificial intelligence. He is widely regarded as the father of computer science and artificial intelligence.
|
118 |
+
"""
|
119 |
```
|
120 |
|
121 |
+
Please note that since `LayerNormKernelImpl` is not implemented in fp16 for CPU, we use `bfloat16` for CPU inference.
|
122 |
+
|
123 |
|
124 |
# Uses
|
125 |
|