Update README.md
Browse files
README.md
CHANGED
@@ -34,6 +34,14 @@ It is the result of quantising to 4bit using [AutoGPTQ](https://github.com/PanQi
|
|
34 |
* [2, 3, 4, 5, 6, 8-bit GGML models for CPU+GPU inference](https://huggingface.co/TheBloke/falcon-40b-instruct-GGML)
|
35 |
* [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/tiiuae/falcon-40b-instruct)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
## EXPERIMENTAL
|
38 |
|
39 |
Please note this is an experimental GPTQ model. Support for it is currently quite limited.
|
@@ -97,24 +105,57 @@ pip install einops
|
|
97 |
|
98 |
You can then run this example code:
|
99 |
```python
|
100 |
-
import
|
101 |
-
from
|
102 |
-
|
103 |
|
104 |
-
|
105 |
-
|
|
|
106 |
|
107 |
-
|
108 |
-
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=False)
|
109 |
|
110 |
-
|
111 |
|
112 |
-
|
113 |
-
prompt_template = f"### Instruction: {prompt}\n### Response:"
|
114 |
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
print(tokenizer.decode(output[0]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
```
|
119 |
|
120 |
## Provided files
|
@@ -204,8 +245,6 @@ for seq in sequences:
|
|
204 |
|
205 |
```
|
206 |
|
207 |
-
|
208 |
-
|
209 |
# Model Card for Falcon-40B-Instruct
|
210 |
|
211 |
## Model Details
|
|
|
34 |
* [2, 3, 4, 5, 6, 8-bit GGML models for CPU+GPU inference](https://huggingface.co/TheBloke/falcon-40b-instruct-GGML)
|
35 |
* [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/tiiuae/falcon-40b-instruct)
|
36 |
|
37 |
+
## Prompt template
|
38 |
+
|
39 |
+
```
|
40 |
+
A helpful assistant who helps the user with any questions asked.
|
41 |
+
User: prompt
|
42 |
+
Assistant:
|
43 |
+
```
|
44 |
+
|
45 |
## EXPERIMENTAL
|
46 |
|
47 |
Please note this is an experimental GPTQ model. Support for it is currently quite limited.
|
|
|
105 |
|
106 |
You can then run this example code:
|
107 |
```python
|
108 |
+
from transformers import AutoTokenizer, pipeline, logging
|
109 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
110 |
+
import argparse
|
111 |
|
112 |
+
model_name_or_path = "TheBloke/falcon-40b-instruct-GPTQ"
|
113 |
+
# You could also download the model locally, and access it there
|
114 |
+
# model_name_or_path = "/path/to/TheBloke_falcon-40b-instruct-GPTQ"
|
115 |
|
116 |
+
model_basename = "gptq_model-4bit--1g"
|
|
|
117 |
|
118 |
+
use_triton = False
|
119 |
|
120 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
|
|
121 |
|
122 |
+
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
|
123 |
+
model_basename=model_basename,
|
124 |
+
use_safetensors=True,
|
125 |
+
trust_remote_code=True,
|
126 |
+
device="cuda:0",
|
127 |
+
use_triton=use_triton,
|
128 |
+
quantize_config=None)
|
129 |
+
|
130 |
+
prompt = "Tell me about AI"
|
131 |
+
prompt_template=f'''A helpful assistant who helps the user with any questions asked.
|
132 |
+
User: {prompt}
|
133 |
+
Assistant:''
|
134 |
+
|
135 |
+
print("\n\n*** Generate:")
|
136 |
+
|
137 |
+
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
|
138 |
+
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
|
139 |
print(tokenizer.decode(output[0]))
|
140 |
+
|
141 |
+
# Inference can also be done using transformers' pipeline
|
142 |
+
# Note that if you use pipeline, you will see a spurious error message saying the model type is not supported
|
143 |
+
# This can be ignored! Or you can hide it with the following logging line:
|
144 |
+
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
|
145 |
+
logging.set_verbosity(logging.CRITICAL)
|
146 |
+
|
147 |
+
print("*** Pipeline:")
|
148 |
+
pipe = pipeline(
|
149 |
+
"text-generation",
|
150 |
+
model=model,
|
151 |
+
tokenizer=tokenizer,
|
152 |
+
max_new_tokens=512,
|
153 |
+
temperature=0.7,
|
154 |
+
top_p=0.95,
|
155 |
+
repetition_penalty=1.15
|
156 |
+
)
|
157 |
+
|
158 |
+
print(pipe(prompt_template)[0]['generated_text'])
|
159 |
```
|
160 |
|
161 |
## Provided files
|
|
|
245 |
|
246 |
```
|
247 |
|
|
|
|
|
248 |
# Model Card for Falcon-40B-Instruct
|
249 |
|
250 |
## Model Details
|