Update README.md
Browse files
README.md
CHANGED
@@ -25,6 +25,54 @@ Gemma is a family of lightweight, state-of-the-art open models from Google, buil
|
|
25 |
## Model Details
|
26 |
context window = 8192
|
27 |
SYSTEM MESSAGE NOT SUPPORTED
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
### Prompt Format
|
30 |
```pthon
|
@@ -55,7 +103,9 @@ wget https://huggingface.co/FM-1976/gemma-2-2b-it-Q5_K_M-GGUF/resolve/main/gemma
|
|
55 |
|
56 |
```
|
57 |
|
58 |
-
Open your Python REPL
|
|
|
|
|
59 |
```python
|
60 |
from llama_cpp import Llama
|
61 |
nCTX = 8192
|
@@ -78,17 +128,130 @@ response = llm.create_chat_completion(
|
|
78 |
repeat_penalty= 1.178,
|
79 |
stop=sTOPS,
|
80 |
max_tokens=500)
|
81 |
-
print(response)
|
82 |
```
|
83 |
|
84 |
-
|
85 |
-
```
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
```
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
llama-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
```
|
93 |
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
## Model Details
|
26 |
context window = 8192
|
27 |
SYSTEM MESSAGE NOT SUPPORTED
|
28 |
+
```bash
|
29 |
+
llama_model_loader: - kv 0: general.architecture str = gemma2
|
30 |
+
llama_model_loader: - kv 1: general.type str = model
|
31 |
+
llama_model_loader: - kv 2: general.name str = Gemma 2 2b It
|
32 |
+
llama_model_loader: - kv 3: general.finetune str = it
|
33 |
+
llama_model_loader: - kv 4: general.basename str = gemma-2
|
34 |
+
llama_model_loader: - kv 5: general.size_label str = 2B
|
35 |
+
llama_model_loader: - kv 6: general.license str = gemma
|
36 |
+
llama_model_loader: - kv 7: general.base_model.count u32 = 1
|
37 |
+
llama_model_loader: - kv 8: general.base_model.0.name str = Gemma 2 2b
|
38 |
+
llama_model_loader: - kv 9: general.base_model.0.organization str = Google
|
39 |
+
llm_load_print_meta: format = GGUF V3 (latest)
|
40 |
+
llm_load_print_meta: arch = gemma2
|
41 |
+
llm_load_print_meta: vocab type = SPM
|
42 |
+
llm_load_print_meta: n_vocab = 256000
|
43 |
+
llm_load_print_meta: n_merges = 0
|
44 |
+
llm_load_print_meta: vocab_only = 0
|
45 |
+
llm_load_print_meta: n_ctx_train = 8192
|
46 |
+
llm_load_print_meta: n_embd = 2304
|
47 |
+
llm_load_print_meta: n_layer = 26
|
48 |
+
llm_load_print_meta: n_head = 8
|
49 |
+
llm_load_print_meta: n_head_kv = 4
|
50 |
+
llm_load_print_meta: model type = 2B
|
51 |
+
llm_load_print_meta: model ftype = Q5_K - Medium
|
52 |
+
llm_load_print_meta: model params = 2.61 B
|
53 |
+
llm_load_print_meta: model size = 1.79 GiB (5.87 BPW)
|
54 |
+
llm_load_print_meta: general.name = Gemma 2 2b It
|
55 |
+
llm_load_print_meta: BOS token = 2 '<bos>'
|
56 |
+
llm_load_print_meta: EOS token = 1 '<eos>'
|
57 |
+
llm_load_print_meta: UNK token = 3 '<unk>'
|
58 |
+
llm_load_print_meta: PAD token = 0 '<pad>'
|
59 |
+
llm_load_print_meta: LF token = 227 '<0x0A>'
|
60 |
+
llm_load_print_meta: EOT token = 107 '<end_of_turn>'
|
61 |
+
llm_load_print_meta: EOG token = 1 '<eos>'
|
62 |
+
llm_load_print_meta: EOG token = 107 '<end_of_turn>'
|
63 |
+
|
64 |
+
>>> System role not supported
|
65 |
+
Available chat formats from metadata: chat_template.default
|
66 |
+
Using gguf chat template: {{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
|
67 |
+
' + message['content'] | trim + '<end_of_turn>
|
68 |
+
' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
|
69 |
+
'}}{% endif %}
|
70 |
+
Using chat eos_token: <eos>
|
71 |
+
Using chat bos_token: <bos>
|
72 |
+
|
73 |
+
```
|
74 |
+
|
75 |
+
|
76 |
|
77 |
### Prompt Format
|
78 |
```pthon
|
|
|
103 |
|
104 |
```
|
105 |
|
106 |
+
### Open your Python REPL
|
107 |
+
|
108 |
+
#### Using chat_template
|
109 |
```python
|
110 |
from llama_cpp import Llama
|
111 |
nCTX = 8192
|
|
|
128 |
repeat_penalty= 1.178,
|
129 |
stop=sTOPS,
|
130 |
max_tokens=500)
|
131 |
+
print(response['choices'][0]['message']['content'])
|
132 |
```
|
133 |
|
134 |
+
#### Using create_completion
|
135 |
+
```python
|
136 |
+
from llama_cpp import Llama
|
137 |
+
nCTX = 8192
|
138 |
+
sTOPS = ['<eos>']
|
139 |
+
llm = Llama(
|
140 |
+
model_path='gemma-2-2b-it-q5_k_m.gguf',
|
141 |
+
temperature=0.24,
|
142 |
+
n_ctx=nCTX,
|
143 |
+
max_tokens=600,
|
144 |
+
repeat_penalty=1.176,
|
145 |
+
stop=sTOPS,
|
146 |
+
verbose=False,
|
147 |
+
)
|
148 |
+
prompt = 'Explain Science in one sentence.'
|
149 |
+
template = f'''<bos><start_of_turn>user
|
150 |
+
{prompt}<end_of_turn>
|
151 |
+
<start_of_turn>model
|
152 |
+
<end_of_turn>'''
|
153 |
+
res = llm.create_completion(prompt,temperature=0.15, max_tokens=500,repeat_penalty=1.178, stop=['<eos>'])
|
154 |
+
print(res['choices'][0]['text'])
|
155 |
```
|
156 |
|
157 |
+
|
158 |
+
### Streaming text
|
159 |
+
llama-cpp-python allows you to also stream text during the inference<br>
|
160 |
+
Tokens are decoded and printed soon after gneration is done. You don't have to wait until the entire inference is done.
|
161 |
+
<br><br>
|
162 |
+
You can use both `create_chat_completion()` and `create_completion()` methods.
|
163 |
+
<br>
|
164 |
+
|
165 |
+
#### Streaming with `create_chat_completion()` method
|
166 |
+
```python
|
167 |
+
import datetime
|
168 |
+
from llama_cpp import Llama
|
169 |
+
nCTX = 8192
|
170 |
+
sTOPS = ['<eos>']
|
171 |
+
llm = Llama(
|
172 |
+
model_path='gemma-2-2b-it-q5_k_m.gguf',
|
173 |
+
temperature=0.24,
|
174 |
+
n_ctx=nCTX,
|
175 |
+
max_tokens=600,
|
176 |
+
repeat_penalty=1.176,
|
177 |
+
stop=sTOPS,
|
178 |
+
verbose=False,
|
179 |
+
)
|
180 |
+
fisrtround=0
|
181 |
+
full_response = ''
|
182 |
+
message = [{'role':'user','content':'what is science?'}]
|
183 |
+
start = datetime.datetime.now()
|
184 |
+
for chunk in llm.create_chat_completion(
|
185 |
+
messages=message,
|
186 |
+
temperature=0.15,
|
187 |
+
repeat_penalty= 1.31,
|
188 |
+
stop=['<eos>'],
|
189 |
+
max_tokens=500,
|
190 |
+
stream=True,):
|
191 |
+
try:
|
192 |
+
if chunk["choices"][0]["delta"]["content"]:
|
193 |
+
if fisrtround==0:
|
194 |
+
print(chunk["choices"][0]["delta"]["content"], end="", flush=True)
|
195 |
+
full_response += chunk["choices"][0]["delta"]["content"]
|
196 |
+
ttftoken = datetime.datetime.now() - start
|
197 |
+
fisrtround = 1
|
198 |
+
else:
|
199 |
+
print(chunk["choices"][0]["delta"]["content"], end="", flush=True)
|
200 |
+
full_response += chunk["choices"][0]["delta"]["content"]
|
201 |
+
except:
|
202 |
+
pass
|
203 |
+
first_token_time = ttftoken.total_seconds()
|
204 |
+
print(f'Time to first token: {first_token_time:.2f} seconds')
|
205 |
```
|
206 |
|
207 |
+
#### Streaming with `create_completion()` method
|
208 |
+
|
209 |
+
```python
|
210 |
+
import datetime
|
211 |
+
from llama_cpp import Llama
|
212 |
+
nCTX = 8192
|
213 |
+
sTOPS = ['<eos>']
|
214 |
+
llm = Llama(
|
215 |
+
model_path='gemma-2-2b-it-q5_k_m.gguf',
|
216 |
+
temperature=0.24,
|
217 |
+
n_ctx=nCTX,
|
218 |
+
max_tokens=600,
|
219 |
+
repeat_penalty=1.176,
|
220 |
+
stop=sTOPS,
|
221 |
+
verbose=False,
|
222 |
+
)
|
223 |
+
fisrtround=0
|
224 |
+
full_response = ''
|
225 |
+
prompt = 'Explain Science in one sentence.'
|
226 |
+
template = f'''<bos><start_of_turn>user
|
227 |
+
{prompt}<end_of_turn>
|
228 |
+
<start_of_turn>model
|
229 |
+
<end_of_turn>'''
|
230 |
+
start = datetime.datetime.now()
|
231 |
+
for chunk in llm.create_completion(
|
232 |
+
prompt,
|
233 |
+
temperature=0.15,
|
234 |
+
repeat_penalty= 1.78,
|
235 |
+
stop=['<eos>'],
|
236 |
+
max_tokens=500,
|
237 |
+
stream=True,):
|
238 |
+
if fisrtround==0:
|
239 |
+
print(chunk["choices"][0]["text"], end="", flush=True)
|
240 |
+
full_response += chunk["choices"][0]["text"]
|
241 |
+
ttftoken = datetime.datetime.now() - start
|
242 |
+
fisrtround = 1
|
243 |
+
else:
|
244 |
+
print(chunk["choices"][0]["text"], end="", flush=True)
|
245 |
+
full_response += chunk["choices"][0]["text"]
|
246 |
+
|
247 |
+
first_token_time = ttftoken.total_seconds()
|
248 |
+
print(f'Time to first token: {first_token_time:.2f} seconds')
|
249 |
+
```
|
250 |
+
|
251 |
+
### Further exploration
|
252 |
+
You can also serve the model with an OpenAI compliant API server<br>
|
253 |
+
This can be done both with `llama-cpp-python[server]` and `llamafile`.
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
+
|