NeoChen1024 commited on
Commit
7b0c3c9
·
verified ·
1 Parent(s): e318b2e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ benchmarks.png filter=lfs diff=lfs merge=lfs -text
37
+ winrates.png filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
Aya_23_notebook.ipynb ADDED
@@ -0,0 +1,1264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "A100",
8
+ "machine_shape": "hm"
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU",
18
+ "widgets": {
19
+ "application/vnd.jupyter.widget-state+json": {
20
+ "531def06b1f7430983a2e4ba33f41f7f": {
21
+ "model_module": "@jupyter-widgets/controls",
22
+ "model_name": "HBoxModel",
23
+ "model_module_version": "1.5.0",
24
+ "state": {
25
+ "_dom_classes": [],
26
+ "_model_module": "@jupyter-widgets/controls",
27
+ "_model_module_version": "1.5.0",
28
+ "_model_name": "HBoxModel",
29
+ "_view_count": null,
30
+ "_view_module": "@jupyter-widgets/controls",
31
+ "_view_module_version": "1.5.0",
32
+ "_view_name": "HBoxView",
33
+ "box_style": "",
34
+ "children": [
35
+ "IPY_MODEL_847b6b899bfc4e9b89b6ecb136a21385",
36
+ "IPY_MODEL_412da2e9912f4eb0ab89d44f0bb09cec",
37
+ "IPY_MODEL_1d56fddc294241f6a7cb4a300cb09afd"
38
+ ],
39
+ "layout": "IPY_MODEL_6f83c639357f4729873f6897119532f0"
40
+ }
41
+ },
42
+ "847b6b899bfc4e9b89b6ecb136a21385": {
43
+ "model_module": "@jupyter-widgets/controls",
44
+ "model_name": "HTMLModel",
45
+ "model_module_version": "1.5.0",
46
+ "state": {
47
+ "_dom_classes": [],
48
+ "_model_module": "@jupyter-widgets/controls",
49
+ "_model_module_version": "1.5.0",
50
+ "_model_name": "HTMLModel",
51
+ "_view_count": null,
52
+ "_view_module": "@jupyter-widgets/controls",
53
+ "_view_module_version": "1.5.0",
54
+ "_view_name": "HTMLView",
55
+ "description": "",
56
+ "description_tooltip": null,
57
+ "layout": "IPY_MODEL_2551b382eca04537a3a11cd70aaf574c",
58
+ "placeholder": "​",
59
+ "style": "IPY_MODEL_93e6cbabc77f4fd69ddc3dee9012cb8e",
60
+ "value": "Loading checkpoint shards: 100%"
61
+ }
62
+ },
63
+ "412da2e9912f4eb0ab89d44f0bb09cec": {
64
+ "model_module": "@jupyter-widgets/controls",
65
+ "model_name": "FloatProgressModel",
66
+ "model_module_version": "1.5.0",
67
+ "state": {
68
+ "_dom_classes": [],
69
+ "_model_module": "@jupyter-widgets/controls",
70
+ "_model_module_version": "1.5.0",
71
+ "_model_name": "FloatProgressModel",
72
+ "_view_count": null,
73
+ "_view_module": "@jupyter-widgets/controls",
74
+ "_view_module_version": "1.5.0",
75
+ "_view_name": "ProgressView",
76
+ "bar_style": "success",
77
+ "description": "",
78
+ "description_tooltip": null,
79
+ "layout": "IPY_MODEL_da2997c847b84a32b43c377137f64b5e",
80
+ "max": 4,
81
+ "min": 0,
82
+ "orientation": "horizontal",
83
+ "style": "IPY_MODEL_24f16c1efe8547f1ab36efcccda46b59",
84
+ "value": 4
85
+ }
86
+ },
87
+ "1d56fddc294241f6a7cb4a300cb09afd": {
88
+ "model_module": "@jupyter-widgets/controls",
89
+ "model_name": "HTMLModel",
90
+ "model_module_version": "1.5.0",
91
+ "state": {
92
+ "_dom_classes": [],
93
+ "_model_module": "@jupyter-widgets/controls",
94
+ "_model_module_version": "1.5.0",
95
+ "_model_name": "HTMLModel",
96
+ "_view_count": null,
97
+ "_view_module": "@jupyter-widgets/controls",
98
+ "_view_module_version": "1.5.0",
99
+ "_view_name": "HTMLView",
100
+ "description": "",
101
+ "description_tooltip": null,
102
+ "layout": "IPY_MODEL_cc8cb81531344463aa881093fff8c2f0",
103
+ "placeholder": "​",
104
+ "style": "IPY_MODEL_f4c45b260e7a4feaaeef4c50c560641a",
105
+ "value": " 4/4 [00:12<00:00,  2.77s/it]"
106
+ }
107
+ },
108
+ "6f83c639357f4729873f6897119532f0": {
109
+ "model_module": "@jupyter-widgets/base",
110
+ "model_name": "LayoutModel",
111
+ "model_module_version": "1.2.0",
112
+ "state": {
113
+ "_model_module": "@jupyter-widgets/base",
114
+ "_model_module_version": "1.2.0",
115
+ "_model_name": "LayoutModel",
116
+ "_view_count": null,
117
+ "_view_module": "@jupyter-widgets/base",
118
+ "_view_module_version": "1.2.0",
119
+ "_view_name": "LayoutView",
120
+ "align_content": null,
121
+ "align_items": null,
122
+ "align_self": null,
123
+ "border": null,
124
+ "bottom": null,
125
+ "display": null,
126
+ "flex": null,
127
+ "flex_flow": null,
128
+ "grid_area": null,
129
+ "grid_auto_columns": null,
130
+ "grid_auto_flow": null,
131
+ "grid_auto_rows": null,
132
+ "grid_column": null,
133
+ "grid_gap": null,
134
+ "grid_row": null,
135
+ "grid_template_areas": null,
136
+ "grid_template_columns": null,
137
+ "grid_template_rows": null,
138
+ "height": null,
139
+ "justify_content": null,
140
+ "justify_items": null,
141
+ "left": null,
142
+ "margin": null,
143
+ "max_height": null,
144
+ "max_width": null,
145
+ "min_height": null,
146
+ "min_width": null,
147
+ "object_fit": null,
148
+ "object_position": null,
149
+ "order": null,
150
+ "overflow": null,
151
+ "overflow_x": null,
152
+ "overflow_y": null,
153
+ "padding": null,
154
+ "right": null,
155
+ "top": null,
156
+ "visibility": null,
157
+ "width": null
158
+ }
159
+ },
160
+ "2551b382eca04537a3a11cd70aaf574c": {
161
+ "model_module": "@jupyter-widgets/base",
162
+ "model_name": "LayoutModel",
163
+ "model_module_version": "1.2.0",
164
+ "state": {
165
+ "_model_module": "@jupyter-widgets/base",
166
+ "_model_module_version": "1.2.0",
167
+ "_model_name": "LayoutModel",
168
+ "_view_count": null,
169
+ "_view_module": "@jupyter-widgets/base",
170
+ "_view_module_version": "1.2.0",
171
+ "_view_name": "LayoutView",
172
+ "align_content": null,
173
+ "align_items": null,
174
+ "align_self": null,
175
+ "border": null,
176
+ "bottom": null,
177
+ "display": null,
178
+ "flex": null,
179
+ "flex_flow": null,
180
+ "grid_area": null,
181
+ "grid_auto_columns": null,
182
+ "grid_auto_flow": null,
183
+ "grid_auto_rows": null,
184
+ "grid_column": null,
185
+ "grid_gap": null,
186
+ "grid_row": null,
187
+ "grid_template_areas": null,
188
+ "grid_template_columns": null,
189
+ "grid_template_rows": null,
190
+ "height": null,
191
+ "justify_content": null,
192
+ "justify_items": null,
193
+ "left": null,
194
+ "margin": null,
195
+ "max_height": null,
196
+ "max_width": null,
197
+ "min_height": null,
198
+ "min_width": null,
199
+ "object_fit": null,
200
+ "object_position": null,
201
+ "order": null,
202
+ "overflow": null,
203
+ "overflow_x": null,
204
+ "overflow_y": null,
205
+ "padding": null,
206
+ "right": null,
207
+ "top": null,
208
+ "visibility": null,
209
+ "width": null
210
+ }
211
+ },
212
+ "93e6cbabc77f4fd69ddc3dee9012cb8e": {
213
+ "model_module": "@jupyter-widgets/controls",
214
+ "model_name": "DescriptionStyleModel",
215
+ "model_module_version": "1.5.0",
216
+ "state": {
217
+ "_model_module": "@jupyter-widgets/controls",
218
+ "_model_module_version": "1.5.0",
219
+ "_model_name": "DescriptionStyleModel",
220
+ "_view_count": null,
221
+ "_view_module": "@jupyter-widgets/base",
222
+ "_view_module_version": "1.2.0",
223
+ "_view_name": "StyleView",
224
+ "description_width": ""
225
+ }
226
+ },
227
+ "da2997c847b84a32b43c377137f64b5e": {
228
+ "model_module": "@jupyter-widgets/base",
229
+ "model_name": "LayoutModel",
230
+ "model_module_version": "1.2.0",
231
+ "state": {
232
+ "_model_module": "@jupyter-widgets/base",
233
+ "_model_module_version": "1.2.0",
234
+ "_model_name": "LayoutModel",
235
+ "_view_count": null,
236
+ "_view_module": "@jupyter-widgets/base",
237
+ "_view_module_version": "1.2.0",
238
+ "_view_name": "LayoutView",
239
+ "align_content": null,
240
+ "align_items": null,
241
+ "align_self": null,
242
+ "border": null,
243
+ "bottom": null,
244
+ "display": null,
245
+ "flex": null,
246
+ "flex_flow": null,
247
+ "grid_area": null,
248
+ "grid_auto_columns": null,
249
+ "grid_auto_flow": null,
250
+ "grid_auto_rows": null,
251
+ "grid_column": null,
252
+ "grid_gap": null,
253
+ "grid_row": null,
254
+ "grid_template_areas": null,
255
+ "grid_template_columns": null,
256
+ "grid_template_rows": null,
257
+ "height": null,
258
+ "justify_content": null,
259
+ "justify_items": null,
260
+ "left": null,
261
+ "margin": null,
262
+ "max_height": null,
263
+ "max_width": null,
264
+ "min_height": null,
265
+ "min_width": null,
266
+ "object_fit": null,
267
+ "object_position": null,
268
+ "order": null,
269
+ "overflow": null,
270
+ "overflow_x": null,
271
+ "overflow_y": null,
272
+ "padding": null,
273
+ "right": null,
274
+ "top": null,
275
+ "visibility": null,
276
+ "width": null
277
+ }
278
+ },
279
+ "24f16c1efe8547f1ab36efcccda46b59": {
280
+ "model_module": "@jupyter-widgets/controls",
281
+ "model_name": "ProgressStyleModel",
282
+ "model_module_version": "1.5.0",
283
+ "state": {
284
+ "_model_module": "@jupyter-widgets/controls",
285
+ "_model_module_version": "1.5.0",
286
+ "_model_name": "ProgressStyleModel",
287
+ "_view_count": null,
288
+ "_view_module": "@jupyter-widgets/base",
289
+ "_view_module_version": "1.2.0",
290
+ "_view_name": "StyleView",
291
+ "bar_color": null,
292
+ "description_width": ""
293
+ }
294
+ },
295
+ "cc8cb81531344463aa881093fff8c2f0": {
296
+ "model_module": "@jupyter-widgets/base",
297
+ "model_name": "LayoutModel",
298
+ "model_module_version": "1.2.0",
299
+ "state": {
300
+ "_model_module": "@jupyter-widgets/base",
301
+ "_model_module_version": "1.2.0",
302
+ "_model_name": "LayoutModel",
303
+ "_view_count": null,
304
+ "_view_module": "@jupyter-widgets/base",
305
+ "_view_module_version": "1.2.0",
306
+ "_view_name": "LayoutView",
307
+ "align_content": null,
308
+ "align_items": null,
309
+ "align_self": null,
310
+ "border": null,
311
+ "bottom": null,
312
+ "display": null,
313
+ "flex": null,
314
+ "flex_flow": null,
315
+ "grid_area": null,
316
+ "grid_auto_columns": null,
317
+ "grid_auto_flow": null,
318
+ "grid_auto_rows": null,
319
+ "grid_column": null,
320
+ "grid_gap": null,
321
+ "grid_row": null,
322
+ "grid_template_areas": null,
323
+ "grid_template_columns": null,
324
+ "grid_template_rows": null,
325
+ "height": null,
326
+ "justify_content": null,
327
+ "justify_items": null,
328
+ "left": null,
329
+ "margin": null,
330
+ "max_height": null,
331
+ "max_width": null,
332
+ "min_height": null,
333
+ "min_width": null,
334
+ "object_fit": null,
335
+ "object_position": null,
336
+ "order": null,
337
+ "overflow": null,
338
+ "overflow_x": null,
339
+ "overflow_y": null,
340
+ "padding": null,
341
+ "right": null,
342
+ "top": null,
343
+ "visibility": null,
344
+ "width": null
345
+ }
346
+ },
347
+ "f4c45b260e7a4feaaeef4c50c560641a": {
348
+ "model_module": "@jupyter-widgets/controls",
349
+ "model_name": "DescriptionStyleModel",
350
+ "model_module_version": "1.5.0",
351
+ "state": {
352
+ "_model_module": "@jupyter-widgets/controls",
353
+ "_model_module_version": "1.5.0",
354
+ "_model_name": "DescriptionStyleModel",
355
+ "_view_count": null,
356
+ "_view_module": "@jupyter-widgets/base",
357
+ "_view_module_version": "1.2.0",
358
+ "_view_name": "StyleView",
359
+ "description_width": ""
360
+ }
361
+ },
362
+ "0272ba7f31a2441ab1cb5b8f77dbaacb": {
363
+ "model_module": "@jupyter-widgets/controls",
364
+ "model_name": "HBoxModel",
365
+ "model_module_version": "1.5.0",
366
+ "state": {
367
+ "_dom_classes": [],
368
+ "_model_module": "@jupyter-widgets/controls",
369
+ "_model_module_version": "1.5.0",
370
+ "_model_name": "HBoxModel",
371
+ "_view_count": null,
372
+ "_view_module": "@jupyter-widgets/controls",
373
+ "_view_module_version": "1.5.0",
374
+ "_view_name": "HBoxView",
375
+ "box_style": "",
376
+ "children": [
377
+ "IPY_MODEL_d1bb171ddebd4f4bbeb4ed5d4b8b7076",
378
+ "IPY_MODEL_33b4fc55703746778511265e28160837",
379
+ "IPY_MODEL_7548c151f8764276ad7951e2ac80d981"
380
+ ],
381
+ "layout": "IPY_MODEL_d972c72fef7c45998469550318661e71"
382
+ }
383
+ },
384
+ "d1bb171ddebd4f4bbeb4ed5d4b8b7076": {
385
+ "model_module": "@jupyter-widgets/controls",
386
+ "model_name": "HTMLModel",
387
+ "model_module_version": "1.5.0",
388
+ "state": {
389
+ "_dom_classes": [],
390
+ "_model_module": "@jupyter-widgets/controls",
391
+ "_model_module_version": "1.5.0",
392
+ "_model_name": "HTMLModel",
393
+ "_view_count": null,
394
+ "_view_module": "@jupyter-widgets/controls",
395
+ "_view_module_version": "1.5.0",
396
+ "_view_name": "HTMLView",
397
+ "description": "",
398
+ "description_tooltip": null,
399
+ "layout": "IPY_MODEL_2811b7c68a7b4c95b91bd5690cf06577",
400
+ "placeholder": "​",
401
+ "style": "IPY_MODEL_a33ccfdb735948e98a19d901d8091319",
402
+ "value": "Loading checkpoint shards: 100%"
403
+ }
404
+ },
405
+ "33b4fc55703746778511265e28160837": {
406
+ "model_module": "@jupyter-widgets/controls",
407
+ "model_name": "FloatProgressModel",
408
+ "model_module_version": "1.5.0",
409
+ "state": {
410
+ "_dom_classes": [],
411
+ "_model_module": "@jupyter-widgets/controls",
412
+ "_model_module_version": "1.5.0",
413
+ "_model_name": "FloatProgressModel",
414
+ "_view_count": null,
415
+ "_view_module": "@jupyter-widgets/controls",
416
+ "_view_module_version": "1.5.0",
417
+ "_view_name": "ProgressView",
418
+ "bar_style": "success",
419
+ "description": "",
420
+ "description_tooltip": null,
421
+ "layout": "IPY_MODEL_c1103244cec74a299265729e630faffd",
422
+ "max": 4,
423
+ "min": 0,
424
+ "orientation": "horizontal",
425
+ "style": "IPY_MODEL_340941cfc49e4ab983b73fb48c30dfe8",
426
+ "value": 4
427
+ }
428
+ },
429
+ "7548c151f8764276ad7951e2ac80d981": {
430
+ "model_module": "@jupyter-widgets/controls",
431
+ "model_name": "HTMLModel",
432
+ "model_module_version": "1.5.0",
433
+ "state": {
434
+ "_dom_classes": [],
435
+ "_model_module": "@jupyter-widgets/controls",
436
+ "_model_module_version": "1.5.0",
437
+ "_model_name": "HTMLModel",
438
+ "_view_count": null,
439
+ "_view_module": "@jupyter-widgets/controls",
440
+ "_view_module_version": "1.5.0",
441
+ "_view_name": "HTMLView",
442
+ "description": "",
443
+ "description_tooltip": null,
444
+ "layout": "IPY_MODEL_8bb42aa84f4b4a9ab6417aed92132063",
445
+ "placeholder": "​",
446
+ "style": "IPY_MODEL_b0cf428afc21468caeb664428627aaf6",
447
+ "value": " 4/4 [00:11<00:00,  2.57s/it]"
448
+ }
449
+ },
450
+ "d972c72fef7c45998469550318661e71": {
451
+ "model_module": "@jupyter-widgets/base",
452
+ "model_name": "LayoutModel",
453
+ "model_module_version": "1.2.0",
454
+ "state": {
455
+ "_model_module": "@jupyter-widgets/base",
456
+ "_model_module_version": "1.2.0",
457
+ "_model_name": "LayoutModel",
458
+ "_view_count": null,
459
+ "_view_module": "@jupyter-widgets/base",
460
+ "_view_module_version": "1.2.0",
461
+ "_view_name": "LayoutView",
462
+ "align_content": null,
463
+ "align_items": null,
464
+ "align_self": null,
465
+ "border": null,
466
+ "bottom": null,
467
+ "display": null,
468
+ "flex": null,
469
+ "flex_flow": null,
470
+ "grid_area": null,
471
+ "grid_auto_columns": null,
472
+ "grid_auto_flow": null,
473
+ "grid_auto_rows": null,
474
+ "grid_column": null,
475
+ "grid_gap": null,
476
+ "grid_row": null,
477
+ "grid_template_areas": null,
478
+ "grid_template_columns": null,
479
+ "grid_template_rows": null,
480
+ "height": null,
481
+ "justify_content": null,
482
+ "justify_items": null,
483
+ "left": null,
484
+ "margin": null,
485
+ "max_height": null,
486
+ "max_width": null,
487
+ "min_height": null,
488
+ "min_width": null,
489
+ "object_fit": null,
490
+ "object_position": null,
491
+ "order": null,
492
+ "overflow": null,
493
+ "overflow_x": null,
494
+ "overflow_y": null,
495
+ "padding": null,
496
+ "right": null,
497
+ "top": null,
498
+ "visibility": null,
499
+ "width": null
500
+ }
501
+ },
502
+ "2811b7c68a7b4c95b91bd5690cf06577": {
503
+ "model_module": "@jupyter-widgets/base",
504
+ "model_name": "LayoutModel",
505
+ "model_module_version": "1.2.0",
506
+ "state": {
507
+ "_model_module": "@jupyter-widgets/base",
508
+ "_model_module_version": "1.2.0",
509
+ "_model_name": "LayoutModel",
510
+ "_view_count": null,
511
+ "_view_module": "@jupyter-widgets/base",
512
+ "_view_module_version": "1.2.0",
513
+ "_view_name": "LayoutView",
514
+ "align_content": null,
515
+ "align_items": null,
516
+ "align_self": null,
517
+ "border": null,
518
+ "bottom": null,
519
+ "display": null,
520
+ "flex": null,
521
+ "flex_flow": null,
522
+ "grid_area": null,
523
+ "grid_auto_columns": null,
524
+ "grid_auto_flow": null,
525
+ "grid_auto_rows": null,
526
+ "grid_column": null,
527
+ "grid_gap": null,
528
+ "grid_row": null,
529
+ "grid_template_areas": null,
530
+ "grid_template_columns": null,
531
+ "grid_template_rows": null,
532
+ "height": null,
533
+ "justify_content": null,
534
+ "justify_items": null,
535
+ "left": null,
536
+ "margin": null,
537
+ "max_height": null,
538
+ "max_width": null,
539
+ "min_height": null,
540
+ "min_width": null,
541
+ "object_fit": null,
542
+ "object_position": null,
543
+ "order": null,
544
+ "overflow": null,
545
+ "overflow_x": null,
546
+ "overflow_y": null,
547
+ "padding": null,
548
+ "right": null,
549
+ "top": null,
550
+ "visibility": null,
551
+ "width": null
552
+ }
553
+ },
554
+ "a33ccfdb735948e98a19d901d8091319": {
555
+ "model_module": "@jupyter-widgets/controls",
556
+ "model_name": "DescriptionStyleModel",
557
+ "model_module_version": "1.5.0",
558
+ "state": {
559
+ "_model_module": "@jupyter-widgets/controls",
560
+ "_model_module_version": "1.5.0",
561
+ "_model_name": "DescriptionStyleModel",
562
+ "_view_count": null,
563
+ "_view_module": "@jupyter-widgets/base",
564
+ "_view_module_version": "1.2.0",
565
+ "_view_name": "StyleView",
566
+ "description_width": ""
567
+ }
568
+ },
569
+ "c1103244cec74a299265729e630faffd": {
570
+ "model_module": "@jupyter-widgets/base",
571
+ "model_name": "LayoutModel",
572
+ "model_module_version": "1.2.0",
573
+ "state": {
574
+ "_model_module": "@jupyter-widgets/base",
575
+ "_model_module_version": "1.2.0",
576
+ "_model_name": "LayoutModel",
577
+ "_view_count": null,
578
+ "_view_module": "@jupyter-widgets/base",
579
+ "_view_module_version": "1.2.0",
580
+ "_view_name": "LayoutView",
581
+ "align_content": null,
582
+ "align_items": null,
583
+ "align_self": null,
584
+ "border": null,
585
+ "bottom": null,
586
+ "display": null,
587
+ "flex": null,
588
+ "flex_flow": null,
589
+ "grid_area": null,
590
+ "grid_auto_columns": null,
591
+ "grid_auto_flow": null,
592
+ "grid_auto_rows": null,
593
+ "grid_column": null,
594
+ "grid_gap": null,
595
+ "grid_row": null,
596
+ "grid_template_areas": null,
597
+ "grid_template_columns": null,
598
+ "grid_template_rows": null,
599
+ "height": null,
600
+ "justify_content": null,
601
+ "justify_items": null,
602
+ "left": null,
603
+ "margin": null,
604
+ "max_height": null,
605
+ "max_width": null,
606
+ "min_height": null,
607
+ "min_width": null,
608
+ "object_fit": null,
609
+ "object_position": null,
610
+ "order": null,
611
+ "overflow": null,
612
+ "overflow_x": null,
613
+ "overflow_y": null,
614
+ "padding": null,
615
+ "right": null,
616
+ "top": null,
617
+ "visibility": null,
618
+ "width": null
619
+ }
620
+ },
621
+ "340941cfc49e4ab983b73fb48c30dfe8": {
622
+ "model_module": "@jupyter-widgets/controls",
623
+ "model_name": "ProgressStyleModel",
624
+ "model_module_version": "1.5.0",
625
+ "state": {
626
+ "_model_module": "@jupyter-widgets/controls",
627
+ "_model_module_version": "1.5.0",
628
+ "_model_name": "ProgressStyleModel",
629
+ "_view_count": null,
630
+ "_view_module": "@jupyter-widgets/base",
631
+ "_view_module_version": "1.2.0",
632
+ "_view_name": "StyleView",
633
+ "bar_color": null,
634
+ "description_width": ""
635
+ }
636
+ },
637
+ "8bb42aa84f4b4a9ab6417aed92132063": {
638
+ "model_module": "@jupyter-widgets/base",
639
+ "model_name": "LayoutModel",
640
+ "model_module_version": "1.2.0",
641
+ "state": {
642
+ "_model_module": "@jupyter-widgets/base",
643
+ "_model_module_version": "1.2.0",
644
+ "_model_name": "LayoutModel",
645
+ "_view_count": null,
646
+ "_view_module": "@jupyter-widgets/base",
647
+ "_view_module_version": "1.2.0",
648
+ "_view_name": "LayoutView",
649
+ "align_content": null,
650
+ "align_items": null,
651
+ "align_self": null,
652
+ "border": null,
653
+ "bottom": null,
654
+ "display": null,
655
+ "flex": null,
656
+ "flex_flow": null,
657
+ "grid_area": null,
658
+ "grid_auto_columns": null,
659
+ "grid_auto_flow": null,
660
+ "grid_auto_rows": null,
661
+ "grid_column": null,
662
+ "grid_gap": null,
663
+ "grid_row": null,
664
+ "grid_template_areas": null,
665
+ "grid_template_columns": null,
666
+ "grid_template_rows": null,
667
+ "height": null,
668
+ "justify_content": null,
669
+ "justify_items": null,
670
+ "left": null,
671
+ "margin": null,
672
+ "max_height": null,
673
+ "max_width": null,
674
+ "min_height": null,
675
+ "min_width": null,
676
+ "object_fit": null,
677
+ "object_position": null,
678
+ "order": null,
679
+ "overflow": null,
680
+ "overflow_x": null,
681
+ "overflow_y": null,
682
+ "padding": null,
683
+ "right": null,
684
+ "top": null,
685
+ "visibility": null,
686
+ "width": null
687
+ }
688
+ },
689
+ "b0cf428afc21468caeb664428627aaf6": {
690
+ "model_module": "@jupyter-widgets/controls",
691
+ "model_name": "DescriptionStyleModel",
692
+ "model_module_version": "1.5.0",
693
+ "state": {
694
+ "_model_module": "@jupyter-widgets/controls",
695
+ "_model_module_version": "1.5.0",
696
+ "_model_name": "DescriptionStyleModel",
697
+ "_view_count": null,
698
+ "_view_module": "@jupyter-widgets/base",
699
+ "_view_module_version": "1.2.0",
700
+ "_view_name": "StyleView",
701
+ "description_width": ""
702
+ }
703
+ }
704
+ }
705
+ }
706
+ },
707
+ "cells": [
708
+ {
709
+ "cell_type": "code",
710
+ "source": [
711
+ "!pip install -U bitsandbytes transformers peft accelerate trl datasets sentencepiece wandb\n",
712
+ "!pip install flash-attn --no-build-isolation"
713
+ ],
714
+ "metadata": {
715
+ "id": "tg1moVggj5sk",
716
+ "collapsed": true
717
+ },
718
+ "execution_count": null,
719
+ "outputs": []
720
+ },
721
+ {
722
+ "cell_type": "code",
723
+ "source": [
724
+ "MODEL_NAME = \"CohereForAI/aya-23-8b\"\n",
725
+ "\n",
726
+ "# you may want to change the following parameters depending on your GPU configuration\n",
727
+ "\n",
728
+ "# free T4 instance\n",
729
+ "# QUANTIZE_4BIT = True\n",
730
+ "# USE_GRAD_CHECKPOINTING = True\n",
731
+ "# TRAIN_BATCH_SIZE = 2\n",
732
+ "# TRAIN_MAX_SEQ_LENGTH = 512\n",
733
+ "# USE_FLASH_ATTENTION = False\n",
734
+ "# GRAD_ACC_STEPS = 16\n",
735
+ "\n",
736
+ "# equivalent A100 setting\n",
737
+ "QUANTIZE_4BIT = True\n",
738
+ "USE_GRAD_CHECKPOINTING = True\n",
739
+ "TRAIN_BATCH_SIZE = 16\n",
740
+ "TRAIN_MAX_SEQ_LENGTH = 512\n",
741
+ "USE_FLASH_ATTENTION = True\n",
742
+ "GRAD_ACC_STEPS = 2"
743
+ ],
744
+ "metadata": {
745
+ "id": "Izn6BYEYw4um"
746
+ },
747
+ "execution_count": null,
748
+ "outputs": []
749
+ },
750
+ {
751
+ "cell_type": "code",
752
+ "source": [
753
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging\n",
754
+ "from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model\n",
755
+ "import os,torch\n",
756
+ "import bitsandbytes as bnb\n",
757
+ "from datasets import load_dataset\n",
758
+ "from trl import SFTTrainer\n",
759
+ "from datasets import Dataset\n",
760
+ "import pyarrow as pa\n",
761
+ "import pyarrow.dataset as ds\n",
762
+ "import pandas as pd\n",
763
+ "import re\n",
764
+ "import wandb"
765
+ ],
766
+ "metadata": {
767
+ "id": "wMs9uNDMHL6R"
768
+ },
769
+ "execution_count": null,
770
+ "outputs": []
771
+ },
772
+ {
773
+ "cell_type": "code",
774
+ "source": [
775
+ "# Load Model\n",
776
+ "quantization_config = None\n",
777
+ "if QUANTIZE_4BIT:\n",
778
+ " quantization_config = BitsAndBytesConfig(\n",
779
+ " load_in_4bit=True,\n",
780
+ " bnb_4bit_quant_type=\"nf4\",\n",
781
+ " bnb_4bit_use_double_quant=True,\n",
782
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
783
+ " )\n",
784
+ "\n",
785
+ "attn_implementation = None\n",
786
+ "if USE_FLASH_ATTENTION:\n",
787
+ " attn_implementation=\"flash_attention_2\"\n",
788
+ "\n",
789
+ "model = AutoModelForCausalLM.from_pretrained(\n",
790
+ " MODEL_NAME,\n",
791
+ " quantization_config=quantization_config,\n",
792
+ " attn_implementation=attn_implementation,\n",
793
+ " torch_dtype=torch.bfloat16,\n",
794
+ " device_map=\"auto\",\n",
795
+ " )"
796
+ ],
797
+ "metadata": {
798
+ "colab": {
799
+ "base_uri": "https://localhost:8080/",
800
+ "height": 176,
801
+ "referenced_widgets": [
802
+ "531def06b1f7430983a2e4ba33f41f7f",
803
+ "847b6b899bfc4e9b89b6ecb136a21385",
804
+ "412da2e9912f4eb0ab89d44f0bb09cec",
805
+ "1d56fddc294241f6a7cb4a300cb09afd",
806
+ "6f83c639357f4729873f6897119532f0",
807
+ "2551b382eca04537a3a11cd70aaf574c",
808
+ "93e6cbabc77f4fd69ddc3dee9012cb8e",
809
+ "da2997c847b84a32b43c377137f64b5e",
810
+ "24f16c1efe8547f1ab36efcccda46b59",
811
+ "cc8cb81531344463aa881093fff8c2f0",
812
+ "f4c45b260e7a4feaaeef4c50c560641a"
813
+ ]
814
+ },
815
+ "id": "d9a23_jiC-qG",
816
+ "outputId": "3cf0666d-f23d-4382-b17b-c29cbe91d2f6"
817
+ },
818
+ "execution_count": null,
819
+ "outputs": [
820
+ {
821
+ "output_type": "stream",
822
+ "name": "stderr",
823
+ "text": [
824
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
825
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
826
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
827
+ "You will be able to reuse this secret in all of your notebooks.\n",
828
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
829
+ " warnings.warn(\n"
830
+ ]
831
+ },
832
+ {
833
+ "output_type": "display_data",
834
+ "data": {
835
+ "text/plain": [
836
+ "Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]"
837
+ ],
838
+ "application/vnd.jupyter.widget-view+json": {
839
+ "version_major": 2,
840
+ "version_minor": 0,
841
+ "model_id": "531def06b1f7430983a2e4ba33f41f7f"
842
+ }
843
+ },
844
+ "metadata": {}
845
+ }
846
+ ]
847
+ },
848
+ {
849
+ "cell_type": "code",
850
+ "source": [
851
+ "# Load tokenizer\n",
852
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)"
853
+ ],
854
+ "metadata": {
855
+ "colab": {
856
+ "base_uri": "https://localhost:8080/"
857
+ },
858
+ "id": "YuqAA8GhYSdO",
859
+ "outputId": "14553887-8142-492e-ca23-aeddac002815"
860
+ },
861
+ "execution_count": null,
862
+ "outputs": [
863
+ {
864
+ "output_type": "stream",
865
+ "name": "stderr",
866
+ "text": [
867
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
868
+ ]
869
+ }
870
+ ]
871
+ },
872
+ {
873
+ "cell_type": "code",
874
+ "source": [
875
+ "def get_message_format(prompts):\n",
876
+ " messages = []\n",
877
+ "\n",
878
+ " for p in prompts:\n",
879
+ " messages.append(\n",
880
+ " [{\"role\": \"user\", \"content\": p}]\n",
881
+ " )\n",
882
+ "\n",
883
+ " return messages\n",
884
+ "\n",
885
+ "def generate_aya_23(\n",
886
+ " prompts,\n",
887
+ " model,\n",
888
+ " temperature=0.3,\n",
889
+ " top_p=0.75,\n",
890
+ " top_k=0,\n",
891
+ " max_new_tokens=1024\n",
892
+ " ):\n",
893
+ "\n",
894
+ " messages = get_message_format(prompts)\n",
895
+ "\n",
896
+ " input_ids = tokenizer.apply_chat_template(\n",
897
+ " messages,\n",
898
+ " tokenize=True,\n",
899
+ " add_generation_prompt=True,\n",
900
+ " padding=True,\n",
901
+ " return_tensors=\"pt\",\n",
902
+ " )\n",
903
+ " input_ids = input_ids.to(model.device)\n",
904
+ " prompt_padded_len = len(input_ids[0])\n",
905
+ "\n",
906
+ " gen_tokens = model.generate(\n",
907
+ " input_ids,\n",
908
+ " temperature=temperature,\n",
909
+ " top_p=top_p,\n",
910
+ " top_k=top_k,\n",
911
+ " max_new_tokens=max_new_tokens,\n",
912
+ " do_sample=True,\n",
913
+ " )\n",
914
+ "\n",
915
+ " # get only generated tokens\n",
916
+ " gen_tokens = [\n",
917
+ " gt[prompt_padded_len:] for gt in gen_tokens\n",
918
+ " ]\n",
919
+ "\n",
920
+ " gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)\n",
921
+ " return gen_text"
922
+ ],
923
+ "metadata": {
924
+ "id": "s75a8Vda-eqx"
925
+ },
926
+ "execution_count": null,
927
+ "outputs": []
928
+ },
929
+ {
930
+ "cell_type": "code",
931
+ "source": [
932
+ "# Test generations on langauges in Aya 23 set\n",
933
+ "prompts = [\n",
934
+ " \"Write a list of three fruits and tell me about each of them\", # English\n",
935
+ " \"Viết danh sách ba loại trái cây và kể cho tôi nghe về từng loại trái cây đó\", # Vietnamese\n",
936
+ " \"3 つの果物のリストを書いて、それぞれについて教えてください\", # Japanese\n",
937
+ " \"Üç meyveden oluşan bir liste yazın ve bana her birini anlatın\" # Turkish\n",
938
+ "]\n",
939
+ "\n",
940
+ "generations = generate_aya_23(prompts, model)\n",
941
+ "\n",
942
+ "for p, g in zip(prompts, generations):\n",
943
+ " print(\n",
944
+ " \"PROMPT\", p ,\"RESPONSE\", g, \"\\n\", sep=\"\\n\"\n",
945
+ " )"
946
+ ],
947
+ "metadata": {
948
+ "id": "4l12EC7q-h3I",
949
+ "colab": {
950
+ "base_uri": "https://localhost:8080/"
951
+ },
952
+ "outputId": "e32ee1a4-9d91-447f-9bde-c8c71c727d80"
953
+ },
954
+ "execution_count": null,
955
+ "outputs": [
956
+ {
957
+ "output_type": "stream",
958
+ "name": "stdout",
959
+ "text": [
960
+ "PROMPT\n",
961
+ "Write a list of three fruits and tell me about each of them\n",
962
+ "RESPONSE\n",
963
+ "Sure! Here is a list of three fruits, along with some information about each of them:\n",
964
+ "\n",
965
+ "1. Apple: Apples are a popular fruit that are widely cultivated across the world. They are typically round or oval in shape and come in a variety of colors, including red, green, yellow, and a blend of these colors. Apples are known for their crisp texture and sweet or tart taste. They are a good source of dietary fiber, vitamins, and antioxidants.\n",
966
+ "\n",
967
+ "2. Banana: Bananas are long, curved fruits that come in a range of colors, from yellow to brown. They are a good source of potassium, vitamins, and fiber. Bananas have a sweet taste and are often eaten raw, but they can also be used in baking or blended into smoothies.\n",
968
+ "\n",
969
+ "3. Orange: Oranges are citrus fruits known for their vibrant orange color and sweet, tangy taste. They are a good source of vitamin C and other nutrients. Oranges can be eaten fresh, juiced, or used in various dishes, such as salads, desserts, and marmalades.\n",
970
+ "\n",
971
+ "These fruits are not only delicious but also provide various health benefits and are commonly used in various cuisines worldwide.\n",
972
+ "\n",
973
+ "\n",
974
+ "PROMPT\n",
975
+ "Viết danh sách ba loại trái cây và kể cho tôi nghe về từng loại trái cây đó\n",
976
+ "RESPONSE\n",
977
+ "Dưới đây là ba loại trái cây phổ biến, mỗi loại có hương vị và đặc điểm riêng:\n",
978
+ "\n",
979
+ "1. Táo: Táo là một loại trái cây quen thuộc và phổ biến trên toàn thế giới. Chúng có nguồn gốc từ Châu Á nhưng hiện nay được trồng ở nhiều nơi. Táo có hình tròn hoặc oval, với nhiều loại khác nhau về kích thước và màu sắc. Vỏ táo có thể có màu đỏ, xanh hoặc vàng, trong khi phần thịt thường có màu trắng hoặc hồng nhạt. Táo có hương vị ngọt ngào và tươi mát, với một chút giòn khi ăn. Chúng chứa nhiều vitamin và chất xơ, làm cho táo trở thành một món ăn vặt lành mạnh. Táo cũng thường được sử dụng trong các món tráng miệng và nước ép.\n",
980
+ "\n",
981
+ "2. Cam: Cam là một loại trái cây nhiệt đới có nguồn gốc từ Châu Phi và hiện nay được trồng rộng rãi trên toàn thế giới. Chúng có hình tròn hoặc oval, với vỏ cam hoặc vàng và thịt màu cam tươi sáng. Cam có hương vị ngọt ngào và chua nhẹ, với một chút giòn khi ăn. Chúng chứa nhiều vitamin C và có thể được ăn tươi hoặc ép lấy nước. Cam cũng thường được sử dụng trong các món salad, nước ép và các món tráng miệng.\n",
982
+ "\n",
983
+ "3. Dâu tây: Dâu tây là một loại trái cây mọng nước có nguồn gốc từ Châu Âu và hiện nay được trồng rộng rãi trên toàn thế giới. Chúng có hình tròn hoặc oval, với màu đỏ tươi hoặc hồng nhạt và thịt trắng hoặc hồng nhạt. Dâu tây có hương vị ngọt ngào và tươi mát, với một chút giòn. Chúng thường được sử dụng trong các món tráng miệng, bánh ngọt và salad. Dâu tây cũng chứa nhiều vitamin và chất chống oxy hóa, làm cho chúng trở thành một lựa chọn lành mạnh.\n",
984
+ "\n",
985
+ "Mỗi loại trái cây này đều có hương vị và đặc điểm riêng, nhưng tất cả đều là những lựa chọn lành mạnh và ngon miệng cho bữa ăn nhẹ hoặc món tráng miệng.\n",
986
+ "\n",
987
+ "\n",
988
+ "PROMPT\n",
989
+ "3 つの果物のリストを書いて、それぞれについて教えてください\n",
990
+ "RESPONSE\n",
991
+ "もちろんです! 3 つの果物は次のとおりです。\n",
992
+ "\n",
993
+ "1. リンゴ: リンゴは世界中で広く栽培されている人気のある果物です。甘くてジューシーな味と食感で知られ、赤、緑、黄色などさまざまな品種があります。リンゴはビタミンや食物繊維が豊富で、健康的なスナックとしてよく食べられています。\n",
994
+ "\n",
995
+ "2. オレンジ: オレンジは柑橘類の一種で、ビタミン C が豊富に含まれています。甘酸っぱい味わいとジューシーな食感が特徴で、世界中で広く消費されています。オレンジは免疫力を高め、健康な皮膚と髪を維持するのに役立つと考えられています。\n",
996
+ "\n",
997
+ "3. スターフルーツ: スターフルーツは、その名前が示すように、星形をした独特の形をした果物です。甘くて爽やかな味わいで、ビタミン C と食物繊維が豊富です。スターフルーツは通常、生として食べられますが、ジュースやデザートにも使われます。\n",
998
+ "\n",
999
+ "これらの果物はすべて、栄養価が高く、さまざまな健康上の利点を提供します。世界中で広く利用可能で、さまざまな方法で楽しむことができます。\n",
1000
+ "\n",
1001
+ "\n",
1002
+ "PROMPT\n",
1003
+ "Üç meyveden oluşan bir liste yazın ve bana her birini anlatın\n",
1004
+ "RESPONSE\n",
1005
+ "Elma, armut ve çilek.\n",
1006
+ "\n",
1007
+ "Elma: Elma, dünyanın birçok bölgesinde yetişen popüler ve yaygın bir meyvedir. Genellikle kırmızı veya yeşil kabuğu ve sulu, tatlı eti vardır. Elma, vitamin C ve lif bakımından zengindir ve sağlıklı bir atıştırmalık olarak kabul edilir.\n",
1008
+ "\n",
1009
+ "Armut: Armut, yaz aylarında hasat edilen ve genellikle sarı, yeşil veya mor renkte olan bir meyvedir. Armut, elmaya benzer bir tada sahiptir, ancak daha yumuşak ve sulu bir dokuya sahiptir. Armut da vitamin C ve K bakımından zengindir ve sindirimi kolay bir meyve olarak bilinir.\n",
1010
+ "\n",
1011
+ "Çilek: Çilek, bahar ve yaz aylarında hasat edilen ve tatlı ve aromatik bir tada sahip kırmızı meyvelerdir. Çilekler genellikle taze olarak yenir, ancak dondurulmuş veya kurutulmuş olarak da tüketilebilir. Vitamin C ve antioksidanlar bakımından zengindir ve kalp sağlığını destekleyebileceği düşünülmektedir.\n",
1012
+ "\n",
1013
+ "Bu üç meyve, her birinin kendine has özellikleri ve faydaları olan lezzetli ve besleyici seçenekler sunar.\n",
1014
+ "\n",
1015
+ "\n"
1016
+ ]
1017
+ }
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "source": [
1023
+ "# Test Bengali (not in Aya 23 set) inference on base model\n",
1024
+ "\n",
1025
+ "prompts = [\n",
1026
+ " 'Translate from English to Bengali: \"Rates are competitive, almost always the best in the market\"'\n",
1027
+ "]\n",
1028
+ "\n",
1029
+ "generations = generate_aya_23(prompts, model)\n",
1030
+ "\n",
1031
+ "for p, g in zip(prompts, generations):\n",
1032
+ " print(\n",
1033
+ " \"PROMPT\", p ,\"RESPONSE\", g, \"\\n\", sep=\"\\n\"\n",
1034
+ " )"
1035
+ ],
1036
+ "metadata": {
1037
+ "colab": {
1038
+ "base_uri": "https://localhost:8080/"
1039
+ },
1040
+ "id": "tkEl3__Mwd8N",
1041
+ "outputId": "d4cf3e07-f148-4a57-cd69-b72acfc15b54"
1042
+ },
1043
+ "execution_count": null,
1044
+ "outputs": [
1045
+ {
1046
+ "output_type": "stream",
1047
+ "name": "stdout",
1048
+ "text": [
1049
+ "PROMPT\n",
1050
+ "Translate from English to Bengali: \"Rates are competitive, almost always the best in the market\"\n",
1051
+ "RESPONSE\n",
1052
+ "\"পরিণতি সংসাধানকরি, বাজারের সম্পর্কে সম্প্রতি সবচেয়ে বেশি\"\n",
1053
+ "\n",
1054
+ "\n"
1055
+ ]
1056
+ }
1057
+ ]
1058
+ },
1059
+ {
1060
+ "cell_type": "code",
1061
+ "source": [
1062
+ "# Load an English to Bengali translation dataset from Aya Collection\n",
1063
+ "dataset = load_dataset(\"CohereForAI/aya_collection\", \"templated_indic_sentiment\")['train']\n",
1064
+ "dataset = dataset.filter(lambda example: example['language']=='ben')\n",
1065
+ "\n",
1066
+ "def formatting_prompts_func(example):\n",
1067
+ " output_texts = []\n",
1068
+ " for i in range(len(example['inputs'])):\n",
1069
+ " text = f\"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{example['inputs'][i]}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{example['targets'][i]}\"\n",
1070
+ " output_texts.append(text)\n",
1071
+ " return output_texts"
1072
+ ],
1073
+ "metadata": {
1074
+ "id": "CHXm3Io5zCrk"
1075
+ },
1076
+ "execution_count": null,
1077
+ "outputs": []
1078
+ },
1079
+ {
1080
+ "cell_type": "code",
1081
+ "source": [
1082
+ "# Training Arguments\n",
1083
+ "training_arguments = TrainingArguments(\n",
1084
+ " output_dir=\"results\",\n",
1085
+ " num_train_epochs=20,\n",
1086
+ " per_device_train_batch_size=TRAIN_BATCH_SIZE,\n",
1087
+ " gradient_accumulation_steps=GRAD_ACC_STEPS,\n",
1088
+ " gradient_checkpointing=USE_GRAD_CHECKPOINTING,\n",
1089
+ " optim=\"paged_adamw_32bit\",\n",
1090
+ " save_steps=50,\n",
1091
+ " logging_steps=10,\n",
1092
+ " learning_rate=1e-3,\n",
1093
+ " weight_decay=0.001,\n",
1094
+ " fp16=False,\n",
1095
+ " bf16=True,\n",
1096
+ " warmup_ratio=0.05,\n",
1097
+ " group_by_length=True,\n",
1098
+ " lr_scheduler_type=\"constant\",\n",
1099
+ " report_to=\"none\"\n",
1100
+ ")\n",
1101
+ "\n",
1102
+ "peft_config = LoraConfig(\n",
1103
+ " lora_alpha=32,\n",
1104
+ " r=32,\n",
1105
+ " bias=\"none\",\n",
1106
+ " task_type=\"CAUSAL_LM\",\n",
1107
+ " target_modules=[\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"]\n",
1108
+ ")\n",
1109
+ "\n",
1110
+ "trainer = SFTTrainer(\n",
1111
+ " model=model,\n",
1112
+ " train_dataset=dataset,\n",
1113
+ " peft_config=peft_config,\n",
1114
+ " max_seq_length=TRAIN_MAX_SEQ_LENGTH,\n",
1115
+ " tokenizer=tokenizer,\n",
1116
+ " args=training_arguments,\n",
1117
+ " formatting_func=formatting_prompts_func\n",
1118
+ ")"
1119
+ ],
1120
+ "metadata": {
1121
+ "id": "A9OdyDDEy7rM",
1122
+ "colab": {
1123
+ "base_uri": "https://localhost:8080/"
1124
+ },
1125
+ "outputId": "49592f25-4aaf-4e21-f612-a6fe5c5865e1"
1126
+ },
1127
+ "execution_count": null,
1128
+ "outputs": [
1129
+ {
1130
+ "output_type": "stream",
1131
+ "name": "stderr",
1132
+ "text": [
1133
+ "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:318: UserWarning: You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code.\n",
1134
+ " warnings.warn(\n"
1135
+ ]
1136
+ }
1137
+ ]
1138
+ },
1139
+ {
1140
+ "cell_type": "code",
1141
+ "source": [
1142
+ "trainer.train()"
1143
+ ],
1144
+ "metadata": {
1145
+ "id": "9BvK-3eYiwhx"
1146
+ },
1147
+ "execution_count": null,
1148
+ "outputs": []
1149
+ },
1150
+ {
1151
+ "cell_type": "code",
1152
+ "source": [
1153
+ "# Save the model to disk\n",
1154
+ "trainer.model.save_pretrained(save_directory='aya-qlora')\n",
1155
+ "model.config.use_cache = True\n",
1156
+ "model.eval()"
1157
+ ],
1158
+ "metadata": {
1159
+ "id": "X3Lqfwo-8CCG"
1160
+ },
1161
+ "execution_count": null,
1162
+ "outputs": []
1163
+ },
1164
+ {
1165
+ "cell_type": "code",
1166
+ "source": [
1167
+ "# Test Bengali inference on loaded fine-tuned model\n",
1168
+ "\n",
1169
+ "# Load Model and LoRA Adapter\n",
1170
+ "quantization_config = None\n",
1171
+ "if QUANTIZE_4BIT:\n",
1172
+ " quantization_config = BitsAndBytesConfig(\n",
1173
+ " load_in_4bit=True,\n",
1174
+ " bnb_4bit_quant_type=\"nf4\",\n",
1175
+ " bnb_4bit_use_double_quant=True,\n",
1176
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
1177
+ " )\n",
1178
+ "\n",
1179
+ "attn_implementation = None\n",
1180
+ "if USE_FLASH_ATTENTION:\n",
1181
+ " attn_implementation=\"flash_attention_2\"\n",
1182
+ "\n",
1183
+ "loaded_model = AutoModelForCausalLM.from_pretrained(\n",
1184
+ " MODEL_NAME,\n",
1185
+ " quantization_config=quantization_config,\n",
1186
+ " attn_implementation=attn_implementation,\n",
1187
+ " torch_dtype=torch.bfloat16,\n",
1188
+ " device_map=\"auto\",\n",
1189
+ " )\n",
1190
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
1191
+ "loaded_model.load_adapter(\"aya-qlora\")\n",
1192
+ "\n",
1193
+ "\n",
1194
+ "prompts = [\n",
1195
+ " 'Translate from English to Bengali: \"Rates are competitive, almost always the best in the market\"'\n",
1196
+ "]\n",
1197
+ "\n",
1198
+ "generations = generate_aya_23(prompts, loaded_model)\n",
1199
+ "\n",
1200
+ "for p, g in zip(prompts, generations):\n",
1201
+ " print(\n",
1202
+ " \"PROMPT\", p ,\"RESPONSE\", g, \"\\n\", sep=\"\\n\"\n",
1203
+ " )"
1204
+ ],
1205
+ "metadata": {
1206
+ "colab": {
1207
+ "base_uri": "https://localhost:8080/",
1208
+ "height": 174,
1209
+ "referenced_widgets": [
1210
+ "0272ba7f31a2441ab1cb5b8f77dbaacb",
1211
+ "d1bb171ddebd4f4bbeb4ed5d4b8b7076",
1212
+ "33b4fc55703746778511265e28160837",
1213
+ "7548c151f8764276ad7951e2ac80d981",
1214
+ "d972c72fef7c45998469550318661e71",
1215
+ "2811b7c68a7b4c95b91bd5690cf06577",
1216
+ "a33ccfdb735948e98a19d901d8091319",
1217
+ "c1103244cec74a299265729e630faffd",
1218
+ "340941cfc49e4ab983b73fb48c30dfe8",
1219
+ "8bb42aa84f4b4a9ab6417aed92132063",
1220
+ "b0cf428afc21468caeb664428627aaf6"
1221
+ ]
1222
+ },
1223
+ "id": "w5HGIJtRJN-y",
1224
+ "outputId": "441193fe-89fa-40ad-8585-d1f2dcf124e5"
1225
+ },
1226
+ "execution_count": null,
1227
+ "outputs": [
1228
+ {
1229
+ "output_type": "display_data",
1230
+ "data": {
1231
+ "text/plain": [
1232
+ "Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]"
1233
+ ],
1234
+ "application/vnd.jupyter.widget-view+json": {
1235
+ "version_major": 2,
1236
+ "version_minor": 0,
1237
+ "model_id": "0272ba7f31a2441ab1cb5b8f77dbaacb"
1238
+ }
1239
+ },
1240
+ "metadata": {}
1241
+ },
1242
+ {
1243
+ "output_type": "stream",
1244
+ "name": "stderr",
1245
+ "text": [
1246
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
1247
+ ]
1248
+ },
1249
+ {
1250
+ "output_type": "stream",
1251
+ "name": "stdout",
1252
+ "text": [
1253
+ "PROMPT\n",
1254
+ "Translate from English to Bengali: \"Rates are competitive, almost always the best in the market\"\n",
1255
+ "RESPONSE\n",
1256
+ "\"দরগুলি প্রতিযোগিতামূলক, প্রায় সবসময় বাজারে সেরা\"\n",
1257
+ "\n",
1258
+ "\n"
1259
+ ]
1260
+ }
1261
+ ]
1262
+ }
1263
+ ]
1264
+ }
README.md ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ inference: false
3
+ library_name: transformers
4
+ language:
5
+ - en
6
+ - fr
7
+ - de
8
+ - es
9
+ - it
10
+ - pt
11
+ - ja
12
+ - ko
13
+ - zh
14
+ - ar
15
+ - el
16
+ - fa
17
+ - pl
18
+ - id
19
+ - cs
20
+ - he
21
+ - hi
22
+ - nl
23
+ - ro
24
+ - ru
25
+ - tr
26
+ - uk
27
+ - vi
28
+ license: cc-by-nc-4.0
29
+ extra_gated_prompt: "By submitting this form, you agree to the [License Agreement](https://cohere.com/c4ai-cc-by-nc-license) and acknowledge that the information you provide will be collected, used, and shared in accordance with Cohere’s [Privacy Policy]( https://cohere.com/privacy). You’ll receive email updates about C4AI and Cohere research, events, products and services. You can unsubscribe at any time."
30
+ extra_gated_fields:
31
+ Name: text
32
+ Affiliation: text
33
+ Country:
34
+ type: select
35
+ options:
36
+ - Aruba
37
+ - Afghanistan
38
+ - Angola
39
+ - Anguilla
40
+ - Åland Islands
41
+ - Albania
42
+ - Andorra
43
+ - United Arab Emirates
44
+ - Argentina
45
+ - Armenia
46
+ - American Samoa
47
+ - Antarctica
48
+ - French Southern Territories
49
+ - Antigua and Barbuda
50
+ - Australia
51
+ - Austria
52
+ - Azerbaijan
53
+ - Burundi
54
+ - Belgium
55
+ - Benin
56
+ - Bonaire Sint Eustatius and Saba
57
+ - Burkina Faso
58
+ - Bangladesh
59
+ - Bulgaria
60
+ - Bahrain
61
+ - Bahamas
62
+ - Bosnia and Herzegovina
63
+ - Saint Barthélemy
64
+ - Belarus
65
+ - Belize
66
+ - Bermuda
67
+ - Plurinational State of Bolivia
68
+ - Brazil
69
+ - Barbados
70
+ - Brunei-Darussalam
71
+ - Bhutan
72
+ - Bouvet-Island
73
+ - Botswana
74
+ - Central African Republic
75
+ - Canada
76
+ - Cocos (Keeling) Islands
77
+ - Switzerland
78
+ - Chile
79
+ - China
80
+ - Côte-dIvoire
81
+ - Cameroon
82
+ - Democratic Republic of the Congo
83
+ - Cook Islands
84
+ - Colombia
85
+ - Comoros
86
+ - Cabo Verde
87
+ - Costa Rica
88
+ - Cuba
89
+ - Curaçao
90
+ - Christmas Island
91
+ - Cayman Islands
92
+ - Cyprus
93
+ - Czechia
94
+ - Germany
95
+ - Djibouti
96
+ - Dominica
97
+ - Denmark
98
+ - Dominican Republic
99
+ - Algeria
100
+ - Ecuador
101
+ - Egypt
102
+ - Eritrea
103
+ - Western Sahara
104
+ - Spain
105
+ - Estonia
106
+ - Ethiopia
107
+ - Finland
108
+ - Fiji
109
+ - Falkland Islands (Malvinas)
110
+ - France
111
+ - Faroe Islands
112
+ - Federated States of Micronesia
113
+ - Gabon
114
+ - United Kingdom
115
+ - Georgia
116
+ - Guernsey
117
+ - Ghana
118
+ - Gibraltar
119
+ - Guinea
120
+ - Guadeloupe
121
+ - Gambia
122
+ - Guinea Bissau
123
+ - Equatorial Guinea
124
+ - Greece
125
+ - Grenada
126
+ - Greenland
127
+ - Guatemala
128
+ - French Guiana
129
+ - Guam
130
+ - Guyana
131
+ - Hong Kong
132
+ - Heard Island and McDonald Islands
133
+ - Honduras
134
+ - Croatia
135
+ - Haiti
136
+ - Hungary
137
+ - Indonesia
138
+ - Isle of Man
139
+ - India
140
+ - British Indian Ocean Territory
141
+ - Ireland
142
+ - Islamic Republic of Iran
143
+ - Iraq
144
+ - Iceland
145
+ - Israel
146
+ - Italy
147
+ - Jamaica
148
+ - Jersey
149
+ - Jordan
150
+ - Japan
151
+ - Kazakhstan
152
+ - Kenya
153
+ - Kyrgyzstan
154
+ - Cambodia
155
+ - Kiribati
156
+ - Saint-Kitts-and-Nevis
157
+ - South Korea
158
+ - Kuwait
159
+ - Lao-Peoples-Democratic-Republic
160
+ - Lebanon
161
+ - Liberia
162
+ - Libya
163
+ - Saint-Lucia
164
+ - Liechtenstein
165
+ - Sri Lanka
166
+ - Lesotho
167
+ - Lithuania
168
+ - Luxembourg
169
+ - Latvia
170
+ - Macao
171
+ - Saint Martin (French-part)
172
+ - Morocco
173
+ - Monaco
174
+ - Republic of Moldova
175
+ - Madagascar
176
+ - Maldives
177
+ - Mexico
178
+ - Marshall Islands
179
+ - North Macedonia
180
+ - Mali
181
+ - Malta
182
+ - Myanmar
183
+ - Montenegro
184
+ - Mongolia
185
+ - Northern Mariana Islands
186
+ - Mozambique
187
+ - Mauritania
188
+ - Montserrat
189
+ - Martinique
190
+ - Mauritius
191
+ - Malawi
192
+ - Malaysia
193
+ - Mayotte
194
+ - Namibia
195
+ - New Caledonia
196
+ - Niger
197
+ - Norfolk Island
198
+ - Nigeria
199
+ - Nicaragua
200
+ - Niue
201
+ - Netherlands
202
+ - Norway
203
+ - Nepal
204
+ - Nauru
205
+ - New Zealand
206
+ - Oman
207
+ - Pakistan
208
+ - Panama
209
+ - Pitcairn
210
+ - Peru
211
+ - Philippines
212
+ - Palau
213
+ - Papua New Guinea
214
+ - Poland
215
+ - Puerto Rico
216
+ - North Korea
217
+ - Portugal
218
+ - Paraguay
219
+ - State of Palestine
220
+ - French Polynesia
221
+ - Qatar
222
+ - Réunion
223
+ - Romania
224
+ - Russia
225
+ - Rwanda
226
+ - Saudi Arabia
227
+ - Sudan
228
+ - Senegal
229
+ - Singapore
230
+ - South Georgia and the South Sandwich Islands
231
+ - Saint Helena Ascension and Tristan da Cunha
232
+ - Svalbard and Jan Mayen
233
+ - Solomon Islands
234
+ - Sierra Leone
235
+ - El Salvador
236
+ - San Marino
237
+ - Somalia
238
+ - Saint Pierre and Miquelon
239
+ - Serbia
240
+ - South Sudan
241
+ - Sao Tome and Principe
242
+ - Suriname
243
+ - Slovakia
244
+ - Slovenia
245
+ - Sweden
246
+ - Eswatini
247
+ - Sint Maarten (Dutch-part)
248
+ - Seychelles
249
+ - Syrian Arab Republic
250
+ - Turks and Caicos Islands
251
+ - Chad
252
+ - Togo
253
+ - Thailand
254
+ - Tajikistan
255
+ - Tokelau
256
+ - Turkmenistan
257
+ - Timor Leste
258
+ - Tonga
259
+ - Trinidad and Tobago
260
+ - Tunisia
261
+ - Turkey
262
+ - Tuvalu
263
+ - Taiwan
264
+ - United Republic of Tanzania
265
+ - Uganda
266
+ - Ukraine
267
+ - United States Minor Outlying Islands
268
+ - Uruguay
269
+ - United-States
270
+ - Uzbekistan
271
+ - Holy See (Vatican City State)
272
+ - Saint Vincent and the Grenadines
273
+ - Bolivarian Republic of Venezuela
274
+ - Virgin Islands British
275
+ - Virgin Islands U.S.
276
+ - VietNam
277
+ - Vanuatu
278
+ - Wallis and Futuna
279
+ - Samoa
280
+ - Yemen
281
+ - South Africa
282
+ - Zambia
283
+ - Zimbabwe
284
+
285
+ I agree to use this model for non-commercial use ONLY: checkbox
286
+ ---
287
+
288
+ # Model Card for Aya-23-8B
289
+
290
+ **Try Aya 23**
291
+
292
+ You can try out Aya 23 (35B) before downloading the weights in our hosted Hugging Face Space [here](https://huggingface.co/spaces/CohereForAI/aya-23).
293
+
294
+ ## Model Summary
295
+
296
+ Aya 23 is an open weights research release of an instruction fine-tuned model with highly advanced multilingual capabilities. Aya 23 focuses on pairing a highly performant pre-trained [Command family](https://huggingface.co/CohereForAI/c4ai-command-r-plus) of models with the recently released [Aya Collection](https://huggingface.co/datasets/CohereForAI/aya_collection). The result is a powerful multilingual large language model serving 23 languages.
297
+
298
+ This model card corresponds to the 8-billion version of the Aya 23 model. We also released a 35-billion version which you can find [here](https://huggingface.co/CohereForAI/aya-23-35B).
299
+
300
+ We cover 23 languages: Arabic, Chinese (simplified & traditional), Czech, Dutch, English, French, German, Greek, Hebrew, Hindi, Indonesian, Italian, Japanese, Korean, Persian, Polish, Portuguese, Romanian, Russian, Spanish, Turkish, Ukrainian, and Vietnamese
301
+
302
+ Developed by: [Cohere For AI](https://cohere.for.ai) and [Cohere](https://cohere.com/)
303
+
304
+ - Point of Contact: Cohere For AI: [cohere.for.ai](https://cohere.for.ai/)
305
+ - License: [CC-BY-NC](https://cohere.com/c4ai-cc-by-nc-license), requires also adhering to [C4AI's Acceptable Use Policy](https://docs.cohere.com/docs/c4ai-acceptable-use-policy)
306
+ - Model: aya-23-8B
307
+ - Model Size: 8 billion parameters
308
+
309
+ ### Usage
310
+
311
+ Please install transformers from the source repository that includes the necessary changes for this model
312
+
313
+ ```python
314
+ # pip install transformers==4.41.1
315
+ from transformers import AutoTokenizer, AutoModelForCausalLM
316
+
317
+ model_id = "CohereForAI/aya-23-8B"
318
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
319
+ model = AutoModelForCausalLM.from_pretrained(model_id)
320
+
321
+ # Format message with the command-r-plus chat template
322
+ messages = [{"role": "user", "content": "Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz"}]
323
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
324
+ ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
325
+
326
+ gen_tokens = model.generate(
327
+ input_ids,
328
+ max_new_tokens=100,
329
+ do_sample=True,
330
+ temperature=0.3,
331
+ )
332
+
333
+ gen_text = tokenizer.decode(gen_tokens[0])
334
+ print(gen_text)
335
+ ```
336
+
337
+ ### Example Notebook
338
+
339
+ [This notebook](https://huggingface.co/CohereForAI/aya-23-8B/blob/main/Aya_23_notebook.ipynb) showcases a detailed use of Aya 23 (8B) including inference and fine-tuning with [QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
340
+
341
+ ## Model Details
342
+
343
+ **Input**: Models input text only.
344
+
345
+ **Output**: Models generate text only.
346
+
347
+ **Model Architecture**: Aya-23-8B is an auto-regressive language model that uses an optimized transformer architecture. After pretraining, this model is fine-tuned (IFT) to follow human instructions.
348
+
349
+ **Languages covered**: The model is particularly optimized for multilinguality and supports the following languages: Arabic, Chinese (simplified & traditional), Czech, Dutch, English, French, German, Greek, Hebrew, Hindi, Indonesian, Italian, Japanese, Korean, Persian, Polish, Portuguese, Romanian, Russian, Spanish, Turkish, Ukrainian, and Vietnamese
350
+
351
+ **Context length**: 8192
352
+
353
+ ### Evaluation
354
+
355
+ <img src="benchmarks.png" alt="multilingual benchmarks" width="650" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
356
+ <img src="winrates.png" alt="average win rates" width="650" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
357
+
358
+ Please refer to the [Aya 23 technical report](https://cohere.com/research/papers/aya-command-23-8b-and-35b-technical-report-2024-05-23) for further details about the base model, data, instruction tuning, and evaluation.
359
+
360
+ ### Model Card Contact
361
+
362
+ For errors or additional questions about details in this model card, contact [email protected].
363
+
364
+ ### Terms of Use
365
+
366
+ We hope that the release of this model will make community-based research efforts more accessible, by releasing the weights of a highly performant multilingual model to researchers all over the world. This model is governed by a [CC-BY-NC](https://cohere.com/c4ai-cc-by-nc-license) License with an acceptable use addendum, and also requires adhering to [C4AI's Acceptable Use Policy](https://docs.cohere.com/docs/c4ai-acceptable-use-policy).
367
+
368
+ ### Try the model today
369
+
370
+ You can try Aya 23 in the Cohere [playground](https://dashboard.cohere.com/playground/chat) here. You can also use it in our dedicated Hugging Face Space [here](https://huggingface.co/spaces/CohereForAI/aya-23).
371
+
372
+ ### Citation info
373
+ ```bibtex
374
+ @misc{aryabumi2024aya,
375
+ title={Aya 23: Open Weight Releases to Further Multilingual Progress},
376
+ author={Viraat Aryabumi and John Dang and Dwarak Talupuru and Saurabh Dash and David Cairuz and Hangyu Lin and Bharat Venkitesh and Madeline Smith and Kelly Marchisio and Sebastian Ruder and Acyr Locatelli and Julia Kreutzer and Nick Frosst and Phil Blunsom and Marzieh Fadaee and Ahmet Üstün and Sara Hooker},
377
+ year={2024},
378
+ eprint={2405.15032},
379
+ archivePrefix={arXiv},
380
+ primaryClass={cs.CL}
381
+ }
382
+
383
+ ```
benchmarks.png ADDED

Git LFS Details

  • SHA256: e2adc7895ac0db2e66a6af65ff86f06e836d399667ecb293218d49342a77abd6
  • Pointer size: 133 Bytes
  • Size of remote file: 19 MB
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CohereForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 5,
8
+ "eos_token_id": 255001,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "layer_norm_eps": 1e-05,
14
+ "logit_scale": 0.0625,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "cohere",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 0,
21
+ "rope_theta": 10000,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.40.0.dev0",
24
+ "use_cache": true,
25
+ "use_qk_norm": false,
26
+ "vocab_size": 256000,
27
+ "quantization_config": {
28
+ "quant_method": "exl2",
29
+ "version": "0.2.2",
30
+ "bits": 6.0,
31
+ "head_bits": 8,
32
+ "calibration": {
33
+ "rows": 115,
34
+ "length": 2048,
35
+ "dataset": "(default)"
36
+ }
37
+ }
38
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 5,
4
+ "eos_token_id": 255001,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.40.0.dev0"
7
+ }
measurement.json ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16056066048
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
24
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
25
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
26
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
80
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
81
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
88
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
89
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
90
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
91
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
92
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
93
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
94
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
95
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
96
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
97
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
98
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
99
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
100
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
101
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
102
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
103
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
104
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
105
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
106
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
107
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
108
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
109
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
110
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
111
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
112
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
113
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
114
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
115
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
116
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
117
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
122
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
123
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
184
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
185
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
186
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
187
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
192
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
193
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
194
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
195
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
196
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
197
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
198
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
199
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
200
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
201
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
202
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
203
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
204
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
205
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
206
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
207
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
208
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
209
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
210
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
211
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
212
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
213
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
214
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
215
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
225
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
226
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
227
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
228
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
229
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
230
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
231
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
232
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
233
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
234
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
235
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
236
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
237
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
238
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
239
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
240
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
241
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
242
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
244
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
245
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
246
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
247
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
248
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
249
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
251
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
252
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
253
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
254
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
255
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
256
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
257
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
258
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
259
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
260
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
261
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
262
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
263
+ "model.norm.weight": "model-00004-of-00004.safetensors"
264
+ }
265
+ }
output-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa479eef0fca56060359df82fba8f0096eff9f77e019b6362f032f7dfdc538a2
3
+ size 8121996536
output-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed2cc21ee3414943e4d22e11d9f1dc71a9a46eeeffa16dc5aa6eea79407856c
3
+ size 1048576096
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9619890aebac311d644236f49462d7f8618ebef7c7020c52645ccb597434a3c9
3
+ size 16543645
tokenizer_config.json ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<PAD>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<UNK>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<CLS>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<SEP>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<MASK_TOKEN>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<BOS_TOKEN>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<EOS_TOKEN>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "7": {
63
+ "content": "<EOP_TOKEN>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "255000": {
71
+ "content": "<|START_OF_TURN_TOKEN|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "255001": {
79
+ "content": "<|END_OF_TURN_TOKEN|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "255002": {
87
+ "content": "<|YES_TOKEN|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "255003": {
95
+ "content": "<|NO_TOKEN|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "255004": {
103
+ "content": "<|GOOD_TOKEN|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "255005": {
111
+ "content": "<|BAD_TOKEN|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "255006": {
119
+ "content": "<|USER_TOKEN|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "255007": {
127
+ "content": "<|CHATBOT_TOKEN|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "255008": {
135
+ "content": "<|SYSTEM_TOKEN|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "255009": {
143
+ "content": "<|USER_0_TOKEN|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "255010": {
151
+ "content": "<|USER_1_TOKEN|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "255011": {
159
+ "content": "<|USER_2_TOKEN|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "255012": {
167
+ "content": "<|USER_3_TOKEN|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "255013": {
175
+ "content": "<|USER_4_TOKEN|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "255014": {
183
+ "content": "<|USER_5_TOKEN|>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "255015": {
191
+ "content": "<|USER_6_TOKEN|>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "255016": {
199
+ "content": "<|USER_7_TOKEN|>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "255017": {
207
+ "content": "<|USER_8_TOKEN|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "255018": {
215
+ "content": "<|USER_9_TOKEN|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "255019": {
223
+ "content": "<|EXTRA_0_TOKEN|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": false
229
+ },
230
+ "255020": {
231
+ "content": "<|EXTRA_1_TOKEN|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": false
237
+ },
238
+ "255021": {
239
+ "content": "<|EXTRA_2_TOKEN|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": false
245
+ },
246
+ "255022": {
247
+ "content": "<|EXTRA_3_TOKEN|>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": false
253
+ },
254
+ "255023": {
255
+ "content": "<|EXTRA_4_TOKEN|>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": false
261
+ },
262
+ "255024": {
263
+ "content": "<|EXTRA_5_TOKEN|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": false
269
+ },
270
+ "255025": {
271
+ "content": "<|EXTRA_6_TOKEN|>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": false
277
+ },
278
+ "255026": {
279
+ "content": "<|EXTRA_7_TOKEN|>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": false
285
+ },
286
+ "255027": {
287
+ "content": "<|EXTRA_8_TOKEN|>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": false
293
+ },
294
+ "255028": {
295
+ "content": "<|EXTRA_9_TOKEN|>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": false
301
+ }
302
+ },
303
+ "bos_token": "<BOS_TOKEN>",
304
+ "chat_template": [
305
+ {
306
+ "name": "default",
307
+ "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}"
308
+ },
309
+ {
310
+ "name": "tool_use",
311
+ "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = '## Task and Context\\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user\\'s needs as best you can, which will be wide-ranging.\\n\\n## Style Guide\\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.' %}{% endif %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}{{ '# Safety Preamble' }}{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}{{ '\n\n# System Preamble' }}{{ '\n## Basic Rules' }}{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}{{ '\n\n# User Preamble' }}{{ '\n' + system_message }}{{'\n\n## Available Tools\nHere is a list of tools that you have available to you:\n\n'}}{% for tool in tools %}{% if loop.index0 != 0 %}{{ '\n\n'}}{% endif %}{{'```python\ndef ' + tool.name + '('}}{% for param_name, param_fields in tool.parameter_definitions.items() %}{% if loop.index0 != 0 %}{{ ', '}}{% endif %}{{param_name}}: {% if not param_fields.required %}{{'Optional[' + param_fields.type + '] = None'}}{% else %}{{ param_fields.type }}{% endif %}{% endfor %}{{ ') -> List[Dict]:\n \"\"\"'}}{{ tool.description }}{% if tool.parameter_definitions|length != 0 %}{{ '\n\n Args:\n '}}{% for param_name, param_fields in tool.parameter_definitions.items() %}{% if loop.index0 != 0 %}{{ '\n ' }}{% endif %}{{ param_name + ' ('}}{% if not param_fields.required %}{{'Optional[' + param_fields.type + ']'}}{% else %}{{ param_fields.type }}{% endif %}{{ '): ' + param_fields.description }}{% endfor %}{% endif %}{{ '\n \"\"\"\n pass\n```' }}{% endfor %}{{ '<|END_OF_TURN_TOKEN|>'}}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'system' %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{{'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \\'Action:\\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:\n```json\n[\n {\n \"tool_name\": title of the tool in the specification,\n \"parameters\": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters\n }\n]```<|END_OF_TURN_TOKEN|>'}}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}"
312
+ },
313
+ {
314
+ "name": "rag",
315
+ "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = '## Task and Context\\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user\\'s needs as best you can, which will be wide-ranging.\\n\\n## Style Guide\\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.' %}{% endif %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}{{ '# Safety Preamble' }}{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}{{ '\n\n# System Preamble' }}{{ '\n## Basic Rules' }}{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}{{ '\n\n# User Preamble' }}{{ '\n' + system_message }}{{ '<|END_OF_TURN_TOKEN|>'}}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'system' %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>'}}{{ '<results>' }}{% for document in documents %}{{ '\nDocument: ' }}{{ loop.index0 }}\n{% for key, value in document.items() %}{{ key }}: {{value}}\n{% endfor %}{% endfor %}{{ '</results>'}}{{ '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}{{ 'Carefully perform the following instructions, in order, starting each with a new line.\n' }}{{ 'Firstly, Decide which of the retrieved documents are relevant to the user\\'s last input by writing \\'Relevant Documents:\\' followed by comma-separated list of document numbers. If none are relevant, you should instead write \\'None\\'.\n' }}{{ 'Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user\\'s last input by writing \\'Cited Documents:\\' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write \\'None\\'.\n' }}{% if citation_mode=='accurate' %}{{ 'Thirdly, Write \\'Answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\n' }}{% endif %}{{ 'Finally, Write \\'Grounded answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the symbols <co: doc> and </co: doc> to indicate when a fact comes from a document in the search result, e.g <co: 0>my fact</co: 0> for a fact from document 0.' }}{{ '<|END_OF_TURN_TOKEN|>' }}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}"
316
+ }
317
+ ],
318
+ "clean_up_tokenization_spaces": false,
319
+ "eos_token": "<|END_OF_TURN_TOKEN|>",
320
+ "legacy": true,
321
+ "merges_file": null,
322
+ "model_max_length": 1000000000000000019884624838656,
323
+ "pad_token": "<PAD>",
324
+ "sp_model_kwargs": {},
325
+ "spaces_between_special_tokens": false,
326
+ "tokenizer_class": "CohereTokenizer",
327
+ "unk_token": null,
328
+ "use_default_system_prompt": false,
329
+ "vocab_file": null
330
+ }
winrates.png ADDED