hiiamsid commited on
Commit
5280241
·
verified ·
1 Parent(s): 4ef324f

Model save

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  license: apache-2.0
3
- base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # mistral_channel_targeter
15
 
16
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0435
19
 
20
  ## Model description
21
 
@@ -35,27 +35,28 @@ More information needed
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
- - train_batch_size: 4
39
- - eval_batch_size: 4
40
  - seed: 42
41
  - distributed_type: multi-GPU
42
  - num_devices: 3
43
- - gradient_accumulation_steps: 2
44
  - total_train_batch_size: 24
45
- - total_eval_batch_size: 12
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
49
- - num_epochs: 4
50
 
51
  ### Training results
52
 
53
  | Training Loss | Epoch | Step | Validation Loss |
54
  |:-------------:|:-----:|:----:|:---------------:|
55
- | 0.0493 | 1.0 | 158 | 0.0488 |
56
- | 0.0401 | 2.0 | 317 | 0.0401 |
57
- | 0.0256 | 3.0 | 475 | 0.0430 |
58
- | 0.0249 | 3.99 | 632 | 0.0435 |
 
 
59
 
60
 
61
  ### Framework versions
 
1
  ---
2
  license: apache-2.0
3
+ base_model: BioMistral/BioMistral-7B
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # mistral_channel_targeter
15
 
16
+ This model is a fine-tuned version of [BioMistral/BioMistral-7B](https://huggingface.co/BioMistral/BioMistral-7B) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.0708
19
 
20
  ## Model description
21
 
 
35
 
36
  The following hyperparameters were used during training:
37
  - learning_rate: 5e-07
38
+ - train_batch_size: 8
39
+ - eval_batch_size: 8
40
  - seed: 42
41
  - distributed_type: multi-GPU
42
  - num_devices: 3
 
43
  - total_train_batch_size: 24
44
+ - total_eval_batch_size: 24
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 6
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
+ | 0.0526 | 1.0 | 159 | 0.0547 |
55
+ | 0.0421 | 2.0 | 318 | 0.0447 |
56
+ | 0.0285 | 3.0 | 477 | 0.0385 |
57
+ | 0.0165 | 4.0 | 636 | 0.0465 |
58
+ | 0.0021 | 5.0 | 795 | 0.0659 |
59
+ | 0.0008 | 6.0 | 954 | 0.0708 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 3.99,
3
- "eval_loss": 0.04349230229854584,
4
- "eval_runtime": 21.7581,
5
- "eval_samples": 205,
6
- "eval_samples_per_second": 9.422,
7
- "eval_steps_per_second": 0.827,
8
- "train_loss": 0.2913021882878074,
9
- "train_runtime": 5985.1883,
10
- "train_samples": 3793,
11
- "train_samples_per_second": 2.535,
12
- "train_steps_per_second": 0.106
13
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "eval_loss": 0.07076797634363174,
4
+ "eval_runtime": 22.043,
5
+ "eval_samples": 206,
6
+ "eval_samples_per_second": 9.345,
7
+ "eval_steps_per_second": 0.408,
8
+ "train_loss": 0.18272512886366676,
9
+ "train_runtime": 9319.8502,
10
+ "train_samples": 3795,
11
+ "train_samples_per_second": 2.443,
12
+ "train_steps_per_second": 0.102
13
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
@@ -16,8 +16,8 @@
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "rms_norm_eps": 1e-05,
19
- "rope_theta": 1000000.0,
20
- "sliding_window": null,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.36.0",
 
1
  {
2
+ "_name_or_path": "BioMistral/BioMistral-7B",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "rms_norm_eps": 1e-05,
19
+ "rope_theta": 10000.0,
20
+ "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.36.0",
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.99,
3
- "eval_loss": 0.04349230229854584,
4
- "eval_runtime": 21.7581,
5
- "eval_samples": 205,
6
- "eval_samples_per_second": 9.422,
7
- "eval_steps_per_second": 0.827
8
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "eval_loss": 0.07076797634363174,
4
+ "eval_runtime": 22.043,
5
+ "eval_samples": 206,
6
+ "eval_samples_per_second": 9.345,
7
+ "eval_steps_per_second": 0.408
8
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f13aa6ca4c805d82509ea4fd28952c56690304047ea285baf9b0da946dd196e6
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92afcd4e1fb769e701a48bf65d7750b51820f9f5a06061b2c6899dcc4a6f685f
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f7c08a8cebca32ab7301b2ea9f060e8549bf94876bc3871c3614b3912197d9
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41cf70dcb5493928483654d17938b754ba42fc66b87167bce580acd30dddb91a
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:922173b16483b3ece4d5bb441e8c6042ecda1cfffdc9eb54db8916c372276d26
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70519e3fb132eb05253592faf31ad4116681d32d6b102810c5cb276ac15ce06c
3
  size 4540516344
tokenizer_config.json CHANGED
@@ -29,7 +29,7 @@
29
  },
30
  "additional_special_tokens": [],
31
  "bos_token": "<s>",
32
- "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '\n###Response :\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '###Response :\n' }}\n{% endif %}\n{% endfor %}",
33
  "clean_up_tokenization_spaces": false,
34
  "eos_token": "</s>",
35
  "legacy": true,
 
29
  },
30
  "additional_special_tokens": [],
31
  "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### Input:\n\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### Instruction:\n\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '\n###Response :\n\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '###Response :\n\n' }}\n{% endif %}\n{% endfor %}",
33
  "clean_up_tokenization_spaces": false,
34
  "eos_token": "</s>",
35
  "legacy": true,
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.99,
3
- "train_loss": 0.2913021882878074,
4
- "train_runtime": 5985.1883,
5
- "train_samples": 3793,
6
- "train_samples_per_second": 2.535,
7
- "train_steps_per_second": 0.106
8
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "train_loss": 0.18272512886366676,
4
+ "train_runtime": 9319.8502,
5
+ "train_samples": 3795,
6
+ "train_samples_per_second": 2.443,
7
+ "train_steps_per_second": 0.102
8
  }
trainer_state.json CHANGED
@@ -1,440 +1,648 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.9873817034700316,
5
  "eval_steps": 500,
6
- "global_step": 632,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
- "learning_rate": 7.812499999999999e-08,
14
- "loss": 6.7461,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.13,
19
- "learning_rate": 1.5624999999999999e-07,
20
- "loss": 5.4679,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.19,
25
- "learning_rate": 2.3437499999999998e-07,
26
- "loss": 2.9019,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.25,
31
- "learning_rate": 3.1249999999999997e-07,
32
- "loss": 0.8112,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.32,
37
- "learning_rate": 3.9062499999999997e-07,
38
- "loss": 0.2134,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.38,
43
- "learning_rate": 4.6874999999999996e-07,
44
- "loss": 0.1487,
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.44,
49
- "learning_rate": 4.998623501539504e-07,
50
- "loss": 0.1156,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.5,
55
- "learning_rate": 4.990217055187362e-07,
56
- "loss": 0.0821,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.57,
61
- "learning_rate": 4.974194562818019e-07,
62
- "loss": 0.0675,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.63,
67
- "learning_rate": 4.950605027404507e-07,
68
- "loss": 0.066,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.69,
73
- "learning_rate": 4.91952059486152e-07,
74
- "loss": 0.0627,
75
  "step": 110
76
  },
77
  {
78
- "epoch": 0.76,
79
- "learning_rate": 4.881036333395328e-07,
80
- "loss": 0.0583,
81
  "step": 120
82
  },
83
  {
84
  "epoch": 0.82,
85
- "learning_rate": 4.835269942748475e-07,
86
- "loss": 0.0534,
87
  "step": 130
88
  },
89
  {
90
  "epoch": 0.88,
91
- "learning_rate": 4.782361394228472e-07,
92
- "loss": 0.0524,
93
  "step": 140
94
  },
95
  {
96
- "epoch": 0.95,
97
- "learning_rate": 4.7224725026214615e-07,
98
- "loss": 0.0493,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 1.0,
103
- "eval_loss": 0.04877917468547821,
104
- "eval_runtime": 21.7578,
105
- "eval_samples_per_second": 9.422,
106
- "eval_steps_per_second": 0.827,
107
- "step": 158
108
  },
109
  {
110
  "epoch": 1.01,
111
- "learning_rate": 4.655786431300069e-07,
112
- "loss": 0.0434,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.07,
117
- "learning_rate": 4.582507132039034e-07,
118
- "loss": 0.0411,
119
  "step": 170
120
  },
121
  {
122
- "epoch": 1.14,
123
- "learning_rate": 4.5028587212518697e-07,
124
- "loss": 0.0423,
125
  "step": 180
126
  },
127
  {
128
- "epoch": 1.2,
129
- "learning_rate": 4.4170847945562717e-07,
130
- "loss": 0.0416,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 1.26,
135
- "learning_rate": 4.325447681764586e-07,
136
- "loss": 0.0417,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 1.32,
141
- "learning_rate": 4.228227644577875e-07,
142
- "loss": 0.039,
143
  "step": 210
144
  },
145
  {
146
- "epoch": 1.39,
147
- "learning_rate": 4.1257220194373424e-07,
148
- "loss": 0.0406,
149
  "step": 220
150
  },
151
  {
152
  "epoch": 1.45,
153
- "learning_rate": 4.0182443081545917e-07,
154
- "loss": 0.0448,
155
  "step": 230
156
  },
157
  {
158
  "epoch": 1.51,
159
- "learning_rate": 3.9061232191019517e-07,
160
- "loss": 0.0408,
161
  "step": 240
162
  },
163
  {
164
- "epoch": 1.58,
165
- "learning_rate": 3.78970166189525e-07,
166
- "loss": 0.0438,
167
  "step": 250
168
  },
169
  {
170
  "epoch": 1.64,
171
- "learning_rate": 3.669335698643704e-07,
172
- "loss": 0.0357,
173
  "step": 260
174
  },
175
  {
176
  "epoch": 1.7,
177
- "learning_rate": 3.5453934549744043e-07,
178
- "loss": 0.0373,
179
  "step": 270
180
  },
181
  {
182
- "epoch": 1.77,
183
- "learning_rate": 3.418253994161892e-07,
184
- "loss": 0.0446,
185
  "step": 280
186
  },
187
  {
188
- "epoch": 1.83,
189
- "learning_rate": 3.288306157806193e-07,
190
- "loss": 0.039,
191
  "step": 290
192
  },
193
  {
194
  "epoch": 1.89,
195
- "learning_rate": 3.1559473766049476e-07,
196
- "loss": 0.0382,
197
  "step": 300
198
  },
199
  {
200
- "epoch": 1.96,
201
- "learning_rate": 3.021582454856766e-07,
202
- "loss": 0.0401,
203
  "step": 310
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_loss": 0.040096793323755264,
208
- "eval_runtime": 21.7544,
209
- "eval_samples_per_second": 9.423,
210
- "eval_steps_per_second": 0.827,
211
- "step": 317
212
  },
213
  {
214
- "epoch": 2.02,
215
- "learning_rate": 2.8856223324132555e-07,
216
- "loss": 0.0351,
217
  "step": 320
218
  },
219
  {
220
  "epoch": 2.08,
221
- "learning_rate": 2.748482827866165e-07,
222
- "loss": 0.0239,
223
  "step": 330
224
  },
225
  {
226
- "epoch": 2.15,
227
- "learning_rate": 2.610583366813447e-07,
228
- "loss": 0.0318,
229
  "step": 340
230
  },
231
  {
232
- "epoch": 2.21,
233
- "learning_rate": 2.472345699093711e-07,
234
  "loss": 0.0304,
235
  "step": 350
236
  },
237
  {
238
- "epoch": 2.27,
239
- "learning_rate": 2.3341926089122408e-07,
240
- "loss": 0.0318,
241
  "step": 360
242
  },
243
  {
244
  "epoch": 2.33,
245
- "learning_rate": 2.1965466218035267e-07,
246
- "loss": 0.03,
247
  "step": 370
248
  },
249
  {
250
- "epoch": 2.4,
251
- "learning_rate": 2.0598287123849092e-07,
252
- "loss": 0.034,
253
  "step": 380
254
  },
255
  {
256
- "epoch": 2.46,
257
- "learning_rate": 1.9244570168535297e-07,
258
  "loss": 0.0295,
259
  "step": 390
260
  },
261
  {
262
  "epoch": 2.52,
263
- "learning_rate": 1.7908455541642582e-07,
264
- "loss": 0.0312,
265
  "step": 400
266
  },
267
  {
268
- "epoch": 2.59,
269
- "learning_rate": 1.659402959799753e-07,
270
- "loss": 0.0269,
271
  "step": 410
272
  },
273
  {
274
- "epoch": 2.65,
275
- "learning_rate": 1.5305312360052443e-07,
276
- "loss": 0.0289,
277
  "step": 420
278
  },
279
  {
280
- "epoch": 2.71,
281
- "learning_rate": 1.4046245223103348e-07,
282
- "loss": 0.0362,
283
  "step": 430
284
  },
285
  {
286
- "epoch": 2.78,
287
- "learning_rate": 1.2820678900980092e-07,
288
  "loss": 0.0314,
289
  "step": 440
290
  },
291
  {
292
- "epoch": 2.84,
293
- "learning_rate": 1.1632361649075498e-07,
294
- "loss": 0.0302,
295
  "step": 450
296
  },
297
  {
298
- "epoch": 2.9,
299
- "learning_rate": 1.0484927800731982e-07,
300
- "loss": 0.0305,
301
  "step": 460
302
  },
303
  {
304
- "epoch": 2.97,
305
- "learning_rate": 9.381886652045845e-08,
306
- "loss": 0.0256,
307
  "step": 470
308
  },
309
  {
310
  "epoch": 3.0,
311
- "eval_loss": 0.042997319251298904,
312
- "eval_runtime": 21.7318,
313
- "eval_samples_per_second": 9.433,
314
- "eval_steps_per_second": 0.828,
315
- "step": 475
316
  },
317
  {
318
- "epoch": 3.03,
319
- "learning_rate": 8.32661172908373e-08,
320
- "loss": 0.026,
321
  "step": 480
322
  },
323
  {
324
- "epoch": 3.09,
325
- "learning_rate": 7.322330470336313e-08,
326
- "loss": 0.022,
327
  "step": 490
328
  },
329
  {
330
- "epoch": 3.15,
331
- "learning_rate": 6.372114355964292e-08,
332
- "loss": 0.0253,
333
  "step": 500
334
  },
335
  {
336
- "epoch": 3.22,
337
- "learning_rate": 5.4788695140251637e-08,
338
- "loss": 0.0216,
339
  "step": 510
340
  },
341
  {
342
- "epoch": 3.28,
343
- "learning_rate": 4.645327832410648e-08,
344
- "loss": 0.0194,
345
  "step": 520
346
  },
347
  {
348
- "epoch": 3.34,
349
- "learning_rate": 3.874038603677882e-08,
350
- "loss": 0.0247,
351
  "step": 530
352
  },
353
  {
354
- "epoch": 3.41,
355
- "learning_rate": 3.167360728327681e-08,
356
- "loss": 0.021,
357
  "step": 540
358
  },
359
  {
360
- "epoch": 3.47,
361
- "learning_rate": 2.5274555003752697e-08,
362
- "loss": 0.0245,
363
  "step": 550
364
  },
365
  {
366
- "epoch": 3.53,
367
- "learning_rate": 1.956279997278043e-08,
368
- "loss": 0.0226,
369
  "step": 560
370
  },
371
  {
372
- "epoch": 3.6,
373
- "learning_rate": 1.4555810944364478e-08,
374
- "loss": 0.023,
375
  "step": 570
376
  },
377
  {
378
- "epoch": 3.66,
379
- "learning_rate": 1.0268901225739979e-08,
380
- "loss": 0.0209,
381
  "step": 580
382
  },
383
  {
384
- "epoch": 3.72,
385
- "learning_rate": 6.715181843361617e-09,
386
- "loss": 0.0258,
387
  "step": 590
388
  },
389
  {
390
- "epoch": 3.79,
391
- "learning_rate": 3.905521444318604e-09,
392
- "loss": 0.0204,
393
  "step": 600
394
  },
395
  {
396
- "epoch": 3.85,
397
- "learning_rate": 1.8485130558120454e-09,
398
- "loss": 0.0215,
399
  "step": 610
400
  },
401
  {
402
- "epoch": 3.91,
403
- "learning_rate": 5.504478043572291e-10,
404
- "loss": 0.0204,
405
  "step": 620
406
  },
407
  {
408
- "epoch": 3.97,
409
- "learning_rate": 1.5295675087678705e-11,
410
- "loss": 0.0249,
411
  "step": 630
412
  },
413
  {
414
- "epoch": 3.99,
415
- "eval_loss": 0.04349230229854584,
416
- "eval_runtime": 21.7702,
417
- "eval_samples_per_second": 9.417,
418
- "eval_steps_per_second": 0.827,
419
- "step": 632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  },
421
  {
422
- "epoch": 3.99,
423
- "step": 632,
424
- "total_flos": 72589258260480.0,
425
- "train_loss": 0.2913021882878074,
426
- "train_runtime": 5985.1883,
427
- "train_samples_per_second": 2.535,
428
- "train_steps_per_second": 0.106
429
  }
430
  ],
431
  "logging_steps": 10,
432
- "max_steps": 632,
433
  "num_input_tokens_seen": 0,
434
- "num_train_epochs": 4,
435
  "save_steps": 500,
436
- "total_flos": 72589258260480.0,
437
- "train_batch_size": 4,
438
  "trial_name": null,
439
  "trial_params": null
440
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.0,
5
  "eval_steps": 500,
6
+ "global_step": 954,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
+ "learning_rate": 5.208333333333333e-08,
14
+ "loss": 5.1406,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.13,
19
+ "learning_rate": 1.0416666666666667e-07,
20
+ "loss": 4.8324,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.19,
25
+ "learning_rate": 1.5624999999999999e-07,
26
+ "loss": 3.282,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.25,
31
+ "learning_rate": 2.0833333333333333e-07,
32
+ "loss": 1.166,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.31,
37
+ "learning_rate": 2.604166666666667e-07,
38
+ "loss": 0.3744,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.38,
43
+ "learning_rate": 3.1249999999999997e-07,
44
+ "loss": 0.2785,
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.44,
49
+ "learning_rate": 3.645833333333333e-07,
50
+ "loss": 0.2278,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.5,
55
+ "learning_rate": 4.1666666666666667e-07,
56
+ "loss": 0.1459,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.57,
61
+ "learning_rate": 4.6874999999999996e-07,
62
+ "loss": 0.0876,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.63,
67
+ "learning_rate": 4.999731868769026e-07,
68
+ "loss": 0.071,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.69,
73
+ "learning_rate": 4.996716052911017e-07,
74
+ "loss": 0.0626,
75
  "step": 110
76
  },
77
  {
78
+ "epoch": 0.75,
79
+ "learning_rate": 4.990353313429303e-07,
80
+ "loss": 0.0586,
81
  "step": 120
82
  },
83
  {
84
  "epoch": 0.82,
85
+ "learning_rate": 4.980652179769217e-07,
86
+ "loss": 0.055,
87
  "step": 130
88
  },
89
  {
90
  "epoch": 0.88,
91
+ "learning_rate": 4.967625656594781e-07,
92
+ "loss": 0.0547,
93
  "step": 140
94
  },
95
  {
96
+ "epoch": 0.94,
97
+ "learning_rate": 4.951291206355559e-07,
98
+ "loss": 0.0526,
99
  "step": 150
100
  },
101
  {
102
  "epoch": 1.0,
103
+ "eval_loss": 0.054711490869522095,
104
+ "eval_runtime": 21.9924,
105
+ "eval_samples_per_second": 9.367,
106
+ "eval_steps_per_second": 0.409,
107
+ "step": 159
108
  },
109
  {
110
  "epoch": 1.01,
111
+ "learning_rate": 4.93167072587771e-07,
112
+ "loss": 0.0468,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 1.07,
117
+ "learning_rate": 4.908790517010636e-07,
118
+ "loss": 0.0447,
119
  "step": 170
120
  },
121
  {
122
+ "epoch": 1.13,
123
+ "learning_rate": 4.882681251368548e-07,
124
+ "loss": 0.0447,
125
  "step": 180
126
  },
127
  {
128
+ "epoch": 1.19,
129
+ "learning_rate": 4.853377929214243e-07,
130
+ "loss": 0.0498,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 1.26,
135
+ "learning_rate": 4.820919832540181e-07,
136
+ "loss": 0.0468,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 1.32,
141
+ "learning_rate": 4.785350472409791e-07,
142
+ "loss": 0.0458,
143
  "step": 210
144
  },
145
  {
146
+ "epoch": 1.38,
147
+ "learning_rate": 4.7467175306295647e-07,
148
+ "loss": 0.046,
149
  "step": 220
150
  },
151
  {
152
  "epoch": 1.45,
153
+ "learning_rate": 4.70507279583015e-07,
154
+ "loss": 0.043,
155
  "step": 230
156
  },
157
  {
158
  "epoch": 1.51,
159
+ "learning_rate": 4.6604720940421207e-07,
160
+ "loss": 0.0431,
161
  "step": 240
162
  },
163
  {
164
+ "epoch": 1.57,
165
+ "learning_rate": 4.612975213859487e-07,
166
+ "loss": 0.0428,
167
  "step": 250
168
  },
169
  {
170
  "epoch": 1.64,
171
+ "learning_rate": 4.5626458262912735e-07,
172
+ "loss": 0.0468,
173
  "step": 260
174
  },
175
  {
176
  "epoch": 1.7,
177
+ "learning_rate": 4.5095513994085974e-07,
178
+ "loss": 0.0418,
179
  "step": 270
180
  },
181
  {
182
+ "epoch": 1.76,
183
+ "learning_rate": 4.453763107901675e-07,
184
+ "loss": 0.0437,
185
  "step": 280
186
  },
187
  {
188
+ "epoch": 1.82,
189
+ "learning_rate": 4.395355737667985e-07,
190
+ "loss": 0.0429,
191
  "step": 290
192
  },
193
  {
194
  "epoch": 1.89,
195
+ "learning_rate": 4.3344075855595097e-07,
196
+ "loss": 0.0408,
197
  "step": 300
198
  },
199
  {
200
+ "epoch": 1.95,
201
+ "learning_rate": 4.271000354423425e-07,
202
+ "loss": 0.0421,
203
  "step": 310
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_loss": 0.04473445564508438,
208
+ "eval_runtime": 22.0105,
209
+ "eval_samples_per_second": 9.359,
210
+ "eval_steps_per_second": 0.409,
211
+ "step": 318
212
  },
213
  {
214
+ "epoch": 2.01,
215
+ "learning_rate": 4.2052190435769554e-07,
216
+ "loss": 0.0355,
217
  "step": 320
218
  },
219
  {
220
  "epoch": 2.08,
221
+ "learning_rate": 4.137151834863213e-07,
222
+ "loss": 0.0348,
223
  "step": 330
224
  },
225
  {
226
+ "epoch": 2.14,
227
+ "learning_rate": 4.0668899744407567e-07,
228
+ "loss": 0.0326,
229
  "step": 340
230
  },
231
  {
232
+ "epoch": 2.2,
233
+ "learning_rate": 3.994527650465352e-07,
234
  "loss": 0.0304,
235
  "step": 350
236
  },
237
  {
238
+ "epoch": 2.26,
239
+ "learning_rate": 3.920161866827889e-07,
240
+ "loss": 0.0258,
241
  "step": 360
242
  },
243
  {
244
  "epoch": 2.33,
245
+ "learning_rate": 3.8438923131177237e-07,
246
+ "loss": 0.0317,
247
  "step": 370
248
  },
249
  {
250
+ "epoch": 2.39,
251
+ "learning_rate": 3.765821230985757e-07,
252
+ "loss": 0.0326,
253
  "step": 380
254
  },
255
  {
256
+ "epoch": 2.45,
257
+ "learning_rate": 3.6860532770864005e-07,
258
  "loss": 0.0295,
259
  "step": 390
260
  },
261
  {
262
  "epoch": 2.52,
263
+ "learning_rate": 3.604695382782159e-07,
264
+ "loss": 0.0275,
265
  "step": 400
266
  },
267
  {
268
+ "epoch": 2.58,
269
+ "learning_rate": 3.5218566107988867e-07,
270
+ "loss": 0.0344,
271
  "step": 410
272
  },
273
  {
274
+ "epoch": 2.64,
275
+ "learning_rate": 3.4376480090239047e-07,
276
+ "loss": 0.0341,
277
  "step": 420
278
  },
279
  {
280
+ "epoch": 2.7,
281
+ "learning_rate": 3.3521824616429284e-07,
282
+ "loss": 0.0273,
283
  "step": 430
284
  },
285
  {
286
+ "epoch": 2.77,
287
+ "learning_rate": 3.265574537815398e-07,
288
  "loss": 0.0314,
289
  "step": 440
290
  },
291
  {
292
+ "epoch": 2.83,
293
+ "learning_rate": 3.1779403380910425e-07,
294
+ "loss": 0.0354,
295
  "step": 450
296
  },
297
  {
298
+ "epoch": 2.89,
299
+ "learning_rate": 3.0893973387735683e-07,
300
+ "loss": 0.0336,
301
  "step": 460
302
  },
303
  {
304
+ "epoch": 2.96,
305
+ "learning_rate": 3.000064234440111e-07,
306
+ "loss": 0.0285,
307
  "step": 470
308
  },
309
  {
310
  "epoch": 3.0,
311
+ "eval_loss": 0.03846818581223488,
312
+ "eval_runtime": 22.5077,
313
+ "eval_samples_per_second": 9.152,
314
+ "eval_steps_per_second": 0.4,
315
+ "step": 477
316
  },
317
  {
318
+ "epoch": 3.02,
319
+ "learning_rate": 2.910060778827554e-07,
320
+ "loss": 0.0259,
321
  "step": 480
322
  },
323
  {
324
+ "epoch": 3.08,
325
+ "learning_rate": 2.8195076242990116e-07,
326
+ "loss": 0.0098,
327
  "step": 490
328
  },
329
  {
330
+ "epoch": 3.14,
331
+ "learning_rate": 2.7285261601056697e-07,
332
+ "loss": 0.0153,
333
  "step": 500
334
  },
335
  {
336
+ "epoch": 3.21,
337
+ "learning_rate": 2.6372383496608186e-07,
338
+ "loss": 0.0162,
339
  "step": 510
340
  },
341
  {
342
+ "epoch": 3.27,
343
+ "learning_rate": 2.5457665670441937e-07,
344
+ "loss": 0.0155,
345
  "step": 520
346
  },
347
  {
348
+ "epoch": 3.33,
349
+ "learning_rate": 2.454233432955807e-07,
350
+ "loss": 0.0185,
351
  "step": 530
352
  },
353
  {
354
+ "epoch": 3.4,
355
+ "learning_rate": 2.3627616503391812e-07,
356
+ "loss": 0.0138,
357
  "step": 540
358
  },
359
  {
360
+ "epoch": 3.46,
361
+ "learning_rate": 2.2714738398943308e-07,
362
+ "loss": 0.0136,
363
  "step": 550
364
  },
365
  {
366
+ "epoch": 3.52,
367
+ "learning_rate": 2.1804923757009882e-07,
368
+ "loss": 0.016,
369
  "step": 560
370
  },
371
  {
372
+ "epoch": 3.58,
373
+ "learning_rate": 2.089939221172446e-07,
374
+ "loss": 0.0155,
375
  "step": 570
376
  },
377
  {
378
+ "epoch": 3.65,
379
+ "learning_rate": 1.9999357655598891e-07,
380
+ "loss": 0.009,
381
  "step": 580
382
  },
383
  {
384
+ "epoch": 3.71,
385
+ "learning_rate": 1.9106026612264315e-07,
386
+ "loss": 0.0186,
387
  "step": 590
388
  },
389
  {
390
+ "epoch": 3.77,
391
+ "learning_rate": 1.8220596619089573e-07,
392
+ "loss": 0.0109,
393
  "step": 600
394
  },
395
  {
396
+ "epoch": 3.84,
397
+ "learning_rate": 1.7344254621846017e-07,
398
+ "loss": 0.016,
399
  "step": 610
400
  },
401
  {
402
+ "epoch": 3.9,
403
+ "learning_rate": 1.647817538357072e-07,
404
+ "loss": 0.0111,
405
  "step": 620
406
  },
407
  {
408
+ "epoch": 3.96,
409
+ "learning_rate": 1.562351990976095e-07,
410
+ "loss": 0.0165,
411
  "step": 630
412
  },
413
  {
414
+ "epoch": 4.0,
415
+ "eval_loss": 0.04646956920623779,
416
+ "eval_runtime": 22.0005,
417
+ "eval_samples_per_second": 9.363,
418
+ "eval_steps_per_second": 0.409,
419
+ "step": 636
420
+ },
421
+ {
422
+ "epoch": 4.03,
423
+ "learning_rate": 1.478143389201113e-07,
424
+ "loss": 0.0104,
425
+ "step": 640
426
+ },
427
+ {
428
+ "epoch": 4.09,
429
+ "learning_rate": 1.3953046172178413e-07,
430
+ "loss": 0.0041,
431
+ "step": 650
432
+ },
433
+ {
434
+ "epoch": 4.15,
435
+ "learning_rate": 1.3139467229135998e-07,
436
+ "loss": 0.0041,
437
+ "step": 660
438
+ },
439
+ {
440
+ "epoch": 4.21,
441
+ "learning_rate": 1.2341787690142435e-07,
442
+ "loss": 0.0011,
443
+ "step": 670
444
+ },
445
+ {
446
+ "epoch": 4.28,
447
+ "learning_rate": 1.1561076868822755e-07,
448
+ "loss": 0.0028,
449
+ "step": 680
450
+ },
451
+ {
452
+ "epoch": 4.34,
453
+ "learning_rate": 1.0798381331721107e-07,
454
+ "loss": 0.0024,
455
+ "step": 690
456
+ },
457
+ {
458
+ "epoch": 4.4,
459
+ "learning_rate": 1.0054723495346482e-07,
460
+ "loss": 0.0021,
461
+ "step": 700
462
+ },
463
+ {
464
+ "epoch": 4.47,
465
+ "learning_rate": 9.331100255592436e-08,
466
+ "loss": 0.0038,
467
+ "step": 710
468
+ },
469
+ {
470
+ "epoch": 4.53,
471
+ "learning_rate": 8.628481651367875e-08,
472
+ "loss": 0.0068,
473
+ "step": 720
474
+ },
475
+ {
476
+ "epoch": 4.59,
477
+ "learning_rate": 7.947809564230445e-08,
478
+ "loss": 0.0018,
479
+ "step": 730
480
+ },
481
+ {
482
+ "epoch": 4.65,
483
+ "learning_rate": 7.289996455765748e-08,
484
+ "loss": 0.0035,
485
+ "step": 740
486
+ },
487
+ {
488
+ "epoch": 4.72,
489
+ "learning_rate": 6.655924144404906e-08,
490
+ "loss": 0.0032,
491
+ "step": 750
492
+ },
493
+ {
494
+ "epoch": 4.78,
495
+ "learning_rate": 6.046442623320145e-08,
496
+ "loss": 0.0048,
497
+ "step": 760
498
+ },
499
+ {
500
+ "epoch": 4.84,
501
+ "learning_rate": 5.4623689209832484e-08,
502
+ "loss": 0.0034,
503
+ "step": 770
504
+ },
505
+ {
506
+ "epoch": 4.91,
507
+ "learning_rate": 4.904486005914027e-08,
508
+ "loss": 0.0031,
509
+ "step": 780
510
+ },
511
+ {
512
+ "epoch": 4.97,
513
+ "learning_rate": 4.373541737087263e-08,
514
+ "loss": 0.0021,
515
+ "step": 790
516
+ },
517
+ {
518
+ "epoch": 5.0,
519
+ "eval_loss": 0.06586528569459915,
520
+ "eval_runtime": 22.2256,
521
+ "eval_samples_per_second": 9.269,
522
+ "eval_steps_per_second": 0.405,
523
+ "step": 795
524
+ },
525
+ {
526
+ "epoch": 5.03,
527
+ "learning_rate": 3.8702478614051345e-08,
528
+ "loss": 0.0037,
529
+ "step": 800
530
+ },
531
+ {
532
+ "epoch": 5.09,
533
+ "learning_rate": 3.3952790595787986e-08,
534
+ "loss": 0.0015,
535
+ "step": 810
536
+ },
537
+ {
538
+ "epoch": 5.16,
539
+ "learning_rate": 2.9492720416985e-08,
540
+ "loss": 0.0007,
541
+ "step": 820
542
+ },
543
+ {
544
+ "epoch": 5.22,
545
+ "learning_rate": 2.5328246937043525e-08,
546
+ "loss": 0.002,
547
+ "step": 830
548
+ },
549
+ {
550
+ "epoch": 5.28,
551
+ "learning_rate": 2.1464952759020856e-08,
552
+ "loss": 0.0005,
553
+ "step": 840
554
+ },
555
+ {
556
+ "epoch": 5.35,
557
+ "learning_rate": 1.7908016745981856e-08,
558
+ "loss": 0.0012,
559
+ "step": 850
560
+ },
561
+ {
562
+ "epoch": 5.41,
563
+ "learning_rate": 1.4662207078575684e-08,
564
+ "loss": 0.0005,
565
+ "step": 860
566
+ },
567
+ {
568
+ "epoch": 5.47,
569
+ "learning_rate": 1.1731874863145142e-08,
570
+ "loss": 0.0024,
571
+ "step": 870
572
+ },
573
+ {
574
+ "epoch": 5.53,
575
+ "learning_rate": 9.12094829893642e-09,
576
+ "loss": 0.0011,
577
+ "step": 880
578
+ },
579
+ {
580
+ "epoch": 5.6,
581
+ "learning_rate": 6.832927412229017e-09,
582
+ "loss": 0.0006,
583
+ "step": 890
584
+ },
585
+ {
586
+ "epoch": 5.66,
587
+ "learning_rate": 4.8708793644441086e-09,
588
+ "loss": 0.0023,
589
+ "step": 900
590
+ },
591
+ {
592
+ "epoch": 5.72,
593
+ "learning_rate": 3.2374343405217884e-09,
594
+ "loss": 0.0023,
595
+ "step": 910
596
+ },
597
+ {
598
+ "epoch": 5.79,
599
+ "learning_rate": 1.9347820230782295e-09,
600
+ "loss": 0.0005,
601
+ "step": 920
602
+ },
603
+ {
604
+ "epoch": 5.85,
605
+ "learning_rate": 9.64668657069706e-10,
606
+ "loss": 0.0023,
607
+ "step": 930
608
+ },
609
+ {
610
+ "epoch": 5.91,
611
+ "learning_rate": 3.2839470889836627e-10,
612
+ "loss": 0.0011,
613
+ "step": 940
614
+ },
615
+ {
616
+ "epoch": 5.97,
617
+ "learning_rate": 2.6813123097352287e-11,
618
+ "loss": 0.0008,
619
+ "step": 950
620
+ },
621
+ {
622
+ "epoch": 6.0,
623
+ "eval_loss": 0.07076797634363174,
624
+ "eval_runtime": 21.958,
625
+ "eval_samples_per_second": 9.382,
626
+ "eval_steps_per_second": 0.41,
627
+ "step": 954
628
  },
629
  {
630
+ "epoch": 6.0,
631
+ "step": 954,
632
+ "total_flos": 126473593159680.0,
633
+ "train_loss": 0.18272512886366676,
634
+ "train_runtime": 9319.8502,
635
+ "train_samples_per_second": 2.443,
636
+ "train_steps_per_second": 0.102
637
  }
638
  ],
639
  "logging_steps": 10,
640
+ "max_steps": 954,
641
  "num_input_tokens_seen": 0,
642
+ "num_train_epochs": 6,
643
  "save_steps": 500,
644
+ "total_flos": 126473593159680.0,
645
+ "train_batch_size": 8,
646
  "trial_name": null,
647
  "trial_params": null
648
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5b2affc5d29584c63baf83c2c1002a4fb75eed308d418c32cba8a909ddcc19f
3
  size 5371
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55228b00492b67368a9b7ba885cdae50dcc19467b0776ba18e45dcad51e7aa44
3
  size 5371