pyb-camag commited on
Commit
9729da0
·
verified ·
1 Parent(s): c658ace

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 2.9918404351767904,
3
- "eval_accuracy": 0.45,
4
- "eval_loss": 1.8726001977920532,
5
- "eval_runtime": 14.5467,
6
- "eval_samples_per_second": 269.476,
7
- "eval_steps_per_second": 8.455,
8
  "total_flos": 2.6244369700391485e+18,
9
- "train_loss": 0.38805453755638813,
10
- "train_runtime": 888.6407,
11
- "train_samples_per_second": 119.103,
12
- "train_steps_per_second": 0.928
13
  }
 
1
  {
2
  "epoch": 2.9918404351767904,
3
+ "eval_accuracy": 0.5242346938775511,
4
+ "eval_loss": 1.3872867822647095,
5
+ "eval_runtime": 14.9938,
6
+ "eval_samples_per_second": 261.442,
7
+ "eval_steps_per_second": 8.203,
8
  "total_flos": 2.6244369700391485e+18,
9
+ "train_loss": 0.26469580238515683,
10
+ "train_runtime": 846.2486,
11
+ "train_samples_per_second": 125.07,
12
+ "train_steps_per_second": 0.975
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9918404351767904,
3
- "eval_accuracy": 0.45,
4
- "eval_loss": 1.8726001977920532,
5
- "eval_runtime": 14.5467,
6
- "eval_samples_per_second": 269.476,
7
- "eval_steps_per_second": 8.455
8
  }
 
1
  {
2
  "epoch": 2.9918404351767904,
3
+ "eval_accuracy": 0.5242346938775511,
4
+ "eval_loss": 1.3872867822647095,
5
+ "eval_runtime": 14.9938,
6
+ "eval_samples_per_second": 261.442,
7
+ "eval_steps_per_second": 8.203
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c6a15bb95415941597a628576b95115fa6d5879e3753213b6529831fe24a203
3
  size 110361288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beb20f00e454e7258299c3194ba829eae8d403661b817d5d5deeb7fb2e5304d3
3
  size 110361288
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9918404351767904,
3
  "total_flos": 2.6244369700391485e+18,
4
- "train_loss": 0.38805453755638813,
5
- "train_runtime": 888.6407,
6
- "train_samples_per_second": 119.103,
7
- "train_steps_per_second": 0.928
8
  }
 
1
  {
2
  "epoch": 2.9918404351767904,
3
  "total_flos": 2.6244369700391485e+18,
4
+ "train_loss": 0.26469580238515683,
5
+ "train_runtime": 846.2486,
6
+ "train_samples_per_second": 125.07,
7
+ "train_steps_per_second": 0.975
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.45,
3
  "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-ginger\\checkpoint-551",
4
  "epoch": 2.9918404351767904,
5
  "eval_steps": 500,
@@ -10,613 +10,613 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03626473254759746,
13
- "grad_norm": 33.50652313232422,
14
  "learning_rate": 6.024096385542169e-06,
15
- "loss": 2.0565,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07252946509519492,
20
- "grad_norm": 123.1854248046875,
21
  "learning_rate": 1.2048192771084338e-05,
22
- "loss": 1.9305,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.10879419764279238,
27
- "grad_norm": 129.15440368652344,
28
  "learning_rate": 1.8072289156626505e-05,
29
- "loss": 1.657,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.14505893019038985,
34
- "grad_norm": 243.6415252685547,
35
  "learning_rate": 2.4096385542168677e-05,
36
- "loss": 1.221,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.1813236627379873,
41
- "grad_norm": 439.5745849609375,
42
  "learning_rate": 3.012048192771085e-05,
43
- "loss": 0.9468,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.21758839528558477,
48
- "grad_norm": 667.8916625976562,
49
  "learning_rate": 3.614457831325301e-05,
50
- "loss": 0.8172,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.25385312783318226,
55
- "grad_norm": 428.6040344238281,
56
  "learning_rate": 4.2168674698795186e-05,
57
- "loss": 0.7226,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2901178603807797,
62
- "grad_norm": 560.4403686523438,
63
  "learning_rate": 4.8192771084337354e-05,
64
- "loss": 0.66,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.3263825929283772,
69
- "grad_norm": 479.2410888671875,
70
  "learning_rate": 4.952830188679246e-05,
71
- "loss": 0.6481,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.3626473254759746,
76
- "grad_norm": 610.5564575195312,
77
  "learning_rate": 4.88544474393531e-05,
78
- "loss": 0.5897,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3989120580235721,
83
- "grad_norm": 220.13214111328125,
84
  "learning_rate": 4.818059299191375e-05,
85
- "loss": 0.577,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.43517679057116954,
90
- "grad_norm": 380.0888366699219,
91
  "learning_rate": 4.750673854447439e-05,
92
- "loss": 0.5937,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.471441523118767,
97
- "grad_norm": 213.3901824951172,
98
  "learning_rate": 4.683288409703504e-05,
99
- "loss": 0.4978,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.5077062556663645,
104
- "grad_norm": 513.8497314453125,
105
  "learning_rate": 4.615902964959569e-05,
106
- "loss": 0.5274,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.543970988213962,
111
- "grad_norm": 299.3127136230469,
112
  "learning_rate": 4.548517520215634e-05,
113
- "loss": 0.4712,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.5802357207615594,
118
- "grad_norm": 143.81353759765625,
119
  "learning_rate": 4.4811320754716985e-05,
120
- "loss": 0.473,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.6165004533091568,
125
- "grad_norm": 420.0848693847656,
126
  "learning_rate": 4.413746630727763e-05,
127
- "loss": 0.4134,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.6527651858567544,
132
- "grad_norm": 365.6310119628906,
133
  "learning_rate": 4.3463611859838275e-05,
134
- "loss": 0.4046,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.6890299184043518,
139
- "grad_norm": 259.5730285644531,
140
  "learning_rate": 4.2789757412398926e-05,
141
- "loss": 0.36,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.7252946509519492,
146
- "grad_norm": 220.11085510253906,
147
  "learning_rate": 4.211590296495957e-05,
148
- "loss": 0.4181,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.7615593834995467,
153
- "grad_norm": 1192.25732421875,
154
  "learning_rate": 4.1442048517520216e-05,
155
- "loss": 0.4025,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.7978241160471442,
160
- "grad_norm": 228.13970947265625,
161
  "learning_rate": 4.076819407008086e-05,
162
- "loss": 0.3847,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.8340888485947416,
167
- "grad_norm": 420.40789794921875,
168
  "learning_rate": 4.009433962264151e-05,
169
- "loss": 0.3677,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.8703535811423391,
174
- "grad_norm": 495.6370849609375,
175
  "learning_rate": 3.942048517520216e-05,
176
- "loss": 0.3645,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.9066183136899365,
181
- "grad_norm": 554.6434936523438,
182
  "learning_rate": 3.874663072776281e-05,
183
- "loss": 0.3892,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.942883046237534,
188
- "grad_norm": 310.8186950683594,
189
  "learning_rate": 3.807277628032345e-05,
190
- "loss": 0.3135,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.9791477787851315,
195
- "grad_norm": 613.1405029296875,
196
  "learning_rate": 3.73989218328841e-05,
197
- "loss": 0.3419,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.9972801450589301,
202
- "eval_accuracy": 0.3704081632653061,
203
- "eval_loss": 1.927472710609436,
204
- "eval_runtime": 17.4792,
205
- "eval_samples_per_second": 224.267,
206
- "eval_steps_per_second": 7.037,
207
  "step": 275
208
  },
209
  {
210
  "epoch": 1.015412511332729,
211
- "grad_norm": 207.4573211669922,
212
  "learning_rate": 3.672506738544474e-05,
213
- "loss": 0.3532,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.0516772438803264,
218
- "grad_norm": 457.7559509277344,
219
  "learning_rate": 3.605121293800539e-05,
220
- "loss": 0.3507,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.087941976427924,
225
- "grad_norm": 121.54537200927734,
226
  "learning_rate": 3.537735849056604e-05,
227
- "loss": 0.3277,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.1242067089755212,
232
- "grad_norm": 320.1291198730469,
233
  "learning_rate": 3.470350404312669e-05,
234
- "loss": 0.2949,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.1604714415231188,
239
- "grad_norm": 212.68765258789062,
240
  "learning_rate": 3.4029649595687336e-05,
241
- "loss": 0.312,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.1967361740707163,
246
- "grad_norm": 588.8690185546875,
247
  "learning_rate": 3.335579514824798e-05,
248
- "loss": 0.2919,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.2330009066183136,
253
- "grad_norm": 174.75848388671875,
254
  "learning_rate": 3.2681940700808625e-05,
255
- "loss": 0.2844,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.2692656391659112,
260
- "grad_norm": 211.03993225097656,
261
  "learning_rate": 3.200808625336928e-05,
262
- "loss": 0.3096,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.3055303717135085,
267
- "grad_norm": 592.6405029296875,
268
  "learning_rate": 3.133423180592992e-05,
269
- "loss": 0.3195,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.341795104261106,
274
- "grad_norm": 297.369384765625,
275
  "learning_rate": 3.0660377358490567e-05,
276
- "loss": 0.2699,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.3780598368087036,
281
- "grad_norm": 131.08090209960938,
282
  "learning_rate": 2.998652291105121e-05,
283
- "loss": 0.2723,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.414324569356301,
288
- "grad_norm": 378.4498291015625,
289
  "learning_rate": 2.931266846361186e-05,
290
- "loss": 0.2602,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.4505893019038985,
295
- "grad_norm": 297.8955383300781,
296
  "learning_rate": 2.863881401617251e-05,
297
- "loss": 0.324,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.486854034451496,
302
- "grad_norm": 277.98309326171875,
303
  "learning_rate": 2.7964959568733156e-05,
304
- "loss": 0.2682,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.5231187669990933,
309
- "grad_norm": 528.5087890625,
310
  "learning_rate": 2.7291105121293804e-05,
311
- "loss": 0.2986,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.5593834995466909,
316
- "grad_norm": 445.7499084472656,
317
  "learning_rate": 2.661725067385445e-05,
318
- "loss": 0.2864,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.5956482320942884,
323
- "grad_norm": 292.2099914550781,
324
  "learning_rate": 2.5943396226415094e-05,
325
- "loss": 0.2598,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.6319129646418857,
330
- "grad_norm": 540.9222412109375,
331
  "learning_rate": 2.5269541778975742e-05,
332
- "loss": 0.2441,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.6681776971894833,
337
- "grad_norm": 215.79940795898438,
338
  "learning_rate": 2.459568733153639e-05,
339
- "loss": 0.2635,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.7044424297370808,
344
- "grad_norm": 278.9331359863281,
345
  "learning_rate": 2.3921832884097038e-05,
346
- "loss": 0.2584,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.7407071622846781,
351
- "grad_norm": 313.8738098144531,
352
  "learning_rate": 2.3247978436657683e-05,
353
- "loss": 0.2563,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.7769718948322755,
358
- "grad_norm": 269.33984375,
359
  "learning_rate": 2.2574123989218328e-05,
360
- "loss": 0.2452,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.8132366273798732,
365
- "grad_norm": 219.2152099609375,
366
  "learning_rate": 2.1900269541778976e-05,
367
- "loss": 0.2439,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.8495013599274706,
372
- "grad_norm": 566.9930419921875,
373
  "learning_rate": 2.1226415094339624e-05,
374
- "loss": 0.2262,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.8857660924750679,
379
- "grad_norm": 355.5133056640625,
380
  "learning_rate": 2.055256064690027e-05,
381
- "loss": 0.2727,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.9220308250226654,
386
- "grad_norm": 123.38850402832031,
387
  "learning_rate": 1.9878706199460917e-05,
388
- "loss": 0.2318,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.958295557570263,
393
- "grad_norm": 451.4879150390625,
394
  "learning_rate": 1.9204851752021562e-05,
395
- "loss": 0.2715,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 1.9945602901178603,
400
- "grad_norm": 105.41907501220703,
401
  "learning_rate": 1.8530997304582214e-05,
402
- "loss": 0.2402,
403
  "step": 550
404
  },
405
  {
406
  "epoch": 1.9981867633726202,
407
- "eval_accuracy": 0.45,
408
- "eval_loss": 1.8726001977920532,
409
- "eval_runtime": 14.3162,
410
- "eval_samples_per_second": 273.815,
411
- "eval_steps_per_second": 8.592,
412
  "step": 551
413
  },
414
  {
415
  "epoch": 2.030825022665458,
416
- "grad_norm": 179.87648010253906,
417
  "learning_rate": 1.785714285714286e-05,
418
- "loss": 0.2354,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.0670897552130554,
423
- "grad_norm": 144.1634063720703,
424
  "learning_rate": 1.7183288409703503e-05,
425
- "loss": 0.2269,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.1033544877606527,
430
- "grad_norm": 217.34768676757812,
431
  "learning_rate": 1.650943396226415e-05,
432
- "loss": 0.2245,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.13961922030825,
437
- "grad_norm": 289.987548828125,
438
  "learning_rate": 1.58355795148248e-05,
439
- "loss": 0.2301,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.175883952855848,
444
- "grad_norm": 509.3897705078125,
445
  "learning_rate": 1.5161725067385446e-05,
446
- "loss": 0.2029,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.212148685403445,
451
- "grad_norm": 772.5884399414062,
452
  "learning_rate": 1.4487870619946093e-05,
453
- "loss": 0.2547,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.2484134179510424,
458
- "grad_norm": 143.57704162597656,
459
  "learning_rate": 1.381401617250674e-05,
460
- "loss": 0.2337,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 2.28467815049864,
465
- "grad_norm": 139.83460998535156,
466
  "learning_rate": 1.3140161725067384e-05,
467
- "loss": 0.2306,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.3209428830462375,
472
- "grad_norm": 102.90089416503906,
473
  "learning_rate": 1.2466307277628032e-05,
474
- "loss": 0.2282,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.357207615593835,
479
- "grad_norm": 218.74929809570312,
480
  "learning_rate": 1.179245283018868e-05,
481
- "loss": 0.2098,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.3934723481414326,
486
- "grad_norm": 236.75039672851562,
487
  "learning_rate": 1.1118598382749327e-05,
488
- "loss": 0.2054,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.42973708068903,
493
- "grad_norm": 248.0030975341797,
494
  "learning_rate": 1.0444743935309973e-05,
495
- "loss": 0.1875,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.4660018132366273,
500
- "grad_norm": 120.9859390258789,
501
  "learning_rate": 9.77088948787062e-06,
502
- "loss": 0.2062,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.5022665457842246,
507
- "grad_norm": 346.58111572265625,
508
  "learning_rate": 9.097035040431268e-06,
509
- "loss": 0.2082,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.5385312783318223,
514
- "grad_norm": 299.8227844238281,
515
  "learning_rate": 8.423180592991915e-06,
516
- "loss": 0.2111,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.5747960108794197,
521
- "grad_norm": 423.92626953125,
522
  "learning_rate": 7.749326145552561e-06,
523
- "loss": 0.1825,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.611060743427017,
528
- "grad_norm": 270.41375732421875,
529
  "learning_rate": 7.0754716981132075e-06,
530
- "loss": 0.18,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.6473254759746148,
535
- "grad_norm": 236.77076721191406,
536
  "learning_rate": 6.401617250673856e-06,
537
- "loss": 0.1985,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.683590208522212,
542
- "grad_norm": 121.75785827636719,
543
  "learning_rate": 5.727762803234501e-06,
544
- "loss": 0.196,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.7198549410698094,
549
- "grad_norm": 134.50650024414062,
550
  "learning_rate": 5.053908355795149e-06,
551
- "loss": 0.1948,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.756119673617407,
556
- "grad_norm": 142.56356811523438,
557
  "learning_rate": 4.380053908355795e-06,
558
- "loss": 0.1804,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.7923844061650045,
563
- "grad_norm": 106.05587005615234,
564
  "learning_rate": 3.706199460916442e-06,
565
- "loss": 0.1722,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.828649138712602,
570
- "grad_norm": 216.68280029296875,
571
  "learning_rate": 3.032345013477089e-06,
572
- "loss": 0.2002,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.8649138712601996,
577
- "grad_norm": 270.9617004394531,
578
  "learning_rate": 2.358490566037736e-06,
579
- "loss": 0.199,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.901178603807797,
584
- "grad_norm": 147.06790161132812,
585
  "learning_rate": 1.6846361185983827e-06,
586
- "loss": 0.1859,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.9374433363553942,
591
- "grad_norm": 224.23472595214844,
592
  "learning_rate": 1.0107816711590296e-06,
593
- "loss": 0.1857,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.973708068902992,
598
- "grad_norm": 73.98612213134766,
599
  "learning_rate": 3.369272237196766e-07,
600
- "loss": 0.1841,
601
  "step": 820
602
  },
603
  {
604
  "epoch": 2.9918404351767904,
605
- "eval_accuracy": 0.3683673469387755,
606
- "eval_loss": 2.2334840297698975,
607
- "eval_runtime": 14.2473,
608
- "eval_samples_per_second": 275.139,
609
- "eval_steps_per_second": 8.633,
610
  "step": 825
611
  },
612
  {
613
  "epoch": 2.9918404351767904,
614
  "step": 825,
615
  "total_flos": 2.6244369700391485e+18,
616
- "train_loss": 0.38805453755638813,
617
- "train_runtime": 888.6407,
618
- "train_samples_per_second": 119.103,
619
- "train_steps_per_second": 0.928
620
  }
621
  ],
622
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.5242346938775511,
3
  "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-ginger\\checkpoint-551",
4
  "epoch": 2.9918404351767904,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03626473254759746,
13
+ "grad_norm": 66.77265930175781,
14
  "learning_rate": 6.024096385542169e-06,
15
+ "loss": 2.0316,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07252946509519492,
20
+ "grad_norm": 98.40676879882812,
21
  "learning_rate": 1.2048192771084338e-05,
22
+ "loss": 1.8051,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.10879419764279238,
27
+ "grad_norm": 75.0258560180664,
28
  "learning_rate": 1.8072289156626505e-05,
29
+ "loss": 1.2977,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.14505893019038985,
34
+ "grad_norm": 193.25062561035156,
35
  "learning_rate": 2.4096385542168677e-05,
36
+ "loss": 0.7561,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.1813236627379873,
41
+ "grad_norm": 448.8358459472656,
42
  "learning_rate": 3.012048192771085e-05,
43
+ "loss": 0.5479,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.21758839528558477,
48
+ "grad_norm": 450.2969970703125,
49
  "learning_rate": 3.614457831325301e-05,
50
+ "loss": 0.5176,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.25385312783318226,
55
+ "grad_norm": 179.40008544921875,
56
  "learning_rate": 4.2168674698795186e-05,
57
+ "loss": 0.4498,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2901178603807797,
62
+ "grad_norm": 431.70550537109375,
63
  "learning_rate": 4.8192771084337354e-05,
64
+ "loss": 0.3653,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.3263825929283772,
69
+ "grad_norm": 448.7202453613281,
70
  "learning_rate": 4.952830188679246e-05,
71
+ "loss": 0.3826,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.3626473254759746,
76
+ "grad_norm": 146.0322723388672,
77
  "learning_rate": 4.88544474393531e-05,
78
+ "loss": 0.3937,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3989120580235721,
83
+ "grad_norm": 230.89141845703125,
84
  "learning_rate": 4.818059299191375e-05,
85
+ "loss": 0.3891,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.43517679057116954,
90
+ "grad_norm": 291.6907043457031,
91
  "learning_rate": 4.750673854447439e-05,
92
+ "loss": 0.4013,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.471441523118767,
97
+ "grad_norm": 299.92999267578125,
98
  "learning_rate": 4.683288409703504e-05,
99
+ "loss": 0.3297,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.5077062556663645,
104
+ "grad_norm": 158.54531860351562,
105
  "learning_rate": 4.615902964959569e-05,
106
+ "loss": 0.317,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.543970988213962,
111
+ "grad_norm": 171.34927368164062,
112
  "learning_rate": 4.548517520215634e-05,
113
+ "loss": 0.3041,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.5802357207615594,
118
+ "grad_norm": 359.9927978515625,
119
  "learning_rate": 4.4811320754716985e-05,
120
+ "loss": 0.3252,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.6165004533091568,
125
+ "grad_norm": 191.6513671875,
126
  "learning_rate": 4.413746630727763e-05,
127
+ "loss": 0.3096,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.6527651858567544,
132
+ "grad_norm": 212.29483032226562,
133
  "learning_rate": 4.3463611859838275e-05,
134
+ "loss": 0.2636,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.6890299184043518,
139
+ "grad_norm": 141.14498901367188,
140
  "learning_rate": 4.2789757412398926e-05,
141
+ "loss": 0.2549,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.7252946509519492,
146
+ "grad_norm": 146.6569061279297,
147
  "learning_rate": 4.211590296495957e-05,
148
+ "loss": 0.2518,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.7615593834995467,
153
+ "grad_norm": 126.00825500488281,
154
  "learning_rate": 4.1442048517520216e-05,
155
+ "loss": 0.2204,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.7978241160471442,
160
+ "grad_norm": 181.69412231445312,
161
  "learning_rate": 4.076819407008086e-05,
162
+ "loss": 0.2087,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.8340888485947416,
167
+ "grad_norm": 555.3983154296875,
168
  "learning_rate": 4.009433962264151e-05,
169
+ "loss": 0.2254,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.8703535811423391,
174
+ "grad_norm": 289.5895690917969,
175
  "learning_rate": 3.942048517520216e-05,
176
+ "loss": 0.2591,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.9066183136899365,
181
+ "grad_norm": 171.5518798828125,
182
  "learning_rate": 3.874663072776281e-05,
183
+ "loss": 0.2474,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.942883046237534,
188
+ "grad_norm": 245.42327880859375,
189
  "learning_rate": 3.807277628032345e-05,
190
+ "loss": 0.249,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.9791477787851315,
195
+ "grad_norm": 481.2547302246094,
196
  "learning_rate": 3.73989218328841e-05,
197
+ "loss": 0.241,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.9972801450589301,
202
+ "eval_accuracy": 0.510969387755102,
203
+ "eval_loss": 1.5232751369476318,
204
+ "eval_runtime": 14.3392,
205
+ "eval_samples_per_second": 273.376,
206
+ "eval_steps_per_second": 8.578,
207
  "step": 275
208
  },
209
  {
210
  "epoch": 1.015412511332729,
211
+ "grad_norm": 127.02649688720703,
212
  "learning_rate": 3.672506738544474e-05,
213
+ "loss": 0.2281,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.0516772438803264,
218
+ "grad_norm": 288.0721130371094,
219
  "learning_rate": 3.605121293800539e-05,
220
+ "loss": 0.2058,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.087941976427924,
225
+ "grad_norm": 349.9566345214844,
226
  "learning_rate": 3.537735849056604e-05,
227
+ "loss": 0.2228,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.1242067089755212,
232
+ "grad_norm": 268.7991027832031,
233
  "learning_rate": 3.470350404312669e-05,
234
+ "loss": 0.2157,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.1604714415231188,
239
+ "grad_norm": 119.46910858154297,
240
  "learning_rate": 3.4029649595687336e-05,
241
+ "loss": 0.1892,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.1967361740707163,
246
+ "grad_norm": 151.6316680908203,
247
  "learning_rate": 3.335579514824798e-05,
248
+ "loss": 0.2094,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.2330009066183136,
253
+ "grad_norm": 179.56629943847656,
254
  "learning_rate": 3.2681940700808625e-05,
255
+ "loss": 0.2061,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.2692656391659112,
260
+ "grad_norm": 111.70588684082031,
261
  "learning_rate": 3.200808625336928e-05,
262
+ "loss": 0.2413,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.3055303717135085,
267
+ "grad_norm": 349.2808837890625,
268
  "learning_rate": 3.133423180592992e-05,
269
+ "loss": 0.1837,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.341795104261106,
274
+ "grad_norm": 188.50990295410156,
275
  "learning_rate": 3.0660377358490567e-05,
276
+ "loss": 0.2154,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.3780598368087036,
281
+ "grad_norm": 190.62208557128906,
282
  "learning_rate": 2.998652291105121e-05,
283
+ "loss": 0.1856,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.414324569356301,
288
+ "grad_norm": 493.2279968261719,
289
  "learning_rate": 2.931266846361186e-05,
290
+ "loss": 0.1999,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.4505893019038985,
295
+ "grad_norm": 244.08197021484375,
296
  "learning_rate": 2.863881401617251e-05,
297
+ "loss": 0.1687,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.486854034451496,
302
+ "grad_norm": 85.91020965576172,
303
  "learning_rate": 2.7964959568733156e-05,
304
+ "loss": 0.1607,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.5231187669990933,
309
+ "grad_norm": 80.52980041503906,
310
  "learning_rate": 2.7291105121293804e-05,
311
+ "loss": 0.1967,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.5593834995466909,
316
+ "grad_norm": 623.0601806640625,
317
  "learning_rate": 2.661725067385445e-05,
318
+ "loss": 0.1779,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.5956482320942884,
323
+ "grad_norm": 77.6041488647461,
324
  "learning_rate": 2.5943396226415094e-05,
325
+ "loss": 0.1588,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.6319129646418857,
330
+ "grad_norm": 126.40304565429688,
331
  "learning_rate": 2.5269541778975742e-05,
332
+ "loss": 0.1516,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.6681776971894833,
337
+ "grad_norm": 219.29595947265625,
338
  "learning_rate": 2.459568733153639e-05,
339
+ "loss": 0.1477,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.7044424297370808,
344
+ "grad_norm": 144.26341247558594,
345
  "learning_rate": 2.3921832884097038e-05,
346
+ "loss": 0.1714,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.7407071622846781,
351
+ "grad_norm": 177.91326904296875,
352
  "learning_rate": 2.3247978436657683e-05,
353
+ "loss": 0.1446,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.7769718948322755,
358
+ "grad_norm": 160.88905334472656,
359
  "learning_rate": 2.2574123989218328e-05,
360
+ "loss": 0.1681,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.8132366273798732,
365
+ "grad_norm": 118.04940032958984,
366
  "learning_rate": 2.1900269541778976e-05,
367
+ "loss": 0.153,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.8495013599274706,
372
+ "grad_norm": 42.61888122558594,
373
  "learning_rate": 2.1226415094339624e-05,
374
+ "loss": 0.1268,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.8857660924750679,
379
+ "grad_norm": 383.0787353515625,
380
  "learning_rate": 2.055256064690027e-05,
381
+ "loss": 0.1489,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.9220308250226654,
386
+ "grad_norm": 281.9739685058594,
387
  "learning_rate": 1.9878706199460917e-05,
388
+ "loss": 0.1966,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.958295557570263,
393
+ "grad_norm": 175.12257385253906,
394
  "learning_rate": 1.9204851752021562e-05,
395
+ "loss": 0.1479,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 1.9945602901178603,
400
+ "grad_norm": 250.56533813476562,
401
  "learning_rate": 1.8530997304582214e-05,
402
+ "loss": 0.1661,
403
  "step": 550
404
  },
405
  {
406
  "epoch": 1.9981867633726202,
407
+ "eval_accuracy": 0.5242346938775511,
408
+ "eval_loss": 1.3872867822647095,
409
+ "eval_runtime": 14.3219,
410
+ "eval_samples_per_second": 273.707,
411
+ "eval_steps_per_second": 8.588,
412
  "step": 551
413
  },
414
  {
415
  "epoch": 2.030825022665458,
416
+ "grad_norm": 128.275146484375,
417
  "learning_rate": 1.785714285714286e-05,
418
+ "loss": 0.1605,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.0670897552130554,
423
+ "grad_norm": 267.3365478515625,
424
  "learning_rate": 1.7183288409703503e-05,
425
+ "loss": 0.1295,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.1033544877606527,
430
+ "grad_norm": 62.166568756103516,
431
  "learning_rate": 1.650943396226415e-05,
432
+ "loss": 0.1209,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.13961922030825,
437
+ "grad_norm": 398.4904479980469,
438
  "learning_rate": 1.58355795148248e-05,
439
+ "loss": 0.1541,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.175883952855848,
444
+ "grad_norm": 89.24071502685547,
445
  "learning_rate": 1.5161725067385446e-05,
446
+ "loss": 0.1427,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.212148685403445,
451
+ "grad_norm": 227.34228515625,
452
  "learning_rate": 1.4487870619946093e-05,
453
+ "loss": 0.1557,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.2484134179510424,
458
+ "grad_norm": 333.01739501953125,
459
  "learning_rate": 1.381401617250674e-05,
460
+ "loss": 0.1194,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 2.28467815049864,
465
+ "grad_norm": 250.48745727539062,
466
  "learning_rate": 1.3140161725067384e-05,
467
+ "loss": 0.1585,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.3209428830462375,
472
+ "grad_norm": 48.325416564941406,
473
  "learning_rate": 1.2466307277628032e-05,
474
+ "loss": 0.1384,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.357207615593835,
479
+ "grad_norm": 111.5351791381836,
480
  "learning_rate": 1.179245283018868e-05,
481
+ "loss": 0.1096,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.3934723481414326,
486
+ "grad_norm": 149.7464141845703,
487
  "learning_rate": 1.1118598382749327e-05,
488
+ "loss": 0.1327,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.42973708068903,
493
+ "grad_norm": 145.99148559570312,
494
  "learning_rate": 1.0444743935309973e-05,
495
+ "loss": 0.099,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.4660018132366273,
500
+ "grad_norm": 67.97899627685547,
501
  "learning_rate": 9.77088948787062e-06,
502
+ "loss": 0.1545,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.5022665457842246,
507
+ "grad_norm": 288.68731689453125,
508
  "learning_rate": 9.097035040431268e-06,
509
+ "loss": 0.1155,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.5385312783318223,
514
+ "grad_norm": 70.22994232177734,
515
  "learning_rate": 8.423180592991915e-06,
516
+ "loss": 0.1244,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.5747960108794197,
521
+ "grad_norm": 98.32154083251953,
522
  "learning_rate": 7.749326145552561e-06,
523
+ "loss": 0.1182,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.611060743427017,
528
+ "grad_norm": 107.6678466796875,
529
  "learning_rate": 7.0754716981132075e-06,
530
+ "loss": 0.1247,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.6473254759746148,
535
+ "grad_norm": 61.7956657409668,
536
  "learning_rate": 6.401617250673856e-06,
537
+ "loss": 0.1066,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.683590208522212,
542
+ "grad_norm": 105.1893539428711,
543
  "learning_rate": 5.727762803234501e-06,
544
+ "loss": 0.1216,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.7198549410698094,
549
+ "grad_norm": 170.1961212158203,
550
  "learning_rate": 5.053908355795149e-06,
551
+ "loss": 0.1394,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.756119673617407,
556
+ "grad_norm": 202.67616271972656,
557
  "learning_rate": 4.380053908355795e-06,
558
+ "loss": 0.0923,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.7923844061650045,
563
+ "grad_norm": 73.72498321533203,
564
  "learning_rate": 3.706199460916442e-06,
565
+ "loss": 0.112,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.828649138712602,
570
+ "grad_norm": 50.71122741699219,
571
  "learning_rate": 3.032345013477089e-06,
572
+ "loss": 0.1025,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.8649138712601996,
577
+ "grad_norm": 65.91510009765625,
578
  "learning_rate": 2.358490566037736e-06,
579
+ "loss": 0.0911,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.901178603807797,
584
+ "grad_norm": 72.48229217529297,
585
  "learning_rate": 1.6846361185983827e-06,
586
+ "loss": 0.1068,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.9374433363553942,
591
+ "grad_norm": 156.2762451171875,
592
  "learning_rate": 1.0107816711590296e-06,
593
+ "loss": 0.1022,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.973708068902992,
598
+ "grad_norm": 270.1289367675781,
599
  "learning_rate": 3.369272237196766e-07,
600
+ "loss": 0.1177,
601
  "step": 820
602
  },
603
  {
604
  "epoch": 2.9918404351767904,
605
+ "eval_accuracy": 0.5104591836734694,
606
+ "eval_loss": 1.3372142314910889,
607
+ "eval_runtime": 14.3839,
608
+ "eval_samples_per_second": 272.527,
609
+ "eval_steps_per_second": 8.551,
610
  "step": 825
611
  },
612
  {
613
  "epoch": 2.9918404351767904,
614
  "step": 825,
615
  "total_flos": 2.6244369700391485e+18,
616
+ "train_loss": 0.26469580238515683,
617
+ "train_runtime": 846.2486,
618
+ "train_samples_per_second": 125.07,
619
+ "train_steps_per_second": 0.975
620
  }
621
  ],
622
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:172c6a3fa7c2f542ca19b97c7bf3d9c39587e357496558e45a67c14ca7104778
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0cbed784c599b1c9edec1045bf523b5b71aaf97708fcf7e53655a5a0a9ec75
3
  size 5304