FuseChat-Gemma-2-9B-SFT / trainer_state.json
AALF's picture
Upload folder using huggingface_hub
5482496 verified
raw
history blame
77.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.996954314720812,
"eval_steps": 500,
"global_step": 2214,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00676818950930626,
"grad_norm": 7.503111624582946,
"learning_rate": 4.504504504504504e-08,
"loss": 0.7511,
"step": 5
},
{
"epoch": 0.01353637901861252,
"grad_norm": 7.290232887177767,
"learning_rate": 9.009009009009008e-08,
"loss": 0.7468,
"step": 10
},
{
"epoch": 0.02030456852791878,
"grad_norm": 6.95340550567438,
"learning_rate": 1.3513513513513515e-07,
"loss": 0.7048,
"step": 15
},
{
"epoch": 0.02707275803722504,
"grad_norm": 6.910978359875767,
"learning_rate": 1.8018018018018017e-07,
"loss": 0.734,
"step": 20
},
{
"epoch": 0.0338409475465313,
"grad_norm": 5.990453108110331,
"learning_rate": 2.2522522522522522e-07,
"loss": 0.7075,
"step": 25
},
{
"epoch": 0.04060913705583756,
"grad_norm": 6.315135953523189,
"learning_rate": 2.702702702702703e-07,
"loss": 0.7034,
"step": 30
},
{
"epoch": 0.047377326565143825,
"grad_norm": 4.938125726164199,
"learning_rate": 3.153153153153153e-07,
"loss": 0.6955,
"step": 35
},
{
"epoch": 0.05414551607445008,
"grad_norm": 4.514212008344891,
"learning_rate": 3.6036036036036033e-07,
"loss": 0.6675,
"step": 40
},
{
"epoch": 0.06091370558375635,
"grad_norm": 2.5175394937632234,
"learning_rate": 4.054054054054054e-07,
"loss": 0.6121,
"step": 45
},
{
"epoch": 0.0676818950930626,
"grad_norm": 2.044111131735211,
"learning_rate": 4.5045045045045043e-07,
"loss": 0.5689,
"step": 50
},
{
"epoch": 0.07445008460236886,
"grad_norm": 1.9607335603547595,
"learning_rate": 4.954954954954955e-07,
"loss": 0.5681,
"step": 55
},
{
"epoch": 0.08121827411167512,
"grad_norm": 1.7055910206402423,
"learning_rate": 5.405405405405406e-07,
"loss": 0.5632,
"step": 60
},
{
"epoch": 0.08798646362098139,
"grad_norm": 1.6356611418313087,
"learning_rate": 5.855855855855856e-07,
"loss": 0.5455,
"step": 65
},
{
"epoch": 0.09475465313028765,
"grad_norm": 1.5898009813340985,
"learning_rate": 6.306306306306306e-07,
"loss": 0.5124,
"step": 70
},
{
"epoch": 0.10152284263959391,
"grad_norm": 1.6241164657421379,
"learning_rate": 6.756756756756756e-07,
"loss": 0.5235,
"step": 75
},
{
"epoch": 0.10829103214890017,
"grad_norm": 1.5281172683623996,
"learning_rate": 7.207207207207207e-07,
"loss": 0.5274,
"step": 80
},
{
"epoch": 0.11505922165820642,
"grad_norm": 1.467101912720914,
"learning_rate": 7.657657657657657e-07,
"loss": 0.5152,
"step": 85
},
{
"epoch": 0.1218274111675127,
"grad_norm": 1.4144943645830494,
"learning_rate": 8.108108108108108e-07,
"loss": 0.4903,
"step": 90
},
{
"epoch": 0.12859560067681894,
"grad_norm": 1.5168733264951235,
"learning_rate": 8.558558558558558e-07,
"loss": 0.5235,
"step": 95
},
{
"epoch": 0.1353637901861252,
"grad_norm": 1.4719778866974211,
"learning_rate": 9.009009009009009e-07,
"loss": 0.4955,
"step": 100
},
{
"epoch": 0.14213197969543148,
"grad_norm": 1.5617750862395916,
"learning_rate": 9.459459459459459e-07,
"loss": 0.5023,
"step": 105
},
{
"epoch": 0.14890016920473773,
"grad_norm": 1.608427652038671,
"learning_rate": 9.90990990990991e-07,
"loss": 0.5161,
"step": 110
},
{
"epoch": 0.155668358714044,
"grad_norm": 1.470675404237294,
"learning_rate": 1.0360360360360361e-06,
"loss": 0.4629,
"step": 115
},
{
"epoch": 0.16243654822335024,
"grad_norm": 1.5907906676415646,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.5272,
"step": 120
},
{
"epoch": 0.1692047377326565,
"grad_norm": 1.4139201116022069,
"learning_rate": 1.1261261261261262e-06,
"loss": 0.4828,
"step": 125
},
{
"epoch": 0.17597292724196278,
"grad_norm": 1.572210590864309,
"learning_rate": 1.1711711711711712e-06,
"loss": 0.4932,
"step": 130
},
{
"epoch": 0.18274111675126903,
"grad_norm": 1.8317582344702474,
"learning_rate": 1.2162162162162162e-06,
"loss": 0.4851,
"step": 135
},
{
"epoch": 0.1895093062605753,
"grad_norm": 1.4834369904658524,
"learning_rate": 1.2612612612612613e-06,
"loss": 0.4823,
"step": 140
},
{
"epoch": 0.19627749576988154,
"grad_norm": 1.487385439852692,
"learning_rate": 1.3063063063063063e-06,
"loss": 0.4835,
"step": 145
},
{
"epoch": 0.20304568527918782,
"grad_norm": 1.6282121013160264,
"learning_rate": 1.3513513513513513e-06,
"loss": 0.4845,
"step": 150
},
{
"epoch": 0.2098138747884941,
"grad_norm": 1.410710327011968,
"learning_rate": 1.3963963963963963e-06,
"loss": 0.4604,
"step": 155
},
{
"epoch": 0.21658206429780033,
"grad_norm": 1.5959737364732594,
"learning_rate": 1.4414414414414413e-06,
"loss": 0.49,
"step": 160
},
{
"epoch": 0.2233502538071066,
"grad_norm": 1.5229055220751309,
"learning_rate": 1.4864864864864864e-06,
"loss": 0.4907,
"step": 165
},
{
"epoch": 0.23011844331641285,
"grad_norm": 1.5262092330780643,
"learning_rate": 1.5315315315315314e-06,
"loss": 0.4603,
"step": 170
},
{
"epoch": 0.23688663282571912,
"grad_norm": 1.6342082189425051,
"learning_rate": 1.5765765765765766e-06,
"loss": 0.4872,
"step": 175
},
{
"epoch": 0.2436548223350254,
"grad_norm": 1.4374951298863459,
"learning_rate": 1.6216216216216216e-06,
"loss": 0.4648,
"step": 180
},
{
"epoch": 0.25042301184433163,
"grad_norm": 1.5280129895322156,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.4467,
"step": 185
},
{
"epoch": 0.2571912013536379,
"grad_norm": 1.5525834339213638,
"learning_rate": 1.7117117117117117e-06,
"loss": 0.4738,
"step": 190
},
{
"epoch": 0.2639593908629442,
"grad_norm": 1.5093982724761799,
"learning_rate": 1.7567567567567567e-06,
"loss": 0.483,
"step": 195
},
{
"epoch": 0.2707275803722504,
"grad_norm": 1.6721932829256383,
"learning_rate": 1.8018018018018017e-06,
"loss": 0.463,
"step": 200
},
{
"epoch": 0.27749576988155666,
"grad_norm": 1.5493470193453733,
"learning_rate": 1.8468468468468467e-06,
"loss": 0.4432,
"step": 205
},
{
"epoch": 0.28426395939086296,
"grad_norm": 1.5282256717827059,
"learning_rate": 1.8918918918918918e-06,
"loss": 0.4659,
"step": 210
},
{
"epoch": 0.2910321489001692,
"grad_norm": 1.493666122191364,
"learning_rate": 1.936936936936937e-06,
"loss": 0.4789,
"step": 215
},
{
"epoch": 0.29780033840947545,
"grad_norm": 1.7222146450605926,
"learning_rate": 1.981981981981982e-06,
"loss": 0.4636,
"step": 220
},
{
"epoch": 0.30456852791878175,
"grad_norm": 1.6787579180811572,
"learning_rate": 1.999988807353673e-06,
"loss": 0.4697,
"step": 225
},
{
"epoch": 0.311336717428088,
"grad_norm": 1.5374929160562005,
"learning_rate": 1.999920408755684e-06,
"loss": 0.4702,
"step": 230
},
{
"epoch": 0.31810490693739424,
"grad_norm": 1.5486529898439687,
"learning_rate": 1.9997898339445025e-06,
"loss": 0.4545,
"step": 235
},
{
"epoch": 0.3248730964467005,
"grad_norm": 1.5982741872566966,
"learning_rate": 1.9995970910394226e-06,
"loss": 0.4769,
"step": 240
},
{
"epoch": 0.3316412859560068,
"grad_norm": 1.5356403333143016,
"learning_rate": 1.999342192025422e-06,
"loss": 0.456,
"step": 245
},
{
"epoch": 0.338409475465313,
"grad_norm": 1.5906824266272626,
"learning_rate": 1.9990251527524177e-06,
"loss": 0.4803,
"step": 250
},
{
"epoch": 0.34517766497461927,
"grad_norm": 1.5359520161986076,
"learning_rate": 1.99864599293428e-06,
"loss": 0.4475,
"step": 255
},
{
"epoch": 0.35194585448392557,
"grad_norm": 1.5036681203646087,
"learning_rate": 1.9982047361476074e-06,
"loss": 0.464,
"step": 260
},
{
"epoch": 0.3587140439932318,
"grad_norm": 1.461884922691461,
"learning_rate": 1.9977014098302594e-06,
"loss": 0.4593,
"step": 265
},
{
"epoch": 0.36548223350253806,
"grad_norm": 1.434636405242703,
"learning_rate": 1.997136045279652e-06,
"loss": 0.4524,
"step": 270
},
{
"epoch": 0.37225042301184436,
"grad_norm": 1.4806026186739891,
"learning_rate": 1.996508677650809e-06,
"loss": 0.455,
"step": 275
},
{
"epoch": 0.3790186125211506,
"grad_norm": 1.5577278096965192,
"learning_rate": 1.9958193459541803e-06,
"loss": 0.4715,
"step": 280
},
{
"epoch": 0.38578680203045684,
"grad_norm": 1.6997591630335047,
"learning_rate": 1.9950680930532106e-06,
"loss": 0.4613,
"step": 285
},
{
"epoch": 0.3925549915397631,
"grad_norm": 1.750274534222129,
"learning_rate": 1.9942549656616785e-06,
"loss": 0.4654,
"step": 290
},
{
"epoch": 0.3993231810490694,
"grad_norm": 1.8009950734484617,
"learning_rate": 1.9933800143407914e-06,
"loss": 0.449,
"step": 295
},
{
"epoch": 0.40609137055837563,
"grad_norm": 1.5530961439855866,
"learning_rate": 1.992443293496038e-06,
"loss": 0.4458,
"step": 300
},
{
"epoch": 0.4128595600676819,
"grad_norm": 1.5081709456372263,
"learning_rate": 1.9914448613738106e-06,
"loss": 0.4565,
"step": 305
},
{
"epoch": 0.4196277495769882,
"grad_norm": 1.6772043840290556,
"learning_rate": 1.9903847800577777e-06,
"loss": 0.4804,
"step": 310
},
{
"epoch": 0.4263959390862944,
"grad_norm": 1.727246671277696,
"learning_rate": 1.9892631154650277e-06,
"loss": 0.4576,
"step": 315
},
{
"epoch": 0.43316412859560066,
"grad_norm": 1.5355208962246418,
"learning_rate": 1.9880799373419697e-06,
"loss": 0.4453,
"step": 320
},
{
"epoch": 0.43993231810490696,
"grad_norm": 1.5063427073085303,
"learning_rate": 1.986835319259994e-06,
"loss": 0.4452,
"step": 325
},
{
"epoch": 0.4467005076142132,
"grad_norm": 1.5195960228713277,
"learning_rate": 1.985529338610899e-06,
"loss": 0.465,
"step": 330
},
{
"epoch": 0.45346869712351945,
"grad_norm": 1.5343721052947665,
"learning_rate": 1.98416207660208e-06,
"loss": 0.436,
"step": 335
},
{
"epoch": 0.4602368866328257,
"grad_norm": 1.5094089297695503,
"learning_rate": 1.982733618251478e-06,
"loss": 0.4583,
"step": 340
},
{
"epoch": 0.467005076142132,
"grad_norm": 1.3787923632280212,
"learning_rate": 1.981244052382293e-06,
"loss": 0.4512,
"step": 345
},
{
"epoch": 0.47377326565143824,
"grad_norm": 1.5868715153799644,
"learning_rate": 1.9796934716174616e-06,
"loss": 0.427,
"step": 350
},
{
"epoch": 0.4805414551607445,
"grad_norm": 1.5348278464191702,
"learning_rate": 1.978081972373899e-06,
"loss": 0.4593,
"step": 355
},
{
"epoch": 0.4873096446700508,
"grad_norm": 1.587810620443399,
"learning_rate": 1.976409654856501e-06,
"loss": 0.4453,
"step": 360
},
{
"epoch": 0.494077834179357,
"grad_norm": 1.6669439346571866,
"learning_rate": 1.9746766230519137e-06,
"loss": 0.4726,
"step": 365
},
{
"epoch": 0.5008460236886633,
"grad_norm": 1.6972318618519455,
"learning_rate": 1.9728829847220696e-06,
"loss": 0.454,
"step": 370
},
{
"epoch": 0.5076142131979695,
"grad_norm": 1.5739750068827298,
"learning_rate": 1.9710288513974846e-06,
"loss": 0.4592,
"step": 375
},
{
"epoch": 0.5143824027072758,
"grad_norm": 1.5240777641912773,
"learning_rate": 1.969114338370324e-06,
"loss": 0.4297,
"step": 380
},
{
"epoch": 0.5211505922165821,
"grad_norm": 1.6105894445618845,
"learning_rate": 1.9671395646872323e-06,
"loss": 0.447,
"step": 385
},
{
"epoch": 0.5279187817258884,
"grad_norm": 1.7397039341357436,
"learning_rate": 1.965104653141933e-06,
"loss": 0.4516,
"step": 390
},
{
"epoch": 0.5346869712351946,
"grad_norm": 1.667337409755205,
"learning_rate": 1.9630097302675913e-06,
"loss": 0.4497,
"step": 395
},
{
"epoch": 0.5414551607445008,
"grad_norm": 1.472174322913246,
"learning_rate": 1.9608549263289456e-06,
"loss": 0.4396,
"step": 400
},
{
"epoch": 0.5482233502538071,
"grad_norm": 1.390664891183549,
"learning_rate": 1.95864037531421e-06,
"loss": 0.4584,
"step": 405
},
{
"epoch": 0.5549915397631133,
"grad_norm": 1.7716596272823615,
"learning_rate": 1.9563662149267405e-06,
"loss": 0.4417,
"step": 410
},
{
"epoch": 0.5617597292724196,
"grad_norm": 1.7181597011497438,
"learning_rate": 1.9540325865764725e-06,
"loss": 0.4566,
"step": 415
},
{
"epoch": 0.5685279187817259,
"grad_norm": 1.5659915803916853,
"learning_rate": 1.951639635371129e-06,
"loss": 0.4636,
"step": 420
},
{
"epoch": 0.5752961082910322,
"grad_norm": 1.3420492760813085,
"learning_rate": 1.9491875101071985e-06,
"loss": 0.4392,
"step": 425
},
{
"epoch": 0.5820642978003384,
"grad_norm": 1.7068688103479133,
"learning_rate": 1.946676363260679e-06,
"loss": 0.4442,
"step": 430
},
{
"epoch": 0.5888324873096447,
"grad_norm": 1.5986442251841577,
"learning_rate": 1.9441063509776e-06,
"loss": 0.4461,
"step": 435
},
{
"epoch": 0.5956006768189509,
"grad_norm": 1.3829526404505894,
"learning_rate": 1.9414776330643123e-06,
"loss": 0.4342,
"step": 440
},
{
"epoch": 0.6023688663282571,
"grad_norm": 1.5703877223063394,
"learning_rate": 1.9387903729775516e-06,
"loss": 0.4508,
"step": 445
},
{
"epoch": 0.6091370558375635,
"grad_norm": 1.6558038588752109,
"learning_rate": 1.9360447378142724e-06,
"loss": 0.4223,
"step": 450
},
{
"epoch": 0.6159052453468697,
"grad_norm": 1.4868682565255178,
"learning_rate": 1.9332408983012616e-06,
"loss": 0.4452,
"step": 455
},
{
"epoch": 0.622673434856176,
"grad_norm": 1.7326589310573917,
"learning_rate": 1.930379028784518e-06,
"loss": 0.4488,
"step": 460
},
{
"epoch": 0.6294416243654822,
"grad_norm": 1.5873739274578285,
"learning_rate": 1.9274593072184147e-06,
"loss": 0.4605,
"step": 465
},
{
"epoch": 0.6362098138747885,
"grad_norm": 1.334324454741449,
"learning_rate": 1.924481915154632e-06,
"loss": 0.4098,
"step": 470
},
{
"epoch": 0.6429780033840947,
"grad_norm": 1.4711516821260495,
"learning_rate": 1.9214470377308698e-06,
"loss": 0.4512,
"step": 475
},
{
"epoch": 0.649746192893401,
"grad_norm": 1.5422978596049304,
"learning_rate": 1.918354863659332e-06,
"loss": 0.4434,
"step": 480
},
{
"epoch": 0.6565143824027073,
"grad_norm": 1.6734420314479854,
"learning_rate": 1.915205585214998e-06,
"loss": 0.4656,
"step": 485
},
{
"epoch": 0.6632825719120136,
"grad_norm": 1.4869130554051895,
"learning_rate": 1.9119993982236605e-06,
"loss": 0.4658,
"step": 490
},
{
"epoch": 0.6700507614213198,
"grad_norm": 1.4648439577812231,
"learning_rate": 1.908736502049754e-06,
"loss": 0.4485,
"step": 495
},
{
"epoch": 0.676818950930626,
"grad_norm": 1.4924798254155862,
"learning_rate": 1.9054170995839543e-06,
"loss": 0.4434,
"step": 500
},
{
"epoch": 0.6835871404399323,
"grad_norm": 1.5718324313516365,
"learning_rate": 1.9020413972305652e-06,
"loss": 0.4141,
"step": 505
},
{
"epoch": 0.6903553299492385,
"grad_norm": 1.6927420013265664,
"learning_rate": 1.8986096048946822e-06,
"loss": 0.4188,
"step": 510
},
{
"epoch": 0.6971235194585449,
"grad_norm": 1.9638689317654023,
"learning_rate": 1.8951219359691416e-06,
"loss": 0.4085,
"step": 515
},
{
"epoch": 0.7038917089678511,
"grad_norm": 1.7008350118895288,
"learning_rate": 1.8915786073212506e-06,
"loss": 0.425,
"step": 520
},
{
"epoch": 0.7106598984771574,
"grad_norm": 1.5729307770372762,
"learning_rate": 1.887979839279303e-06,
"loss": 0.4486,
"step": 525
},
{
"epoch": 0.7174280879864636,
"grad_norm": 1.6239899543583527,
"learning_rate": 1.8843258556188783e-06,
"loss": 0.4314,
"step": 530
},
{
"epoch": 0.7241962774957699,
"grad_norm": 1.5552777279734928,
"learning_rate": 1.8806168835489277e-06,
"loss": 0.426,
"step": 535
},
{
"epoch": 0.7309644670050761,
"grad_norm": 1.4332159863043232,
"learning_rate": 1.876853153697645e-06,
"loss": 0.4297,
"step": 540
},
{
"epoch": 0.7377326565143824,
"grad_norm": 1.3873749896369056,
"learning_rate": 1.8730349000981267e-06,
"loss": 0.445,
"step": 545
},
{
"epoch": 0.7445008460236887,
"grad_norm": 1.4780965073836605,
"learning_rate": 1.8691623601738197e-06,
"loss": 0.458,
"step": 550
},
{
"epoch": 0.751269035532995,
"grad_norm": 1.5268831705229313,
"learning_rate": 1.8652357747237578e-06,
"loss": 0.4354,
"step": 555
},
{
"epoch": 0.7580372250423012,
"grad_norm": 1.777684181196839,
"learning_rate": 1.8612553879075873e-06,
"loss": 0.4521,
"step": 560
},
{
"epoch": 0.7648054145516074,
"grad_norm": 1.5369674855501423,
"learning_rate": 1.8572214472303868e-06,
"loss": 0.4403,
"step": 565
},
{
"epoch": 0.7715736040609137,
"grad_norm": 1.4081724058322669,
"learning_rate": 1.8531342035272765e-06,
"loss": 0.432,
"step": 570
},
{
"epoch": 0.7783417935702199,
"grad_norm": 1.6328252561599401,
"learning_rate": 1.8489939109478203e-06,
"loss": 0.4447,
"step": 575
},
{
"epoch": 0.7851099830795262,
"grad_norm": 1.4953859268513063,
"learning_rate": 1.8448008269402226e-06,
"loss": 0.4239,
"step": 580
},
{
"epoch": 0.7918781725888325,
"grad_norm": 1.5304225811890608,
"learning_rate": 1.840555212235321e-06,
"loss": 0.4293,
"step": 585
},
{
"epoch": 0.7986463620981388,
"grad_norm": 1.4486153048909152,
"learning_rate": 1.8362573308303717e-06,
"loss": 0.4304,
"step": 590
},
{
"epoch": 0.805414551607445,
"grad_norm": 1.4602386588010192,
"learning_rate": 1.831907449972636e-06,
"loss": 0.4472,
"step": 595
},
{
"epoch": 0.8121827411167513,
"grad_norm": 1.4965634086484065,
"learning_rate": 1.8275058401427618e-06,
"loss": 0.4409,
"step": 600
},
{
"epoch": 0.8189509306260575,
"grad_norm": 1.5872590957255623,
"learning_rate": 1.823052775037964e-06,
"loss": 0.4522,
"step": 605
},
{
"epoch": 0.8257191201353637,
"grad_norm": 1.4920706655623288,
"learning_rate": 1.818548531555006e-06,
"loss": 0.4289,
"step": 610
},
{
"epoch": 0.8324873096446701,
"grad_norm": 1.4808293841541855,
"learning_rate": 1.8139933897729832e-06,
"loss": 0.4404,
"step": 615
},
{
"epoch": 0.8392554991539763,
"grad_norm": 1.788578348932924,
"learning_rate": 1.8093876329359058e-06,
"loss": 0.4199,
"step": 620
},
{
"epoch": 0.8460236886632826,
"grad_norm": 1.5479379288714532,
"learning_rate": 1.8047315474350868e-06,
"loss": 0.4199,
"step": 625
},
{
"epoch": 0.8527918781725888,
"grad_norm": 1.4758005354779442,
"learning_rate": 1.8000254227913344e-06,
"loss": 0.4503,
"step": 630
},
{
"epoch": 0.8595600676818951,
"grad_norm": 1.5709164106643387,
"learning_rate": 1.7952695516369488e-06,
"loss": 0.436,
"step": 635
},
{
"epoch": 0.8663282571912013,
"grad_norm": 1.4869937326587306,
"learning_rate": 1.7904642296975261e-06,
"loss": 0.4416,
"step": 640
},
{
"epoch": 0.8730964467005076,
"grad_norm": 1.5189251087228999,
"learning_rate": 1.7856097557735694e-06,
"loss": 0.4367,
"step": 645
},
{
"epoch": 0.8798646362098139,
"grad_norm": 1.5390825637238734,
"learning_rate": 1.7807064317219093e-06,
"loss": 0.4385,
"step": 650
},
{
"epoch": 0.8866328257191202,
"grad_norm": 1.500939175744208,
"learning_rate": 1.7757545624369347e-06,
"loss": 0.4112,
"step": 655
},
{
"epoch": 0.8934010152284264,
"grad_norm": 1.5891603689470157,
"learning_rate": 1.770754455831633e-06,
"loss": 0.4496,
"step": 660
},
{
"epoch": 0.9001692047377327,
"grad_norm": 1.6353444834776032,
"learning_rate": 1.7657064228184444e-06,
"loss": 0.4222,
"step": 665
},
{
"epoch": 0.9069373942470389,
"grad_norm": 1.6181070946717884,
"learning_rate": 1.7606107772899285e-06,
"loss": 0.4296,
"step": 670
},
{
"epoch": 0.9137055837563451,
"grad_norm": 1.633235008124872,
"learning_rate": 1.7554678360992475e-06,
"loss": 0.4308,
"step": 675
},
{
"epoch": 0.9204737732656514,
"grad_norm": 1.4850942271908985,
"learning_rate": 1.7502779190404611e-06,
"loss": 0.425,
"step": 680
},
{
"epoch": 0.9272419627749577,
"grad_norm": 1.553921180752237,
"learning_rate": 1.745041348828645e-06,
"loss": 0.4228,
"step": 685
},
{
"epoch": 0.934010152284264,
"grad_norm": 1.5091342123918963,
"learning_rate": 1.7397584510798206e-06,
"loss": 0.4447,
"step": 690
},
{
"epoch": 0.9407783417935702,
"grad_norm": 1.5470959471430534,
"learning_rate": 1.7344295542907105e-06,
"loss": 0.4515,
"step": 695
},
{
"epoch": 0.9475465313028765,
"grad_norm": 1.584257899917925,
"learning_rate": 1.7290549898183109e-06,
"loss": 0.4276,
"step": 700
},
{
"epoch": 0.9543147208121827,
"grad_norm": 1.5896926257834991,
"learning_rate": 1.7236350918592866e-06,
"loss": 0.4429,
"step": 705
},
{
"epoch": 0.961082910321489,
"grad_norm": 1.4741862947637046,
"learning_rate": 1.7181701974291928e-06,
"loss": 0.417,
"step": 710
},
{
"epoch": 0.9678510998307953,
"grad_norm": 1.5194416495298846,
"learning_rate": 1.7126606463415164e-06,
"loss": 0.4299,
"step": 715
},
{
"epoch": 0.9746192893401016,
"grad_norm": 1.6092727303865384,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.4351,
"step": 720
},
{
"epoch": 0.9813874788494078,
"grad_norm": 1.4151798722540438,
"learning_rate": 1.701508947310077e-06,
"loss": 0.4194,
"step": 725
},
{
"epoch": 0.988155668358714,
"grad_norm": 1.5504577940513988,
"learning_rate": 1.695867492791921e-06,
"loss": 0.4269,
"step": 730
},
{
"epoch": 0.9949238578680203,
"grad_norm": 1.5420736691888262,
"learning_rate": 1.690182768424279e-06,
"loss": 0.4406,
"step": 735
},
{
"epoch": 1.0016920473773265,
"grad_norm": 1.7092059209864483,
"learning_rate": 1.6844551276899184e-06,
"loss": 0.3988,
"step": 740
},
{
"epoch": 1.0084602368866329,
"grad_norm": 1.5056006562387958,
"learning_rate": 1.6786849267401978e-06,
"loss": 0.3558,
"step": 745
},
{
"epoch": 1.015228426395939,
"grad_norm": 1.767638909765125,
"learning_rate": 1.6728725243729187e-06,
"loss": 0.3396,
"step": 750
},
{
"epoch": 1.0219966159052454,
"grad_norm": 1.6366144961909075,
"learning_rate": 1.667018282010016e-06,
"loss": 0.3238,
"step": 755
},
{
"epoch": 1.0287648054145515,
"grad_norm": 1.5321955881585025,
"learning_rate": 1.6611225636750836e-06,
"loss": 0.3422,
"step": 760
},
{
"epoch": 1.0355329949238579,
"grad_norm": 1.680806770640699,
"learning_rate": 1.6551857359707405e-06,
"loss": 0.3309,
"step": 765
},
{
"epoch": 1.0423011844331642,
"grad_norm": 1.5471253871545012,
"learning_rate": 1.649208168055833e-06,
"loss": 0.324,
"step": 770
},
{
"epoch": 1.0490693739424704,
"grad_norm": 1.4947975015447832,
"learning_rate": 1.6431902316224818e-06,
"loss": 0.3329,
"step": 775
},
{
"epoch": 1.0558375634517767,
"grad_norm": 1.6086982480144758,
"learning_rate": 1.6371323008729687e-06,
"loss": 0.3196,
"step": 780
},
{
"epoch": 1.0626057529610828,
"grad_norm": 1.580457843877164,
"learning_rate": 1.6310347524964687e-06,
"loss": 0.3262,
"step": 785
},
{
"epoch": 1.0693739424703892,
"grad_norm": 1.6566966310786062,
"learning_rate": 1.6248979656456272e-06,
"loss": 0.3438,
"step": 790
},
{
"epoch": 1.0761421319796955,
"grad_norm": 1.6737061189158637,
"learning_rate": 1.6187223219129823e-06,
"loss": 0.343,
"step": 795
},
{
"epoch": 1.0829103214890017,
"grad_norm": 1.7888201683292473,
"learning_rate": 1.6125082053072405e-06,
"loss": 0.3381,
"step": 800
},
{
"epoch": 1.089678510998308,
"grad_norm": 1.6140042434311925,
"learning_rate": 1.6062560022293933e-06,
"loss": 0.3299,
"step": 805
},
{
"epoch": 1.0964467005076142,
"grad_norm": 1.5841270947078938,
"learning_rate": 1.5999661014486955e-06,
"loss": 0.3312,
"step": 810
},
{
"epoch": 1.1032148900169205,
"grad_norm": 1.6193416591297873,
"learning_rate": 1.5936388940784883e-06,
"loss": 0.3523,
"step": 815
},
{
"epoch": 1.1099830795262267,
"grad_norm": 1.550619017130258,
"learning_rate": 1.5872747735518798e-06,
"loss": 0.3228,
"step": 820
},
{
"epoch": 1.116751269035533,
"grad_norm": 1.7508231230240185,
"learning_rate": 1.5808741355972807e-06,
"loss": 0.3324,
"step": 825
},
{
"epoch": 1.1235194585448394,
"grad_norm": 1.6682501120844164,
"learning_rate": 1.574437378213799e-06,
"loss": 0.3239,
"step": 830
},
{
"epoch": 1.1302876480541455,
"grad_norm": 1.8314883569950966,
"learning_rate": 1.5679649016464895e-06,
"loss": 0.3296,
"step": 835
},
{
"epoch": 1.1370558375634519,
"grad_norm": 1.59630852333473,
"learning_rate": 1.561457108361468e-06,
"loss": 0.3289,
"step": 840
},
{
"epoch": 1.143824027072758,
"grad_norm": 1.5574912791068813,
"learning_rate": 1.5549144030208855e-06,
"loss": 0.3346,
"step": 845
},
{
"epoch": 1.1505922165820643,
"grad_norm": 1.6059497129644689,
"learning_rate": 1.5483371924577634e-06,
"loss": 0.3381,
"step": 850
},
{
"epoch": 1.1573604060913705,
"grad_norm": 1.7201015765204164,
"learning_rate": 1.5417258856506994e-06,
"loss": 0.3271,
"step": 855
},
{
"epoch": 1.1641285956006768,
"grad_norm": 1.6165299860217694,
"learning_rate": 1.535080893698435e-06,
"loss": 0.3312,
"step": 860
},
{
"epoch": 1.1708967851099832,
"grad_norm": 1.6502865565791853,
"learning_rate": 1.5284026297942926e-06,
"loss": 0.3362,
"step": 865
},
{
"epoch": 1.1776649746192893,
"grad_norm": 1.7002581521681002,
"learning_rate": 1.5216915092004844e-06,
"loss": 0.3215,
"step": 870
},
{
"epoch": 1.1844331641285957,
"grad_norm": 1.7420409861182249,
"learning_rate": 1.5149479492222886e-06,
"loss": 0.3464,
"step": 875
},
{
"epoch": 1.1912013536379018,
"grad_norm": 1.707048857027911,
"learning_rate": 1.5081723691821026e-06,
"loss": 0.3455,
"step": 880
},
{
"epoch": 1.1979695431472082,
"grad_norm": 1.6420304279171187,
"learning_rate": 1.5013651903933683e-06,
"loss": 0.3332,
"step": 885
},
{
"epoch": 1.2047377326565143,
"grad_norm": 1.8125800875620734,
"learning_rate": 1.4945268361343746e-06,
"loss": 0.3382,
"step": 890
},
{
"epoch": 1.2115059221658206,
"grad_norm": 1.6640857688992343,
"learning_rate": 1.4876577316219374e-06,
"loss": 0.3369,
"step": 895
},
{
"epoch": 1.218274111675127,
"grad_norm": 1.6451257316850152,
"learning_rate": 1.4807583039849586e-06,
"loss": 0.3539,
"step": 900
},
{
"epoch": 1.2250423011844331,
"grad_norm": 1.6610764662131192,
"learning_rate": 1.4738289822378683e-06,
"loss": 0.3369,
"step": 905
},
{
"epoch": 1.2318104906937395,
"grad_norm": 1.74944774821556,
"learning_rate": 1.4668701972539456e-06,
"loss": 0.3414,
"step": 910
},
{
"epoch": 1.2385786802030456,
"grad_norm": 1.6889905704095276,
"learning_rate": 1.4598823817385296e-06,
"loss": 0.3462,
"step": 915
},
{
"epoch": 1.245346869712352,
"grad_norm": 1.87748003800123,
"learning_rate": 1.4528659702021106e-06,
"loss": 0.347,
"step": 920
},
{
"epoch": 1.252115059221658,
"grad_norm": 1.7676519334092846,
"learning_rate": 1.4458213989333125e-06,
"loss": 0.3344,
"step": 925
},
{
"epoch": 1.2588832487309645,
"grad_norm": 1.8625421673744915,
"learning_rate": 1.4387491059717651e-06,
"loss": 0.3259,
"step": 930
},
{
"epoch": 1.2656514382402708,
"grad_norm": 1.5243120020428504,
"learning_rate": 1.431649531080864e-06,
"loss": 0.3286,
"step": 935
},
{
"epoch": 1.272419627749577,
"grad_norm": 1.773494740626271,
"learning_rate": 1.424523115720428e-06,
"loss": 0.3366,
"step": 940
},
{
"epoch": 1.2791878172588833,
"grad_norm": 1.642323556539902,
"learning_rate": 1.4173703030192466e-06,
"loss": 0.3381,
"step": 945
},
{
"epoch": 1.2859560067681894,
"grad_norm": 1.8298130052806405,
"learning_rate": 1.4101915377475273e-06,
"loss": 0.3472,
"step": 950
},
{
"epoch": 1.2927241962774958,
"grad_norm": 1.5564171598002208,
"learning_rate": 1.4029872662892382e-06,
"loss": 0.3378,
"step": 955
},
{
"epoch": 1.299492385786802,
"grad_norm": 1.7850954669361399,
"learning_rate": 1.3957579366143519e-06,
"loss": 0.3363,
"step": 960
},
{
"epoch": 1.3062605752961083,
"grad_norm": 1.614939575319601,
"learning_rate": 1.3885039982509905e-06,
"loss": 0.3166,
"step": 965
},
{
"epoch": 1.3130287648054146,
"grad_norm": 1.7656883518798847,
"learning_rate": 1.3812259022574715e-06,
"loss": 0.3426,
"step": 970
},
{
"epoch": 1.3197969543147208,
"grad_norm": 1.4996842720105086,
"learning_rate": 1.373924101194264e-06,
"loss": 0.3377,
"step": 975
},
{
"epoch": 1.3265651438240271,
"grad_norm": 1.834478494924892,
"learning_rate": 1.3665990490958437e-06,
"loss": 0.3408,
"step": 980
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.687498482197505,
"learning_rate": 1.3592512014424644e-06,
"loss": 0.3341,
"step": 985
},
{
"epoch": 1.3401015228426396,
"grad_norm": 1.4779395904473713,
"learning_rate": 1.351881015131833e-06,
"loss": 0.3319,
"step": 990
},
{
"epoch": 1.3468697123519457,
"grad_norm": 1.5491258438326576,
"learning_rate": 1.3444889484507009e-06,
"loss": 0.3287,
"step": 995
},
{
"epoch": 1.353637901861252,
"grad_norm": 1.753194944328746,
"learning_rate": 1.3370754610463652e-06,
"loss": 0.3264,
"step": 1000
},
{
"epoch": 1.3604060913705585,
"grad_norm": 1.8046926820280387,
"learning_rate": 1.32964101389809e-06,
"loss": 0.3453,
"step": 1005
},
{
"epoch": 1.3671742808798646,
"grad_norm": 1.5582819679996394,
"learning_rate": 1.3221860692884395e-06,
"loss": 0.3185,
"step": 1010
},
{
"epoch": 1.373942470389171,
"grad_norm": 1.734615015555365,
"learning_rate": 1.3147110907745336e-06,
"loss": 0.3209,
"step": 1015
},
{
"epoch": 1.380710659898477,
"grad_norm": 1.8370614645263001,
"learning_rate": 1.3072165431592248e-06,
"loss": 0.3389,
"step": 1020
},
{
"epoch": 1.3874788494077834,
"grad_norm": 1.6416288844308489,
"learning_rate": 1.2997028924621943e-06,
"loss": 0.3465,
"step": 1025
},
{
"epoch": 1.3942470389170896,
"grad_norm": 1.7141835707827855,
"learning_rate": 1.2921706058909756e-06,
"loss": 0.3379,
"step": 1030
},
{
"epoch": 1.401015228426396,
"grad_norm": 1.7703336159956253,
"learning_rate": 1.2846201518119017e-06,
"loss": 0.3331,
"step": 1035
},
{
"epoch": 1.4077834179357023,
"grad_norm": 1.7164709273217806,
"learning_rate": 1.2770519997209835e-06,
"loss": 0.3316,
"step": 1040
},
{
"epoch": 1.4145516074450084,
"grad_norm": 1.698294459133158,
"learning_rate": 1.2694666202147137e-06,
"loss": 0.3407,
"step": 1045
},
{
"epoch": 1.4213197969543148,
"grad_norm": 1.7231395084021628,
"learning_rate": 1.2618644849608067e-06,
"loss": 0.3383,
"step": 1050
},
{
"epoch": 1.4280879864636211,
"grad_norm": 1.6225747755972384,
"learning_rate": 1.2542460666688678e-06,
"loss": 0.3272,
"step": 1055
},
{
"epoch": 1.4348561759729273,
"grad_norm": 1.6273808164138512,
"learning_rate": 1.246611839061002e-06,
"loss": 0.3307,
"step": 1060
},
{
"epoch": 1.4416243654822334,
"grad_norm": 1.640412382244569,
"learning_rate": 1.2389622768423536e-06,
"loss": 0.3326,
"step": 1065
},
{
"epoch": 1.4483925549915397,
"grad_norm": 1.5742322851792212,
"learning_rate": 1.231297855671593e-06,
"loss": 0.311,
"step": 1070
},
{
"epoch": 1.455160744500846,
"grad_norm": 1.6398609226586531,
"learning_rate": 1.223619052131337e-06,
"loss": 0.3417,
"step": 1075
},
{
"epoch": 1.4619289340101522,
"grad_norm": 1.5919173149091699,
"learning_rate": 1.2159263436985136e-06,
"loss": 0.3311,
"step": 1080
},
{
"epoch": 1.4686971235194586,
"grad_norm": 1.7663834242591079,
"learning_rate": 1.2082202087146751e-06,
"loss": 0.3404,
"step": 1085
},
{
"epoch": 1.475465313028765,
"grad_norm": 1.6365555810498733,
"learning_rate": 1.2005011263562513e-06,
"loss": 0.3211,
"step": 1090
},
{
"epoch": 1.482233502538071,
"grad_norm": 1.7543606709062083,
"learning_rate": 1.1927695766047538e-06,
"loss": 0.3345,
"step": 1095
},
{
"epoch": 1.4890016920473772,
"grad_norm": 1.6454656998875175,
"learning_rate": 1.185026040216934e-06,
"loss": 0.329,
"step": 1100
},
{
"epoch": 1.4957698815566836,
"grad_norm": 1.6242171627203073,
"learning_rate": 1.1772709986948827e-06,
"loss": 0.3274,
"step": 1105
},
{
"epoch": 1.50253807106599,
"grad_norm": 2.0678978985333596,
"learning_rate": 1.1695049342560967e-06,
"loss": 0.3544,
"step": 1110
},
{
"epoch": 1.509306260575296,
"grad_norm": 1.7262447342718426,
"learning_rate": 1.161728329803488e-06,
"loss": 0.341,
"step": 1115
},
{
"epoch": 1.5160744500846024,
"grad_norm": 1.6907890988982508,
"learning_rate": 1.153941668895361e-06,
"loss": 0.3292,
"step": 1120
},
{
"epoch": 1.5228426395939088,
"grad_norm": 1.6131818091865402,
"learning_rate": 1.1461454357153406e-06,
"loss": 0.3273,
"step": 1125
},
{
"epoch": 1.5296108291032149,
"grad_norm": 1.970023298749538,
"learning_rate": 1.1383401150422668e-06,
"loss": 0.3389,
"step": 1130
},
{
"epoch": 1.536379018612521,
"grad_norm": 1.7477654667475575,
"learning_rate": 1.1305261922200517e-06,
"loss": 0.336,
"step": 1135
},
{
"epoch": 1.5431472081218274,
"grad_norm": 1.8260233194529998,
"learning_rate": 1.1227041531274977e-06,
"loss": 0.3394,
"step": 1140
},
{
"epoch": 1.5499153976311337,
"grad_norm": 1.533061734694472,
"learning_rate": 1.1148744841480873e-06,
"loss": 0.3274,
"step": 1145
},
{
"epoch": 1.5566835871404399,
"grad_norm": 1.769403392681689,
"learning_rate": 1.1070376721397372e-06,
"loss": 0.3438,
"step": 1150
},
{
"epoch": 1.5634517766497462,
"grad_norm": 1.6263236224467823,
"learning_rate": 1.0991942044045274e-06,
"loss": 0.3437,
"step": 1155
},
{
"epoch": 1.5702199661590526,
"grad_norm": 1.9050438393576472,
"learning_rate": 1.0913445686583974e-06,
"loss": 0.3208,
"step": 1160
},
{
"epoch": 1.5769881556683587,
"grad_norm": 1.7107041476766611,
"learning_rate": 1.0834892530008214e-06,
"loss": 0.3192,
"step": 1165
},
{
"epoch": 1.5837563451776648,
"grad_norm": 1.5694513216338701,
"learning_rate": 1.0756287458844569e-06,
"loss": 0.3339,
"step": 1170
},
{
"epoch": 1.5905245346869712,
"grad_norm": 1.5469737030155013,
"learning_rate": 1.0677635360847722e-06,
"loss": 0.3323,
"step": 1175
},
{
"epoch": 1.5972927241962775,
"grad_norm": 1.652088656809816,
"learning_rate": 1.0598941126696543e-06,
"loss": 0.3331,
"step": 1180
},
{
"epoch": 1.6040609137055837,
"grad_norm": 1.696570545718917,
"learning_rate": 1.0520209649689977e-06,
"loss": 0.3258,
"step": 1185
},
{
"epoch": 1.61082910321489,
"grad_norm": 1.759912674332406,
"learning_rate": 1.0441445825442771e-06,
"loss": 0.3379,
"step": 1190
},
{
"epoch": 1.6175972927241964,
"grad_norm": 1.7089027285892213,
"learning_rate": 1.0362654551581062e-06,
"loss": 0.3449,
"step": 1195
},
{
"epoch": 1.6243654822335025,
"grad_norm": 1.5565566000298192,
"learning_rate": 1.0283840727437832e-06,
"loss": 0.338,
"step": 1200
},
{
"epoch": 1.6311336717428087,
"grad_norm": 1.6464360907618232,
"learning_rate": 1.0205009253748272e-06,
"loss": 0.3327,
"step": 1205
},
{
"epoch": 1.637901861252115,
"grad_norm": 1.6450744022431256,
"learning_rate": 1.0126165032345037e-06,
"loss": 0.3411,
"step": 1210
},
{
"epoch": 1.6446700507614214,
"grad_norm": 1.8547448481156448,
"learning_rate": 1.0047312965853454e-06,
"loss": 0.3383,
"step": 1215
},
{
"epoch": 1.6514382402707275,
"grad_norm": 1.6495177484989427,
"learning_rate": 9.968457957386662e-07,
"loss": 0.3263,
"step": 1220
},
{
"epoch": 1.6582064297800339,
"grad_norm": 1.6895491290871958,
"learning_rate": 9.88960491024074e-07,
"loss": 0.3325,
"step": 1225
},
{
"epoch": 1.6649746192893402,
"grad_norm": 1.627812788457914,
"learning_rate": 9.810758727589813e-07,
"loss": 0.3291,
"step": 1230
},
{
"epoch": 1.6717428087986463,
"grad_norm": 1.7996421239110534,
"learning_rate": 9.731924312181148e-07,
"loss": 0.3354,
"step": 1235
},
{
"epoch": 1.6785109983079525,
"grad_norm": 1.8622024185022505,
"learning_rate": 9.653106566030328e-07,
"loss": 0.3459,
"step": 1240
},
{
"epoch": 1.6852791878172588,
"grad_norm": 1.7964469673524814,
"learning_rate": 9.574310390116418e-07,
"loss": 0.3205,
"step": 1245
},
{
"epoch": 1.6920473773265652,
"grad_norm": 1.6712003471107053,
"learning_rate": 9.495540684077214e-07,
"loss": 0.3368,
"step": 1250
},
{
"epoch": 1.6988155668358713,
"grad_norm": 1.6956557822203489,
"learning_rate": 9.41680234590459e-07,
"loss": 0.3249,
"step": 1255
},
{
"epoch": 1.7055837563451777,
"grad_norm": 1.694408958921992,
"learning_rate": 9.338100271639931e-07,
"loss": 0.3498,
"step": 1260
},
{
"epoch": 1.712351945854484,
"grad_norm": 1.6701624264401975,
"learning_rate": 9.25943935506969e-07,
"loss": 0.3257,
"step": 1265
},
{
"epoch": 1.7191201353637902,
"grad_norm": 1.7930456020095138,
"learning_rate": 9.180824487421076e-07,
"loss": 0.3261,
"step": 1270
},
{
"epoch": 1.7258883248730963,
"grad_norm": 1.5441215575090625,
"learning_rate": 9.102260557057935e-07,
"loss": 0.336,
"step": 1275
},
{
"epoch": 1.7326565143824029,
"grad_norm": 1.64455257154265,
"learning_rate": 9.023752449176772e-07,
"loss": 0.3269,
"step": 1280
},
{
"epoch": 1.739424703891709,
"grad_norm": 1.5448960299784444,
"learning_rate": 8.945305045502984e-07,
"loss": 0.3288,
"step": 1285
},
{
"epoch": 1.7461928934010151,
"grad_norm": 1.6868399015307785,
"learning_rate": 8.866923223987302e-07,
"loss": 0.3196,
"step": 1290
},
{
"epoch": 1.7529610829103215,
"grad_norm": 1.6844238790169903,
"learning_rate": 8.788611858502489e-07,
"loss": 0.3524,
"step": 1295
},
{
"epoch": 1.7597292724196278,
"grad_norm": 1.536238947791522,
"learning_rate": 8.710375818540279e-07,
"loss": 0.323,
"step": 1300
},
{
"epoch": 1.766497461928934,
"grad_norm": 1.6594155852789063,
"learning_rate": 8.632219968908555e-07,
"loss": 0.3388,
"step": 1305
},
{
"epoch": 1.77326565143824,
"grad_norm": 1.8357910067878633,
"learning_rate": 8.554149169428892e-07,
"loss": 0.319,
"step": 1310
},
{
"epoch": 1.7800338409475467,
"grad_norm": 1.770060103182254,
"learning_rate": 8.476168274634341e-07,
"loss": 0.3533,
"step": 1315
},
{
"epoch": 1.7868020304568528,
"grad_norm": 1.6650622145152638,
"learning_rate": 8.398282133467578e-07,
"loss": 0.3313,
"step": 1320
},
{
"epoch": 1.793570219966159,
"grad_norm": 1.6934630286297077,
"learning_rate": 8.320495588979377e-07,
"loss": 0.3273,
"step": 1325
},
{
"epoch": 1.8003384094754653,
"grad_norm": 1.6802792500913968,
"learning_rate": 8.242813478027491e-07,
"loss": 0.3425,
"step": 1330
},
{
"epoch": 1.8071065989847717,
"grad_norm": 1.6829953200882048,
"learning_rate": 8.165240630975861e-07,
"loss": 0.351,
"step": 1335
},
{
"epoch": 1.8138747884940778,
"grad_norm": 1.7159022109793864,
"learning_rate": 8.087781871394279e-07,
"loss": 0.3211,
"step": 1340
},
{
"epoch": 1.8206429780033841,
"grad_norm": 1.6151668145042095,
"learning_rate": 8.010442015758445e-07,
"loss": 0.316,
"step": 1345
},
{
"epoch": 1.8274111675126905,
"grad_norm": 1.6174069637779651,
"learning_rate": 7.93322587315047e-07,
"loss": 0.332,
"step": 1350
},
{
"epoch": 1.8341793570219966,
"grad_norm": 1.6680877411690365,
"learning_rate": 7.856138244959849e-07,
"loss": 0.3243,
"step": 1355
},
{
"epoch": 1.8409475465313028,
"grad_norm": 1.7974753943714166,
"learning_rate": 7.7791839245849e-07,
"loss": 0.3294,
"step": 1360
},
{
"epoch": 1.8477157360406091,
"grad_norm": 1.646510776278098,
"learning_rate": 7.702367697134701e-07,
"loss": 0.3304,
"step": 1365
},
{
"epoch": 1.8544839255499155,
"grad_norm": 1.7474734525141256,
"learning_rate": 7.625694339131563e-07,
"loss": 0.3588,
"step": 1370
},
{
"epoch": 1.8612521150592216,
"grad_norm": 1.658323098173442,
"learning_rate": 7.549168618213994e-07,
"loss": 0.3362,
"step": 1375
},
{
"epoch": 1.868020304568528,
"grad_norm": 1.67431452331962,
"learning_rate": 7.472795292840269e-07,
"loss": 0.3427,
"step": 1380
},
{
"epoch": 1.8747884940778343,
"grad_norm": 1.6444897700367918,
"learning_rate": 7.396579111992522e-07,
"loss": 0.3552,
"step": 1385
},
{
"epoch": 1.8815566835871405,
"grad_norm": 1.8283646757854843,
"learning_rate": 7.32052481488147e-07,
"loss": 0.3312,
"step": 1390
},
{
"epoch": 1.8883248730964466,
"grad_norm": 1.755531433154494,
"learning_rate": 7.244637130651693e-07,
"loss": 0.3366,
"step": 1395
},
{
"epoch": 1.895093062605753,
"grad_norm": 1.634727248259316,
"learning_rate": 7.168920778087601e-07,
"loss": 0.3323,
"step": 1400
},
{
"epoch": 1.9018612521150593,
"grad_norm": 1.6187492591322967,
"learning_rate": 7.093380465320008e-07,
"loss": 0.345,
"step": 1405
},
{
"epoch": 1.9086294416243654,
"grad_norm": 2.046018137628728,
"learning_rate": 7.018020889533347e-07,
"loss": 0.3316,
"step": 1410
},
{
"epoch": 1.9153976311336718,
"grad_norm": 1.7585386765032196,
"learning_rate": 6.942846736673633e-07,
"loss": 0.3404,
"step": 1415
},
{
"epoch": 1.9221658206429781,
"grad_norm": 1.600209455454873,
"learning_rate": 6.867862681157066e-07,
"loss": 0.3319,
"step": 1420
},
{
"epoch": 1.9289340101522843,
"grad_norm": 1.6939356089504074,
"learning_rate": 6.793073385579372e-07,
"loss": 0.3353,
"step": 1425
},
{
"epoch": 1.9357021996615904,
"grad_norm": 1.730441931386039,
"learning_rate": 6.718483500425866e-07,
"loss": 0.3448,
"step": 1430
},
{
"epoch": 1.9424703891708968,
"grad_norm": 1.602246560839999,
"learning_rate": 6.644097663782308e-07,
"loss": 0.3207,
"step": 1435
},
{
"epoch": 1.9492385786802031,
"grad_norm": 1.5255973835779064,
"learning_rate": 6.569920501046473e-07,
"loss": 0.3211,
"step": 1440
},
{
"epoch": 1.9560067681895092,
"grad_norm": 1.787975436248423,
"learning_rate": 6.495956624640558e-07,
"loss": 0.3331,
"step": 1445
},
{
"epoch": 1.9627749576988156,
"grad_norm": 1.635182796509772,
"learning_rate": 6.422210633724359e-07,
"loss": 0.3151,
"step": 1450
},
{
"epoch": 1.969543147208122,
"grad_norm": 1.745570057757413,
"learning_rate": 6.348687113909303e-07,
"loss": 0.3166,
"step": 1455
},
{
"epoch": 1.976311336717428,
"grad_norm": 1.666654337456338,
"learning_rate": 6.275390636973315e-07,
"loss": 0.3287,
"step": 1460
},
{
"epoch": 1.9830795262267342,
"grad_norm": 1.7830502067885774,
"learning_rate": 6.20232576057651e-07,
"loss": 0.3374,
"step": 1465
},
{
"epoch": 1.9898477157360406,
"grad_norm": 1.6063864832196357,
"learning_rate": 6.129497027977828e-07,
"loss": 0.3333,
"step": 1470
},
{
"epoch": 1.996615905245347,
"grad_norm": 1.84518355863186,
"learning_rate": 6.05690896775251e-07,
"loss": 0.3338,
"step": 1475
},
{
"epoch": 2.003384094754653,
"grad_norm": 1.996243516154583,
"learning_rate": 5.984566093510508e-07,
"loss": 0.3076,
"step": 1480
},
{
"epoch": 2.010152284263959,
"grad_norm": 1.6441418715481781,
"learning_rate": 5.91247290361582e-07,
"loss": 0.2734,
"step": 1485
},
{
"epoch": 2.0169204737732658,
"grad_norm": 1.7461011995129512,
"learning_rate": 5.840633880906787e-07,
"loss": 0.2483,
"step": 1490
},
{
"epoch": 2.023688663282572,
"grad_norm": 1.780208702006572,
"learning_rate": 5.769053492417341e-07,
"loss": 0.2597,
"step": 1495
},
{
"epoch": 2.030456852791878,
"grad_norm": 1.6957870173539042,
"learning_rate": 5.69773618909923e-07,
"loss": 0.2534,
"step": 1500
},
{
"epoch": 2.0372250423011846,
"grad_norm": 1.7614796776092347,
"learning_rate": 5.62668640554526e-07,
"loss": 0.2684,
"step": 1505
},
{
"epoch": 2.0439932318104908,
"grad_norm": 1.8361067674268434,
"learning_rate": 5.55590855971356e-07,
"loss": 0.2645,
"step": 1510
},
{
"epoch": 2.050761421319797,
"grad_norm": 1.803442468077519,
"learning_rate": 5.485407052652844e-07,
"loss": 0.2637,
"step": 1515
},
{
"epoch": 2.057529610829103,
"grad_norm": 1.6146644628968327,
"learning_rate": 5.415186268228762e-07,
"loss": 0.2657,
"step": 1520
},
{
"epoch": 2.0642978003384096,
"grad_norm": 2.478325459542897,
"learning_rate": 5.3452505728513e-07,
"loss": 0.2528,
"step": 1525
},
{
"epoch": 2.0710659898477157,
"grad_norm": 1.902805376038303,
"learning_rate": 5.275604315203292e-07,
"loss": 0.2625,
"step": 1530
},
{
"epoch": 2.077834179357022,
"grad_norm": 1.707278374587984,
"learning_rate": 5.206251825969973e-07,
"loss": 0.2557,
"step": 1535
},
{
"epoch": 2.0846023688663284,
"grad_norm": 1.7063185136076124,
"learning_rate": 5.137197417569738e-07,
"loss": 0.2397,
"step": 1540
},
{
"epoch": 2.0913705583756346,
"grad_norm": 1.8210841183819555,
"learning_rate": 5.068445383885961e-07,
"loss": 0.2511,
"step": 1545
},
{
"epoch": 2.0981387478849407,
"grad_norm": 1.8554223864028092,
"learning_rate": 5.000000000000002e-07,
"loss": 0.2553,
"step": 1550
},
{
"epoch": 2.104906937394247,
"grad_norm": 1.7188069419320222,
"learning_rate": 4.931865521925383e-07,
"loss": 0.2454,
"step": 1555
},
{
"epoch": 2.1116751269035534,
"grad_norm": 1.8040405864378724,
"learning_rate": 4.864046186343139e-07,
"loss": 0.2608,
"step": 1560
},
{
"epoch": 2.1184433164128595,
"grad_norm": 1.9594461805438623,
"learning_rate": 4.796546210338387e-07,
"loss": 0.262,
"step": 1565
},
{
"epoch": 2.1252115059221657,
"grad_norm": 1.6763587503898223,
"learning_rate": 4.7293697911380846e-07,
"loss": 0.2622,
"step": 1570
},
{
"epoch": 2.1319796954314723,
"grad_norm": 1.966220531714013,
"learning_rate": 4.662521105850055e-07,
"loss": 0.2512,
"step": 1575
},
{
"epoch": 2.1387478849407784,
"grad_norm": 1.755764907373717,
"learning_rate": 4.596004311203242e-07,
"loss": 0.249,
"step": 1580
},
{
"epoch": 2.1455160744500845,
"grad_norm": 1.8223089125739613,
"learning_rate": 4.5298235432892575e-07,
"loss": 0.2465,
"step": 1585
},
{
"epoch": 2.152284263959391,
"grad_norm": 1.7206718832666286,
"learning_rate": 4.463982917305155e-07,
"loss": 0.2458,
"step": 1590
},
{
"epoch": 2.1590524534686972,
"grad_norm": 1.8397041856544176,
"learning_rate": 4.398486527297595e-07,
"loss": 0.2577,
"step": 1595
},
{
"epoch": 2.1658206429780034,
"grad_norm": 1.8789732250278144,
"learning_rate": 4.3333384459082247e-07,
"loss": 0.2547,
"step": 1600
},
{
"epoch": 2.1725888324873095,
"grad_norm": 1.9842491893922207,
"learning_rate": 4.268542724120475e-07,
"loss": 0.2407,
"step": 1605
},
{
"epoch": 2.179357021996616,
"grad_norm": 1.6840880785917902,
"learning_rate": 4.204103391007623e-07,
"loss": 0.2453,
"step": 1610
},
{
"epoch": 2.186125211505922,
"grad_norm": 1.7102283764854913,
"learning_rate": 4.140024453482307e-07,
"loss": 0.2531,
"step": 1615
},
{
"epoch": 2.1928934010152283,
"grad_norm": 1.9705580867409258,
"learning_rate": 4.076309896047336e-07,
"loss": 0.239,
"step": 1620
},
{
"epoch": 2.199661590524535,
"grad_norm": 1.793405155653999,
"learning_rate": 4.012963680547946e-07,
"loss": 0.2565,
"step": 1625
},
{
"epoch": 2.206429780033841,
"grad_norm": 1.720247219871775,
"learning_rate": 3.949989745925437e-07,
"loss": 0.2675,
"step": 1630
},
{
"epoch": 2.213197969543147,
"grad_norm": 1.6800247601578724,
"learning_rate": 3.8873920079722644e-07,
"loss": 0.2568,
"step": 1635
},
{
"epoch": 2.2199661590524533,
"grad_norm": 1.8843811057218098,
"learning_rate": 3.8251743590885256e-07,
"loss": 0.2431,
"step": 1640
},
{
"epoch": 2.22673434856176,
"grad_norm": 1.6769092970745525,
"learning_rate": 3.7633406680399416e-07,
"loss": 0.2513,
"step": 1645
},
{
"epoch": 2.233502538071066,
"grad_norm": 1.9149891397189558,
"learning_rate": 3.701894779717286e-07,
"loss": 0.2441,
"step": 1650
},
{
"epoch": 2.240270727580372,
"grad_norm": 1.8352949058879366,
"learning_rate": 3.640840514897322e-07,
"loss": 0.2512,
"step": 1655
},
{
"epoch": 2.2470389170896787,
"grad_norm": 1.7191969563079663,
"learning_rate": 3.580181670005182e-07,
"loss": 0.2514,
"step": 1660
},
{
"epoch": 2.253807106598985,
"grad_norm": 1.7862905463536183,
"learning_rate": 3.519922016878356e-07,
"loss": 0.2523,
"step": 1665
},
{
"epoch": 2.260575296108291,
"grad_norm": 1.7759093954687228,
"learning_rate": 3.460065302532108e-07,
"loss": 0.2455,
"step": 1670
},
{
"epoch": 2.267343485617597,
"grad_norm": 1.8251542509381542,
"learning_rate": 3.400615248926506e-07,
"loss": 0.2628,
"step": 1675
},
{
"epoch": 2.2741116751269037,
"grad_norm": 1.8854432012171292,
"learning_rate": 3.341575552734978e-07,
"loss": 0.2496,
"step": 1680
},
{
"epoch": 2.28087986463621,
"grad_norm": 1.7098687857977533,
"learning_rate": 3.2829498851144577e-07,
"loss": 0.2486,
"step": 1685
},
{
"epoch": 2.287648054145516,
"grad_norm": 1.9407175434129038,
"learning_rate": 3.224741891477095e-07,
"loss": 0.254,
"step": 1690
},
{
"epoch": 2.2944162436548226,
"grad_norm": 1.8048961105532955,
"learning_rate": 3.166955191263587e-07,
"loss": 0.2596,
"step": 1695
},
{
"epoch": 2.3011844331641287,
"grad_norm": 1.9500522244148442,
"learning_rate": 3.109593377718116e-07,
"loss": 0.2674,
"step": 1700
},
{
"epoch": 2.307952622673435,
"grad_norm": 1.7430933854279251,
"learning_rate": 3.0526600176649153e-07,
"loss": 0.2426,
"step": 1705
},
{
"epoch": 2.314720812182741,
"grad_norm": 1.7044330031218278,
"learning_rate": 2.9961586512864944e-07,
"loss": 0.2545,
"step": 1710
},
{
"epoch": 2.3214890016920475,
"grad_norm": 1.750375443426282,
"learning_rate": 2.9400927919034726e-07,
"loss": 0.2408,
"step": 1715
},
{
"epoch": 2.3282571912013537,
"grad_norm": 1.8950517301756609,
"learning_rate": 2.884465925756159e-07,
"loss": 0.2489,
"step": 1720
},
{
"epoch": 2.33502538071066,
"grad_norm": 1.8482803305522688,
"learning_rate": 2.829281511787739e-07,
"loss": 0.2625,
"step": 1725
},
{
"epoch": 2.3417935702199664,
"grad_norm": 1.9547518786899865,
"learning_rate": 2.774542981429214e-07,
"loss": 0.2539,
"step": 1730
},
{
"epoch": 2.3485617597292725,
"grad_norm": 1.681029190992801,
"learning_rate": 2.7202537383860193e-07,
"loss": 0.2569,
"step": 1735
},
{
"epoch": 2.3553299492385786,
"grad_norm": 1.8386556962717924,
"learning_rate": 2.6664171584263927e-07,
"loss": 0.2738,
"step": 1740
},
{
"epoch": 2.3620981387478848,
"grad_norm": 1.8580649687840483,
"learning_rate": 2.613036589171443e-07,
"loss": 0.2587,
"step": 1745
},
{
"epoch": 2.3688663282571913,
"grad_norm": 1.8932490915132734,
"learning_rate": 2.560115349887013e-07,
"loss": 0.2597,
"step": 1750
},
{
"epoch": 2.3756345177664975,
"grad_norm": 1.8291342664374226,
"learning_rate": 2.5076567312772636e-07,
"loss": 0.2514,
"step": 1755
},
{
"epoch": 2.3824027072758036,
"grad_norm": 1.8170289718176122,
"learning_rate": 2.4556639952800784e-07,
"loss": 0.2508,
"step": 1760
},
{
"epoch": 2.38917089678511,
"grad_norm": 1.770454255610108,
"learning_rate": 2.4041403748642085e-07,
"loss": 0.2607,
"step": 1765
},
{
"epoch": 2.3959390862944163,
"grad_norm": 1.920241090922087,
"learning_rate": 2.353089073828255e-07,
"loss": 0.2497,
"step": 1770
},
{
"epoch": 2.4027072758037225,
"grad_norm": 1.9199730244133009,
"learning_rate": 2.302513266601449e-07,
"loss": 0.2534,
"step": 1775
},
{
"epoch": 2.4094754653130286,
"grad_norm": 1.770783110899942,
"learning_rate": 2.2524160980462747e-07,
"loss": 0.2577,
"step": 1780
},
{
"epoch": 2.416243654822335,
"grad_norm": 1.779049283308491,
"learning_rate": 2.2028006832628876e-07,
"loss": 0.2648,
"step": 1785
},
{
"epoch": 2.4230118443316413,
"grad_norm": 1.8951806130885847,
"learning_rate": 2.1536701073954556e-07,
"loss": 0.2552,
"step": 1790
},
{
"epoch": 2.4297800338409474,
"grad_norm": 1.890797368455652,
"learning_rate": 2.1050274254402812e-07,
"loss": 0.2533,
"step": 1795
},
{
"epoch": 2.436548223350254,
"grad_norm": 1.8879449556370194,
"learning_rate": 2.0568756620558736e-07,
"loss": 0.2621,
"step": 1800
},
{
"epoch": 2.44331641285956,
"grad_norm": 1.9716199735140634,
"learning_rate": 2.0092178113748348e-07,
"loss": 0.251,
"step": 1805
},
{
"epoch": 2.4500846023688663,
"grad_norm": 1.664484183169836,
"learning_rate": 1.962056836817718e-07,
"loss": 0.2451,
"step": 1810
},
{
"epoch": 2.4568527918781724,
"grad_norm": 1.7827762385482862,
"learning_rate": 1.9153956709087337e-07,
"loss": 0.2561,
"step": 1815
},
{
"epoch": 2.463620981387479,
"grad_norm": 1.776234828523817,
"learning_rate": 1.8692372150934111e-07,
"loss": 0.2396,
"step": 1820
},
{
"epoch": 2.470389170896785,
"grad_norm": 1.745340346927932,
"learning_rate": 1.8235843395581795e-07,
"loss": 0.2494,
"step": 1825
},
{
"epoch": 2.4771573604060912,
"grad_norm": 1.8045126818551005,
"learning_rate": 1.7784398830519e-07,
"loss": 0.2522,
"step": 1830
},
{
"epoch": 2.483925549915398,
"grad_norm": 1.89151528655817,
"learning_rate": 1.733806652709351e-07,
"loss": 0.2528,
"step": 1835
},
{
"epoch": 2.490693739424704,
"grad_norm": 1.7633235435458878,
"learning_rate": 1.68968742387667e-07,
"loss": 0.2518,
"step": 1840
},
{
"epoch": 2.49746192893401,
"grad_norm": 1.7902731019901965,
"learning_rate": 1.6460849399387845e-07,
"loss": 0.2552,
"step": 1845
},
{
"epoch": 2.504230118443316,
"grad_norm": 1.8529889195491502,
"learning_rate": 1.6030019121488226e-07,
"loss": 0.2588,
"step": 1850
},
{
"epoch": 2.510998307952623,
"grad_norm": 1.7334848080126557,
"learning_rate": 1.5604410194595264e-07,
"loss": 0.2495,
"step": 1855
},
{
"epoch": 2.517766497461929,
"grad_norm": 1.8562652968231852,
"learning_rate": 1.5184049083566687e-07,
"loss": 0.2468,
"step": 1860
},
{
"epoch": 2.524534686971235,
"grad_norm": 1.7483934506844119,
"learning_rate": 1.476896192694499e-07,
"loss": 0.2527,
"step": 1865
},
{
"epoch": 2.5313028764805416,
"grad_norm": 1.8012786807901255,
"learning_rate": 1.4359174535331998e-07,
"loss": 0.2495,
"step": 1870
},
{
"epoch": 2.5380710659898478,
"grad_norm": 1.882303438003816,
"learning_rate": 1.3954712389783996e-07,
"loss": 0.2633,
"step": 1875
},
{
"epoch": 2.544839255499154,
"grad_norm": 1.9004686167123348,
"learning_rate": 1.3555600640227283e-07,
"loss": 0.2482,
"step": 1880
},
{
"epoch": 2.55160744500846,
"grad_norm": 1.9008963364397549,
"learning_rate": 1.3161864103894361e-07,
"loss": 0.2601,
"step": 1885
},
{
"epoch": 2.5583756345177666,
"grad_norm": 1.7910128571435278,
"learning_rate": 1.2773527263780626e-07,
"loss": 0.2483,
"step": 1890
},
{
"epoch": 2.5651438240270727,
"grad_norm": 1.7108145043264886,
"learning_rate": 1.23906142671222e-07,
"loss": 0.2468,
"step": 1895
},
{
"epoch": 2.571912013536379,
"grad_norm": 1.8225256818804154,
"learning_rate": 1.2013148923894212e-07,
"loss": 0.2543,
"step": 1900
},
{
"epoch": 2.5786802030456855,
"grad_norm": 1.79411507183842,
"learning_rate": 1.1641154705330502e-07,
"loss": 0.2409,
"step": 1905
},
{
"epoch": 2.5854483925549916,
"grad_norm": 1.7242811103863596,
"learning_rate": 1.127465474246384e-07,
"loss": 0.2571,
"step": 1910
},
{
"epoch": 2.5922165820642977,
"grad_norm": 1.9407815471081673,
"learning_rate": 1.0913671824687953e-07,
"loss": 0.251,
"step": 1915
},
{
"epoch": 2.598984771573604,
"grad_norm": 1.6296473201174055,
"learning_rate": 1.0558228398340186e-07,
"loss": 0.2388,
"step": 1920
},
{
"epoch": 2.6057529610829104,
"grad_norm": 1.6835802138496847,
"learning_rate": 1.020834656530597e-07,
"loss": 0.2427,
"step": 1925
},
{
"epoch": 2.6125211505922166,
"grad_norm": 1.8446991621937052,
"learning_rate": 9.86404808164426e-08,
"loss": 0.24,
"step": 1930
},
{
"epoch": 2.6192893401015227,
"grad_norm": 1.762353539890661,
"learning_rate": 9.525354356235004e-08,
"loss": 0.24,
"step": 1935
},
{
"epoch": 2.6260575296108293,
"grad_norm": 1.9831610169369156,
"learning_rate": 9.192286449447684e-08,
"loss": 0.2451,
"step": 1940
},
{
"epoch": 2.6328257191201354,
"grad_norm": 1.6135822284867207,
"learning_rate": 8.864865071831829e-08,
"loss": 0.2534,
"step": 1945
},
{
"epoch": 2.6395939086294415,
"grad_norm": 1.6037304176671419,
"learning_rate": 8.543110582829272e-08,
"loss": 0.243,
"step": 1950
},
{
"epoch": 2.6463620981387477,
"grad_norm": 1.7531920017607536,
"learning_rate": 8.227042989508104e-08,
"loss": 0.2482,
"step": 1955
},
{
"epoch": 2.6531302876480543,
"grad_norm": 1.723522478352686,
"learning_rate": 7.916681945318648e-08,
"loss": 0.2477,
"step": 1960
},
{
"epoch": 2.6598984771573604,
"grad_norm": 1.7614779368425475,
"learning_rate": 7.612046748871326e-08,
"loss": 0.253,
"step": 1965
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.887871045430256,
"learning_rate": 7.313156342736738e-08,
"loss": 0.2508,
"step": 1970
},
{
"epoch": 2.673434856175973,
"grad_norm": 1.9069055836407365,
"learning_rate": 7.020029312267727e-08,
"loss": 0.267,
"step": 1975
},
{
"epoch": 2.6802030456852792,
"grad_norm": 1.8939133923811016,
"learning_rate": 6.732683884443735e-08,
"loss": 0.2692,
"step": 1980
},
{
"epoch": 2.6869712351945854,
"grad_norm": 1.8130887567930416,
"learning_rate": 6.451137926737415e-08,
"loss": 0.2527,
"step": 1985
},
{
"epoch": 2.6937394247038915,
"grad_norm": 2.101117018466412,
"learning_rate": 6.175408946003702e-08,
"loss": 0.2497,
"step": 1990
},
{
"epoch": 2.700507614213198,
"grad_norm": 1.654995259770582,
"learning_rate": 5.90551408739105e-08,
"loss": 0.2306,
"step": 1995
},
{
"epoch": 2.707275803722504,
"grad_norm": 1.83930610346023,
"learning_rate": 5.641470133275472e-08,
"loss": 0.2573,
"step": 2000
},
{
"epoch": 2.7140439932318103,
"grad_norm": 1.810068940701123,
"learning_rate": 5.3832935022169015e-08,
"loss": 0.236,
"step": 2005
},
{
"epoch": 2.720812182741117,
"grad_norm": 1.8885737097550355,
"learning_rate": 5.1310002479383665e-08,
"loss": 0.2543,
"step": 2010
},
{
"epoch": 2.727580372250423,
"grad_norm": 1.722305340814142,
"learning_rate": 4.884606058327612e-08,
"loss": 0.2472,
"step": 2015
},
{
"epoch": 2.734348561759729,
"grad_norm": 1.9391080536086678,
"learning_rate": 4.644126254461755e-08,
"loss": 0.259,
"step": 2020
},
{
"epoch": 2.7411167512690353,
"grad_norm": 1.6499649417235132,
"learning_rate": 4.409575789654474e-08,
"loss": 0.2566,
"step": 2025
},
{
"epoch": 2.747884940778342,
"grad_norm": 1.6917223527602334,
"learning_rate": 4.180969248526334e-08,
"loss": 0.2626,
"step": 2030
},
{
"epoch": 2.754653130287648,
"grad_norm": 1.598065120848905,
"learning_rate": 3.958320846097685e-08,
"loss": 0.2428,
"step": 2035
},
{
"epoch": 2.761421319796954,
"grad_norm": 1.9249388241762717,
"learning_rate": 3.7416444269050326e-08,
"loss": 0.2589,
"step": 2040
},
{
"epoch": 2.7681895093062607,
"grad_norm": 1.7249000353236292,
"learning_rate": 3.530953464139919e-08,
"loss": 0.2381,
"step": 2045
},
{
"epoch": 2.774957698815567,
"grad_norm": 1.8216343143692146,
"learning_rate": 3.3262610588113305e-08,
"loss": 0.2519,
"step": 2050
},
{
"epoch": 2.781725888324873,
"grad_norm": 1.7835249600977168,
"learning_rate": 3.127579938930891e-08,
"loss": 0.2498,
"step": 2055
},
{
"epoch": 2.788494077834179,
"grad_norm": 1.9602449026673896,
"learning_rate": 2.934922458721578e-08,
"loss": 0.2609,
"step": 2060
},
{
"epoch": 2.7952622673434857,
"grad_norm": 1.7932782220063819,
"learning_rate": 2.748300597849429e-08,
"loss": 0.2463,
"step": 2065
},
{
"epoch": 2.802030456852792,
"grad_norm": 1.8373837478135608,
"learning_rate": 2.5677259606786682e-08,
"loss": 0.2587,
"step": 2070
},
{
"epoch": 2.808798646362098,
"grad_norm": 1.7579583567771166,
"learning_rate": 2.393209775550087e-08,
"loss": 0.2409,
"step": 2075
},
{
"epoch": 2.8155668358714045,
"grad_norm": 1.7193835474755472,
"learning_rate": 2.224762894082921e-08,
"loss": 0.2558,
"step": 2080
},
{
"epoch": 2.8223350253807107,
"grad_norm": 1.9046287715832801,
"learning_rate": 2.06239579050006e-08,
"loss": 0.2551,
"step": 2085
},
{
"epoch": 2.829103214890017,
"grad_norm": 2.0268943381701585,
"learning_rate": 1.9061185609766995e-08,
"loss": 0.2412,
"step": 2090
},
{
"epoch": 2.835871404399323,
"grad_norm": 1.9877508850932446,
"learning_rate": 1.7559409230125997e-08,
"loss": 0.2554,
"step": 2095
},
{
"epoch": 2.8426395939086295,
"grad_norm": 1.8920868869430731,
"learning_rate": 1.6118722148278584e-08,
"loss": 0.2563,
"step": 2100
},
{
"epoch": 2.8494077834179357,
"grad_norm": 2.075343149112532,
"learning_rate": 1.4739213947821737e-08,
"loss": 0.2524,
"step": 2105
},
{
"epoch": 2.8561759729272422,
"grad_norm": 2.055837498560825,
"learning_rate": 1.342097040817891e-08,
"loss": 0.2502,
"step": 2110
},
{
"epoch": 2.8629441624365484,
"grad_norm": 1.7560367998970217,
"learning_rate": 1.2164073499265403e-08,
"loss": 0.2619,
"step": 2115
},
{
"epoch": 2.8697123519458545,
"grad_norm": 1.892609707767642,
"learning_rate": 1.0968601376391995e-08,
"loss": 0.2583,
"step": 2120
},
{
"epoch": 2.8764805414551606,
"grad_norm": 1.990394034627918,
"learning_rate": 9.834628375404718e-09,
"loss": 0.2644,
"step": 2125
},
{
"epoch": 2.8832487309644668,
"grad_norm": 1.828132568376944,
"learning_rate": 8.762225008062673e-09,
"loss": 0.2532,
"step": 2130
},
{
"epoch": 2.8900169204737733,
"grad_norm": 1.912512170265383,
"learning_rate": 7.75145795765375e-09,
"loss": 0.2561,
"step": 2135
},
{
"epoch": 2.8967851099830795,
"grad_norm": 1.8627242489328315,
"learning_rate": 6.80239007484773e-09,
"loss": 0.2524,
"step": 2140
},
{
"epoch": 2.903553299492386,
"grad_norm": 1.532454933071209,
"learning_rate": 5.915080373788961e-09,
"loss": 0.2369,
"step": 2145
},
{
"epoch": 2.910321489001692,
"grad_norm": 1.799514001407587,
"learning_rate": 5.089584028425742e-09,
"loss": 0.2561,
"step": 2150
},
{
"epoch": 2.9170896785109983,
"grad_norm": 1.6517806725559891,
"learning_rate": 4.325952369080288e-09,
"loss": 0.2491,
"step": 2155
},
{
"epoch": 2.9238578680203045,
"grad_norm": 1.8524564319779193,
"learning_rate": 3.6242328792567278e-09,
"loss": 0.2548,
"step": 2160
},
{
"epoch": 2.9306260575296106,
"grad_norm": 1.7195279301746678,
"learning_rate": 2.984469192688577e-09,
"loss": 0.2588,
"step": 2165
},
{
"epoch": 2.937394247038917,
"grad_norm": 1.9136832821313436,
"learning_rate": 2.4067010906254628e-09,
"loss": 0.2537,
"step": 2170
},
{
"epoch": 2.9441624365482233,
"grad_norm": 2.0082222078514844,
"learning_rate": 1.8909644993593267e-09,
"loss": 0.2654,
"step": 2175
},
{
"epoch": 2.95093062605753,
"grad_norm": 1.8775828520860633,
"learning_rate": 1.4372914879909881e-09,
"loss": 0.2486,
"step": 2180
},
{
"epoch": 2.957698815566836,
"grad_norm": 1.8912270182535766,
"learning_rate": 1.0457102664356288e-09,
"loss": 0.2471,
"step": 2185
},
{
"epoch": 2.964467005076142,
"grad_norm": 1.8579123732190646,
"learning_rate": 7.162451836685291e-10,
"loss": 0.2366,
"step": 2190
},
{
"epoch": 2.9712351945854483,
"grad_norm": 1.7842037052182547,
"learning_rate": 4.4891672621161226e-10,
"loss": 0.2467,
"step": 2195
},
{
"epoch": 2.9780033840947544,
"grad_norm": 1.875106833427518,
"learning_rate": 2.4374151685913057e-10,
"loss": 0.2626,
"step": 2200
},
{
"epoch": 2.984771573604061,
"grad_norm": 1.7694776387255406,
"learning_rate": 1.007323136438254e-10,
"loss": 0.2515,
"step": 2205
},
{
"epoch": 2.991539763113367,
"grad_norm": 1.8122490703088023,
"learning_rate": 1.9898009044450048e-11,
"loss": 0.2448,
"step": 2210
},
{
"epoch": 2.996954314720812,
"step": 2214,
"total_flos": 865276955983872.0,
"train_loss": 0.3529874208604525,
"train_runtime": 40097.4129,
"train_samples_per_second": 7.073,
"train_steps_per_second": 0.055
}
],
"logging_steps": 5,
"max_steps": 2214,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10086,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 865276955983872.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}