gemma7b-closedqa-gpt4o-100k / trainer_state.json
chansung's picture
Model save
4587400 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.991296779808529,
"eval_steps": 500,
"global_step": 5740,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017406440382941688,
"grad_norm": 600.0,
"learning_rate": 3.4843205574912896e-07,
"loss": 32.6735,
"step": 1
},
{
"epoch": 0.008703220191470844,
"grad_norm": 548.0,
"learning_rate": 1.7421602787456445e-06,
"loss": 30.8839,
"step": 5
},
{
"epoch": 0.017406440382941687,
"grad_norm": 338.0,
"learning_rate": 3.484320557491289e-06,
"loss": 28.8366,
"step": 10
},
{
"epoch": 0.02610966057441253,
"grad_norm": 197.0,
"learning_rate": 5.226480836236934e-06,
"loss": 24.2042,
"step": 15
},
{
"epoch": 0.034812880765883375,
"grad_norm": 82.5,
"learning_rate": 6.968641114982578e-06,
"loss": 20.0294,
"step": 20
},
{
"epoch": 0.04351610095735422,
"grad_norm": 52.75,
"learning_rate": 8.710801393728225e-06,
"loss": 18.1416,
"step": 25
},
{
"epoch": 0.05221932114882506,
"grad_norm": 25.375,
"learning_rate": 1.0452961672473868e-05,
"loss": 16.3408,
"step": 30
},
{
"epoch": 0.060922541340295906,
"grad_norm": 11.5625,
"learning_rate": 1.2195121951219513e-05,
"loss": 15.4579,
"step": 35
},
{
"epoch": 0.06962576153176675,
"grad_norm": 8.1875,
"learning_rate": 1.3937282229965156e-05,
"loss": 14.8065,
"step": 40
},
{
"epoch": 0.0783289817232376,
"grad_norm": 6.8125,
"learning_rate": 1.56794425087108e-05,
"loss": 14.3068,
"step": 45
},
{
"epoch": 0.08703220191470844,
"grad_norm": 8.4375,
"learning_rate": 1.742160278745645e-05,
"loss": 13.9,
"step": 50
},
{
"epoch": 0.09573542210617929,
"grad_norm": 12.875,
"learning_rate": 1.9163763066202093e-05,
"loss": 13.1213,
"step": 55
},
{
"epoch": 0.10443864229765012,
"grad_norm": 21.25,
"learning_rate": 2.0905923344947736e-05,
"loss": 12.0425,
"step": 60
},
{
"epoch": 0.11314186248912098,
"grad_norm": 44.5,
"learning_rate": 2.264808362369338e-05,
"loss": 10.0004,
"step": 65
},
{
"epoch": 0.12184508268059181,
"grad_norm": 42.75,
"learning_rate": 2.4390243902439026e-05,
"loss": 5.8601,
"step": 70
},
{
"epoch": 0.13054830287206268,
"grad_norm": 5.03125,
"learning_rate": 2.6132404181184672e-05,
"loss": 2.3269,
"step": 75
},
{
"epoch": 0.1392515230635335,
"grad_norm": 3.046875,
"learning_rate": 2.7874564459930312e-05,
"loss": 1.8038,
"step": 80
},
{
"epoch": 0.14795474325500435,
"grad_norm": 2.890625,
"learning_rate": 2.9616724738675962e-05,
"loss": 1.6296,
"step": 85
},
{
"epoch": 0.1566579634464752,
"grad_norm": 2.421875,
"learning_rate": 3.13588850174216e-05,
"loss": 1.4928,
"step": 90
},
{
"epoch": 0.16536118363794605,
"grad_norm": 8.0,
"learning_rate": 3.310104529616725e-05,
"loss": 1.4006,
"step": 95
},
{
"epoch": 0.17406440382941687,
"grad_norm": 3.328125,
"learning_rate": 3.48432055749129e-05,
"loss": 1.366,
"step": 100
},
{
"epoch": 0.18276762402088773,
"grad_norm": 1.6875,
"learning_rate": 3.6585365853658535e-05,
"loss": 1.2998,
"step": 105
},
{
"epoch": 0.19147084421235858,
"grad_norm": 5.40625,
"learning_rate": 3.8327526132404185e-05,
"loss": 1.2497,
"step": 110
},
{
"epoch": 0.20017406440382943,
"grad_norm": 2.1875,
"learning_rate": 4.006968641114983e-05,
"loss": 1.2106,
"step": 115
},
{
"epoch": 0.20887728459530025,
"grad_norm": 5.65625,
"learning_rate": 4.181184668989547e-05,
"loss": 1.2119,
"step": 120
},
{
"epoch": 0.2175805047867711,
"grad_norm": 22.75,
"learning_rate": 4.3554006968641115e-05,
"loss": 1.1897,
"step": 125
},
{
"epoch": 0.22628372497824195,
"grad_norm": 16.375,
"learning_rate": 4.529616724738676e-05,
"loss": 1.1921,
"step": 130
},
{
"epoch": 0.2349869451697128,
"grad_norm": 11.1875,
"learning_rate": 4.703832752613241e-05,
"loss": 1.1743,
"step": 135
},
{
"epoch": 0.24369016536118362,
"grad_norm": 9.6875,
"learning_rate": 4.878048780487805e-05,
"loss": 1.1545,
"step": 140
},
{
"epoch": 0.2523933855526545,
"grad_norm": 5.21875,
"learning_rate": 5.0522648083623695e-05,
"loss": 1.1263,
"step": 145
},
{
"epoch": 0.26109660574412535,
"grad_norm": 2.34375,
"learning_rate": 5.2264808362369345e-05,
"loss": 1.1232,
"step": 150
},
{
"epoch": 0.26979982593559615,
"grad_norm": 8.4375,
"learning_rate": 5.400696864111499e-05,
"loss": 1.0888,
"step": 155
},
{
"epoch": 0.278503046127067,
"grad_norm": 3.3125,
"learning_rate": 5.5749128919860624e-05,
"loss": 1.0734,
"step": 160
},
{
"epoch": 0.28720626631853785,
"grad_norm": 1.5078125,
"learning_rate": 5.749128919860628e-05,
"loss": 1.0545,
"step": 165
},
{
"epoch": 0.2959094865100087,
"grad_norm": 5.5625,
"learning_rate": 5.9233449477351924e-05,
"loss": 1.0451,
"step": 170
},
{
"epoch": 0.30461270670147955,
"grad_norm": 6.03125,
"learning_rate": 6.097560975609756e-05,
"loss": 1.023,
"step": 175
},
{
"epoch": 0.3133159268929504,
"grad_norm": 3.78125,
"learning_rate": 6.27177700348432e-05,
"loss": 1.0475,
"step": 180
},
{
"epoch": 0.32201914708442125,
"grad_norm": 1.8984375,
"learning_rate": 6.445993031358886e-05,
"loss": 1.008,
"step": 185
},
{
"epoch": 0.3307223672758921,
"grad_norm": 6.46875,
"learning_rate": 6.62020905923345e-05,
"loss": 0.9875,
"step": 190
},
{
"epoch": 0.3394255874673629,
"grad_norm": 3.3125,
"learning_rate": 6.794425087108013e-05,
"loss": 1.0139,
"step": 195
},
{
"epoch": 0.34812880765883375,
"grad_norm": 1.8828125,
"learning_rate": 6.96864111498258e-05,
"loss": 0.9675,
"step": 200
},
{
"epoch": 0.3568320278503046,
"grad_norm": 2.25,
"learning_rate": 7.142857142857143e-05,
"loss": 0.9668,
"step": 205
},
{
"epoch": 0.36553524804177545,
"grad_norm": 2.28125,
"learning_rate": 7.317073170731707e-05,
"loss": 0.9589,
"step": 210
},
{
"epoch": 0.3742384682332463,
"grad_norm": 1.6484375,
"learning_rate": 7.491289198606272e-05,
"loss": 0.9686,
"step": 215
},
{
"epoch": 0.38294168842471715,
"grad_norm": 1.328125,
"learning_rate": 7.665505226480837e-05,
"loss": 0.9702,
"step": 220
},
{
"epoch": 0.391644908616188,
"grad_norm": 2.875,
"learning_rate": 7.839721254355401e-05,
"loss": 0.9665,
"step": 225
},
{
"epoch": 0.40034812880765885,
"grad_norm": 4.9375,
"learning_rate": 8.013937282229966e-05,
"loss": 0.9598,
"step": 230
},
{
"epoch": 0.4090513489991297,
"grad_norm": 3.359375,
"learning_rate": 8.188153310104531e-05,
"loss": 0.944,
"step": 235
},
{
"epoch": 0.4177545691906005,
"grad_norm": 2.484375,
"learning_rate": 8.362369337979094e-05,
"loss": 0.9368,
"step": 240
},
{
"epoch": 0.42645778938207135,
"grad_norm": 3.625,
"learning_rate": 8.53658536585366e-05,
"loss": 0.9467,
"step": 245
},
{
"epoch": 0.4351610095735422,
"grad_norm": 2.015625,
"learning_rate": 8.710801393728223e-05,
"loss": 0.9282,
"step": 250
},
{
"epoch": 0.44386422976501305,
"grad_norm": 5.0,
"learning_rate": 8.885017421602788e-05,
"loss": 0.9065,
"step": 255
},
{
"epoch": 0.4525674499564839,
"grad_norm": 1.6796875,
"learning_rate": 9.059233449477352e-05,
"loss": 0.911,
"step": 260
},
{
"epoch": 0.46127067014795475,
"grad_norm": 1.1328125,
"learning_rate": 9.233449477351917e-05,
"loss": 0.9055,
"step": 265
},
{
"epoch": 0.4699738903394256,
"grad_norm": 1.0625,
"learning_rate": 9.407665505226482e-05,
"loss": 0.9053,
"step": 270
},
{
"epoch": 0.47867711053089645,
"grad_norm": 2.703125,
"learning_rate": 9.581881533101045e-05,
"loss": 0.8986,
"step": 275
},
{
"epoch": 0.48738033072236725,
"grad_norm": 2.734375,
"learning_rate": 9.75609756097561e-05,
"loss": 0.8968,
"step": 280
},
{
"epoch": 0.4960835509138381,
"grad_norm": 2.53125,
"learning_rate": 9.930313588850174e-05,
"loss": 0.9147,
"step": 285
},
{
"epoch": 0.504786771105309,
"grad_norm": 2.84375,
"learning_rate": 0.00010104529616724739,
"loss": 0.8892,
"step": 290
},
{
"epoch": 0.5134899912967799,
"grad_norm": 2.296875,
"learning_rate": 0.00010278745644599304,
"loss": 0.8896,
"step": 295
},
{
"epoch": 0.5221932114882507,
"grad_norm": 0.87109375,
"learning_rate": 0.00010452961672473869,
"loss": 0.8926,
"step": 300
},
{
"epoch": 0.5308964316797214,
"grad_norm": 1.2890625,
"learning_rate": 0.00010627177700348431,
"loss": 0.8943,
"step": 305
},
{
"epoch": 0.5395996518711923,
"grad_norm": 0.82421875,
"learning_rate": 0.00010801393728222998,
"loss": 0.8807,
"step": 310
},
{
"epoch": 0.5483028720626631,
"grad_norm": 20.125,
"learning_rate": 0.00010975609756097563,
"loss": 0.882,
"step": 315
},
{
"epoch": 0.557006092254134,
"grad_norm": 3.609375,
"learning_rate": 0.00011149825783972125,
"loss": 0.9149,
"step": 320
},
{
"epoch": 0.5657093124456049,
"grad_norm": 4.1875,
"learning_rate": 0.00011324041811846691,
"loss": 0.8841,
"step": 325
},
{
"epoch": 0.5744125326370757,
"grad_norm": 2.078125,
"learning_rate": 0.00011498257839721256,
"loss": 0.891,
"step": 330
},
{
"epoch": 0.5831157528285466,
"grad_norm": 2.4375,
"learning_rate": 0.00011672473867595819,
"loss": 0.8778,
"step": 335
},
{
"epoch": 0.5918189730200174,
"grad_norm": 1.5078125,
"learning_rate": 0.00011846689895470385,
"loss": 0.8664,
"step": 340
},
{
"epoch": 0.6005221932114883,
"grad_norm": 1.6640625,
"learning_rate": 0.00012020905923344947,
"loss": 0.8838,
"step": 345
},
{
"epoch": 0.6092254134029591,
"grad_norm": 0.82421875,
"learning_rate": 0.00012195121951219512,
"loss": 0.8502,
"step": 350
},
{
"epoch": 0.61792863359443,
"grad_norm": 1.703125,
"learning_rate": 0.00012369337979094077,
"loss": 0.8579,
"step": 355
},
{
"epoch": 0.6266318537859008,
"grad_norm": 0.890625,
"learning_rate": 0.0001254355400696864,
"loss": 0.8601,
"step": 360
},
{
"epoch": 0.6353350739773717,
"grad_norm": 1.2578125,
"learning_rate": 0.00012717770034843207,
"loss": 0.8575,
"step": 365
},
{
"epoch": 0.6440382941688425,
"grad_norm": 1.9375,
"learning_rate": 0.00012891986062717772,
"loss": 0.8595,
"step": 370
},
{
"epoch": 0.6527415143603134,
"grad_norm": 1.359375,
"learning_rate": 0.00013066202090592334,
"loss": 0.8814,
"step": 375
},
{
"epoch": 0.6614447345517842,
"grad_norm": 0.93359375,
"learning_rate": 0.000132404181184669,
"loss": 0.8418,
"step": 380
},
{
"epoch": 0.6701479547432551,
"grad_norm": 0.94921875,
"learning_rate": 0.00013414634146341464,
"loss": 0.846,
"step": 385
},
{
"epoch": 0.6788511749347258,
"grad_norm": 1.0,
"learning_rate": 0.00013588850174216027,
"loss": 0.8663,
"step": 390
},
{
"epoch": 0.6875543951261966,
"grad_norm": 2.625,
"learning_rate": 0.00013763066202090594,
"loss": 0.8521,
"step": 395
},
{
"epoch": 0.6962576153176675,
"grad_norm": 3.46875,
"learning_rate": 0.0001393728222996516,
"loss": 0.8726,
"step": 400
},
{
"epoch": 0.7049608355091384,
"grad_norm": 3.0625,
"learning_rate": 0.00014111498257839722,
"loss": 0.8606,
"step": 405
},
{
"epoch": 0.7136640557006092,
"grad_norm": 1.25,
"learning_rate": 0.00014285714285714287,
"loss": 0.8517,
"step": 410
},
{
"epoch": 0.72236727589208,
"grad_norm": 2.515625,
"learning_rate": 0.00014459930313588852,
"loss": 0.8638,
"step": 415
},
{
"epoch": 0.7310704960835509,
"grad_norm": 1.4296875,
"learning_rate": 0.00014634146341463414,
"loss": 0.8448,
"step": 420
},
{
"epoch": 0.7397737162750218,
"grad_norm": 1.5390625,
"learning_rate": 0.0001480836236933798,
"loss": 0.8351,
"step": 425
},
{
"epoch": 0.7484769364664926,
"grad_norm": 3.203125,
"learning_rate": 0.00014982578397212544,
"loss": 0.8616,
"step": 430
},
{
"epoch": 0.7571801566579635,
"grad_norm": 2.109375,
"learning_rate": 0.0001515679442508711,
"loss": 0.8569,
"step": 435
},
{
"epoch": 0.7658833768494343,
"grad_norm": 3.265625,
"learning_rate": 0.00015331010452961674,
"loss": 0.8531,
"step": 440
},
{
"epoch": 0.7745865970409052,
"grad_norm": 1.2734375,
"learning_rate": 0.00015505226480836236,
"loss": 0.8483,
"step": 445
},
{
"epoch": 0.783289817232376,
"grad_norm": 1.8046875,
"learning_rate": 0.00015679442508710801,
"loss": 0.8444,
"step": 450
},
{
"epoch": 0.7919930374238469,
"grad_norm": 1.984375,
"learning_rate": 0.00015853658536585366,
"loss": 0.8513,
"step": 455
},
{
"epoch": 0.8006962576153177,
"grad_norm": 1.8125,
"learning_rate": 0.00016027874564459931,
"loss": 0.8326,
"step": 460
},
{
"epoch": 0.8093994778067886,
"grad_norm": 1.671875,
"learning_rate": 0.00016202090592334496,
"loss": 0.8554,
"step": 465
},
{
"epoch": 0.8181026979982594,
"grad_norm": 3.0,
"learning_rate": 0.00016376306620209061,
"loss": 0.8334,
"step": 470
},
{
"epoch": 0.8268059181897301,
"grad_norm": 1.2109375,
"learning_rate": 0.00016550522648083624,
"loss": 0.8547,
"step": 475
},
{
"epoch": 0.835509138381201,
"grad_norm": 1.6328125,
"learning_rate": 0.0001672473867595819,
"loss": 0.8496,
"step": 480
},
{
"epoch": 0.8442123585726719,
"grad_norm": 42.75,
"learning_rate": 0.00016898954703832754,
"loss": 0.8443,
"step": 485
},
{
"epoch": 0.8529155787641427,
"grad_norm": 1.5,
"learning_rate": 0.0001707317073170732,
"loss": 0.8419,
"step": 490
},
{
"epoch": 0.8616187989556136,
"grad_norm": 1.4375,
"learning_rate": 0.00017247386759581884,
"loss": 0.8447,
"step": 495
},
{
"epoch": 0.8703220191470844,
"grad_norm": 1.65625,
"learning_rate": 0.00017421602787456446,
"loss": 0.836,
"step": 500
},
{
"epoch": 0.8790252393385553,
"grad_norm": 2.40625,
"learning_rate": 0.0001759581881533101,
"loss": 0.8499,
"step": 505
},
{
"epoch": 0.8877284595300261,
"grad_norm": 1.7421875,
"learning_rate": 0.00017770034843205576,
"loss": 0.8429,
"step": 510
},
{
"epoch": 0.896431679721497,
"grad_norm": 1.875,
"learning_rate": 0.00017944250871080138,
"loss": 0.8562,
"step": 515
},
{
"epoch": 0.9051348999129678,
"grad_norm": 2.578125,
"learning_rate": 0.00018118466898954703,
"loss": 0.8553,
"step": 520
},
{
"epoch": 0.9138381201044387,
"grad_norm": 8.25,
"learning_rate": 0.0001829268292682927,
"loss": 0.8297,
"step": 525
},
{
"epoch": 0.9225413402959095,
"grad_norm": 0.97265625,
"learning_rate": 0.00018466898954703833,
"loss": 0.8288,
"step": 530
},
{
"epoch": 0.9312445604873804,
"grad_norm": 0.78125,
"learning_rate": 0.00018641114982578398,
"loss": 0.8325,
"step": 535
},
{
"epoch": 0.9399477806788512,
"grad_norm": 1.171875,
"learning_rate": 0.00018815331010452963,
"loss": 0.8317,
"step": 540
},
{
"epoch": 0.9486510008703221,
"grad_norm": 2.3125,
"learning_rate": 0.00018989547038327526,
"loss": 0.8422,
"step": 545
},
{
"epoch": 0.9573542210617929,
"grad_norm": 1.53125,
"learning_rate": 0.0001916376306620209,
"loss": 0.8528,
"step": 550
},
{
"epoch": 0.9660574412532638,
"grad_norm": 1.046875,
"learning_rate": 0.00019337979094076658,
"loss": 0.8385,
"step": 555
},
{
"epoch": 0.9747606614447345,
"grad_norm": 2.28125,
"learning_rate": 0.0001951219512195122,
"loss": 0.8434,
"step": 560
},
{
"epoch": 0.9834638816362054,
"grad_norm": 2.296875,
"learning_rate": 0.00019686411149825786,
"loss": 0.8416,
"step": 565
},
{
"epoch": 0.9921671018276762,
"grad_norm": 6.90625,
"learning_rate": 0.00019860627177700348,
"loss": 0.8445,
"step": 570
},
{
"epoch": 0.999129677980853,
"eval_loss": 2.1106536388397217,
"eval_runtime": 1.1027,
"eval_samples_per_second": 5.441,
"eval_steps_per_second": 0.907,
"step": 574
},
{
"epoch": 1.000870322019147,
"grad_norm": 1.0546875,
"learning_rate": 0.00019999998150897728,
"loss": 0.8531,
"step": 575
},
{
"epoch": 1.009573542210618,
"grad_norm": 2.625,
"learning_rate": 0.00019999933432389942,
"loss": 0.7707,
"step": 580
},
{
"epoch": 1.0182767624020888,
"grad_norm": 3.84375,
"learning_rate": 0.00019999776259452297,
"loss": 0.7908,
"step": 585
},
{
"epoch": 1.0269799825935597,
"grad_norm": 3.234375,
"learning_rate": 0.00019999526633537938,
"loss": 0.7832,
"step": 590
},
{
"epoch": 1.0356832027850305,
"grad_norm": 2.328125,
"learning_rate": 0.00019999184556954776,
"loss": 0.7502,
"step": 595
},
{
"epoch": 1.0443864229765012,
"grad_norm": 4.84375,
"learning_rate": 0.00019998750032865483,
"loss": 0.7704,
"step": 600
},
{
"epoch": 1.0530896431679722,
"grad_norm": 1.4765625,
"learning_rate": 0.00019998223065287456,
"loss": 0.7887,
"step": 605
},
{
"epoch": 1.061792863359443,
"grad_norm": 1.8046875,
"learning_rate": 0.00019997603659092773,
"loss": 0.7848,
"step": 610
},
{
"epoch": 1.0704960835509139,
"grad_norm": 1.515625,
"learning_rate": 0.00019996891820008164,
"loss": 0.7635,
"step": 615
},
{
"epoch": 1.0791993037423846,
"grad_norm": 1.109375,
"learning_rate": 0.00019996087554614934,
"loss": 0.7591,
"step": 620
},
{
"epoch": 1.0879025239338556,
"grad_norm": 1.2734375,
"learning_rate": 0.00019995190870348922,
"loss": 0.7569,
"step": 625
},
{
"epoch": 1.0966057441253263,
"grad_norm": 1.3125,
"learning_rate": 0.0001999420177550043,
"loss": 0.7677,
"step": 630
},
{
"epoch": 1.1053089643167973,
"grad_norm": 1.109375,
"learning_rate": 0.00019993120279214135,
"loss": 0.7648,
"step": 635
},
{
"epoch": 1.114012184508268,
"grad_norm": 0.984375,
"learning_rate": 0.00019991946391489018,
"loss": 0.7819,
"step": 640
},
{
"epoch": 1.122715404699739,
"grad_norm": 9.4375,
"learning_rate": 0.00019990680123178263,
"loss": 0.7606,
"step": 645
},
{
"epoch": 1.1314186248912097,
"grad_norm": 1.0234375,
"learning_rate": 0.00019989321485989163,
"loss": 0.796,
"step": 650
},
{
"epoch": 1.1401218450826807,
"grad_norm": 1.9296875,
"learning_rate": 0.00019987870492482997,
"loss": 0.7866,
"step": 655
},
{
"epoch": 1.1488250652741514,
"grad_norm": 1.46875,
"learning_rate": 0.00019986327156074939,
"loss": 0.7824,
"step": 660
},
{
"epoch": 1.1575282854656224,
"grad_norm": 3.234375,
"learning_rate": 0.00019984691491033906,
"loss": 0.7748,
"step": 665
},
{
"epoch": 1.166231505657093,
"grad_norm": 2.09375,
"learning_rate": 0.00019982963512482453,
"loss": 0.794,
"step": 670
},
{
"epoch": 1.174934725848564,
"grad_norm": 7.3125,
"learning_rate": 0.00019981143236396612,
"loss": 0.7733,
"step": 675
},
{
"epoch": 1.1836379460400348,
"grad_norm": 1.515625,
"learning_rate": 0.00019979230679605749,
"loss": 0.7919,
"step": 680
},
{
"epoch": 1.1923411662315058,
"grad_norm": 1.2265625,
"learning_rate": 0.0001997722585979242,
"loss": 0.7668,
"step": 685
},
{
"epoch": 1.2010443864229765,
"grad_norm": 0.578125,
"learning_rate": 0.000199751287954922,
"loss": 0.7746,
"step": 690
},
{
"epoch": 1.2097476066144472,
"grad_norm": 1.0859375,
"learning_rate": 0.000199729395060935,
"loss": 0.778,
"step": 695
},
{
"epoch": 1.2184508268059182,
"grad_norm": 0.6484375,
"learning_rate": 0.00019970658011837404,
"loss": 0.7742,
"step": 700
},
{
"epoch": 1.227154046997389,
"grad_norm": 0.71875,
"learning_rate": 0.00019968284333817486,
"loss": 0.7856,
"step": 705
},
{
"epoch": 1.23585726718886,
"grad_norm": 0.953125,
"learning_rate": 0.00019965818493979586,
"loss": 0.78,
"step": 710
},
{
"epoch": 1.2445604873803306,
"grad_norm": 0.63671875,
"learning_rate": 0.00019963260515121648,
"loss": 0.804,
"step": 715
},
{
"epoch": 1.2532637075718016,
"grad_norm": 0.71484375,
"learning_rate": 0.0001996061042089347,
"loss": 0.7713,
"step": 720
},
{
"epoch": 1.2619669277632724,
"grad_norm": 0.859375,
"learning_rate": 0.00019957868235796514,
"loss": 0.7725,
"step": 725
},
{
"epoch": 1.2706701479547433,
"grad_norm": 0.703125,
"learning_rate": 0.0001995503398518366,
"loss": 0.7738,
"step": 730
},
{
"epoch": 1.279373368146214,
"grad_norm": 1.234375,
"learning_rate": 0.00019952107695258992,
"loss": 0.7935,
"step": 735
},
{
"epoch": 1.288076588337685,
"grad_norm": 0.93359375,
"learning_rate": 0.0001994908939307753,
"loss": 0.7573,
"step": 740
},
{
"epoch": 1.2967798085291558,
"grad_norm": 0.734375,
"learning_rate": 0.00019945979106545002,
"loss": 0.8069,
"step": 745
},
{
"epoch": 1.3054830287206267,
"grad_norm": 1.6796875,
"learning_rate": 0.0001994277686441758,
"loss": 0.7752,
"step": 750
},
{
"epoch": 1.3141862489120975,
"grad_norm": 1.8671875,
"learning_rate": 0.00019939482696301606,
"loss": 0.7989,
"step": 755
},
{
"epoch": 1.3228894691035684,
"grad_norm": 0.83984375,
"learning_rate": 0.00019936096632653324,
"loss": 0.7946,
"step": 760
},
{
"epoch": 1.3315926892950392,
"grad_norm": 1.7421875,
"learning_rate": 0.000199326187047786,
"loss": 0.7781,
"step": 765
},
{
"epoch": 1.34029590948651,
"grad_norm": 1.859375,
"learning_rate": 0.00019929048944832638,
"loss": 0.7819,
"step": 770
},
{
"epoch": 1.3489991296779809,
"grad_norm": 1.2890625,
"learning_rate": 0.00019925387385819664,
"loss": 0.7702,
"step": 775
},
{
"epoch": 1.3577023498694518,
"grad_norm": 0.85546875,
"learning_rate": 0.00019921634061592644,
"loss": 0.7759,
"step": 780
},
{
"epoch": 1.3664055700609226,
"grad_norm": 0.8359375,
"learning_rate": 0.0001991778900685295,
"loss": 0.7683,
"step": 785
},
{
"epoch": 1.3751087902523933,
"grad_norm": 0.73828125,
"learning_rate": 0.00019913852257150052,
"loss": 0.7831,
"step": 790
},
{
"epoch": 1.3838120104438643,
"grad_norm": 0.80078125,
"learning_rate": 0.0001990982384888119,
"loss": 0.7823,
"step": 795
},
{
"epoch": 1.392515230635335,
"grad_norm": 1.046875,
"learning_rate": 0.0001990570381929103,
"loss": 0.7698,
"step": 800
},
{
"epoch": 1.401218450826806,
"grad_norm": 0.7421875,
"learning_rate": 0.00019901492206471325,
"loss": 0.7663,
"step": 805
},
{
"epoch": 1.4099216710182767,
"grad_norm": 0.734375,
"learning_rate": 0.00019897189049360557,
"loss": 0.7966,
"step": 810
},
{
"epoch": 1.4186248912097477,
"grad_norm": 1.0625,
"learning_rate": 0.00019892794387743593,
"loss": 0.7792,
"step": 815
},
{
"epoch": 1.4273281114012184,
"grad_norm": 0.73046875,
"learning_rate": 0.00019888308262251285,
"loss": 0.7761,
"step": 820
},
{
"epoch": 1.4360313315926894,
"grad_norm": 1.5390625,
"learning_rate": 0.00019883730714360137,
"loss": 0.772,
"step": 825
},
{
"epoch": 1.44473455178416,
"grad_norm": 3.578125,
"learning_rate": 0.00019879061786391881,
"loss": 0.7705,
"step": 830
},
{
"epoch": 1.453437771975631,
"grad_norm": 2.03125,
"learning_rate": 0.0001987430152151312,
"loss": 0.7637,
"step": 835
},
{
"epoch": 1.4621409921671018,
"grad_norm": 0.9765625,
"learning_rate": 0.00019869449963734893,
"loss": 0.7647,
"step": 840
},
{
"epoch": 1.4708442123585725,
"grad_norm": 1.3125,
"learning_rate": 0.0001986450715791231,
"loss": 0.7772,
"step": 845
},
{
"epoch": 1.4795474325500435,
"grad_norm": 0.86328125,
"learning_rate": 0.000198594731497441,
"loss": 0.7538,
"step": 850
},
{
"epoch": 1.4882506527415145,
"grad_norm": 3.5,
"learning_rate": 0.00019854347985772208,
"loss": 0.7732,
"step": 855
},
{
"epoch": 1.4969538729329852,
"grad_norm": 0.96484375,
"learning_rate": 0.00019849131713381364,
"loss": 0.7777,
"step": 860
},
{
"epoch": 1.505657093124456,
"grad_norm": 1.5703125,
"learning_rate": 0.00019843824380798633,
"loss": 0.7742,
"step": 865
},
{
"epoch": 1.514360313315927,
"grad_norm": 1.3828125,
"learning_rate": 0.00019838426037092988,
"loss": 0.7596,
"step": 870
},
{
"epoch": 1.5230635335073979,
"grad_norm": 5.03125,
"learning_rate": 0.00019832936732174834,
"loss": 0.7668,
"step": 875
},
{
"epoch": 1.5317667536988686,
"grad_norm": 1.1640625,
"learning_rate": 0.0001982735651679557,
"loss": 0.7635,
"step": 880
},
{
"epoch": 1.5404699738903394,
"grad_norm": 0.71875,
"learning_rate": 0.000198216854425471,
"loss": 0.7745,
"step": 885
},
{
"epoch": 1.5491731940818103,
"grad_norm": 0.9296875,
"learning_rate": 0.0001981592356186137,
"loss": 0.7905,
"step": 890
},
{
"epoch": 1.5578764142732813,
"grad_norm": 0.75,
"learning_rate": 0.00019810070928009867,
"loss": 0.7773,
"step": 895
},
{
"epoch": 1.566579634464752,
"grad_norm": 1.546875,
"learning_rate": 0.0001980412759510315,
"loss": 0.7611,
"step": 900
},
{
"epoch": 1.5752828546562228,
"grad_norm": 0.7421875,
"learning_rate": 0.00019798093618090328,
"loss": 0.7705,
"step": 905
},
{
"epoch": 1.5839860748476937,
"grad_norm": 0.7578125,
"learning_rate": 0.00019791969052758562,
"loss": 0.7895,
"step": 910
},
{
"epoch": 1.5926892950391645,
"grad_norm": 3.40625,
"learning_rate": 0.0001978575395573255,
"loss": 0.7738,
"step": 915
},
{
"epoch": 1.6013925152306352,
"grad_norm": 1.3515625,
"learning_rate": 0.00019779448384474,
"loss": 0.7661,
"step": 920
},
{
"epoch": 1.6100957354221062,
"grad_norm": 1.359375,
"learning_rate": 0.000197730523972811,
"loss": 0.7561,
"step": 925
},
{
"epoch": 1.6187989556135771,
"grad_norm": 1.0078125,
"learning_rate": 0.00019766566053287975,
"loss": 0.7742,
"step": 930
},
{
"epoch": 1.6275021758050479,
"grad_norm": 1.03125,
"learning_rate": 0.00019759989412464153,
"loss": 0.7742,
"step": 935
},
{
"epoch": 1.6362053959965186,
"grad_norm": 0.8359375,
"learning_rate": 0.0001975332253561399,
"loss": 0.769,
"step": 940
},
{
"epoch": 1.6449086161879896,
"grad_norm": 0.6953125,
"learning_rate": 0.00019746565484376132,
"loss": 0.7564,
"step": 945
},
{
"epoch": 1.6536118363794605,
"grad_norm": 0.87890625,
"learning_rate": 0.00019739718321222928,
"loss": 0.7574,
"step": 950
},
{
"epoch": 1.6623150565709313,
"grad_norm": 0.796875,
"learning_rate": 0.00019732781109459846,
"loss": 0.7702,
"step": 955
},
{
"epoch": 1.671018276762402,
"grad_norm": 0.73828125,
"learning_rate": 0.00019725753913224918,
"loss": 0.7785,
"step": 960
},
{
"epoch": 1.679721496953873,
"grad_norm": 1.8828125,
"learning_rate": 0.0001971863679748812,
"loss": 0.7694,
"step": 965
},
{
"epoch": 1.688424717145344,
"grad_norm": 0.96875,
"learning_rate": 0.00019711429828050769,
"loss": 0.7802,
"step": 970
},
{
"epoch": 1.6971279373368147,
"grad_norm": 1.1171875,
"learning_rate": 0.00019704133071544942,
"loss": 0.7629,
"step": 975
},
{
"epoch": 1.7058311575282854,
"grad_norm": 0.72265625,
"learning_rate": 0.00019696746595432828,
"loss": 0.7739,
"step": 980
},
{
"epoch": 1.7145343777197564,
"grad_norm": 0.85546875,
"learning_rate": 0.00019689270468006132,
"loss": 0.7794,
"step": 985
},
{
"epoch": 1.723237597911227,
"grad_norm": 1.0078125,
"learning_rate": 0.00019681704758385418,
"loss": 0.7575,
"step": 990
},
{
"epoch": 1.7319408181026978,
"grad_norm": 1.0546875,
"learning_rate": 0.0001967404953651949,
"loss": 0.7673,
"step": 995
},
{
"epoch": 1.7406440382941688,
"grad_norm": 0.96484375,
"learning_rate": 0.00019666304873184739,
"loss": 0.7734,
"step": 1000
},
{
"epoch": 1.7493472584856398,
"grad_norm": 1.6171875,
"learning_rate": 0.0001965847083998448,
"loss": 0.7785,
"step": 1005
},
{
"epoch": 1.7580504786771105,
"grad_norm": 1.640625,
"learning_rate": 0.00019650547509348306,
"loss": 0.7652,
"step": 1010
},
{
"epoch": 1.7667536988685812,
"grad_norm": 2.34375,
"learning_rate": 0.0001964253495453141,
"loss": 0.7631,
"step": 1015
},
{
"epoch": 1.7754569190600522,
"grad_norm": 0.83203125,
"learning_rate": 0.00019634433249613898,
"loss": 0.7819,
"step": 1020
},
{
"epoch": 1.7841601392515232,
"grad_norm": 0.67578125,
"learning_rate": 0.0001962624246950012,
"loss": 0.7774,
"step": 1025
},
{
"epoch": 1.792863359442994,
"grad_norm": 0.609375,
"learning_rate": 0.00019617962689917975,
"loss": 0.7723,
"step": 1030
},
{
"epoch": 1.8015665796344646,
"grad_norm": 0.8359375,
"learning_rate": 0.00019609593987418198,
"loss": 0.7645,
"step": 1035
},
{
"epoch": 1.8102697998259356,
"grad_norm": 0.80078125,
"learning_rate": 0.00019601136439373668,
"loss": 0.7653,
"step": 1040
},
{
"epoch": 1.8189730200174066,
"grad_norm": 0.69921875,
"learning_rate": 0.0001959259012397868,
"loss": 0.7756,
"step": 1045
},
{
"epoch": 1.8276762402088773,
"grad_norm": 0.69921875,
"learning_rate": 0.00019583955120248237,
"loss": 0.7656,
"step": 1050
},
{
"epoch": 1.836379460400348,
"grad_norm": 0.90625,
"learning_rate": 0.00019575231508017307,
"loss": 0.761,
"step": 1055
},
{
"epoch": 1.845082680591819,
"grad_norm": 0.87890625,
"learning_rate": 0.0001956641936794008,
"loss": 0.7584,
"step": 1060
},
{
"epoch": 1.85378590078329,
"grad_norm": 1.234375,
"learning_rate": 0.00019557518781489238,
"loss": 0.749,
"step": 1065
},
{
"epoch": 1.8624891209747607,
"grad_norm": 0.6484375,
"learning_rate": 0.00019548529830955196,
"loss": 0.7635,
"step": 1070
},
{
"epoch": 1.8711923411662315,
"grad_norm": 1.0703125,
"learning_rate": 0.00019539452599445336,
"loss": 0.7601,
"step": 1075
},
{
"epoch": 1.8798955613577024,
"grad_norm": 0.90625,
"learning_rate": 0.0001953028717088324,
"loss": 0.7869,
"step": 1080
},
{
"epoch": 1.8885987815491732,
"grad_norm": 2.046875,
"learning_rate": 0.00019521033630007928,
"loss": 0.766,
"step": 1085
},
{
"epoch": 1.897302001740644,
"grad_norm": 1.4375,
"learning_rate": 0.00019511692062373044,
"loss": 0.7744,
"step": 1090
},
{
"epoch": 1.9060052219321149,
"grad_norm": 1.03125,
"learning_rate": 0.000195022625543461,
"loss": 0.7749,
"step": 1095
},
{
"epoch": 1.9147084421235858,
"grad_norm": 0.6328125,
"learning_rate": 0.0001949274519310765,
"loss": 0.7684,
"step": 1100
},
{
"epoch": 1.9234116623150566,
"grad_norm": 0.703125,
"learning_rate": 0.00019483140066650507,
"loss": 0.7596,
"step": 1105
},
{
"epoch": 1.9321148825065273,
"grad_norm": 0.77734375,
"learning_rate": 0.00019473447263778905,
"loss": 0.768,
"step": 1110
},
{
"epoch": 1.9408181026979983,
"grad_norm": 6.03125,
"learning_rate": 0.00019463666874107704,
"loss": 0.7563,
"step": 1115
},
{
"epoch": 1.9495213228894692,
"grad_norm": 0.80078125,
"learning_rate": 0.00019453798988061535,
"loss": 0.7834,
"step": 1120
},
{
"epoch": 1.95822454308094,
"grad_norm": 7.6875,
"learning_rate": 0.00019443843696873985,
"loss": 0.7471,
"step": 1125
},
{
"epoch": 1.9669277632724107,
"grad_norm": 2.28125,
"learning_rate": 0.00019433801092586742,
"loss": 0.768,
"step": 1130
},
{
"epoch": 1.9756309834638817,
"grad_norm": 1.1953125,
"learning_rate": 0.00019423671268048754,
"loss": 0.7806,
"step": 1135
},
{
"epoch": 1.9843342036553526,
"grad_norm": 0.68359375,
"learning_rate": 0.00019413454316915356,
"loss": 0.7543,
"step": 1140
},
{
"epoch": 1.9930374238468234,
"grad_norm": 0.6875,
"learning_rate": 0.00019403150333647417,
"loss": 0.784,
"step": 1145
},
{
"epoch": 2.0,
"eval_loss": 2.230104684829712,
"eval_runtime": 0.7759,
"eval_samples_per_second": 7.733,
"eval_steps_per_second": 1.289,
"step": 1149
},
{
"epoch": 2.001740644038294,
"grad_norm": 0.77734375,
"learning_rate": 0.0001939275941351046,
"loss": 0.7099,
"step": 1150
},
{
"epoch": 2.010443864229765,
"grad_norm": 0.87890625,
"learning_rate": 0.00019382281652573785,
"loss": 0.6306,
"step": 1155
},
{
"epoch": 2.019147084421236,
"grad_norm": 0.96484375,
"learning_rate": 0.00019371717147709583,
"loss": 0.6241,
"step": 1160
},
{
"epoch": 2.0278503046127065,
"grad_norm": 1.1328125,
"learning_rate": 0.0001936106599659202,
"loss": 0.6167,
"step": 1165
},
{
"epoch": 2.0365535248041775,
"grad_norm": 1.015625,
"learning_rate": 0.00019350328297696373,
"loss": 0.6173,
"step": 1170
},
{
"epoch": 2.0452567449956485,
"grad_norm": 0.67578125,
"learning_rate": 0.00019339504150298084,
"loss": 0.6234,
"step": 1175
},
{
"epoch": 2.0539599651871194,
"grad_norm": 1.265625,
"learning_rate": 0.00019328593654471848,
"loss": 0.6151,
"step": 1180
},
{
"epoch": 2.06266318537859,
"grad_norm": 0.75390625,
"learning_rate": 0.00019317596911090713,
"loss": 0.6386,
"step": 1185
},
{
"epoch": 2.071366405570061,
"grad_norm": 0.62890625,
"learning_rate": 0.00019306514021825118,
"loss": 0.6209,
"step": 1190
},
{
"epoch": 2.080069625761532,
"grad_norm": 0.75390625,
"learning_rate": 0.00019295345089141963,
"loss": 0.625,
"step": 1195
},
{
"epoch": 2.0887728459530024,
"grad_norm": 0.703125,
"learning_rate": 0.00019284090216303666,
"loss": 0.6336,
"step": 1200
},
{
"epoch": 2.0974760661444734,
"grad_norm": 0.8828125,
"learning_rate": 0.00019272749507367212,
"loss": 0.6266,
"step": 1205
},
{
"epoch": 2.1061792863359443,
"grad_norm": 0.76171875,
"learning_rate": 0.00019261323067183166,
"loss": 0.6286,
"step": 1210
},
{
"epoch": 2.1148825065274153,
"grad_norm": 0.7421875,
"learning_rate": 0.0001924981100139474,
"loss": 0.6458,
"step": 1215
},
{
"epoch": 2.123585726718886,
"grad_norm": 2.03125,
"learning_rate": 0.00019238213416436785,
"loss": 0.6328,
"step": 1220
},
{
"epoch": 2.1322889469103568,
"grad_norm": 1.1328125,
"learning_rate": 0.00019226530419534833,
"loss": 0.6398,
"step": 1225
},
{
"epoch": 2.1409921671018277,
"grad_norm": 1.78125,
"learning_rate": 0.00019214762118704076,
"loss": 0.6361,
"step": 1230
},
{
"epoch": 2.1496953872932987,
"grad_norm": 1.1875,
"learning_rate": 0.000192029086227484,
"loss": 0.6357,
"step": 1235
},
{
"epoch": 2.158398607484769,
"grad_norm": 0.7421875,
"learning_rate": 0.00019190970041259352,
"loss": 0.6277,
"step": 1240
},
{
"epoch": 2.16710182767624,
"grad_norm": 1.2734375,
"learning_rate": 0.0001917894648461514,
"loss": 0.6455,
"step": 1245
},
{
"epoch": 2.175805047867711,
"grad_norm": 0.7578125,
"learning_rate": 0.00019166838063979614,
"loss": 0.6374,
"step": 1250
},
{
"epoch": 2.184508268059182,
"grad_norm": 0.76953125,
"learning_rate": 0.0001915464489130123,
"loss": 0.6343,
"step": 1255
},
{
"epoch": 2.1932114882506526,
"grad_norm": 1.0703125,
"learning_rate": 0.00019142367079312021,
"loss": 0.623,
"step": 1260
},
{
"epoch": 2.2019147084421236,
"grad_norm": 0.828125,
"learning_rate": 0.00019130004741526558,
"loss": 0.6359,
"step": 1265
},
{
"epoch": 2.2106179286335945,
"grad_norm": 0.68359375,
"learning_rate": 0.00019117557992240887,
"loss": 0.6344,
"step": 1270
},
{
"epoch": 2.2193211488250655,
"grad_norm": 0.75,
"learning_rate": 0.00019105026946531482,
"loss": 0.6511,
"step": 1275
},
{
"epoch": 2.228024369016536,
"grad_norm": 0.83203125,
"learning_rate": 0.0001909241172025419,
"loss": 0.636,
"step": 1280
},
{
"epoch": 2.236727589208007,
"grad_norm": 0.7578125,
"learning_rate": 0.00019079712430043134,
"loss": 0.6374,
"step": 1285
},
{
"epoch": 2.245430809399478,
"grad_norm": 1.3671875,
"learning_rate": 0.0001906692919330967,
"loss": 0.6359,
"step": 1290
},
{
"epoch": 2.254134029590949,
"grad_norm": 1.0859375,
"learning_rate": 0.00019054062128241264,
"loss": 0.6518,
"step": 1295
},
{
"epoch": 2.2628372497824194,
"grad_norm": 1.140625,
"learning_rate": 0.00019041111353800425,
"loss": 0.6428,
"step": 1300
},
{
"epoch": 2.2715404699738904,
"grad_norm": 1.0546875,
"learning_rate": 0.00019028076989723597,
"loss": 0.6562,
"step": 1305
},
{
"epoch": 2.2802436901653613,
"grad_norm": 0.78125,
"learning_rate": 0.00019014959156520052,
"loss": 0.6495,
"step": 1310
},
{
"epoch": 2.288946910356832,
"grad_norm": 0.6796875,
"learning_rate": 0.0001900175797547078,
"loss": 0.6466,
"step": 1315
},
{
"epoch": 2.297650130548303,
"grad_norm": 0.78515625,
"learning_rate": 0.00018988473568627354,
"loss": 0.6603,
"step": 1320
},
{
"epoch": 2.3063533507397738,
"grad_norm": 0.703125,
"learning_rate": 0.00018975106058810823,
"loss": 0.6352,
"step": 1325
},
{
"epoch": 2.3150565709312447,
"grad_norm": 0.90234375,
"learning_rate": 0.00018961655569610557,
"loss": 0.6592,
"step": 1330
},
{
"epoch": 2.3237597911227152,
"grad_norm": 0.8359375,
"learning_rate": 0.00018948122225383114,
"loss": 0.6515,
"step": 1335
},
{
"epoch": 2.332463011314186,
"grad_norm": 0.82421875,
"learning_rate": 0.00018934506151251093,
"loss": 0.6534,
"step": 1340
},
{
"epoch": 2.341166231505657,
"grad_norm": 1.0,
"learning_rate": 0.00018920807473101964,
"loss": 0.6558,
"step": 1345
},
{
"epoch": 2.349869451697128,
"grad_norm": 0.7421875,
"learning_rate": 0.00018907026317586923,
"loss": 0.6547,
"step": 1350
},
{
"epoch": 2.3585726718885986,
"grad_norm": 0.875,
"learning_rate": 0.00018893162812119702,
"loss": 0.6541,
"step": 1355
},
{
"epoch": 2.3672758920800696,
"grad_norm": 0.80859375,
"learning_rate": 0.00018879217084875408,
"loss": 0.655,
"step": 1360
},
{
"epoch": 2.3759791122715406,
"grad_norm": 0.75390625,
"learning_rate": 0.0001886518926478932,
"loss": 0.648,
"step": 1365
},
{
"epoch": 2.3846823324630115,
"grad_norm": 0.60546875,
"learning_rate": 0.00018851079481555714,
"loss": 0.6474,
"step": 1370
},
{
"epoch": 2.393385552654482,
"grad_norm": 0.78125,
"learning_rate": 0.00018836887865626654,
"loss": 0.6543,
"step": 1375
},
{
"epoch": 2.402088772845953,
"grad_norm": 0.8203125,
"learning_rate": 0.00018822614548210797,
"loss": 0.6529,
"step": 1380
},
{
"epoch": 2.410791993037424,
"grad_norm": 1.0078125,
"learning_rate": 0.00018808259661272153,
"loss": 0.6612,
"step": 1385
},
{
"epoch": 2.4194952132288945,
"grad_norm": 1.0078125,
"learning_rate": 0.000187938233375289,
"loss": 0.6519,
"step": 1390
},
{
"epoch": 2.4281984334203655,
"grad_norm": 1.5625,
"learning_rate": 0.00018779305710452132,
"loss": 0.6558,
"step": 1395
},
{
"epoch": 2.4369016536118364,
"grad_norm": 0.88671875,
"learning_rate": 0.00018764706914264635,
"loss": 0.6532,
"step": 1400
},
{
"epoch": 2.4456048738033074,
"grad_norm": 0.82421875,
"learning_rate": 0.00018750027083939654,
"loss": 0.6443,
"step": 1405
},
{
"epoch": 2.454308093994778,
"grad_norm": 0.8828125,
"learning_rate": 0.00018735266355199618,
"loss": 0.6544,
"step": 1410
},
{
"epoch": 2.463011314186249,
"grad_norm": 0.74609375,
"learning_rate": 0.00018720424864514913,
"loss": 0.6663,
"step": 1415
},
{
"epoch": 2.47171453437772,
"grad_norm": 3.359375,
"learning_rate": 0.0001870550274910261,
"loss": 0.6654,
"step": 1420
},
{
"epoch": 2.480417754569191,
"grad_norm": 0.74609375,
"learning_rate": 0.00018690500146925193,
"loss": 0.6456,
"step": 1425
},
{
"epoch": 2.4891209747606613,
"grad_norm": 0.88671875,
"learning_rate": 0.00018675417196689292,
"loss": 0.6495,
"step": 1430
},
{
"epoch": 2.4978241949521323,
"grad_norm": 0.79296875,
"learning_rate": 0.00018660254037844388,
"loss": 0.6551,
"step": 1435
},
{
"epoch": 2.506527415143603,
"grad_norm": 1.0703125,
"learning_rate": 0.00018645010810581535,
"loss": 0.6432,
"step": 1440
},
{
"epoch": 2.515230635335074,
"grad_norm": 0.6953125,
"learning_rate": 0.00018629687655832063,
"loss": 0.6521,
"step": 1445
},
{
"epoch": 2.5239338555265447,
"grad_norm": 0.703125,
"learning_rate": 0.00018614284715266264,
"loss": 0.6626,
"step": 1450
},
{
"epoch": 2.5326370757180157,
"grad_norm": 0.9296875,
"learning_rate": 0.00018598802131292093,
"loss": 0.6451,
"step": 1455
},
{
"epoch": 2.5413402959094866,
"grad_norm": 0.94140625,
"learning_rate": 0.00018583240047053863,
"loss": 0.6627,
"step": 1460
},
{
"epoch": 2.550043516100957,
"grad_norm": 0.8828125,
"learning_rate": 0.00018567598606430882,
"loss": 0.6756,
"step": 1465
},
{
"epoch": 2.558746736292428,
"grad_norm": 1.0234375,
"learning_rate": 0.00018551877954036162,
"loss": 0.6734,
"step": 1470
},
{
"epoch": 2.567449956483899,
"grad_norm": 0.73046875,
"learning_rate": 0.0001853607823521507,
"loss": 0.6495,
"step": 1475
},
{
"epoch": 2.57615317667537,
"grad_norm": 0.703125,
"learning_rate": 0.00018520199596043976,
"loss": 0.6459,
"step": 1480
},
{
"epoch": 2.584856396866841,
"grad_norm": 1.0859375,
"learning_rate": 0.0001850424218332891,
"loss": 0.6665,
"step": 1485
},
{
"epoch": 2.5935596170583115,
"grad_norm": 0.7734375,
"learning_rate": 0.00018488206144604203,
"loss": 0.6637,
"step": 1490
},
{
"epoch": 2.6022628372497825,
"grad_norm": 2.015625,
"learning_rate": 0.00018472091628131125,
"loss": 0.6705,
"step": 1495
},
{
"epoch": 2.6109660574412534,
"grad_norm": 0.76953125,
"learning_rate": 0.00018455898782896511,
"loss": 0.6601,
"step": 1500
},
{
"epoch": 2.619669277632724,
"grad_norm": 0.8203125,
"learning_rate": 0.00018439627758611385,
"loss": 0.6591,
"step": 1505
},
{
"epoch": 2.628372497824195,
"grad_norm": 0.671875,
"learning_rate": 0.00018423278705709573,
"loss": 0.6574,
"step": 1510
},
{
"epoch": 2.637075718015666,
"grad_norm": 0.9609375,
"learning_rate": 0.00018406851775346322,
"loss": 0.6665,
"step": 1515
},
{
"epoch": 2.645778938207137,
"grad_norm": 0.85546875,
"learning_rate": 0.0001839034711939689,
"loss": 0.6591,
"step": 1520
},
{
"epoch": 2.6544821583986073,
"grad_norm": 0.65625,
"learning_rate": 0.00018373764890455146,
"loss": 0.6505,
"step": 1525
},
{
"epoch": 2.6631853785900783,
"grad_norm": 0.79296875,
"learning_rate": 0.00018357105241832163,
"loss": 0.6654,
"step": 1530
},
{
"epoch": 2.6718885987815493,
"grad_norm": 0.69921875,
"learning_rate": 0.000183403683275548,
"loss": 0.6551,
"step": 1535
},
{
"epoch": 2.68059181897302,
"grad_norm": 0.75,
"learning_rate": 0.00018323554302364272,
"loss": 0.6647,
"step": 1540
},
{
"epoch": 2.6892950391644908,
"grad_norm": 0.9921875,
"learning_rate": 0.0001830666332171473,
"loss": 0.6658,
"step": 1545
},
{
"epoch": 2.6979982593559617,
"grad_norm": 1.890625,
"learning_rate": 0.00018289695541771802,
"loss": 0.6584,
"step": 1550
},
{
"epoch": 2.7067014795474327,
"grad_norm": 0.72265625,
"learning_rate": 0.00018272651119411186,
"loss": 0.6661,
"step": 1555
},
{
"epoch": 2.7154046997389036,
"grad_norm": 1.9296875,
"learning_rate": 0.0001825553021221716,
"loss": 0.6695,
"step": 1560
},
{
"epoch": 2.724107919930374,
"grad_norm": 1.453125,
"learning_rate": 0.00018238332978481148,
"loss": 0.6592,
"step": 1565
},
{
"epoch": 2.732811140121845,
"grad_norm": 1.0859375,
"learning_rate": 0.0001822105957720025,
"loss": 0.6587,
"step": 1570
},
{
"epoch": 2.741514360313316,
"grad_norm": 0.76171875,
"learning_rate": 0.00018203710168075788,
"loss": 0.6635,
"step": 1575
},
{
"epoch": 2.7502175805047866,
"grad_norm": 0.91796875,
"learning_rate": 0.00018186284911511787,
"loss": 0.6567,
"step": 1580
},
{
"epoch": 2.7589208006962576,
"grad_norm": 0.8125,
"learning_rate": 0.0001816878396861355,
"loss": 0.6543,
"step": 1585
},
{
"epoch": 2.7676240208877285,
"grad_norm": 1.2421875,
"learning_rate": 0.0001815120750118611,
"loss": 0.6662,
"step": 1590
},
{
"epoch": 2.7763272410791995,
"grad_norm": 0.875,
"learning_rate": 0.0001813355567173279,
"loss": 0.6637,
"step": 1595
},
{
"epoch": 2.78503046127067,
"grad_norm": 1.4296875,
"learning_rate": 0.00018115828643453647,
"loss": 0.6598,
"step": 1600
},
{
"epoch": 2.793733681462141,
"grad_norm": 0.76953125,
"learning_rate": 0.0001809802658024401,
"loss": 0.6734,
"step": 1605
},
{
"epoch": 2.802436901653612,
"grad_norm": 1.734375,
"learning_rate": 0.0001808014964669293,
"loss": 0.6547,
"step": 1610
},
{
"epoch": 2.8111401218450824,
"grad_norm": 1.0859375,
"learning_rate": 0.0001806219800808168,
"loss": 0.6662,
"step": 1615
},
{
"epoch": 2.8198433420365534,
"grad_norm": 1.015625,
"learning_rate": 0.00018044171830382215,
"loss": 0.658,
"step": 1620
},
{
"epoch": 2.8285465622280244,
"grad_norm": 1.0078125,
"learning_rate": 0.0001802607128025564,
"loss": 0.6574,
"step": 1625
},
{
"epoch": 2.8372497824194953,
"grad_norm": 1.09375,
"learning_rate": 0.0001800789652505068,
"loss": 0.6631,
"step": 1630
},
{
"epoch": 2.8459530026109663,
"grad_norm": 0.62109375,
"learning_rate": 0.00017989647732802113,
"loss": 0.6606,
"step": 1635
},
{
"epoch": 2.854656222802437,
"grad_norm": 0.875,
"learning_rate": 0.00017971325072229226,
"loss": 0.6759,
"step": 1640
},
{
"epoch": 2.8633594429939078,
"grad_norm": 1.1015625,
"learning_rate": 0.00017952928712734268,
"loss": 0.6751,
"step": 1645
},
{
"epoch": 2.8720626631853787,
"grad_norm": 0.9765625,
"learning_rate": 0.00017934458824400858,
"loss": 0.6604,
"step": 1650
},
{
"epoch": 2.8807658833768492,
"grad_norm": 0.90625,
"learning_rate": 0.00017915915577992433,
"loss": 0.6528,
"step": 1655
},
{
"epoch": 2.88946910356832,
"grad_norm": 1.109375,
"learning_rate": 0.00017897299144950662,
"loss": 0.653,
"step": 1660
},
{
"epoch": 2.898172323759791,
"grad_norm": 0.78515625,
"learning_rate": 0.00017878609697393868,
"loss": 0.6757,
"step": 1665
},
{
"epoch": 2.906875543951262,
"grad_norm": 0.7265625,
"learning_rate": 0.00017859847408115414,
"loss": 0.6608,
"step": 1670
},
{
"epoch": 2.9155787641427326,
"grad_norm": 3.5625,
"learning_rate": 0.00017841012450582134,
"loss": 0.6624,
"step": 1675
},
{
"epoch": 2.9242819843342036,
"grad_norm": 0.8203125,
"learning_rate": 0.00017822104998932713,
"loss": 0.671,
"step": 1680
},
{
"epoch": 2.9329852045256746,
"grad_norm": 2.0,
"learning_rate": 0.00017803125227976082,
"loss": 0.6495,
"step": 1685
},
{
"epoch": 2.941688424717145,
"grad_norm": 1.203125,
"learning_rate": 0.00017784073313189795,
"loss": 0.6729,
"step": 1690
},
{
"epoch": 2.950391644908616,
"grad_norm": 0.69140625,
"learning_rate": 0.00017764949430718426,
"loss": 0.6656,
"step": 1695
},
{
"epoch": 2.959094865100087,
"grad_norm": 0.6796875,
"learning_rate": 0.00017745753757371905,
"loss": 0.6674,
"step": 1700
},
{
"epoch": 2.967798085291558,
"grad_norm": 1.375,
"learning_rate": 0.00017726486470623926,
"loss": 0.6585,
"step": 1705
},
{
"epoch": 2.976501305483029,
"grad_norm": 1.5234375,
"learning_rate": 0.00017707147748610274,
"loss": 0.6659,
"step": 1710
},
{
"epoch": 2.9852045256744995,
"grad_norm": 0.9296875,
"learning_rate": 0.00017687737770127185,
"loss": 0.67,
"step": 1715
},
{
"epoch": 2.9939077458659704,
"grad_norm": 0.80078125,
"learning_rate": 0.00017668256714629713,
"loss": 0.6545,
"step": 1720
},
{
"epoch": 2.9991296779808527,
"eval_loss": 2.432891607284546,
"eval_runtime": 1.0987,
"eval_samples_per_second": 5.461,
"eval_steps_per_second": 0.91,
"step": 1723
},
{
"epoch": 3.0026109660574414,
"grad_norm": 0.9609375,
"learning_rate": 0.00017648704762230036,
"loss": 0.6195,
"step": 1725
},
{
"epoch": 3.011314186248912,
"grad_norm": 1.46875,
"learning_rate": 0.00017629082093695823,
"loss": 0.5228,
"step": 1730
},
{
"epoch": 3.020017406440383,
"grad_norm": 1.71875,
"learning_rate": 0.00017609388890448547,
"loss": 0.5116,
"step": 1735
},
{
"epoch": 3.028720626631854,
"grad_norm": 0.8515625,
"learning_rate": 0.00017589625334561801,
"loss": 0.5045,
"step": 1740
},
{
"epoch": 3.037423846823325,
"grad_norm": 0.68359375,
"learning_rate": 0.00017569791608759635,
"loss": 0.51,
"step": 1745
},
{
"epoch": 3.0461270670147953,
"grad_norm": 0.734375,
"learning_rate": 0.00017549887896414851,
"loss": 0.5144,
"step": 1750
},
{
"epoch": 3.0548302872062663,
"grad_norm": 0.6953125,
"learning_rate": 0.0001752991438154731,
"loss": 0.5033,
"step": 1755
},
{
"epoch": 3.063533507397737,
"grad_norm": 1.0,
"learning_rate": 0.00017509871248822236,
"loss": 0.5268,
"step": 1760
},
{
"epoch": 3.072236727589208,
"grad_norm": 0.8046875,
"learning_rate": 0.00017489758683548502,
"loss": 0.5163,
"step": 1765
},
{
"epoch": 3.0809399477806787,
"grad_norm": 0.75,
"learning_rate": 0.00017469576871676922,
"loss": 0.5165,
"step": 1770
},
{
"epoch": 3.0896431679721497,
"grad_norm": 0.875,
"learning_rate": 0.00017449325999798528,
"loss": 0.5237,
"step": 1775
},
{
"epoch": 3.0983463881636206,
"grad_norm": 0.8203125,
"learning_rate": 0.00017429006255142851,
"loss": 0.5108,
"step": 1780
},
{
"epoch": 3.1070496083550916,
"grad_norm": 0.828125,
"learning_rate": 0.0001740861782557618,
"loss": 0.5086,
"step": 1785
},
{
"epoch": 3.115752828546562,
"grad_norm": 0.97265625,
"learning_rate": 0.0001738816089959983,
"loss": 0.523,
"step": 1790
},
{
"epoch": 3.124456048738033,
"grad_norm": 1.7109375,
"learning_rate": 0.00017367635666348406,
"loss": 0.5265,
"step": 1795
},
{
"epoch": 3.133159268929504,
"grad_norm": 1.4453125,
"learning_rate": 0.00017347042315588046,
"loss": 0.5328,
"step": 1800
},
{
"epoch": 3.1418624891209745,
"grad_norm": 0.828125,
"learning_rate": 0.00017326381037714668,
"loss": 0.5294,
"step": 1805
},
{
"epoch": 3.1505657093124455,
"grad_norm": 0.78125,
"learning_rate": 0.00017305652023752205,
"loss": 0.5264,
"step": 1810
},
{
"epoch": 3.1592689295039165,
"grad_norm": 0.88671875,
"learning_rate": 0.00017284855465350856,
"loss": 0.5164,
"step": 1815
},
{
"epoch": 3.1679721496953874,
"grad_norm": 1.0078125,
"learning_rate": 0.0001726399155478529,
"loss": 0.5269,
"step": 1820
},
{
"epoch": 3.176675369886858,
"grad_norm": 0.74609375,
"learning_rate": 0.00017243060484952894,
"loss": 0.5237,
"step": 1825
},
{
"epoch": 3.185378590078329,
"grad_norm": 0.7109375,
"learning_rate": 0.00017222062449371962,
"loss": 0.5189,
"step": 1830
},
{
"epoch": 3.1940818102698,
"grad_norm": 0.69921875,
"learning_rate": 0.0001720099764217993,
"loss": 0.5306,
"step": 1835
},
{
"epoch": 3.202785030461271,
"grad_norm": 0.76171875,
"learning_rate": 0.00017179866258131568,
"loss": 0.5401,
"step": 1840
},
{
"epoch": 3.2114882506527413,
"grad_norm": 0.734375,
"learning_rate": 0.00017158668492597186,
"loss": 0.5254,
"step": 1845
},
{
"epoch": 3.2201914708442123,
"grad_norm": 0.70703125,
"learning_rate": 0.00017137404541560817,
"loss": 0.5306,
"step": 1850
},
{
"epoch": 3.2288946910356833,
"grad_norm": 0.91015625,
"learning_rate": 0.00017116074601618417,
"loss": 0.5299,
"step": 1855
},
{
"epoch": 3.2375979112271542,
"grad_norm": 0.828125,
"learning_rate": 0.00017094678869976045,
"loss": 0.53,
"step": 1860
},
{
"epoch": 3.2463011314186248,
"grad_norm": 0.75,
"learning_rate": 0.0001707321754444803,
"loss": 0.5422,
"step": 1865
},
{
"epoch": 3.2550043516100957,
"grad_norm": 0.734375,
"learning_rate": 0.00017051690823455162,
"loss": 0.5357,
"step": 1870
},
{
"epoch": 3.2637075718015667,
"grad_norm": 0.70703125,
"learning_rate": 0.00017030098906022832,
"loss": 0.5355,
"step": 1875
},
{
"epoch": 3.272410791993037,
"grad_norm": 0.73828125,
"learning_rate": 0.0001700844199177921,
"loss": 0.5439,
"step": 1880
},
{
"epoch": 3.281114012184508,
"grad_norm": 0.73828125,
"learning_rate": 0.00016986720280953396,
"loss": 0.5294,
"step": 1885
},
{
"epoch": 3.289817232375979,
"grad_norm": 1.015625,
"learning_rate": 0.0001696493397437357,
"loss": 0.5485,
"step": 1890
},
{
"epoch": 3.29852045256745,
"grad_norm": 1.6484375,
"learning_rate": 0.0001694308327346512,
"loss": 0.5429,
"step": 1895
},
{
"epoch": 3.307223672758921,
"grad_norm": 0.84765625,
"learning_rate": 0.0001692116838024881,
"loss": 0.5518,
"step": 1900
},
{
"epoch": 3.3159268929503916,
"grad_norm": 0.95703125,
"learning_rate": 0.00016899189497338876,
"loss": 0.5429,
"step": 1905
},
{
"epoch": 3.3246301131418625,
"grad_norm": 0.7734375,
"learning_rate": 0.00016877146827941187,
"loss": 0.5392,
"step": 1910
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.87109375,
"learning_rate": 0.00016855040575851335,
"loss": 0.5338,
"step": 1915
},
{
"epoch": 3.342036553524804,
"grad_norm": 0.9609375,
"learning_rate": 0.00016832870945452776,
"loss": 0.545,
"step": 1920
},
{
"epoch": 3.350739773716275,
"grad_norm": 0.828125,
"learning_rate": 0.00016810638141714934,
"loss": 0.56,
"step": 1925
},
{
"epoch": 3.359442993907746,
"grad_norm": 2.671875,
"learning_rate": 0.0001678834237019129,
"loss": 0.5483,
"step": 1930
},
{
"epoch": 3.368146214099217,
"grad_norm": 0.8125,
"learning_rate": 0.00016765983837017503,
"loss": 0.5448,
"step": 1935
},
{
"epoch": 3.3768494342906874,
"grad_norm": 0.89453125,
"learning_rate": 0.00016743562748909493,
"loss": 0.5463,
"step": 1940
},
{
"epoch": 3.3855526544821584,
"grad_norm": 1.1015625,
"learning_rate": 0.00016721079313161534,
"loss": 0.5518,
"step": 1945
},
{
"epoch": 3.3942558746736293,
"grad_norm": 0.78515625,
"learning_rate": 0.00016698533737644327,
"loss": 0.551,
"step": 1950
},
{
"epoch": 3.4029590948651,
"grad_norm": 0.74609375,
"learning_rate": 0.000166759262308031,
"loss": 0.5452,
"step": 1955
},
{
"epoch": 3.411662315056571,
"grad_norm": 0.75390625,
"learning_rate": 0.00016653257001655652,
"loss": 0.5371,
"step": 1960
},
{
"epoch": 3.4203655352480418,
"grad_norm": 0.7265625,
"learning_rate": 0.00016630526259790455,
"loss": 0.5615,
"step": 1965
},
{
"epoch": 3.4290687554395127,
"grad_norm": 0.953125,
"learning_rate": 0.00016607734215364674,
"loss": 0.5466,
"step": 1970
},
{
"epoch": 3.4377719756309837,
"grad_norm": 0.93359375,
"learning_rate": 0.00016584881079102263,
"loss": 0.554,
"step": 1975
},
{
"epoch": 3.446475195822454,
"grad_norm": 0.91015625,
"learning_rate": 0.00016561967062292,
"loss": 0.5541,
"step": 1980
},
{
"epoch": 3.455178416013925,
"grad_norm": 0.765625,
"learning_rate": 0.00016538992376785529,
"loss": 0.5476,
"step": 1985
},
{
"epoch": 3.463881636205396,
"grad_norm": 0.875,
"learning_rate": 0.0001651595723499541,
"loss": 0.5543,
"step": 1990
},
{
"epoch": 3.4725848563968666,
"grad_norm": 1.0234375,
"learning_rate": 0.0001649286184989315,
"loss": 0.5547,
"step": 1995
},
{
"epoch": 3.4812880765883376,
"grad_norm": 0.93359375,
"learning_rate": 0.00016469706435007236,
"loss": 0.5467,
"step": 2000
},
{
"epoch": 3.4899912967798086,
"grad_norm": 1.5546875,
"learning_rate": 0.0001644649120442116,
"loss": 0.539,
"step": 2005
},
{
"epoch": 3.4986945169712795,
"grad_norm": 0.9609375,
"learning_rate": 0.00016423216372771443,
"loss": 0.5448,
"step": 2010
},
{
"epoch": 3.5073977371627505,
"grad_norm": 0.71875,
"learning_rate": 0.0001639988215524565,
"loss": 0.5639,
"step": 2015
},
{
"epoch": 3.516100957354221,
"grad_norm": 0.7734375,
"learning_rate": 0.0001637648876758039,
"loss": 0.5511,
"step": 2020
},
{
"epoch": 3.524804177545692,
"grad_norm": 0.94921875,
"learning_rate": 0.00016353036426059334,
"loss": 0.5438,
"step": 2025
},
{
"epoch": 3.5335073977371625,
"grad_norm": 0.88671875,
"learning_rate": 0.0001632952534751122,
"loss": 0.548,
"step": 2030
},
{
"epoch": 3.5422106179286335,
"grad_norm": 0.8515625,
"learning_rate": 0.00016305955749307816,
"loss": 0.5532,
"step": 2035
},
{
"epoch": 3.5509138381201044,
"grad_norm": 0.796875,
"learning_rate": 0.00016282327849361967,
"loss": 0.5432,
"step": 2040
},
{
"epoch": 3.5596170583115754,
"grad_norm": 0.78515625,
"learning_rate": 0.00016258641866125518,
"loss": 0.551,
"step": 2045
},
{
"epoch": 3.5683202785030463,
"grad_norm": 0.75,
"learning_rate": 0.00016234898018587337,
"loss": 0.5454,
"step": 2050
},
{
"epoch": 3.577023498694517,
"grad_norm": 0.69140625,
"learning_rate": 0.00016211096526271273,
"loss": 0.5555,
"step": 2055
},
{
"epoch": 3.585726718885988,
"grad_norm": 0.8359375,
"learning_rate": 0.00016187237609234132,
"loss": 0.5503,
"step": 2060
},
{
"epoch": 3.594429939077459,
"grad_norm": 0.9765625,
"learning_rate": 0.00016163321488063637,
"loss": 0.5432,
"step": 2065
},
{
"epoch": 3.6031331592689293,
"grad_norm": 0.84375,
"learning_rate": 0.000161393483838764,
"loss": 0.5531,
"step": 2070
},
{
"epoch": 3.6118363794604003,
"grad_norm": 0.7734375,
"learning_rate": 0.0001611531851831586,
"loss": 0.5479,
"step": 2075
},
{
"epoch": 3.620539599651871,
"grad_norm": 0.75390625,
"learning_rate": 0.0001609123211355025,
"loss": 0.553,
"step": 2080
},
{
"epoch": 3.629242819843342,
"grad_norm": 1.8359375,
"learning_rate": 0.00016067089392270533,
"loss": 0.5554,
"step": 2085
},
{
"epoch": 3.637946040034813,
"grad_norm": 0.75,
"learning_rate": 0.00016042890577688349,
"loss": 0.5501,
"step": 2090
},
{
"epoch": 3.6466492602262837,
"grad_norm": 0.78125,
"learning_rate": 0.0001601863589353395,
"loss": 0.5488,
"step": 2095
},
{
"epoch": 3.6553524804177546,
"grad_norm": 0.71484375,
"learning_rate": 0.00015994325564054122,
"loss": 0.5618,
"step": 2100
},
{
"epoch": 3.664055700609225,
"grad_norm": 0.765625,
"learning_rate": 0.00015969959814010132,
"loss": 0.5526,
"step": 2105
},
{
"epoch": 3.672758920800696,
"grad_norm": 0.890625,
"learning_rate": 0.00015945538868675628,
"loss": 0.5492,
"step": 2110
},
{
"epoch": 3.681462140992167,
"grad_norm": 0.7578125,
"learning_rate": 0.0001592106295383458,
"loss": 0.5558,
"step": 2115
},
{
"epoch": 3.690165361183638,
"grad_norm": 0.765625,
"learning_rate": 0.00015896532295779157,
"loss": 0.5576,
"step": 2120
},
{
"epoch": 3.698868581375109,
"grad_norm": 0.88671875,
"learning_rate": 0.00015871947121307676,
"loss": 0.5514,
"step": 2125
},
{
"epoch": 3.7075718015665795,
"grad_norm": 0.875,
"learning_rate": 0.0001584730765772248,
"loss": 0.5615,
"step": 2130
},
{
"epoch": 3.7162750217580505,
"grad_norm": 0.796875,
"learning_rate": 0.00015822614132827837,
"loss": 0.5489,
"step": 2135
},
{
"epoch": 3.7249782419495214,
"grad_norm": 0.91015625,
"learning_rate": 0.00015797866774927848,
"loss": 0.5507,
"step": 2140
},
{
"epoch": 3.733681462140992,
"grad_norm": 0.67578125,
"learning_rate": 0.0001577306581282432,
"loss": 0.5574,
"step": 2145
},
{
"epoch": 3.742384682332463,
"grad_norm": 0.91015625,
"learning_rate": 0.00015748211475814658,
"loss": 0.5579,
"step": 2150
},
{
"epoch": 3.751087902523934,
"grad_norm": 0.89453125,
"learning_rate": 0.00015723303993689754,
"loss": 0.5736,
"step": 2155
},
{
"epoch": 3.759791122715405,
"grad_norm": 0.9296875,
"learning_rate": 0.0001569834359673184,
"loss": 0.553,
"step": 2160
},
{
"epoch": 3.768494342906876,
"grad_norm": 0.828125,
"learning_rate": 0.00015673330515712382,
"loss": 0.5617,
"step": 2165
},
{
"epoch": 3.7771975630983463,
"grad_norm": 1.0625,
"learning_rate": 0.00015648264981889934,
"loss": 0.5583,
"step": 2170
},
{
"epoch": 3.7859007832898173,
"grad_norm": 0.73828125,
"learning_rate": 0.00015623147227008006,
"loss": 0.5584,
"step": 2175
},
{
"epoch": 3.7946040034812882,
"grad_norm": 0.72265625,
"learning_rate": 0.00015597977483292907,
"loss": 0.5559,
"step": 2180
},
{
"epoch": 3.8033072236727588,
"grad_norm": 1.046875,
"learning_rate": 0.00015572755983451626,
"loss": 0.5543,
"step": 2185
},
{
"epoch": 3.8120104438642297,
"grad_norm": 0.70703125,
"learning_rate": 0.00015547482960669645,
"loss": 0.5554,
"step": 2190
},
{
"epoch": 3.8207136640557007,
"grad_norm": 0.7890625,
"learning_rate": 0.00015522158648608817,
"loss": 0.5665,
"step": 2195
},
{
"epoch": 3.8294168842471716,
"grad_norm": 0.78125,
"learning_rate": 0.00015496783281405177,
"loss": 0.5614,
"step": 2200
},
{
"epoch": 3.838120104438642,
"grad_norm": 0.69921875,
"learning_rate": 0.00015471357093666804,
"loss": 0.5596,
"step": 2205
},
{
"epoch": 3.846823324630113,
"grad_norm": 0.90625,
"learning_rate": 0.0001544588032047163,
"loss": 0.553,
"step": 2210
},
{
"epoch": 3.855526544821584,
"grad_norm": 0.81640625,
"learning_rate": 0.0001542035319736528,
"loss": 0.549,
"step": 2215
},
{
"epoch": 3.8642297650130546,
"grad_norm": 0.75,
"learning_rate": 0.0001539477596035888,
"loss": 0.5562,
"step": 2220
},
{
"epoch": 3.8729329852045256,
"grad_norm": 1.046875,
"learning_rate": 0.00015369148845926893,
"loss": 0.5658,
"step": 2225
},
{
"epoch": 3.8816362053959965,
"grad_norm": 0.73046875,
"learning_rate": 0.00015343472091004925,
"loss": 0.5625,
"step": 2230
},
{
"epoch": 3.8903394255874675,
"grad_norm": 0.7890625,
"learning_rate": 0.00015317745932987524,
"loss": 0.5613,
"step": 2235
},
{
"epoch": 3.8990426457789384,
"grad_norm": 1.078125,
"learning_rate": 0.00015291970609726007,
"loss": 0.567,
"step": 2240
},
{
"epoch": 3.907745865970409,
"grad_norm": 0.796875,
"learning_rate": 0.0001526614635952624,
"loss": 0.568,
"step": 2245
},
{
"epoch": 3.91644908616188,
"grad_norm": 0.90625,
"learning_rate": 0.0001524027342114644,
"loss": 0.5671,
"step": 2250
},
{
"epoch": 3.925152306353351,
"grad_norm": 0.890625,
"learning_rate": 0.0001521435203379498,
"loss": 0.5538,
"step": 2255
},
{
"epoch": 3.9338555265448214,
"grad_norm": 0.73046875,
"learning_rate": 0.00015188382437128167,
"loss": 0.5624,
"step": 2260
},
{
"epoch": 3.9425587467362924,
"grad_norm": 0.99609375,
"learning_rate": 0.00015162364871248023,
"loss": 0.5491,
"step": 2265
},
{
"epoch": 3.9512619669277633,
"grad_norm": 0.7421875,
"learning_rate": 0.0001513629957670007,
"loss": 0.5575,
"step": 2270
},
{
"epoch": 3.9599651871192343,
"grad_norm": 0.76171875,
"learning_rate": 0.00015110186794471103,
"loss": 0.5639,
"step": 2275
},
{
"epoch": 3.968668407310705,
"grad_norm": 0.85546875,
"learning_rate": 0.00015084026765986979,
"loss": 0.564,
"step": 2280
},
{
"epoch": 3.9773716275021758,
"grad_norm": 0.78125,
"learning_rate": 0.00015057819733110348,
"loss": 0.569,
"step": 2285
},
{
"epoch": 3.9860748476936467,
"grad_norm": 0.8671875,
"learning_rate": 0.00015031565938138458,
"loss": 0.5676,
"step": 2290
},
{
"epoch": 3.9947780678851172,
"grad_norm": 0.82421875,
"learning_rate": 0.0001500526562380089,
"loss": 0.5693,
"step": 2295
},
{
"epoch": 4.0,
"eval_loss": 2.70300030708313,
"eval_runtime": 0.778,
"eval_samples_per_second": 7.712,
"eval_steps_per_second": 1.285,
"step": 2298
},
{
"epoch": 4.003481288076588,
"grad_norm": 0.671875,
"learning_rate": 0.00014978919033257316,
"loss": 0.5013,
"step": 2300
},
{
"epoch": 4.012184508268059,
"grad_norm": 0.9453125,
"learning_rate": 0.00014952526410095258,
"loss": 0.412,
"step": 2305
},
{
"epoch": 4.02088772845953,
"grad_norm": 0.75390625,
"learning_rate": 0.00014926087998327837,
"loss": 0.4225,
"step": 2310
},
{
"epoch": 4.029590948651001,
"grad_norm": 0.85546875,
"learning_rate": 0.00014899604042391506,
"loss": 0.4255,
"step": 2315
},
{
"epoch": 4.038294168842472,
"grad_norm": 0.859375,
"learning_rate": 0.000148730747871438,
"loss": 0.4108,
"step": 2320
},
{
"epoch": 4.046997389033942,
"grad_norm": 0.765625,
"learning_rate": 0.0001484650047786107,
"loss": 0.4152,
"step": 2325
},
{
"epoch": 4.055700609225413,
"grad_norm": 0.76171875,
"learning_rate": 0.00014819881360236207,
"loss": 0.4197,
"step": 2330
},
{
"epoch": 4.064403829416884,
"grad_norm": 0.91015625,
"learning_rate": 0.00014793217680376394,
"loss": 0.4203,
"step": 2335
},
{
"epoch": 4.073107049608355,
"grad_norm": 0.89453125,
"learning_rate": 0.00014766509684800794,
"loss": 0.4138,
"step": 2340
},
{
"epoch": 4.081810269799826,
"grad_norm": 0.796875,
"learning_rate": 0.00014739757620438307,
"loss": 0.4167,
"step": 2345
},
{
"epoch": 4.090513489991297,
"grad_norm": 0.7578125,
"learning_rate": 0.00014712961734625264,
"loss": 0.4183,
"step": 2350
},
{
"epoch": 4.099216710182768,
"grad_norm": 1.0,
"learning_rate": 0.0001468612227510315,
"loss": 0.4302,
"step": 2355
},
{
"epoch": 4.107919930374239,
"grad_norm": 0.8046875,
"learning_rate": 0.00014659239490016302,
"loss": 0.4329,
"step": 2360
},
{
"epoch": 4.116623150565709,
"grad_norm": 1.1328125,
"learning_rate": 0.00014632313627909642,
"loss": 0.4304,
"step": 2365
},
{
"epoch": 4.12532637075718,
"grad_norm": 1.3125,
"learning_rate": 0.00014605344937726345,
"loss": 0.4194,
"step": 2370
},
{
"epoch": 4.134029590948651,
"grad_norm": 0.8828125,
"learning_rate": 0.00014578333668805558,
"loss": 0.4195,
"step": 2375
},
{
"epoch": 4.142732811140122,
"grad_norm": 0.7578125,
"learning_rate": 0.0001455128007088009,
"loss": 0.4354,
"step": 2380
},
{
"epoch": 4.151436031331593,
"grad_norm": 0.96484375,
"learning_rate": 0.00014524184394074102,
"loss": 0.442,
"step": 2385
},
{
"epoch": 4.160139251523064,
"grad_norm": 0.86328125,
"learning_rate": 0.00014497046888900801,
"loss": 0.433,
"step": 2390
},
{
"epoch": 4.168842471714535,
"grad_norm": 1.2734375,
"learning_rate": 0.00014469867806260115,
"loss": 0.4325,
"step": 2395
},
{
"epoch": 4.177545691906005,
"grad_norm": 0.7578125,
"learning_rate": 0.00014442647397436365,
"loss": 0.4255,
"step": 2400
},
{
"epoch": 4.186248912097476,
"grad_norm": 0.80859375,
"learning_rate": 0.0001441538591409598,
"loss": 0.4419,
"step": 2405
},
{
"epoch": 4.194952132288947,
"grad_norm": 1.0625,
"learning_rate": 0.00014388083608285113,
"loss": 0.4354,
"step": 2410
},
{
"epoch": 4.203655352480418,
"grad_norm": 0.80078125,
"learning_rate": 0.00014360740732427367,
"loss": 0.4308,
"step": 2415
},
{
"epoch": 4.212358572671889,
"grad_norm": 1.015625,
"learning_rate": 0.00014333357539321416,
"loss": 0.434,
"step": 2420
},
{
"epoch": 4.22106179286336,
"grad_norm": 0.8359375,
"learning_rate": 0.00014305934282138701,
"loss": 0.4402,
"step": 2425
},
{
"epoch": 4.2297650130548305,
"grad_norm": 0.78125,
"learning_rate": 0.00014278471214421073,
"loss": 0.4298,
"step": 2430
},
{
"epoch": 4.2384682332463015,
"grad_norm": 0.765625,
"learning_rate": 0.0001425096859007844,
"loss": 0.4332,
"step": 2435
},
{
"epoch": 4.247171453437772,
"grad_norm": 1.3515625,
"learning_rate": 0.0001422342666338645,
"loss": 0.4441,
"step": 2440
},
{
"epoch": 4.2558746736292425,
"grad_norm": 0.953125,
"learning_rate": 0.00014195845688984104,
"loss": 0.435,
"step": 2445
},
{
"epoch": 4.2645778938207135,
"grad_norm": 0.81640625,
"learning_rate": 0.00014168225921871433,
"loss": 0.4355,
"step": 2450
},
{
"epoch": 4.2732811140121845,
"grad_norm": 0.8046875,
"learning_rate": 0.00014140567617407105,
"loss": 0.4422,
"step": 2455
},
{
"epoch": 4.281984334203655,
"grad_norm": 0.8984375,
"learning_rate": 0.00014112871031306119,
"loss": 0.4347,
"step": 2460
},
{
"epoch": 4.290687554395126,
"grad_norm": 0.74609375,
"learning_rate": 0.00014085136419637369,
"loss": 0.4353,
"step": 2465
},
{
"epoch": 4.299390774586597,
"grad_norm": 0.78125,
"learning_rate": 0.00014057364038821347,
"loss": 0.4425,
"step": 2470
},
{
"epoch": 4.308093994778067,
"grad_norm": 0.87890625,
"learning_rate": 0.00014029554145627714,
"loss": 0.4419,
"step": 2475
},
{
"epoch": 4.316797214969538,
"grad_norm": 0.796875,
"learning_rate": 0.00014001706997172973,
"loss": 0.4403,
"step": 2480
},
{
"epoch": 4.325500435161009,
"grad_norm": 0.83984375,
"learning_rate": 0.00013973822850918055,
"loss": 0.4427,
"step": 2485
},
{
"epoch": 4.33420365535248,
"grad_norm": 0.83203125,
"learning_rate": 0.0001394590196466596,
"loss": 0.4351,
"step": 2490
},
{
"epoch": 4.342906875543951,
"grad_norm": 0.74609375,
"learning_rate": 0.00013917944596559376,
"loss": 0.437,
"step": 2495
},
{
"epoch": 4.351610095735422,
"grad_norm": 0.9375,
"learning_rate": 0.0001388995100507827,
"loss": 0.4383,
"step": 2500
},
{
"epoch": 4.360313315926893,
"grad_norm": 0.75390625,
"learning_rate": 0.0001386192144903752,
"loss": 0.4403,
"step": 2505
},
{
"epoch": 4.369016536118364,
"grad_norm": 0.83984375,
"learning_rate": 0.00013833856187584514,
"loss": 0.4474,
"step": 2510
},
{
"epoch": 4.377719756309834,
"grad_norm": 1.046875,
"learning_rate": 0.00013805755480196755,
"loss": 0.4424,
"step": 2515
},
{
"epoch": 4.386422976501305,
"grad_norm": 0.84375,
"learning_rate": 0.0001377761958667946,
"loss": 0.4495,
"step": 2520
},
{
"epoch": 4.395126196692776,
"grad_norm": 1.140625,
"learning_rate": 0.00013749448767163156,
"loss": 0.4468,
"step": 2525
},
{
"epoch": 4.403829416884247,
"grad_norm": 1.1015625,
"learning_rate": 0.0001372124328210129,
"loss": 0.4472,
"step": 2530
},
{
"epoch": 4.412532637075718,
"grad_norm": 0.90625,
"learning_rate": 0.0001369300339226779,
"loss": 0.4459,
"step": 2535
},
{
"epoch": 4.421235857267189,
"grad_norm": 1.0546875,
"learning_rate": 0.000136647293587547,
"loss": 0.4462,
"step": 2540
},
{
"epoch": 4.42993907745866,
"grad_norm": 0.97265625,
"learning_rate": 0.00013636421442969718,
"loss": 0.4439,
"step": 2545
},
{
"epoch": 4.438642297650131,
"grad_norm": 0.921875,
"learning_rate": 0.00013608079906633807,
"loss": 0.4468,
"step": 2550
},
{
"epoch": 4.447345517841601,
"grad_norm": 1.0234375,
"learning_rate": 0.00013579705011778766,
"loss": 0.4528,
"step": 2555
},
{
"epoch": 4.456048738033072,
"grad_norm": 0.93359375,
"learning_rate": 0.00013551297020744825,
"loss": 0.4449,
"step": 2560
},
{
"epoch": 4.464751958224543,
"grad_norm": 0.796875,
"learning_rate": 0.0001352285619617818,
"loss": 0.4475,
"step": 2565
},
{
"epoch": 4.473455178416014,
"grad_norm": 0.7265625,
"learning_rate": 0.00013494382801028615,
"loss": 0.4431,
"step": 2570
},
{
"epoch": 4.482158398607485,
"grad_norm": 0.98046875,
"learning_rate": 0.00013465877098547033,
"loss": 0.4472,
"step": 2575
},
{
"epoch": 4.490861618798956,
"grad_norm": 0.80078125,
"learning_rate": 0.00013437339352283026,
"loss": 0.4492,
"step": 2580
},
{
"epoch": 4.499564838990427,
"grad_norm": 0.80859375,
"learning_rate": 0.00013408769826082467,
"loss": 0.46,
"step": 2585
},
{
"epoch": 4.508268059181898,
"grad_norm": 0.77734375,
"learning_rate": 0.00013380168784085027,
"loss": 0.449,
"step": 2590
},
{
"epoch": 4.516971279373368,
"grad_norm": 0.8515625,
"learning_rate": 0.00013351536490721784,
"loss": 0.4548,
"step": 2595
},
{
"epoch": 4.525674499564839,
"grad_norm": 0.8125,
"learning_rate": 0.00013322873210712727,
"loss": 0.4428,
"step": 2600
},
{
"epoch": 4.53437771975631,
"grad_norm": 0.98828125,
"learning_rate": 0.00013294179209064348,
"loss": 0.4523,
"step": 2605
},
{
"epoch": 4.543080939947781,
"grad_norm": 0.8984375,
"learning_rate": 0.0001326545475106716,
"loss": 0.4523,
"step": 2610
},
{
"epoch": 4.551784160139252,
"grad_norm": 0.88671875,
"learning_rate": 0.0001323670010229328,
"loss": 0.4463,
"step": 2615
},
{
"epoch": 4.560487380330723,
"grad_norm": 0.87109375,
"learning_rate": 0.00013207915528593933,
"loss": 0.4485,
"step": 2620
},
{
"epoch": 4.569190600522193,
"grad_norm": 0.80859375,
"learning_rate": 0.00013179101296097035,
"loss": 0.4508,
"step": 2625
},
{
"epoch": 4.577893820713664,
"grad_norm": 0.79296875,
"learning_rate": 0.00013150257671204696,
"loss": 0.446,
"step": 2630
},
{
"epoch": 4.586597040905135,
"grad_norm": 0.80078125,
"learning_rate": 0.00013121384920590786,
"loss": 0.448,
"step": 2635
},
{
"epoch": 4.595300261096606,
"grad_norm": 0.8046875,
"learning_rate": 0.00013092483311198444,
"loss": 0.4522,
"step": 2640
},
{
"epoch": 4.604003481288077,
"grad_norm": 0.80859375,
"learning_rate": 0.00013063553110237642,
"loss": 0.4565,
"step": 2645
},
{
"epoch": 4.6127067014795475,
"grad_norm": 0.82421875,
"learning_rate": 0.00013034594585182677,
"loss": 0.4575,
"step": 2650
},
{
"epoch": 4.6214099216710185,
"grad_norm": 0.9140625,
"learning_rate": 0.00013005608003769718,
"loss": 0.4544,
"step": 2655
},
{
"epoch": 4.6301131418624895,
"grad_norm": 1.015625,
"learning_rate": 0.00012976593633994346,
"loss": 0.457,
"step": 2660
},
{
"epoch": 4.63881636205396,
"grad_norm": 0.7734375,
"learning_rate": 0.00012947551744109043,
"loss": 0.4478,
"step": 2665
},
{
"epoch": 4.6475195822454305,
"grad_norm": 0.80078125,
"learning_rate": 0.00012918482602620733,
"loss": 0.4591,
"step": 2670
},
{
"epoch": 4.6562228024369015,
"grad_norm": 0.98046875,
"learning_rate": 0.00012889386478288299,
"loss": 0.4549,
"step": 2675
},
{
"epoch": 4.664926022628372,
"grad_norm": 0.8125,
"learning_rate": 0.00012860263640120085,
"loss": 0.4468,
"step": 2680
},
{
"epoch": 4.673629242819843,
"grad_norm": 0.92578125,
"learning_rate": 0.00012831114357371426,
"loss": 0.444,
"step": 2685
},
{
"epoch": 4.682332463011314,
"grad_norm": 0.90625,
"learning_rate": 0.0001280193889954215,
"loss": 0.4649,
"step": 2690
},
{
"epoch": 4.691035683202785,
"grad_norm": 1.125,
"learning_rate": 0.0001277273753637408,
"loss": 0.4608,
"step": 2695
},
{
"epoch": 4.699738903394256,
"grad_norm": 0.84765625,
"learning_rate": 0.00012743510537848555,
"loss": 0.4522,
"step": 2700
},
{
"epoch": 4.708442123585726,
"grad_norm": 0.77734375,
"learning_rate": 0.0001271425817418392,
"loss": 0.4637,
"step": 2705
},
{
"epoch": 4.717145343777197,
"grad_norm": 0.79296875,
"learning_rate": 0.00012684980715833039,
"loss": 0.4589,
"step": 2710
},
{
"epoch": 4.725848563968668,
"grad_norm": 0.796875,
"learning_rate": 0.0001265567843348078,
"loss": 0.4552,
"step": 2715
},
{
"epoch": 4.734551784160139,
"grad_norm": 0.80859375,
"learning_rate": 0.00012626351598041532,
"loss": 0.4555,
"step": 2720
},
{
"epoch": 4.74325500435161,
"grad_norm": 0.8203125,
"learning_rate": 0.00012597000480656684,
"loss": 0.463,
"step": 2725
},
{
"epoch": 4.751958224543081,
"grad_norm": 0.83984375,
"learning_rate": 0.00012567625352692127,
"loss": 0.462,
"step": 2730
},
{
"epoch": 4.760661444734552,
"grad_norm": 0.76171875,
"learning_rate": 0.00012538226485735735,
"loss": 0.4553,
"step": 2735
},
{
"epoch": 4.769364664926023,
"grad_norm": 0.7890625,
"learning_rate": 0.00012508804151594867,
"loss": 0.4525,
"step": 2740
},
{
"epoch": 4.778067885117493,
"grad_norm": 0.86328125,
"learning_rate": 0.0001247935862229385,
"loss": 0.4609,
"step": 2745
},
{
"epoch": 4.786771105308964,
"grad_norm": 0.77734375,
"learning_rate": 0.00012449890170071454,
"loss": 0.4491,
"step": 2750
},
{
"epoch": 4.795474325500435,
"grad_norm": 0.82421875,
"learning_rate": 0.00012420399067378392,
"loss": 0.4502,
"step": 2755
},
{
"epoch": 4.804177545691906,
"grad_norm": 0.78515625,
"learning_rate": 0.00012390885586874783,
"loss": 0.4527,
"step": 2760
},
{
"epoch": 4.812880765883377,
"grad_norm": 0.73828125,
"learning_rate": 0.0001236135000142765,
"loss": 0.4531,
"step": 2765
},
{
"epoch": 4.821583986074848,
"grad_norm": 0.79296875,
"learning_rate": 0.00012331792584108374,
"loss": 0.4511,
"step": 2770
},
{
"epoch": 4.830287206266319,
"grad_norm": 0.86328125,
"learning_rate": 0.00012302213608190202,
"loss": 0.4504,
"step": 2775
},
{
"epoch": 4.838990426457789,
"grad_norm": 0.796875,
"learning_rate": 0.0001227261334714568,
"loss": 0.4538,
"step": 2780
},
{
"epoch": 4.84769364664926,
"grad_norm": 0.82421875,
"learning_rate": 0.00012242992074644162,
"loss": 0.4585,
"step": 2785
},
{
"epoch": 4.856396866840731,
"grad_norm": 0.83984375,
"learning_rate": 0.0001221335006454925,
"loss": 0.4518,
"step": 2790
},
{
"epoch": 4.865100087032202,
"grad_norm": 0.85546875,
"learning_rate": 0.00012183687590916291,
"loss": 0.4534,
"step": 2795
},
{
"epoch": 4.873803307223673,
"grad_norm": 0.84765625,
"learning_rate": 0.00012154004927989815,
"loss": 0.4543,
"step": 2800
},
{
"epoch": 4.882506527415144,
"grad_norm": 0.8359375,
"learning_rate": 0.00012124302350201016,
"loss": 0.4549,
"step": 2805
},
{
"epoch": 4.891209747606615,
"grad_norm": 1.078125,
"learning_rate": 0.00012094580132165211,
"loss": 0.4405,
"step": 2810
},
{
"epoch": 4.899912967798086,
"grad_norm": 0.86328125,
"learning_rate": 0.00012064838548679307,
"loss": 0.4501,
"step": 2815
},
{
"epoch": 4.908616187989556,
"grad_norm": 0.85546875,
"learning_rate": 0.00012035077874719242,
"loss": 0.4574,
"step": 2820
},
{
"epoch": 4.917319408181027,
"grad_norm": 0.90234375,
"learning_rate": 0.00012005298385437467,
"loss": 0.4515,
"step": 2825
},
{
"epoch": 4.926022628372498,
"grad_norm": 0.8359375,
"learning_rate": 0.00011975500356160383,
"loss": 0.4532,
"step": 2830
},
{
"epoch": 4.934725848563969,
"grad_norm": 0.78125,
"learning_rate": 0.00011945684062385803,
"loss": 0.4533,
"step": 2835
},
{
"epoch": 4.94342906875544,
"grad_norm": 0.84375,
"learning_rate": 0.00011915849779780408,
"loss": 0.4633,
"step": 2840
},
{
"epoch": 4.952132288946911,
"grad_norm": 0.984375,
"learning_rate": 0.00011885997784177196,
"loss": 0.4568,
"step": 2845
},
{
"epoch": 4.960835509138382,
"grad_norm": 0.80859375,
"learning_rate": 0.00011856128351572921,
"loss": 0.4543,
"step": 2850
},
{
"epoch": 4.969538729329852,
"grad_norm": 0.8125,
"learning_rate": 0.00011826241758125565,
"loss": 0.4576,
"step": 2855
},
{
"epoch": 4.978241949521323,
"grad_norm": 0.796875,
"learning_rate": 0.00011796338280151756,
"loss": 0.4595,
"step": 2860
},
{
"epoch": 4.986945169712794,
"grad_norm": 0.75,
"learning_rate": 0.0001176641819412424,
"loss": 0.4549,
"step": 2865
},
{
"epoch": 4.9956483899042645,
"grad_norm": 0.83984375,
"learning_rate": 0.00011736481776669306,
"loss": 0.4555,
"step": 2870
},
{
"epoch": 4.999129677980853,
"eval_loss": 3.144505500793457,
"eval_runtime": 1.1115,
"eval_samples_per_second": 5.398,
"eval_steps_per_second": 0.9,
"step": 2872
},
{
"epoch": 5.0043516100957355,
"grad_norm": 0.66796875,
"learning_rate": 0.00011706529304564235,
"loss": 0.4042,
"step": 2875
},
{
"epoch": 5.013054830287206,
"grad_norm": 0.890625,
"learning_rate": 0.00011676561054734749,
"loss": 0.3352,
"step": 2880
},
{
"epoch": 5.021758050478677,
"grad_norm": 0.79296875,
"learning_rate": 0.00011646577304252433,
"loss": 0.3304,
"step": 2885
},
{
"epoch": 5.030461270670148,
"grad_norm": 0.82421875,
"learning_rate": 0.0001161657833033219,
"loss": 0.3354,
"step": 2890
},
{
"epoch": 5.039164490861618,
"grad_norm": 0.8203125,
"learning_rate": 0.0001158656441032967,
"loss": 0.3342,
"step": 2895
},
{
"epoch": 5.047867711053089,
"grad_norm": 0.75,
"learning_rate": 0.00011556535821738705,
"loss": 0.3344,
"step": 2900
},
{
"epoch": 5.05657093124456,
"grad_norm": 0.7890625,
"learning_rate": 0.00011526492842188745,
"loss": 0.3339,
"step": 2905
},
{
"epoch": 5.065274151436031,
"grad_norm": 0.8125,
"learning_rate": 0.000114964357494423,
"loss": 0.3343,
"step": 2910
},
{
"epoch": 5.073977371627502,
"grad_norm": 0.765625,
"learning_rate": 0.00011466364821392348,
"loss": 0.3391,
"step": 2915
},
{
"epoch": 5.082680591818973,
"grad_norm": 0.88671875,
"learning_rate": 0.00011436280336059799,
"loss": 0.34,
"step": 2920
},
{
"epoch": 5.091383812010444,
"grad_norm": 0.8046875,
"learning_rate": 0.00011406182571590893,
"loss": 0.3388,
"step": 2925
},
{
"epoch": 5.100087032201914,
"grad_norm": 0.75,
"learning_rate": 0.00011376071806254651,
"loss": 0.3371,
"step": 2930
},
{
"epoch": 5.108790252393385,
"grad_norm": 0.80859375,
"learning_rate": 0.00011345948318440289,
"loss": 0.3496,
"step": 2935
},
{
"epoch": 5.117493472584856,
"grad_norm": 0.8203125,
"learning_rate": 0.0001131581238665465,
"loss": 0.3433,
"step": 2940
},
{
"epoch": 5.126196692776327,
"grad_norm": 0.828125,
"learning_rate": 0.00011285664289519626,
"loss": 0.3426,
"step": 2945
},
{
"epoch": 5.134899912967798,
"grad_norm": 0.8515625,
"learning_rate": 0.00011255504305769589,
"loss": 0.3352,
"step": 2950
},
{
"epoch": 5.143603133159269,
"grad_norm": 0.84375,
"learning_rate": 0.00011225332714248804,
"loss": 0.3492,
"step": 2955
},
{
"epoch": 5.15230635335074,
"grad_norm": 0.82421875,
"learning_rate": 0.00011195149793908856,
"loss": 0.338,
"step": 2960
},
{
"epoch": 5.161009573542211,
"grad_norm": 0.80078125,
"learning_rate": 0.00011164955823806079,
"loss": 0.343,
"step": 2965
},
{
"epoch": 5.169712793733681,
"grad_norm": 0.7890625,
"learning_rate": 0.00011134751083098946,
"loss": 0.3407,
"step": 2970
},
{
"epoch": 5.178416013925152,
"grad_norm": 0.8203125,
"learning_rate": 0.00011104535851045539,
"loss": 0.3391,
"step": 2975
},
{
"epoch": 5.187119234116623,
"grad_norm": 0.82421875,
"learning_rate": 0.00011074310407000914,
"loss": 0.3438,
"step": 2980
},
{
"epoch": 5.195822454308094,
"grad_norm": 0.84765625,
"learning_rate": 0.00011044075030414553,
"loss": 0.3394,
"step": 2985
},
{
"epoch": 5.204525674499565,
"grad_norm": 0.8046875,
"learning_rate": 0.00011013830000827767,
"loss": 0.3471,
"step": 2990
},
{
"epoch": 5.213228894691036,
"grad_norm": 0.8203125,
"learning_rate": 0.00010983575597871114,
"loss": 0.3392,
"step": 2995
},
{
"epoch": 5.221932114882507,
"grad_norm": 0.81640625,
"learning_rate": 0.00010953312101261815,
"loss": 0.3436,
"step": 3000
},
{
"epoch": 5.230635335073977,
"grad_norm": 0.8125,
"learning_rate": 0.00010923039790801164,
"loss": 0.3398,
"step": 3005
},
{
"epoch": 5.239338555265448,
"grad_norm": 0.81640625,
"learning_rate": 0.00010892758946371944,
"loss": 0.3469,
"step": 3010
},
{
"epoch": 5.248041775456919,
"grad_norm": 0.86328125,
"learning_rate": 0.00010862469847935841,
"loss": 0.3444,
"step": 3015
},
{
"epoch": 5.25674499564839,
"grad_norm": 0.77734375,
"learning_rate": 0.00010832172775530851,
"loss": 0.3431,
"step": 3020
},
{
"epoch": 5.265448215839861,
"grad_norm": 0.83984375,
"learning_rate": 0.00010801868009268691,
"loss": 0.3513,
"step": 3025
},
{
"epoch": 5.274151436031332,
"grad_norm": 0.8125,
"learning_rate": 0.00010771555829332223,
"loss": 0.3476,
"step": 3030
},
{
"epoch": 5.282854656222803,
"grad_norm": 0.8203125,
"learning_rate": 0.00010741236515972839,
"loss": 0.3471,
"step": 3035
},
{
"epoch": 5.291557876414274,
"grad_norm": 0.95703125,
"learning_rate": 0.0001071091034950788,
"loss": 0.3416,
"step": 3040
},
{
"epoch": 5.300261096605744,
"grad_norm": 0.80859375,
"learning_rate": 0.00010680577610318072,
"loss": 0.3454,
"step": 3045
},
{
"epoch": 5.308964316797215,
"grad_norm": 0.77734375,
"learning_rate": 0.0001065023857884488,
"loss": 0.3486,
"step": 3050
},
{
"epoch": 5.317667536988686,
"grad_norm": 0.984375,
"learning_rate": 0.00010619893535587964,
"loss": 0.3386,
"step": 3055
},
{
"epoch": 5.326370757180157,
"grad_norm": 0.9609375,
"learning_rate": 0.00010589542761102553,
"loss": 0.3418,
"step": 3060
},
{
"epoch": 5.335073977371628,
"grad_norm": 0.87109375,
"learning_rate": 0.00010559186535996873,
"loss": 0.3522,
"step": 3065
},
{
"epoch": 5.3437771975630985,
"grad_norm": 1.0234375,
"learning_rate": 0.00010528825140929541,
"loss": 0.3449,
"step": 3070
},
{
"epoch": 5.3524804177545695,
"grad_norm": 0.85546875,
"learning_rate": 0.00010498458856606972,
"loss": 0.3473,
"step": 3075
},
{
"epoch": 5.36118363794604,
"grad_norm": 0.828125,
"learning_rate": 0.00010468087963780789,
"loss": 0.353,
"step": 3080
},
{
"epoch": 5.3698868581375105,
"grad_norm": 0.9921875,
"learning_rate": 0.00010437712743245209,
"loss": 0.352,
"step": 3085
},
{
"epoch": 5.3785900783289815,
"grad_norm": 0.8828125,
"learning_rate": 0.00010407333475834487,
"loss": 0.354,
"step": 3090
},
{
"epoch": 5.3872932985204525,
"grad_norm": 0.89453125,
"learning_rate": 0.00010376950442420259,
"loss": 0.3436,
"step": 3095
},
{
"epoch": 5.395996518711923,
"grad_norm": 0.8359375,
"learning_rate": 0.00010346563923909014,
"loss": 0.3511,
"step": 3100
},
{
"epoch": 5.404699738903394,
"grad_norm": 0.90234375,
"learning_rate": 0.00010316174201239437,
"loss": 0.3472,
"step": 3105
},
{
"epoch": 5.413402959094865,
"grad_norm": 0.8046875,
"learning_rate": 0.00010285781555379852,
"loss": 0.3449,
"step": 3110
},
{
"epoch": 5.422106179286336,
"grad_norm": 1.015625,
"learning_rate": 0.00010255386267325602,
"loss": 0.3471,
"step": 3115
},
{
"epoch": 5.430809399477806,
"grad_norm": 0.80078125,
"learning_rate": 0.00010224988618096458,
"loss": 0.3523,
"step": 3120
},
{
"epoch": 5.439512619669277,
"grad_norm": 0.86328125,
"learning_rate": 0.00010194588888734027,
"loss": 0.3492,
"step": 3125
},
{
"epoch": 5.448215839860748,
"grad_norm": 0.8828125,
"learning_rate": 0.00010164187360299142,
"loss": 0.3465,
"step": 3130
},
{
"epoch": 5.456919060052219,
"grad_norm": 0.828125,
"learning_rate": 0.00010133784313869277,
"loss": 0.3472,
"step": 3135
},
{
"epoch": 5.46562228024369,
"grad_norm": 0.84375,
"learning_rate": 0.00010103380030535929,
"loss": 0.3558,
"step": 3140
},
{
"epoch": 5.474325500435161,
"grad_norm": 0.8828125,
"learning_rate": 0.0001007297479140204,
"loss": 0.3539,
"step": 3145
},
{
"epoch": 5.483028720626632,
"grad_norm": 0.91796875,
"learning_rate": 0.00010042568877579388,
"loss": 0.3486,
"step": 3150
},
{
"epoch": 5.491731940818102,
"grad_norm": 1.078125,
"learning_rate": 0.00010012162570185983,
"loss": 0.3573,
"step": 3155
},
{
"epoch": 5.500435161009573,
"grad_norm": 0.83984375,
"learning_rate": 9.981756150343485e-05,
"loss": 0.3473,
"step": 3160
},
{
"epoch": 5.509138381201044,
"grad_norm": 0.8359375,
"learning_rate": 9.951349899174577e-05,
"loss": 0.3558,
"step": 3165
},
{
"epoch": 5.517841601392515,
"grad_norm": 0.796875,
"learning_rate": 9.920944097800398e-05,
"loss": 0.3542,
"step": 3170
},
{
"epoch": 5.526544821583986,
"grad_norm": 0.85546875,
"learning_rate": 9.890539027337924e-05,
"loss": 0.3471,
"step": 3175
},
{
"epoch": 5.535248041775457,
"grad_norm": 0.9375,
"learning_rate": 9.860134968897366e-05,
"loss": 0.3553,
"step": 3180
},
{
"epoch": 5.543951261966928,
"grad_norm": 0.81640625,
"learning_rate": 9.829732203579584e-05,
"loss": 0.3558,
"step": 3185
},
{
"epoch": 5.552654482158399,
"grad_norm": 0.78125,
"learning_rate": 9.799331012473493e-05,
"loss": 0.3526,
"step": 3190
},
{
"epoch": 5.56135770234987,
"grad_norm": 0.7890625,
"learning_rate": 9.768931676653427e-05,
"loss": 0.3499,
"step": 3195
},
{
"epoch": 5.57006092254134,
"grad_norm": 0.81640625,
"learning_rate": 9.738534477176596e-05,
"loss": 0.3447,
"step": 3200
},
{
"epoch": 5.578764142732811,
"grad_norm": 0.88671875,
"learning_rate": 9.708139695080441e-05,
"loss": 0.3568,
"step": 3205
},
{
"epoch": 5.587467362924282,
"grad_norm": 0.8046875,
"learning_rate": 9.677747611380058e-05,
"loss": 0.3575,
"step": 3210
},
{
"epoch": 5.596170583115753,
"grad_norm": 0.83984375,
"learning_rate": 9.647358507065594e-05,
"loss": 0.3536,
"step": 3215
},
{
"epoch": 5.604873803307224,
"grad_norm": 0.85546875,
"learning_rate": 9.616972663099647e-05,
"loss": 0.3524,
"step": 3220
},
{
"epoch": 5.613577023498695,
"grad_norm": 0.8046875,
"learning_rate": 9.58659036041468e-05,
"loss": 0.3541,
"step": 3225
},
{
"epoch": 5.622280243690165,
"grad_norm": 0.84765625,
"learning_rate": 9.556211879910414e-05,
"loss": 0.3519,
"step": 3230
},
{
"epoch": 5.630983463881636,
"grad_norm": 0.84375,
"learning_rate": 9.52583750245122e-05,
"loss": 0.3514,
"step": 3235
},
{
"epoch": 5.639686684073107,
"grad_norm": 0.80078125,
"learning_rate": 9.495467508863542e-05,
"loss": 0.3485,
"step": 3240
},
{
"epoch": 5.648389904264578,
"grad_norm": 0.859375,
"learning_rate": 9.465102179933302e-05,
"loss": 0.3547,
"step": 3245
},
{
"epoch": 5.657093124456049,
"grad_norm": 0.80859375,
"learning_rate": 9.434741796403282e-05,
"loss": 0.3549,
"step": 3250
},
{
"epoch": 5.66579634464752,
"grad_norm": 0.79296875,
"learning_rate": 9.404386638970542e-05,
"loss": 0.3502,
"step": 3255
},
{
"epoch": 5.674499564838991,
"grad_norm": 0.87890625,
"learning_rate": 9.37403698828383e-05,
"loss": 0.354,
"step": 3260
},
{
"epoch": 5.683202785030462,
"grad_norm": 0.84765625,
"learning_rate": 9.343693124940977e-05,
"loss": 0.3499,
"step": 3265
},
{
"epoch": 5.691906005221933,
"grad_norm": 0.8359375,
"learning_rate": 9.313355329486318e-05,
"loss": 0.3535,
"step": 3270
},
{
"epoch": 5.700609225413403,
"grad_norm": 0.89453125,
"learning_rate": 9.283023882408065e-05,
"loss": 0.3487,
"step": 3275
},
{
"epoch": 5.709312445604874,
"grad_norm": 0.80078125,
"learning_rate": 9.252699064135758e-05,
"loss": 0.3458,
"step": 3280
},
{
"epoch": 5.718015665796345,
"grad_norm": 0.84375,
"learning_rate": 9.22238115503764e-05,
"loss": 0.3518,
"step": 3285
},
{
"epoch": 5.7267188859878155,
"grad_norm": 0.84765625,
"learning_rate": 9.192070435418079e-05,
"loss": 0.3488,
"step": 3290
},
{
"epoch": 5.7354221061792865,
"grad_norm": 0.74609375,
"learning_rate": 9.161767185514964e-05,
"loss": 0.3529,
"step": 3295
},
{
"epoch": 5.7441253263707575,
"grad_norm": 0.83203125,
"learning_rate": 9.131471685497134e-05,
"loss": 0.3553,
"step": 3300
},
{
"epoch": 5.7528285465622275,
"grad_norm": 0.83203125,
"learning_rate": 9.101184215461774e-05,
"loss": 0.3494,
"step": 3305
},
{
"epoch": 5.7615317667536985,
"grad_norm": 0.8984375,
"learning_rate": 9.070905055431822e-05,
"loss": 0.357,
"step": 3310
},
{
"epoch": 5.7702349869451695,
"grad_norm": 0.796875,
"learning_rate": 9.040634485353389e-05,
"loss": 0.3592,
"step": 3315
},
{
"epoch": 5.77893820713664,
"grad_norm": 0.81640625,
"learning_rate": 9.010372785093167e-05,
"loss": 0.3521,
"step": 3320
},
{
"epoch": 5.787641427328111,
"grad_norm": 0.85546875,
"learning_rate": 8.980120234435849e-05,
"loss": 0.3605,
"step": 3325
},
{
"epoch": 5.796344647519582,
"grad_norm": 0.85546875,
"learning_rate": 8.949877113081521e-05,
"loss": 0.35,
"step": 3330
},
{
"epoch": 5.805047867711053,
"grad_norm": 0.859375,
"learning_rate": 8.919643700643103e-05,
"loss": 0.3483,
"step": 3335
},
{
"epoch": 5.813751087902524,
"grad_norm": 0.80078125,
"learning_rate": 8.889420276643746e-05,
"loss": 0.3505,
"step": 3340
},
{
"epoch": 5.822454308093995,
"grad_norm": 0.8515625,
"learning_rate": 8.859207120514255e-05,
"loss": 0.3468,
"step": 3345
},
{
"epoch": 5.831157528285465,
"grad_norm": 0.88671875,
"learning_rate": 8.829004511590501e-05,
"loss": 0.3539,
"step": 3350
},
{
"epoch": 5.839860748476936,
"grad_norm": 0.953125,
"learning_rate": 8.798812729110837e-05,
"loss": 0.3481,
"step": 3355
},
{
"epoch": 5.848563968668407,
"grad_norm": 0.87109375,
"learning_rate": 8.768632052213531e-05,
"loss": 0.3551,
"step": 3360
},
{
"epoch": 5.857267188859878,
"grad_norm": 0.828125,
"learning_rate": 8.738462759934168e-05,
"loss": 0.3509,
"step": 3365
},
{
"epoch": 5.865970409051349,
"grad_norm": 0.91796875,
"learning_rate": 8.708305131203072e-05,
"loss": 0.3551,
"step": 3370
},
{
"epoch": 5.87467362924282,
"grad_norm": 0.8671875,
"learning_rate": 8.678159444842737e-05,
"loss": 0.3469,
"step": 3375
},
{
"epoch": 5.883376849434291,
"grad_norm": 0.91015625,
"learning_rate": 8.648025979565245e-05,
"loss": 0.3544,
"step": 3380
},
{
"epoch": 5.892080069625761,
"grad_norm": 0.8046875,
"learning_rate": 8.617905013969688e-05,
"loss": 0.3476,
"step": 3385
},
{
"epoch": 5.900783289817232,
"grad_norm": 0.86328125,
"learning_rate": 8.587796826539585e-05,
"loss": 0.3531,
"step": 3390
},
{
"epoch": 5.909486510008703,
"grad_norm": 0.80859375,
"learning_rate": 8.557701695640321e-05,
"loss": 0.3401,
"step": 3395
},
{
"epoch": 5.918189730200174,
"grad_norm": 0.83984375,
"learning_rate": 8.527619899516567e-05,
"loss": 0.35,
"step": 3400
},
{
"epoch": 5.926892950391645,
"grad_norm": 0.84765625,
"learning_rate": 8.497551716289703e-05,
"loss": 0.3474,
"step": 3405
},
{
"epoch": 5.935596170583116,
"grad_norm": 0.8125,
"learning_rate": 8.467497423955249e-05,
"loss": 0.35,
"step": 3410
},
{
"epoch": 5.944299390774587,
"grad_norm": 0.82421875,
"learning_rate": 8.437457300380309e-05,
"loss": 0.3564,
"step": 3415
},
{
"epoch": 5.953002610966058,
"grad_norm": 0.8671875,
"learning_rate": 8.407431623300983e-05,
"loss": 0.3516,
"step": 3420
},
{
"epoch": 5.961705831157528,
"grad_norm": 0.9140625,
"learning_rate": 8.377420670319795e-05,
"loss": 0.356,
"step": 3425
},
{
"epoch": 5.970409051348999,
"grad_norm": 0.87109375,
"learning_rate": 8.347424718903151e-05,
"loss": 0.3538,
"step": 3430
},
{
"epoch": 5.97911227154047,
"grad_norm": 0.82421875,
"learning_rate": 8.317444046378757e-05,
"loss": 0.3491,
"step": 3435
},
{
"epoch": 5.987815491731941,
"grad_norm": 0.9296875,
"learning_rate": 8.28747892993306e-05,
"loss": 0.3559,
"step": 3440
},
{
"epoch": 5.996518711923412,
"grad_norm": 0.83984375,
"learning_rate": 8.257529646608672e-05,
"loss": 0.3504,
"step": 3445
},
{
"epoch": 6.0,
"eval_loss": 3.7196741104125977,
"eval_runtime": 0.7785,
"eval_samples_per_second": 7.707,
"eval_steps_per_second": 1.285,
"step": 3447
},
{
"epoch": 6.005221932114883,
"grad_norm": 0.62109375,
"learning_rate": 8.227596473301835e-05,
"loss": 0.2993,
"step": 3450
},
{
"epoch": 6.013925152306354,
"grad_norm": 0.859375,
"learning_rate": 8.19767968675983e-05,
"loss": 0.2552,
"step": 3455
},
{
"epoch": 6.022628372497824,
"grad_norm": 0.70703125,
"learning_rate": 8.167779563578456e-05,
"loss": 0.2635,
"step": 3460
},
{
"epoch": 6.031331592689295,
"grad_norm": 0.69921875,
"learning_rate": 8.13789638019942e-05,
"loss": 0.2613,
"step": 3465
},
{
"epoch": 6.040034812880766,
"grad_norm": 0.73046875,
"learning_rate": 8.108030412907844e-05,
"loss": 0.2631,
"step": 3470
},
{
"epoch": 6.048738033072237,
"grad_norm": 0.86328125,
"learning_rate": 8.078181937829656e-05,
"loss": 0.2646,
"step": 3475
},
{
"epoch": 6.057441253263708,
"grad_norm": 0.73046875,
"learning_rate": 8.048351230929074e-05,
"loss": 0.2621,
"step": 3480
},
{
"epoch": 6.066144473455179,
"grad_norm": 0.8125,
"learning_rate": 8.018538568006027e-05,
"loss": 0.267,
"step": 3485
},
{
"epoch": 6.07484769364665,
"grad_norm": 0.80859375,
"learning_rate": 7.988744224693625e-05,
"loss": 0.2599,
"step": 3490
},
{
"epoch": 6.0835509138381205,
"grad_norm": 0.796875,
"learning_rate": 7.958968476455608e-05,
"loss": 0.2643,
"step": 3495
},
{
"epoch": 6.092254134029591,
"grad_norm": 0.77734375,
"learning_rate": 7.929211598583794e-05,
"loss": 0.269,
"step": 3500
},
{
"epoch": 6.100957354221062,
"grad_norm": 0.828125,
"learning_rate": 7.899473866195526e-05,
"loss": 0.2622,
"step": 3505
},
{
"epoch": 6.1096605744125325,
"grad_norm": 0.859375,
"learning_rate": 7.869755554231145e-05,
"loss": 0.2633,
"step": 3510
},
{
"epoch": 6.1183637946040035,
"grad_norm": 0.76953125,
"learning_rate": 7.840056937451444e-05,
"loss": 0.2687,
"step": 3515
},
{
"epoch": 6.127067014795474,
"grad_norm": 0.83984375,
"learning_rate": 7.810378290435108e-05,
"loss": 0.2622,
"step": 3520
},
{
"epoch": 6.135770234986945,
"grad_norm": 0.87109375,
"learning_rate": 7.780719887576213e-05,
"loss": 0.2652,
"step": 3525
},
{
"epoch": 6.144473455178416,
"grad_norm": 2.03125,
"learning_rate": 7.751082003081653e-05,
"loss": 0.267,
"step": 3530
},
{
"epoch": 6.153176675369886,
"grad_norm": 0.78515625,
"learning_rate": 7.721464910968627e-05,
"loss": 0.2621,
"step": 3535
},
{
"epoch": 6.161879895561357,
"grad_norm": 0.85546875,
"learning_rate": 7.691868885062088e-05,
"loss": 0.2614,
"step": 3540
},
{
"epoch": 6.170583115752828,
"grad_norm": 0.79296875,
"learning_rate": 7.662294198992228e-05,
"loss": 0.264,
"step": 3545
},
{
"epoch": 6.179286335944299,
"grad_norm": 0.75390625,
"learning_rate": 7.632741126191947e-05,
"loss": 0.267,
"step": 3550
},
{
"epoch": 6.18798955613577,
"grad_norm": 0.74609375,
"learning_rate": 7.603209939894312e-05,
"loss": 0.2638,
"step": 3555
},
{
"epoch": 6.196692776327241,
"grad_norm": 0.828125,
"learning_rate": 7.573700913130035e-05,
"loss": 0.2614,
"step": 3560
},
{
"epoch": 6.205395996518712,
"grad_norm": 0.78125,
"learning_rate": 7.544214318724961e-05,
"loss": 0.2659,
"step": 3565
},
{
"epoch": 6.214099216710183,
"grad_norm": 0.7890625,
"learning_rate": 7.514750429297528e-05,
"loss": 0.2686,
"step": 3570
},
{
"epoch": 6.222802436901653,
"grad_norm": 0.80078125,
"learning_rate": 7.485309517256267e-05,
"loss": 0.268,
"step": 3575
},
{
"epoch": 6.231505657093124,
"grad_norm": 0.79296875,
"learning_rate": 7.455891854797256e-05,
"loss": 0.2652,
"step": 3580
},
{
"epoch": 6.240208877284595,
"grad_norm": 0.76953125,
"learning_rate": 7.426497713901629e-05,
"loss": 0.2638,
"step": 3585
},
{
"epoch": 6.248912097476066,
"grad_norm": 0.78515625,
"learning_rate": 7.397127366333048e-05,
"loss": 0.2649,
"step": 3590
},
{
"epoch": 6.257615317667537,
"grad_norm": 0.7890625,
"learning_rate": 7.3677810836352e-05,
"loss": 0.271,
"step": 3595
},
{
"epoch": 6.266318537859008,
"grad_norm": 0.80078125,
"learning_rate": 7.338459137129266e-05,
"loss": 0.2661,
"step": 3600
},
{
"epoch": 6.275021758050479,
"grad_norm": 0.7265625,
"learning_rate": 7.309161797911441e-05,
"loss": 0.2693,
"step": 3605
},
{
"epoch": 6.283724978241949,
"grad_norm": 0.8046875,
"learning_rate": 7.279889336850408e-05,
"loss": 0.2668,
"step": 3610
},
{
"epoch": 6.29242819843342,
"grad_norm": 0.81640625,
"learning_rate": 7.250642024584835e-05,
"loss": 0.2709,
"step": 3615
},
{
"epoch": 6.301131418624891,
"grad_norm": 0.72265625,
"learning_rate": 7.22142013152088e-05,
"loss": 0.2682,
"step": 3620
},
{
"epoch": 6.309834638816362,
"grad_norm": 0.7890625,
"learning_rate": 7.192223927829689e-05,
"loss": 0.264,
"step": 3625
},
{
"epoch": 6.318537859007833,
"grad_norm": 0.796875,
"learning_rate": 7.163053683444901e-05,
"loss": 0.2719,
"step": 3630
},
{
"epoch": 6.327241079199304,
"grad_norm": 0.87890625,
"learning_rate": 7.133909668060131e-05,
"loss": 0.2715,
"step": 3635
},
{
"epoch": 6.335944299390775,
"grad_norm": 0.8203125,
"learning_rate": 7.104792151126515e-05,
"loss": 0.263,
"step": 3640
},
{
"epoch": 6.344647519582246,
"grad_norm": 0.79296875,
"learning_rate": 7.075701401850183e-05,
"loss": 0.2629,
"step": 3645
},
{
"epoch": 6.353350739773716,
"grad_norm": 0.82421875,
"learning_rate": 7.046637689189794e-05,
"loss": 0.2674,
"step": 3650
},
{
"epoch": 6.362053959965187,
"grad_norm": 0.7890625,
"learning_rate": 7.017601281854027e-05,
"loss": 0.2684,
"step": 3655
},
{
"epoch": 6.370757180156658,
"grad_norm": 0.796875,
"learning_rate": 6.988592448299124e-05,
"loss": 0.2652,
"step": 3660
},
{
"epoch": 6.379460400348129,
"grad_norm": 0.828125,
"learning_rate": 6.959611456726387e-05,
"loss": 0.2642,
"step": 3665
},
{
"epoch": 6.3881636205396,
"grad_norm": 0.78515625,
"learning_rate": 6.930658575079705e-05,
"loss": 0.2696,
"step": 3670
},
{
"epoch": 6.396866840731071,
"grad_norm": 0.8125,
"learning_rate": 6.901734071043071e-05,
"loss": 0.27,
"step": 3675
},
{
"epoch": 6.405570060922542,
"grad_norm": 0.7734375,
"learning_rate": 6.872838212038122e-05,
"loss": 0.2699,
"step": 3680
},
{
"epoch": 6.414273281114012,
"grad_norm": 0.77734375,
"learning_rate": 6.843971265221655e-05,
"loss": 0.2687,
"step": 3685
},
{
"epoch": 6.422976501305483,
"grad_norm": 0.84375,
"learning_rate": 6.815133497483157e-05,
"loss": 0.2681,
"step": 3690
},
{
"epoch": 6.431679721496954,
"grad_norm": 0.8828125,
"learning_rate": 6.786325175442339e-05,
"loss": 0.2631,
"step": 3695
},
{
"epoch": 6.440382941688425,
"grad_norm": 0.77734375,
"learning_rate": 6.75754656544667e-05,
"loss": 0.2619,
"step": 3700
},
{
"epoch": 6.449086161879896,
"grad_norm": 0.83203125,
"learning_rate": 6.728797933568924e-05,
"loss": 0.2658,
"step": 3705
},
{
"epoch": 6.4577893820713665,
"grad_norm": 0.81640625,
"learning_rate": 6.700079545604708e-05,
"loss": 0.2696,
"step": 3710
},
{
"epoch": 6.4664926022628375,
"grad_norm": 0.7734375,
"learning_rate": 6.671391667070002e-05,
"loss": 0.2707,
"step": 3715
},
{
"epoch": 6.4751958224543085,
"grad_norm": 0.7734375,
"learning_rate": 6.642734563198723e-05,
"loss": 0.2653,
"step": 3720
},
{
"epoch": 6.4838990426457785,
"grad_norm": 0.8984375,
"learning_rate": 6.614108498940252e-05,
"loss": 0.2721,
"step": 3725
},
{
"epoch": 6.4926022628372495,
"grad_norm": 0.8046875,
"learning_rate": 6.585513738956996e-05,
"loss": 0.2674,
"step": 3730
},
{
"epoch": 6.5013054830287205,
"grad_norm": 0.75,
"learning_rate": 6.556950547621936e-05,
"loss": 0.2689,
"step": 3735
},
{
"epoch": 6.510008703220191,
"grad_norm": 0.796875,
"learning_rate": 6.52841918901619e-05,
"loss": 0.2695,
"step": 3740
},
{
"epoch": 6.518711923411662,
"grad_norm": 0.859375,
"learning_rate": 6.499919926926566e-05,
"loss": 0.269,
"step": 3745
},
{
"epoch": 6.527415143603133,
"grad_norm": 0.78515625,
"learning_rate": 6.471453024843113e-05,
"loss": 0.2655,
"step": 3750
},
{
"epoch": 6.536118363794604,
"grad_norm": 0.78515625,
"learning_rate": 6.44301874595671e-05,
"loss": 0.265,
"step": 3755
},
{
"epoch": 6.544821583986074,
"grad_norm": 0.7734375,
"learning_rate": 6.414617353156605e-05,
"loss": 0.2627,
"step": 3760
},
{
"epoch": 6.553524804177545,
"grad_norm": 0.79296875,
"learning_rate": 6.386249109028013e-05,
"loss": 0.2724,
"step": 3765
},
{
"epoch": 6.562228024369016,
"grad_norm": 0.796875,
"learning_rate": 6.357914275849652e-05,
"loss": 0.2693,
"step": 3770
},
{
"epoch": 6.570931244560487,
"grad_norm": 0.80859375,
"learning_rate": 6.329613115591359e-05,
"loss": 0.273,
"step": 3775
},
{
"epoch": 6.579634464751958,
"grad_norm": 0.73828125,
"learning_rate": 6.301345889911637e-05,
"loss": 0.2665,
"step": 3780
},
{
"epoch": 6.588337684943429,
"grad_norm": 0.76953125,
"learning_rate": 6.273112860155251e-05,
"loss": 0.2676,
"step": 3785
},
{
"epoch": 6.5970409051349,
"grad_norm": 0.796875,
"learning_rate": 6.2449142873508e-05,
"loss": 0.2659,
"step": 3790
},
{
"epoch": 6.605744125326371,
"grad_norm": 0.83984375,
"learning_rate": 6.21675043220832e-05,
"loss": 0.2691,
"step": 3795
},
{
"epoch": 6.614447345517842,
"grad_norm": 0.89453125,
"learning_rate": 6.188621555116865e-05,
"loss": 0.273,
"step": 3800
},
{
"epoch": 6.623150565709312,
"grad_norm": 0.8203125,
"learning_rate": 6.160527916142093e-05,
"loss": 0.2637,
"step": 3805
},
{
"epoch": 6.631853785900783,
"grad_norm": 0.80078125,
"learning_rate": 6.132469775023867e-05,
"loss": 0.2665,
"step": 3810
},
{
"epoch": 6.640557006092254,
"grad_norm": 0.78515625,
"learning_rate": 6.104447391173858e-05,
"loss": 0.2675,
"step": 3815
},
{
"epoch": 6.649260226283725,
"grad_norm": 0.80078125,
"learning_rate": 6.0764610236731524e-05,
"loss": 0.2696,
"step": 3820
},
{
"epoch": 6.657963446475196,
"grad_norm": 0.81640625,
"learning_rate": 6.048510931269824e-05,
"loss": 0.2654,
"step": 3825
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.83203125,
"learning_rate": 6.020597372376589e-05,
"loss": 0.2746,
"step": 3830
},
{
"epoch": 6.675369886858137,
"grad_norm": 0.84765625,
"learning_rate": 5.992720605068378e-05,
"loss": 0.2731,
"step": 3835
},
{
"epoch": 6.684073107049608,
"grad_norm": 0.9140625,
"learning_rate": 5.964880887079972e-05,
"loss": 0.2694,
"step": 3840
},
{
"epoch": 6.692776327241079,
"grad_norm": 0.8984375,
"learning_rate": 5.937078475803607e-05,
"loss": 0.2718,
"step": 3845
},
{
"epoch": 6.70147954743255,
"grad_norm": 0.80078125,
"learning_rate": 5.909313628286601e-05,
"loss": 0.2679,
"step": 3850
},
{
"epoch": 6.710182767624021,
"grad_norm": 0.7578125,
"learning_rate": 5.881586601228983e-05,
"loss": 0.2644,
"step": 3855
},
{
"epoch": 6.718885987815492,
"grad_norm": 0.81640625,
"learning_rate": 5.853897650981107e-05,
"loss": 0.2712,
"step": 3860
},
{
"epoch": 6.727589208006963,
"grad_norm": 0.8203125,
"learning_rate": 5.8262470335412834e-05,
"loss": 0.2645,
"step": 3865
},
{
"epoch": 6.736292428198434,
"grad_norm": 0.7890625,
"learning_rate": 5.798635004553421e-05,
"loss": 0.2668,
"step": 3870
},
{
"epoch": 6.744995648389905,
"grad_norm": 0.82421875,
"learning_rate": 5.771061819304664e-05,
"loss": 0.2735,
"step": 3875
},
{
"epoch": 6.753698868581375,
"grad_norm": 0.8046875,
"learning_rate": 5.7435277327230206e-05,
"loss": 0.2721,
"step": 3880
},
{
"epoch": 6.762402088772846,
"grad_norm": 0.84375,
"learning_rate": 5.716032999375006e-05,
"loss": 0.2654,
"step": 3885
},
{
"epoch": 6.771105308964317,
"grad_norm": 0.84765625,
"learning_rate": 5.6885778734633074e-05,
"loss": 0.2701,
"step": 3890
},
{
"epoch": 6.779808529155788,
"grad_norm": 0.77734375,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.2684,
"step": 3895
},
{
"epoch": 6.788511749347259,
"grad_norm": 0.80859375,
"learning_rate": 5.6337874589262915e-05,
"loss": 0.2686,
"step": 3900
},
{
"epoch": 6.79721496953873,
"grad_norm": 0.83984375,
"learning_rate": 5.606452676865993e-05,
"loss": 0.2666,
"step": 3905
},
{
"epoch": 6.8059181897302,
"grad_norm": 0.79296875,
"learning_rate": 5.5791585153673774e-05,
"loss": 0.2687,
"step": 3910
},
{
"epoch": 6.814621409921671,
"grad_norm": 0.73828125,
"learning_rate": 5.5519052267787444e-05,
"loss": 0.2667,
"step": 3915
},
{
"epoch": 6.823324630113142,
"grad_norm": 0.78515625,
"learning_rate": 5.524693063070492e-05,
"loss": 0.2689,
"step": 3920
},
{
"epoch": 6.832027850304613,
"grad_norm": 0.80078125,
"learning_rate": 5.497522275832799e-05,
"loss": 0.2666,
"step": 3925
},
{
"epoch": 6.8407310704960835,
"grad_norm": 0.79296875,
"learning_rate": 5.4703931162733116e-05,
"loss": 0.265,
"step": 3930
},
{
"epoch": 6.8494342906875545,
"grad_norm": 0.7734375,
"learning_rate": 5.4433058352147914e-05,
"loss": 0.2667,
"step": 3935
},
{
"epoch": 6.8581375108790255,
"grad_norm": 0.80078125,
"learning_rate": 5.416260683092814e-05,
"loss": 0.2629,
"step": 3940
},
{
"epoch": 6.866840731070496,
"grad_norm": 0.8203125,
"learning_rate": 5.389257909953462e-05,
"loss": 0.2712,
"step": 3945
},
{
"epoch": 6.875543951261967,
"grad_norm": 0.765625,
"learning_rate": 5.362297765450999e-05,
"loss": 0.2671,
"step": 3950
},
{
"epoch": 6.8842471714534375,
"grad_norm": 0.78125,
"learning_rate": 5.335380498845559e-05,
"loss": 0.261,
"step": 3955
},
{
"epoch": 6.892950391644908,
"grad_norm": 0.83203125,
"learning_rate": 5.308506359000851e-05,
"loss": 0.2663,
"step": 3960
},
{
"epoch": 6.901653611836379,
"grad_norm": 0.75,
"learning_rate": 5.281675594381859e-05,
"loss": 0.2673,
"step": 3965
},
{
"epoch": 6.91035683202785,
"grad_norm": 0.8125,
"learning_rate": 5.25488845305254e-05,
"loss": 0.2691,
"step": 3970
},
{
"epoch": 6.919060052219321,
"grad_norm": 0.80859375,
"learning_rate": 5.228145182673532e-05,
"loss": 0.2725,
"step": 3975
},
{
"epoch": 6.927763272410792,
"grad_norm": 0.84765625,
"learning_rate": 5.2014460304998545e-05,
"loss": 0.2653,
"step": 3980
},
{
"epoch": 6.936466492602263,
"grad_norm": 0.796875,
"learning_rate": 5.1747912433786497e-05,
"loss": 0.2661,
"step": 3985
},
{
"epoch": 6.945169712793733,
"grad_norm": 0.8203125,
"learning_rate": 5.148181067746862e-05,
"loss": 0.2707,
"step": 3990
},
{
"epoch": 6.953872932985204,
"grad_norm": 0.8125,
"learning_rate": 5.121615749629003e-05,
"loss": 0.267,
"step": 3995
},
{
"epoch": 6.962576153176675,
"grad_norm": 0.796875,
"learning_rate": 5.0950955346348314e-05,
"loss": 0.2662,
"step": 4000
},
{
"epoch": 6.971279373368146,
"grad_norm": 0.8359375,
"learning_rate": 5.068620667957123e-05,
"loss": 0.2695,
"step": 4005
},
{
"epoch": 6.979982593559617,
"grad_norm": 0.859375,
"learning_rate": 5.042191394369371e-05,
"loss": 0.266,
"step": 4010
},
{
"epoch": 6.988685813751088,
"grad_norm": 0.83203125,
"learning_rate": 5.01580795822355e-05,
"loss": 0.2737,
"step": 4015
},
{
"epoch": 6.997389033942559,
"grad_norm": 0.7734375,
"learning_rate": 4.989470603447835e-05,
"loss": 0.2672,
"step": 4020
},
{
"epoch": 6.999129677980853,
"eval_loss": 4.307767391204834,
"eval_runtime": 1.1109,
"eval_samples_per_second": 5.401,
"eval_steps_per_second": 0.9,
"step": 4021
},
{
"epoch": 7.00609225413403,
"grad_norm": 0.57421875,
"learning_rate": 4.963179573544357e-05,
"loss": 0.2314,
"step": 4025
},
{
"epoch": 7.0147954743255,
"grad_norm": 0.6796875,
"learning_rate": 4.9369351115869535e-05,
"loss": 0.2146,
"step": 4030
},
{
"epoch": 7.023498694516971,
"grad_norm": 0.75390625,
"learning_rate": 4.9107374602189216e-05,
"loss": 0.2171,
"step": 4035
},
{
"epoch": 7.032201914708442,
"grad_norm": 0.73828125,
"learning_rate": 4.8845868616507617e-05,
"loss": 0.2179,
"step": 4040
},
{
"epoch": 7.040905134899913,
"grad_norm": 0.69921875,
"learning_rate": 4.8584835576579466e-05,
"loss": 0.2184,
"step": 4045
},
{
"epoch": 7.049608355091384,
"grad_norm": 0.734375,
"learning_rate": 4.832427789578701e-05,
"loss": 0.2178,
"step": 4050
},
{
"epoch": 7.058311575282855,
"grad_norm": 0.73828125,
"learning_rate": 4.806419798311739e-05,
"loss": 0.214,
"step": 4055
},
{
"epoch": 7.067014795474326,
"grad_norm": 0.703125,
"learning_rate": 4.7804598243140666e-05,
"loss": 0.2176,
"step": 4060
},
{
"epoch": 7.075718015665796,
"grad_norm": 0.73046875,
"learning_rate": 4.754548107598736e-05,
"loss": 0.2158,
"step": 4065
},
{
"epoch": 7.084421235857267,
"grad_norm": 0.71484375,
"learning_rate": 4.728684887732649e-05,
"loss": 0.2175,
"step": 4070
},
{
"epoch": 7.093124456048738,
"grad_norm": 0.70703125,
"learning_rate": 4.702870403834317e-05,
"loss": 0.2162,
"step": 4075
},
{
"epoch": 7.101827676240209,
"grad_norm": 0.75,
"learning_rate": 4.6771048945716664e-05,
"loss": 0.2189,
"step": 4080
},
{
"epoch": 7.11053089643168,
"grad_norm": 0.7421875,
"learning_rate": 4.65138859815983e-05,
"loss": 0.2187,
"step": 4085
},
{
"epoch": 7.119234116623151,
"grad_norm": 0.734375,
"learning_rate": 4.62572175235895e-05,
"loss": 0.2207,
"step": 4090
},
{
"epoch": 7.127937336814622,
"grad_norm": 0.7109375,
"learning_rate": 4.60010459447196e-05,
"loss": 0.2111,
"step": 4095
},
{
"epoch": 7.136640557006093,
"grad_norm": 0.7265625,
"learning_rate": 4.574537361342407e-05,
"loss": 0.2194,
"step": 4100
},
{
"epoch": 7.145343777197563,
"grad_norm": 0.6796875,
"learning_rate": 4.5490202893522614e-05,
"loss": 0.2172,
"step": 4105
},
{
"epoch": 7.154046997389034,
"grad_norm": 0.765625,
"learning_rate": 4.5235536144197353e-05,
"loss": 0.2194,
"step": 4110
},
{
"epoch": 7.162750217580505,
"grad_norm": 0.74609375,
"learning_rate": 4.498137571997081e-05,
"loss": 0.2166,
"step": 4115
},
{
"epoch": 7.171453437771976,
"grad_norm": 0.80859375,
"learning_rate": 4.472772397068431e-05,
"loss": 0.2176,
"step": 4120
},
{
"epoch": 7.180156657963447,
"grad_norm": 0.71484375,
"learning_rate": 4.447458324147629e-05,
"loss": 0.225,
"step": 4125
},
{
"epoch": 7.188859878154918,
"grad_norm": 0.73046875,
"learning_rate": 4.422195587276058e-05,
"loss": 0.217,
"step": 4130
},
{
"epoch": 7.1975630983463885,
"grad_norm": 0.75,
"learning_rate": 4.396984420020451e-05,
"loss": 0.2182,
"step": 4135
},
{
"epoch": 7.206266318537859,
"grad_norm": 0.7265625,
"learning_rate": 4.3718250554707784e-05,
"loss": 0.2171,
"step": 4140
},
{
"epoch": 7.21496953872933,
"grad_norm": 0.71875,
"learning_rate": 4.34671772623806e-05,
"loss": 0.2155,
"step": 4145
},
{
"epoch": 7.2236727589208005,
"grad_norm": 0.71484375,
"learning_rate": 4.321662664452221e-05,
"loss": 0.217,
"step": 4150
},
{
"epoch": 7.2323759791122715,
"grad_norm": 0.734375,
"learning_rate": 4.296660101759942e-05,
"loss": 0.2158,
"step": 4155
},
{
"epoch": 7.241079199303742,
"grad_norm": 0.74609375,
"learning_rate": 4.271710269322536e-05,
"loss": 0.2191,
"step": 4160
},
{
"epoch": 7.249782419495213,
"grad_norm": 0.71484375,
"learning_rate": 4.2468133978137945e-05,
"loss": 0.2119,
"step": 4165
},
{
"epoch": 7.258485639686684,
"grad_norm": 0.73046875,
"learning_rate": 4.221969717417852e-05,
"loss": 0.2125,
"step": 4170
},
{
"epoch": 7.267188859878155,
"grad_norm": 0.734375,
"learning_rate": 4.1971794578270654e-05,
"loss": 0.2176,
"step": 4175
},
{
"epoch": 7.275892080069625,
"grad_norm": 0.73828125,
"learning_rate": 4.1724428482398945e-05,
"loss": 0.2171,
"step": 4180
},
{
"epoch": 7.284595300261096,
"grad_norm": 0.71875,
"learning_rate": 4.1477601173587836e-05,
"loss": 0.2168,
"step": 4185
},
{
"epoch": 7.293298520452567,
"grad_norm": 0.73046875,
"learning_rate": 4.1231314933880175e-05,
"loss": 0.2171,
"step": 4190
},
{
"epoch": 7.302001740644038,
"grad_norm": 0.734375,
"learning_rate": 4.098557204031658e-05,
"loss": 0.217,
"step": 4195
},
{
"epoch": 7.310704960835509,
"grad_norm": 0.72265625,
"learning_rate": 4.0740374764914136e-05,
"loss": 0.2184,
"step": 4200
},
{
"epoch": 7.31940818102698,
"grad_norm": 0.7734375,
"learning_rate": 4.049572537464531e-05,
"loss": 0.2126,
"step": 4205
},
{
"epoch": 7.328111401218451,
"grad_norm": 0.7734375,
"learning_rate": 4.025162613141713e-05,
"loss": 0.2173,
"step": 4210
},
{
"epoch": 7.336814621409921,
"grad_norm": 0.78125,
"learning_rate": 4.000807929205027e-05,
"loss": 0.2113,
"step": 4215
},
{
"epoch": 7.345517841601392,
"grad_norm": 0.73046875,
"learning_rate": 3.9765087108258204e-05,
"loss": 0.2215,
"step": 4220
},
{
"epoch": 7.354221061792863,
"grad_norm": 0.75,
"learning_rate": 3.95226518266262e-05,
"loss": 0.2204,
"step": 4225
},
{
"epoch": 7.362924281984334,
"grad_norm": 0.73828125,
"learning_rate": 3.9280775688590735e-05,
"loss": 0.2169,
"step": 4230
},
{
"epoch": 7.371627502175805,
"grad_norm": 0.71875,
"learning_rate": 3.903946093041877e-05,
"loss": 0.2188,
"step": 4235
},
{
"epoch": 7.380330722367276,
"grad_norm": 0.7421875,
"learning_rate": 3.8798709783187036e-05,
"loss": 0.2162,
"step": 4240
},
{
"epoch": 7.389033942558747,
"grad_norm": 0.69921875,
"learning_rate": 3.85585244727613e-05,
"loss": 0.2163,
"step": 4245
},
{
"epoch": 7.397737162750218,
"grad_norm": 0.7109375,
"learning_rate": 3.8318907219775935e-05,
"loss": 0.2179,
"step": 4250
},
{
"epoch": 7.406440382941688,
"grad_norm": 0.76171875,
"learning_rate": 3.8079860239613395e-05,
"loss": 0.2197,
"step": 4255
},
{
"epoch": 7.415143603133159,
"grad_norm": 0.703125,
"learning_rate": 3.784138574238357e-05,
"loss": 0.2177,
"step": 4260
},
{
"epoch": 7.42384682332463,
"grad_norm": 0.71875,
"learning_rate": 3.760348593290348e-05,
"loss": 0.2188,
"step": 4265
},
{
"epoch": 7.432550043516101,
"grad_norm": 0.69921875,
"learning_rate": 3.736616301067694e-05,
"loss": 0.2187,
"step": 4270
},
{
"epoch": 7.441253263707572,
"grad_norm": 0.73046875,
"learning_rate": 3.7129419169874114e-05,
"loss": 0.221,
"step": 4275
},
{
"epoch": 7.449956483899043,
"grad_norm": 0.80078125,
"learning_rate": 3.689325659931123e-05,
"loss": 0.2236,
"step": 4280
},
{
"epoch": 7.458659704090514,
"grad_norm": 0.73828125,
"learning_rate": 3.6657677482430377e-05,
"loss": 0.2188,
"step": 4285
},
{
"epoch": 7.467362924281984,
"grad_norm": 0.76171875,
"learning_rate": 3.642268399727941e-05,
"loss": 0.2165,
"step": 4290
},
{
"epoch": 7.476066144473455,
"grad_norm": 0.74609375,
"learning_rate": 3.618827831649158e-05,
"loss": 0.2183,
"step": 4295
},
{
"epoch": 7.484769364664926,
"grad_norm": 0.71875,
"learning_rate": 3.595446260726576e-05,
"loss": 0.2117,
"step": 4300
},
{
"epoch": 7.493472584856397,
"grad_norm": 0.73046875,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.2167,
"step": 4305
},
{
"epoch": 7.502175805047868,
"grad_norm": 0.69140625,
"learning_rate": 3.5488609745002214e-05,
"loss": 0.219,
"step": 4310
},
{
"epoch": 7.510879025239339,
"grad_norm": 0.7109375,
"learning_rate": 3.525657689900923e-05,
"loss": 0.2145,
"step": 4315
},
{
"epoch": 7.51958224543081,
"grad_norm": 0.76171875,
"learning_rate": 3.502514263862793e-05,
"loss": 0.2159,
"step": 4320
},
{
"epoch": 7.528285465622281,
"grad_norm": 0.76953125,
"learning_rate": 3.479430910358474e-05,
"loss": 0.2177,
"step": 4325
},
{
"epoch": 7.536988685813752,
"grad_norm": 0.734375,
"learning_rate": 3.456407842805223e-05,
"loss": 0.2154,
"step": 4330
},
{
"epoch": 7.545691906005222,
"grad_norm": 0.73828125,
"learning_rate": 3.433445274062907e-05,
"loss": 0.2157,
"step": 4335
},
{
"epoch": 7.554395126196693,
"grad_norm": 0.71875,
"learning_rate": 3.410543416432069e-05,
"loss": 0.2122,
"step": 4340
},
{
"epoch": 7.563098346388164,
"grad_norm": 0.71875,
"learning_rate": 3.387702481651931e-05,
"loss": 0.2215,
"step": 4345
},
{
"epoch": 7.5718015665796345,
"grad_norm": 0.75,
"learning_rate": 3.364922680898458e-05,
"loss": 0.2192,
"step": 4350
},
{
"epoch": 7.5805047867711055,
"grad_norm": 0.7265625,
"learning_rate": 3.342204224782406e-05,
"loss": 0.2168,
"step": 4355
},
{
"epoch": 7.5892080069625765,
"grad_norm": 0.72265625,
"learning_rate": 3.3195473233473584e-05,
"loss": 0.2163,
"step": 4360
},
{
"epoch": 7.5979112271540465,
"grad_norm": 0.69921875,
"learning_rate": 3.2969521860678066e-05,
"loss": 0.2162,
"step": 4365
},
{
"epoch": 7.6066144473455175,
"grad_norm": 0.73046875,
"learning_rate": 3.2744190218471884e-05,
"loss": 0.2178,
"step": 4370
},
{
"epoch": 7.6153176675369885,
"grad_norm": 0.765625,
"learning_rate": 3.2519480390159806e-05,
"loss": 0.218,
"step": 4375
},
{
"epoch": 7.624020887728459,
"grad_norm": 0.8203125,
"learning_rate": 3.229539445329752e-05,
"loss": 0.216,
"step": 4380
},
{
"epoch": 7.63272410791993,
"grad_norm": 0.7265625,
"learning_rate": 3.207193447967264e-05,
"loss": 0.2207,
"step": 4385
},
{
"epoch": 7.641427328111401,
"grad_norm": 0.75390625,
"learning_rate": 3.184910253528528e-05,
"loss": 0.217,
"step": 4390
},
{
"epoch": 7.650130548302872,
"grad_norm": 0.74609375,
"learning_rate": 3.162690068032926e-05,
"loss": 0.2183,
"step": 4395
},
{
"epoch": 7.658833768494343,
"grad_norm": 0.7578125,
"learning_rate": 3.140533096917282e-05,
"loss": 0.2197,
"step": 4400
},
{
"epoch": 7.667536988685814,
"grad_norm": 0.75,
"learning_rate": 3.118439545033969e-05,
"loss": 0.2204,
"step": 4405
},
{
"epoch": 7.676240208877284,
"grad_norm": 0.73828125,
"learning_rate": 3.096409616649023e-05,
"loss": 0.2194,
"step": 4410
},
{
"epoch": 7.684943429068755,
"grad_norm": 0.7421875,
"learning_rate": 3.074443515440252e-05,
"loss": 0.2211,
"step": 4415
},
{
"epoch": 7.693646649260226,
"grad_norm": 0.8046875,
"learning_rate": 3.0525414444953396e-05,
"loss": 0.219,
"step": 4420
},
{
"epoch": 7.702349869451697,
"grad_norm": 0.75390625,
"learning_rate": 3.0307036063099782e-05,
"loss": 0.2131,
"step": 4425
},
{
"epoch": 7.711053089643168,
"grad_norm": 0.76171875,
"learning_rate": 3.0089302027860044e-05,
"loss": 0.2141,
"step": 4430
},
{
"epoch": 7.719756309834639,
"grad_norm": 0.7890625,
"learning_rate": 2.9872214352295213e-05,
"loss": 0.2192,
"step": 4435
},
{
"epoch": 7.728459530026109,
"grad_norm": 0.73828125,
"learning_rate": 2.965577504349035e-05,
"loss": 0.2214,
"step": 4440
},
{
"epoch": 7.73716275021758,
"grad_norm": 0.6953125,
"learning_rate": 2.9439986102536043e-05,
"loss": 0.2188,
"step": 4445
},
{
"epoch": 7.745865970409051,
"grad_norm": 0.71875,
"learning_rate": 2.9224849524509936e-05,
"loss": 0.2155,
"step": 4450
},
{
"epoch": 7.754569190600522,
"grad_norm": 0.74609375,
"learning_rate": 2.901036729845831e-05,
"loss": 0.2156,
"step": 4455
},
{
"epoch": 7.763272410791993,
"grad_norm": 0.75,
"learning_rate": 2.879654140737743e-05,
"loss": 0.2161,
"step": 4460
},
{
"epoch": 7.771975630983464,
"grad_norm": 0.80859375,
"learning_rate": 2.8583373828195603e-05,
"loss": 0.2185,
"step": 4465
},
{
"epoch": 7.780678851174935,
"grad_norm": 0.69921875,
"learning_rate": 2.837086653175468e-05,
"loss": 0.2226,
"step": 4470
},
{
"epoch": 7.789382071366406,
"grad_norm": 0.72265625,
"learning_rate": 2.8159021482791802e-05,
"loss": 0.2171,
"step": 4475
},
{
"epoch": 7.798085291557877,
"grad_norm": 0.73828125,
"learning_rate": 2.794784063992131e-05,
"loss": 0.2204,
"step": 4480
},
{
"epoch": 7.806788511749347,
"grad_norm": 0.69921875,
"learning_rate": 2.7737325955616643e-05,
"loss": 0.215,
"step": 4485
},
{
"epoch": 7.815491731940818,
"grad_norm": 0.765625,
"learning_rate": 2.7527479376192366e-05,
"loss": 0.2161,
"step": 4490
},
{
"epoch": 7.824194952132289,
"grad_norm": 0.75,
"learning_rate": 2.7318302841785827e-05,
"loss": 0.2187,
"step": 4495
},
{
"epoch": 7.83289817232376,
"grad_norm": 0.76171875,
"learning_rate": 2.7109798286339705e-05,
"loss": 0.2214,
"step": 4500
},
{
"epoch": 7.841601392515231,
"grad_norm": 0.80078125,
"learning_rate": 2.6901967637583835e-05,
"loss": 0.2142,
"step": 4505
},
{
"epoch": 7.850304612706702,
"grad_norm": 0.69921875,
"learning_rate": 2.669481281701739e-05,
"loss": 0.2194,
"step": 4510
},
{
"epoch": 7.859007832898172,
"grad_norm": 0.71875,
"learning_rate": 2.6488335739891178e-05,
"loss": 0.2228,
"step": 4515
},
{
"epoch": 7.867711053089643,
"grad_norm": 0.75390625,
"learning_rate": 2.6282538315189974e-05,
"loss": 0.2196,
"step": 4520
},
{
"epoch": 7.876414273281114,
"grad_norm": 0.7734375,
"learning_rate": 2.607742244561484e-05,
"loss": 0.2225,
"step": 4525
},
{
"epoch": 7.885117493472585,
"grad_norm": 0.75390625,
"learning_rate": 2.5872990027565434e-05,
"loss": 0.2163,
"step": 4530
},
{
"epoch": 7.893820713664056,
"grad_norm": 0.6796875,
"learning_rate": 2.5669242951122586e-05,
"loss": 0.2155,
"step": 4535
},
{
"epoch": 7.902523933855527,
"grad_norm": 0.7578125,
"learning_rate": 2.5466183100030837e-05,
"loss": 0.2167,
"step": 4540
},
{
"epoch": 7.911227154046998,
"grad_norm": 0.75390625,
"learning_rate": 2.5263812351680995e-05,
"loss": 0.2184,
"step": 4545
},
{
"epoch": 7.919930374238469,
"grad_norm": 0.70703125,
"learning_rate": 2.50621325770927e-05,
"loss": 0.2132,
"step": 4550
},
{
"epoch": 7.9286335944299395,
"grad_norm": 0.75,
"learning_rate": 2.4861145640897188e-05,
"loss": 0.2144,
"step": 4555
},
{
"epoch": 7.93733681462141,
"grad_norm": 0.7109375,
"learning_rate": 2.466085340132014e-05,
"loss": 0.2171,
"step": 4560
},
{
"epoch": 7.946040034812881,
"grad_norm": 0.73046875,
"learning_rate": 2.446125771016433e-05,
"loss": 0.2167,
"step": 4565
},
{
"epoch": 7.9547432550043515,
"grad_norm": 0.71484375,
"learning_rate": 2.426236041279266e-05,
"loss": 0.2196,
"step": 4570
},
{
"epoch": 7.9634464751958225,
"grad_norm": 0.7265625,
"learning_rate": 2.4064163348110956e-05,
"loss": 0.2196,
"step": 4575
},
{
"epoch": 7.9721496953872935,
"grad_norm": 0.734375,
"learning_rate": 2.3866668348551112e-05,
"loss": 0.212,
"step": 4580
},
{
"epoch": 7.980852915578764,
"grad_norm": 0.7109375,
"learning_rate": 2.366987724005404e-05,
"loss": 0.2119,
"step": 4585
},
{
"epoch": 7.9895561357702345,
"grad_norm": 0.7109375,
"learning_rate": 2.3473791842052774e-05,
"loss": 0.2194,
"step": 4590
},
{
"epoch": 7.9982593559617055,
"grad_norm": 0.7578125,
"learning_rate": 2.327841396745578e-05,
"loss": 0.2167,
"step": 4595
},
{
"epoch": 8.0,
"eval_loss": 4.845585823059082,
"eval_runtime": 0.7795,
"eval_samples_per_second": 7.697,
"eval_steps_per_second": 1.283,
"step": 4596
},
{
"epoch": 8.006962576153176,
"grad_norm": 0.62109375,
"learning_rate": 2.3083745422630122e-05,
"loss": 0.2056,
"step": 4600
},
{
"epoch": 8.015665796344647,
"grad_norm": 0.6328125,
"learning_rate": 2.2889788007384683e-05,
"loss": 0.1978,
"step": 4605
},
{
"epoch": 8.024369016536118,
"grad_norm": 0.63671875,
"learning_rate": 2.2696543514953595e-05,
"loss": 0.2014,
"step": 4610
},
{
"epoch": 8.03307223672759,
"grad_norm": 0.62890625,
"learning_rate": 2.2504013731979732e-05,
"loss": 0.1991,
"step": 4615
},
{
"epoch": 8.04177545691906,
"grad_norm": 0.6640625,
"learning_rate": 2.2312200438498043e-05,
"loss": 0.2006,
"step": 4620
},
{
"epoch": 8.050478677110531,
"grad_norm": 0.66796875,
"learning_rate": 2.212110540791924e-05,
"loss": 0.2018,
"step": 4625
},
{
"epoch": 8.059181897302002,
"grad_norm": 0.66796875,
"learning_rate": 2.1930730407013245e-05,
"loss": 0.1963,
"step": 4630
},
{
"epoch": 8.067885117493473,
"grad_norm": 0.68359375,
"learning_rate": 2.1741077195893043e-05,
"loss": 0.1995,
"step": 4635
},
{
"epoch": 8.076588337684944,
"grad_norm": 0.66796875,
"learning_rate": 2.1552147527998213e-05,
"loss": 0.1984,
"step": 4640
},
{
"epoch": 8.085291557876415,
"grad_norm": 0.69921875,
"learning_rate": 2.136394315007889e-05,
"loss": 0.2005,
"step": 4645
},
{
"epoch": 8.093994778067884,
"grad_norm": 0.69140625,
"learning_rate": 2.1176465802179467e-05,
"loss": 0.203,
"step": 4650
},
{
"epoch": 8.102697998259355,
"grad_norm": 0.69140625,
"learning_rate": 2.0989717217622652e-05,
"loss": 0.1967,
"step": 4655
},
{
"epoch": 8.111401218450826,
"grad_norm": 0.8125,
"learning_rate": 2.0803699122993293e-05,
"loss": 0.2029,
"step": 4660
},
{
"epoch": 8.120104438642297,
"grad_norm": 0.69140625,
"learning_rate": 2.061841323812257e-05,
"loss": 0.2005,
"step": 4665
},
{
"epoch": 8.128807658833768,
"grad_norm": 0.6484375,
"learning_rate": 2.0433861276071942e-05,
"loss": 0.1966,
"step": 4670
},
{
"epoch": 8.137510879025239,
"grad_norm": 0.71484375,
"learning_rate": 2.0250044943117385e-05,
"loss": 0.2023,
"step": 4675
},
{
"epoch": 8.14621409921671,
"grad_norm": 0.66796875,
"learning_rate": 2.0066965938733707e-05,
"loss": 0.198,
"step": 4680
},
{
"epoch": 8.154917319408181,
"grad_norm": 0.8125,
"learning_rate": 1.9884625955578594e-05,
"loss": 0.196,
"step": 4685
},
{
"epoch": 8.163620539599652,
"grad_norm": 0.66796875,
"learning_rate": 1.9703026679477256e-05,
"loss": 0.1954,
"step": 4690
},
{
"epoch": 8.172323759791123,
"grad_norm": 0.640625,
"learning_rate": 1.9522169789406575e-05,
"loss": 0.196,
"step": 4695
},
{
"epoch": 8.181026979982594,
"grad_norm": 0.71875,
"learning_rate": 1.934205695747978e-05,
"loss": 0.2014,
"step": 4700
},
{
"epoch": 8.189730200174065,
"grad_norm": 0.71484375,
"learning_rate": 1.916268984893086e-05,
"loss": 0.1984,
"step": 4705
},
{
"epoch": 8.198433420365536,
"grad_norm": 0.6953125,
"learning_rate": 1.8984070122099218e-05,
"loss": 0.1994,
"step": 4710
},
{
"epoch": 8.207136640557007,
"grad_norm": 0.71484375,
"learning_rate": 1.880619942841435e-05,
"loss": 0.2002,
"step": 4715
},
{
"epoch": 8.215839860748478,
"grad_norm": 0.7578125,
"learning_rate": 1.862907941238059e-05,
"loss": 0.197,
"step": 4720
},
{
"epoch": 8.224543080939949,
"grad_norm": 0.8125,
"learning_rate": 1.8452711711561842e-05,
"loss": 0.2023,
"step": 4725
},
{
"epoch": 8.233246301131418,
"grad_norm": 0.6796875,
"learning_rate": 1.8277097956566437e-05,
"loss": 0.201,
"step": 4730
},
{
"epoch": 8.241949521322889,
"grad_norm": 0.7265625,
"learning_rate": 1.810223977103217e-05,
"loss": 0.1982,
"step": 4735
},
{
"epoch": 8.25065274151436,
"grad_norm": 0.7109375,
"learning_rate": 1.7928138771611225e-05,
"loss": 0.1983,
"step": 4740
},
{
"epoch": 8.25935596170583,
"grad_norm": 0.671875,
"learning_rate": 1.7754796567955155e-05,
"loss": 0.2005,
"step": 4745
},
{
"epoch": 8.268059181897302,
"grad_norm": 0.734375,
"learning_rate": 1.7582214762700054e-05,
"loss": 0.1974,
"step": 4750
},
{
"epoch": 8.276762402088773,
"grad_norm": 0.71484375,
"learning_rate": 1.7410394951451814e-05,
"loss": 0.1993,
"step": 4755
},
{
"epoch": 8.285465622280244,
"grad_norm": 0.73046875,
"learning_rate": 1.7239338722771327e-05,
"loss": 0.2046,
"step": 4760
},
{
"epoch": 8.294168842471715,
"grad_norm": 0.69140625,
"learning_rate": 1.706904765815963e-05,
"loss": 0.2007,
"step": 4765
},
{
"epoch": 8.302872062663186,
"grad_norm": 0.6796875,
"learning_rate": 1.6899523332043586e-05,
"loss": 0.2041,
"step": 4770
},
{
"epoch": 8.311575282854657,
"grad_norm": 0.703125,
"learning_rate": 1.673076731176114e-05,
"loss": 0.2024,
"step": 4775
},
{
"epoch": 8.320278503046127,
"grad_norm": 0.671875,
"learning_rate": 1.6562781157546835e-05,
"loss": 0.2025,
"step": 4780
},
{
"epoch": 8.328981723237598,
"grad_norm": 0.68359375,
"learning_rate": 1.639556642251737e-05,
"loss": 0.1961,
"step": 4785
},
{
"epoch": 8.33768494342907,
"grad_norm": 0.75,
"learning_rate": 1.622912465265738e-05,
"loss": 0.1966,
"step": 4790
},
{
"epoch": 8.34638816362054,
"grad_norm": 0.703125,
"learning_rate": 1.6063457386805004e-05,
"loss": 0.1987,
"step": 4795
},
{
"epoch": 8.35509138381201,
"grad_norm": 0.67578125,
"learning_rate": 1.5898566156637708e-05,
"loss": 0.2005,
"step": 4800
},
{
"epoch": 8.36379460400348,
"grad_norm": 0.734375,
"learning_rate": 1.573445248665806e-05,
"loss": 0.1993,
"step": 4805
},
{
"epoch": 8.372497824194951,
"grad_norm": 0.68359375,
"learning_rate": 1.5571117894179754e-05,
"loss": 0.2004,
"step": 4810
},
{
"epoch": 8.381201044386422,
"grad_norm": 0.74609375,
"learning_rate": 1.540856388931359e-05,
"loss": 0.1989,
"step": 4815
},
{
"epoch": 8.389904264577893,
"grad_norm": 0.703125,
"learning_rate": 1.5246791974953223e-05,
"loss": 0.1935,
"step": 4820
},
{
"epoch": 8.398607484769364,
"grad_norm": 0.625,
"learning_rate": 1.5085803646761687e-05,
"loss": 0.1989,
"step": 4825
},
{
"epoch": 8.407310704960835,
"grad_norm": 0.7421875,
"learning_rate": 1.4925600393157324e-05,
"loss": 0.1976,
"step": 4830
},
{
"epoch": 8.416013925152306,
"grad_norm": 0.7578125,
"learning_rate": 1.4766183695300006e-05,
"loss": 0.2008,
"step": 4835
},
{
"epoch": 8.424717145343777,
"grad_norm": 0.73828125,
"learning_rate": 1.4607555027077525e-05,
"loss": 0.2007,
"step": 4840
},
{
"epoch": 8.433420365535248,
"grad_norm": 0.73046875,
"learning_rate": 1.4449715855091972e-05,
"loss": 0.1992,
"step": 4845
},
{
"epoch": 8.44212358572672,
"grad_norm": 0.69140625,
"learning_rate": 1.429266763864614e-05,
"loss": 0.1959,
"step": 4850
},
{
"epoch": 8.45082680591819,
"grad_norm": 0.6875,
"learning_rate": 1.4136411829730023e-05,
"loss": 0.1981,
"step": 4855
},
{
"epoch": 8.459530026109661,
"grad_norm": 0.7109375,
"learning_rate": 1.3980949873007364e-05,
"loss": 0.2006,
"step": 4860
},
{
"epoch": 8.468233246301132,
"grad_norm": 0.69140625,
"learning_rate": 1.3826283205802427e-05,
"loss": 0.1991,
"step": 4865
},
{
"epoch": 8.476936466492603,
"grad_norm": 0.69140625,
"learning_rate": 1.3672413258086592e-05,
"loss": 0.1991,
"step": 4870
},
{
"epoch": 8.485639686684074,
"grad_norm": 0.70703125,
"learning_rate": 1.3519341452465151e-05,
"loss": 0.2025,
"step": 4875
},
{
"epoch": 8.494342906875543,
"grad_norm": 0.70703125,
"learning_rate": 1.336706920416415e-05,
"loss": 0.2,
"step": 4880
},
{
"epoch": 8.503046127067014,
"grad_norm": 0.6953125,
"learning_rate": 1.3215597921017387e-05,
"loss": 0.2004,
"step": 4885
},
{
"epoch": 8.511749347258485,
"grad_norm": 0.6484375,
"learning_rate": 1.3064929003453286e-05,
"loss": 0.1985,
"step": 4890
},
{
"epoch": 8.520452567449956,
"grad_norm": 0.6875,
"learning_rate": 1.2915063844481989e-05,
"loss": 0.1978,
"step": 4895
},
{
"epoch": 8.529155787641427,
"grad_norm": 0.71484375,
"learning_rate": 1.2766003829682505e-05,
"loss": 0.1972,
"step": 4900
},
{
"epoch": 8.537859007832898,
"grad_norm": 0.734375,
"learning_rate": 1.2617750337189904e-05,
"loss": 0.1993,
"step": 4905
},
{
"epoch": 8.546562228024369,
"grad_norm": 0.6796875,
"learning_rate": 1.2470304737682514e-05,
"loss": 0.1956,
"step": 4910
},
{
"epoch": 8.55526544821584,
"grad_norm": 0.7109375,
"learning_rate": 1.232366839436926e-05,
"loss": 0.1976,
"step": 4915
},
{
"epoch": 8.56396866840731,
"grad_norm": 0.71875,
"learning_rate": 1.2177842662977135e-05,
"loss": 0.192,
"step": 4920
},
{
"epoch": 8.572671888598782,
"grad_norm": 0.78515625,
"learning_rate": 1.2032828891738646e-05,
"loss": 0.2021,
"step": 4925
},
{
"epoch": 8.581375108790253,
"grad_norm": 0.734375,
"learning_rate": 1.1888628421379221e-05,
"loss": 0.1987,
"step": 4930
},
{
"epoch": 8.590078328981724,
"grad_norm": 0.69140625,
"learning_rate": 1.1745242585104955e-05,
"loss": 0.2024,
"step": 4935
},
{
"epoch": 8.598781549173195,
"grad_norm": 0.69921875,
"learning_rate": 1.160267270859029e-05,
"loss": 0.2027,
"step": 4940
},
{
"epoch": 8.607484769364666,
"grad_norm": 0.7421875,
"learning_rate": 1.1460920109965612e-05,
"loss": 0.2012,
"step": 4945
},
{
"epoch": 8.616187989556135,
"grad_norm": 0.69140625,
"learning_rate": 1.1319986099805279e-05,
"loss": 0.2001,
"step": 4950
},
{
"epoch": 8.624891209747606,
"grad_norm": 0.7109375,
"learning_rate": 1.1179871981115253e-05,
"loss": 0.2014,
"step": 4955
},
{
"epoch": 8.633594429939077,
"grad_norm": 0.74609375,
"learning_rate": 1.1040579049321309e-05,
"loss": 0.2014,
"step": 4960
},
{
"epoch": 8.642297650130548,
"grad_norm": 0.7109375,
"learning_rate": 1.0902108592256831e-05,
"loss": 0.2002,
"step": 4965
},
{
"epoch": 8.651000870322019,
"grad_norm": 0.7421875,
"learning_rate": 1.0764461890151112e-05,
"loss": 0.1967,
"step": 4970
},
{
"epoch": 8.65970409051349,
"grad_norm": 0.73046875,
"learning_rate": 1.062764021561733e-05,
"loss": 0.2005,
"step": 4975
},
{
"epoch": 8.66840731070496,
"grad_norm": 0.71875,
"learning_rate": 1.0491644833640868e-05,
"loss": 0.2013,
"step": 4980
},
{
"epoch": 8.677110530896432,
"grad_norm": 0.69921875,
"learning_rate": 1.0356477001567677e-05,
"loss": 0.197,
"step": 4985
},
{
"epoch": 8.685813751087903,
"grad_norm": 0.69140625,
"learning_rate": 1.0222137969092581e-05,
"loss": 0.2012,
"step": 4990
},
{
"epoch": 8.694516971279374,
"grad_norm": 0.6875,
"learning_rate": 1.0088628978247694e-05,
"loss": 0.2006,
"step": 4995
},
{
"epoch": 8.703220191470844,
"grad_norm": 0.68359375,
"learning_rate": 9.955951263390972e-06,
"loss": 0.1987,
"step": 5000
},
{
"epoch": 8.711923411662315,
"grad_norm": 0.62890625,
"learning_rate": 9.824106051194859e-06,
"loss": 0.1977,
"step": 5005
},
{
"epoch": 8.720626631853786,
"grad_norm": 0.70703125,
"learning_rate": 9.69309456063484e-06,
"loss": 0.1986,
"step": 5010
},
{
"epoch": 8.729329852045257,
"grad_norm": 0.71484375,
"learning_rate": 9.562918002978283e-06,
"loss": 0.2016,
"step": 5015
},
{
"epoch": 8.738033072236728,
"grad_norm": 0.66015625,
"learning_rate": 9.43357758177309e-06,
"loss": 0.1969,
"step": 5020
},
{
"epoch": 8.7467362924282,
"grad_norm": 0.72265625,
"learning_rate": 9.305074492836763e-06,
"loss": 0.197,
"step": 5025
},
{
"epoch": 8.755439512619668,
"grad_norm": 0.73046875,
"learning_rate": 9.177409924245161e-06,
"loss": 0.1953,
"step": 5030
},
{
"epoch": 8.76414273281114,
"grad_norm": 0.71484375,
"learning_rate": 9.050585056321626e-06,
"loss": 0.1979,
"step": 5035
},
{
"epoch": 8.77284595300261,
"grad_norm": 0.72265625,
"learning_rate": 8.924601061626048e-06,
"loss": 0.1969,
"step": 5040
},
{
"epoch": 8.781549173194081,
"grad_norm": 0.66796875,
"learning_rate": 8.799459104944064e-06,
"loss": 0.1983,
"step": 5045
},
{
"epoch": 8.790252393385552,
"grad_norm": 0.7421875,
"learning_rate": 8.675160343276167e-06,
"loss": 0.1982,
"step": 5050
},
{
"epoch": 8.798955613577023,
"grad_norm": 0.7421875,
"learning_rate": 8.551705925827103e-06,
"loss": 0.1989,
"step": 5055
},
{
"epoch": 8.807658833768494,
"grad_norm": 0.6875,
"learning_rate": 8.429096993995277e-06,
"loss": 0.1958,
"step": 5060
},
{
"epoch": 8.816362053959965,
"grad_norm": 0.68359375,
"learning_rate": 8.307334681362133e-06,
"loss": 0.1996,
"step": 5065
},
{
"epoch": 8.825065274151436,
"grad_norm": 0.71484375,
"learning_rate": 8.18642011368167e-06,
"loss": 0.2031,
"step": 5070
},
{
"epoch": 8.833768494342907,
"grad_norm": 0.734375,
"learning_rate": 8.066354408870048e-06,
"loss": 0.201,
"step": 5075
},
{
"epoch": 8.842471714534378,
"grad_norm": 0.67578125,
"learning_rate": 7.947138676995302e-06,
"loss": 0.2003,
"step": 5080
},
{
"epoch": 8.851174934725849,
"grad_norm": 0.6953125,
"learning_rate": 7.828774020267072e-06,
"loss": 0.1989,
"step": 5085
},
{
"epoch": 8.85987815491732,
"grad_norm": 0.7265625,
"learning_rate": 7.711261533026238e-06,
"loss": 0.2007,
"step": 5090
},
{
"epoch": 8.868581375108791,
"grad_norm": 0.72265625,
"learning_rate": 7.594602301735087e-06,
"loss": 0.204,
"step": 5095
},
{
"epoch": 8.877284595300262,
"grad_norm": 0.6796875,
"learning_rate": 7.478797404967075e-06,
"loss": 0.1964,
"step": 5100
},
{
"epoch": 8.885987815491731,
"grad_norm": 0.6953125,
"learning_rate": 7.363847913396882e-06,
"loss": 0.1953,
"step": 5105
},
{
"epoch": 8.894691035683202,
"grad_norm": 0.74609375,
"learning_rate": 7.249754889790539e-06,
"loss": 0.2054,
"step": 5110
},
{
"epoch": 8.903394255874673,
"grad_norm": 0.734375,
"learning_rate": 7.136519388995633e-06,
"loss": 0.1996,
"step": 5115
},
{
"epoch": 8.912097476066144,
"grad_norm": 0.6796875,
"learning_rate": 7.024142457931504e-06,
"loss": 0.198,
"step": 5120
},
{
"epoch": 8.920800696257615,
"grad_norm": 0.67578125,
"learning_rate": 6.9126251355795864e-06,
"loss": 0.1938,
"step": 5125
},
{
"epoch": 8.929503916449086,
"grad_norm": 0.6875,
"learning_rate": 6.8019684529737505e-06,
"loss": 0.2041,
"step": 5130
},
{
"epoch": 8.938207136640557,
"grad_norm": 0.75,
"learning_rate": 6.6921734331908735e-06,
"loss": 0.199,
"step": 5135
},
{
"epoch": 8.946910356832028,
"grad_norm": 0.671875,
"learning_rate": 6.583241091341353e-06,
"loss": 0.1971,
"step": 5140
},
{
"epoch": 8.955613577023499,
"grad_norm": 0.67578125,
"learning_rate": 6.475172434559573e-06,
"loss": 0.1962,
"step": 5145
},
{
"epoch": 8.96431679721497,
"grad_norm": 0.69140625,
"learning_rate": 6.367968461994833e-06,
"loss": 0.1993,
"step": 5150
},
{
"epoch": 8.97302001740644,
"grad_norm": 0.6953125,
"learning_rate": 6.261630164801957e-06,
"loss": 0.2026,
"step": 5155
},
{
"epoch": 8.981723237597912,
"grad_norm": 0.71875,
"learning_rate": 6.156158526132139e-06,
"loss": 0.1999,
"step": 5160
},
{
"epoch": 8.990426457789383,
"grad_norm": 0.76953125,
"learning_rate": 6.05155452112387e-06,
"loss": 0.1983,
"step": 5165
},
{
"epoch": 8.999129677980854,
"grad_norm": 0.73828125,
"learning_rate": 5.947819116893971e-06,
"loss": 0.2037,
"step": 5170
},
{
"epoch": 8.999129677980854,
"eval_loss": 5.056090831756592,
"eval_runtime": 1.1157,
"eval_samples_per_second": 5.378,
"eval_steps_per_second": 0.896,
"step": 5170
},
{
"epoch": 9.007832898172325,
"grad_norm": 0.67578125,
"learning_rate": 5.8449532725286196e-06,
"loss": 0.1957,
"step": 5175
},
{
"epoch": 9.016536118363794,
"grad_norm": 0.71484375,
"learning_rate": 5.742957939074412e-06,
"loss": 0.1967,
"step": 5180
},
{
"epoch": 9.025239338555265,
"grad_norm": 0.671875,
"learning_rate": 5.641834059529661e-06,
"loss": 0.1998,
"step": 5185
},
{
"epoch": 9.033942558746736,
"grad_norm": 0.66796875,
"learning_rate": 5.541582568835679e-06,
"loss": 0.2032,
"step": 5190
},
{
"epoch": 9.042645778938207,
"grad_norm": 0.671875,
"learning_rate": 5.442204393868056e-06,
"loss": 0.1979,
"step": 5195
},
{
"epoch": 9.051348999129678,
"grad_norm": 0.6484375,
"learning_rate": 5.343700453428168e-06,
"loss": 0.1942,
"step": 5200
},
{
"epoch": 9.060052219321149,
"grad_norm": 0.703125,
"learning_rate": 5.246071658234642e-06,
"loss": 0.2022,
"step": 5205
},
{
"epoch": 9.06875543951262,
"grad_norm": 0.69140625,
"learning_rate": 5.1493189109149575e-06,
"loss": 0.2016,
"step": 5210
},
{
"epoch": 9.07745865970409,
"grad_norm": 0.6875,
"learning_rate": 5.0534431059970685e-06,
"loss": 0.1946,
"step": 5215
},
{
"epoch": 9.086161879895561,
"grad_norm": 0.6796875,
"learning_rate": 4.958445129901146e-06,
"loss": 0.2002,
"step": 5220
},
{
"epoch": 9.094865100087032,
"grad_norm": 0.6484375,
"learning_rate": 4.864325860931429e-06,
"loss": 0.1978,
"step": 5225
},
{
"epoch": 9.103568320278503,
"grad_norm": 0.6953125,
"learning_rate": 4.771086169268057e-06,
"loss": 0.1992,
"step": 5230
},
{
"epoch": 9.112271540469974,
"grad_norm": 0.68359375,
"learning_rate": 4.678726916958998e-06,
"loss": 0.1997,
"step": 5235
},
{
"epoch": 9.120974760661445,
"grad_norm": 0.70703125,
"learning_rate": 4.587248957912138e-06,
"loss": 0.1998,
"step": 5240
},
{
"epoch": 9.129677980852916,
"grad_norm": 0.66015625,
"learning_rate": 4.496653137887386e-06,
"loss": 0.1923,
"step": 5245
},
{
"epoch": 9.138381201044387,
"grad_norm": 0.76171875,
"learning_rate": 4.40694029448877e-06,
"loss": 0.1998,
"step": 5250
},
{
"epoch": 9.147084421235856,
"grad_norm": 0.6328125,
"learning_rate": 4.318111257156831e-06,
"loss": 0.1911,
"step": 5255
},
{
"epoch": 9.155787641427327,
"grad_norm": 0.73828125,
"learning_rate": 4.230166847160799e-06,
"loss": 0.1949,
"step": 5260
},
{
"epoch": 9.164490861618798,
"grad_norm": 0.66796875,
"learning_rate": 4.143107877591135e-06,
"loss": 0.1974,
"step": 5265
},
{
"epoch": 9.17319408181027,
"grad_norm": 0.69140625,
"learning_rate": 4.056935153351937e-06,
"loss": 0.1964,
"step": 5270
},
{
"epoch": 9.18189730200174,
"grad_norm": 0.73046875,
"learning_rate": 3.971649471153516e-06,
"loss": 0.1956,
"step": 5275
},
{
"epoch": 9.190600522193211,
"grad_norm": 0.6484375,
"learning_rate": 3.887251619505028e-06,
"loss": 0.1969,
"step": 5280
},
{
"epoch": 9.199303742384682,
"grad_norm": 0.65234375,
"learning_rate": 3.803742378707198e-06,
"loss": 0.1992,
"step": 5285
},
{
"epoch": 9.208006962576153,
"grad_norm": 0.64453125,
"learning_rate": 3.7211225208450774e-06,
"loss": 0.1945,
"step": 5290
},
{
"epoch": 9.216710182767624,
"grad_norm": 0.71484375,
"learning_rate": 3.6393928097809617e-06,
"loss": 0.199,
"step": 5295
},
{
"epoch": 9.225413402959095,
"grad_norm": 0.65625,
"learning_rate": 3.5585540011472516e-06,
"loss": 0.1956,
"step": 5300
},
{
"epoch": 9.234116623150566,
"grad_norm": 0.6953125,
"learning_rate": 3.4786068423395044e-06,
"loss": 0.1991,
"step": 5305
},
{
"epoch": 9.242819843342037,
"grad_norm": 0.6875,
"learning_rate": 3.3995520725095486e-06,
"loss": 0.1943,
"step": 5310
},
{
"epoch": 9.251523063533508,
"grad_norm": 0.7109375,
"learning_rate": 3.3213904225586346e-06,
"loss": 0.1973,
"step": 5315
},
{
"epoch": 9.260226283724979,
"grad_norm": 0.65234375,
"learning_rate": 3.2441226151306404e-06,
"loss": 0.1907,
"step": 5320
},
{
"epoch": 9.26892950391645,
"grad_norm": 0.66015625,
"learning_rate": 3.16774936460541e-06,
"loss": 0.1968,
"step": 5325
},
{
"epoch": 9.27763272410792,
"grad_norm": 0.6484375,
"learning_rate": 3.092271377092215e-06,
"loss": 0.1968,
"step": 5330
},
{
"epoch": 9.28633594429939,
"grad_norm": 0.6484375,
"learning_rate": 3.0176893504230807e-06,
"loss": 0.1955,
"step": 5335
},
{
"epoch": 9.295039164490861,
"grad_norm": 0.66796875,
"learning_rate": 2.944003974146525e-06,
"loss": 0.1939,
"step": 5340
},
{
"epoch": 9.303742384682332,
"grad_norm": 0.703125,
"learning_rate": 2.8712159295209873e-06,
"loss": 0.1955,
"step": 5345
},
{
"epoch": 9.312445604873803,
"grad_norm": 0.65234375,
"learning_rate": 2.7993258895086973e-06,
"loss": 0.1925,
"step": 5350
},
{
"epoch": 9.321148825065274,
"grad_norm": 0.6875,
"learning_rate": 2.7283345187693264e-06,
"loss": 0.196,
"step": 5355
},
{
"epoch": 9.329852045256745,
"grad_norm": 0.63671875,
"learning_rate": 2.658242473653905e-06,
"loss": 0.1929,
"step": 5360
},
{
"epoch": 9.338555265448216,
"grad_norm": 0.65625,
"learning_rate": 2.589050402198767e-06,
"loss": 0.1958,
"step": 5365
},
{
"epoch": 9.347258485639687,
"grad_norm": 0.671875,
"learning_rate": 2.520758944119539e-06,
"loss": 0.1939,
"step": 5370
},
{
"epoch": 9.355961705831158,
"grad_norm": 0.6640625,
"learning_rate": 2.4533687308051835e-06,
"loss": 0.1917,
"step": 5375
},
{
"epoch": 9.364664926022629,
"grad_norm": 0.6953125,
"learning_rate": 2.386880385312218e-06,
"loss": 0.1937,
"step": 5380
},
{
"epoch": 9.3733681462141,
"grad_norm": 0.6484375,
"learning_rate": 2.321294522358952e-06,
"loss": 0.1988,
"step": 5385
},
{
"epoch": 9.38207136640557,
"grad_norm": 0.65625,
"learning_rate": 2.256611748319792e-06,
"loss": 0.1943,
"step": 5390
},
{
"epoch": 9.390774586597042,
"grad_norm": 0.65625,
"learning_rate": 2.1928326612196015e-06,
"loss": 0.1964,
"step": 5395
},
{
"epoch": 9.399477806788513,
"grad_norm": 0.640625,
"learning_rate": 2.1299578507282147e-06,
"loss": 0.196,
"step": 5400
},
{
"epoch": 9.408181026979982,
"grad_norm": 0.7265625,
"learning_rate": 2.0679878981549993e-06,
"loss": 0.1921,
"step": 5405
},
{
"epoch": 9.416884247171453,
"grad_norm": 0.71875,
"learning_rate": 2.006923376443415e-06,
"loss": 0.1983,
"step": 5410
},
{
"epoch": 9.425587467362924,
"grad_norm": 0.67578125,
"learning_rate": 1.946764850165772e-06,
"loss": 0.1984,
"step": 5415
},
{
"epoch": 9.434290687554395,
"grad_norm": 0.6640625,
"learning_rate": 1.8875128755179938e-06,
"loss": 0.198,
"step": 5420
},
{
"epoch": 9.442993907745866,
"grad_norm": 0.71875,
"learning_rate": 1.8291680003145073e-06,
"loss": 0.1977,
"step": 5425
},
{
"epoch": 9.451697127937337,
"grad_norm": 0.671875,
"learning_rate": 1.7717307639831037e-06,
"loss": 0.1966,
"step": 5430
},
{
"epoch": 9.460400348128807,
"grad_norm": 0.6875,
"learning_rate": 1.7152016975599983e-06,
"loss": 0.1959,
"step": 5435
},
{
"epoch": 9.469103568320278,
"grad_norm": 0.68359375,
"learning_rate": 1.6595813236849556e-06,
"loss": 0.1946,
"step": 5440
},
{
"epoch": 9.47780678851175,
"grad_norm": 0.71875,
"learning_rate": 1.604870156596383e-06,
"loss": 0.194,
"step": 5445
},
{
"epoch": 9.48651000870322,
"grad_norm": 0.71875,
"learning_rate": 1.5510687021266234e-06,
"loss": 0.1926,
"step": 5450
},
{
"epoch": 9.495213228894691,
"grad_norm": 0.73046875,
"learning_rate": 1.4981774576972584e-06,
"loss": 0.1963,
"step": 5455
},
{
"epoch": 9.503916449086162,
"grad_norm": 0.69140625,
"learning_rate": 1.4461969123145457e-06,
"loss": 0.1973,
"step": 5460
},
{
"epoch": 9.512619669277633,
"grad_norm": 0.703125,
"learning_rate": 1.395127546564845e-06,
"loss": 0.1963,
"step": 5465
},
{
"epoch": 9.521322889469104,
"grad_norm": 0.73046875,
"learning_rate": 1.344969832610199e-06,
"loss": 0.1932,
"step": 5470
},
{
"epoch": 9.530026109660575,
"grad_norm": 0.71875,
"learning_rate": 1.2957242341839927e-06,
"loss": 0.197,
"step": 5475
},
{
"epoch": 9.538729329852046,
"grad_norm": 0.828125,
"learning_rate": 1.2473912065866345e-06,
"loss": 0.1921,
"step": 5480
},
{
"epoch": 9.547432550043515,
"grad_norm": 0.65234375,
"learning_rate": 1.1999711966813377e-06,
"loss": 0.1969,
"step": 5485
},
{
"epoch": 9.556135770234986,
"grad_norm": 0.6875,
"learning_rate": 1.1534646428900232e-06,
"loss": 0.1981,
"step": 5490
},
{
"epoch": 9.564838990426457,
"grad_norm": 0.69140625,
"learning_rate": 1.107871975189234e-06,
"loss": 0.2015,
"step": 5495
},
{
"epoch": 9.573542210617928,
"grad_norm": 0.7265625,
"learning_rate": 1.0631936151062172e-06,
"loss": 0.1953,
"step": 5500
},
{
"epoch": 9.5822454308094,
"grad_norm": 0.67578125,
"learning_rate": 1.019429975714914e-06,
"loss": 0.1969,
"step": 5505
},
{
"epoch": 9.59094865100087,
"grad_norm": 0.65234375,
"learning_rate": 9.765814616322755e-07,
"loss": 0.1956,
"step": 5510
},
{
"epoch": 9.599651871192341,
"grad_norm": 0.6796875,
"learning_rate": 9.346484690144319e-07,
"loss": 0.1987,
"step": 5515
},
{
"epoch": 9.608355091383812,
"grad_norm": 0.65234375,
"learning_rate": 8.936313855530398e-07,
"loss": 0.1944,
"step": 5520
},
{
"epoch": 9.617058311575283,
"grad_norm": 0.6796875,
"learning_rate": 8.535305904717517e-07,
"loss": 0.1932,
"step": 5525
},
{
"epoch": 9.625761531766754,
"grad_norm": 0.6875,
"learning_rate": 8.143464545226298e-07,
"loss": 0.196,
"step": 5530
},
{
"epoch": 9.634464751958225,
"grad_norm": 0.6875,
"learning_rate": 7.760793399827937e-07,
"loss": 0.1967,
"step": 5535
},
{
"epoch": 9.643167972149696,
"grad_norm": 0.703125,
"learning_rate": 7.387296006510225e-07,
"loss": 0.1958,
"step": 5540
},
{
"epoch": 9.651871192341167,
"grad_norm": 0.69140625,
"learning_rate": 7.022975818445022e-07,
"loss": 0.1933,
"step": 5545
},
{
"epoch": 9.660574412532638,
"grad_norm": 0.73046875,
"learning_rate": 6.667836203956168e-07,
"loss": 0.1972,
"step": 5550
},
{
"epoch": 9.669277632724107,
"grad_norm": 0.65234375,
"learning_rate": 6.321880446488737e-07,
"loss": 0.1932,
"step": 5555
},
{
"epoch": 9.677980852915578,
"grad_norm": 0.69921875,
"learning_rate": 5.985111744578165e-07,
"loss": 0.1977,
"step": 5560
},
{
"epoch": 9.686684073107049,
"grad_norm": 0.67578125,
"learning_rate": 5.657533211820942e-07,
"loss": 0.1979,
"step": 5565
},
{
"epoch": 9.69538729329852,
"grad_norm": 0.66796875,
"learning_rate": 5.339147876845974e-07,
"loss": 0.1961,
"step": 5570
},
{
"epoch": 9.70409051348999,
"grad_norm": 0.7265625,
"learning_rate": 5.029958683286263e-07,
"loss": 0.197,
"step": 5575
},
{
"epoch": 9.712793733681462,
"grad_norm": 0.6875,
"learning_rate": 4.7299684897520456e-07,
"loss": 0.193,
"step": 5580
},
{
"epoch": 9.721496953872933,
"grad_norm": 0.69140625,
"learning_rate": 4.4391800698038165e-07,
"loss": 0.1961,
"step": 5585
},
{
"epoch": 9.730200174064404,
"grad_norm": 0.63671875,
"learning_rate": 4.157596111927342e-07,
"loss": 0.1903,
"step": 5590
},
{
"epoch": 9.738903394255875,
"grad_norm": 0.71875,
"learning_rate": 3.8852192195083516e-07,
"loss": 0.1948,
"step": 5595
},
{
"epoch": 9.747606614447346,
"grad_norm": 0.70703125,
"learning_rate": 3.622051910808666e-07,
"loss": 0.1969,
"step": 5600
},
{
"epoch": 9.756309834638817,
"grad_norm": 0.734375,
"learning_rate": 3.368096618942773e-07,
"loss": 0.1948,
"step": 5605
},
{
"epoch": 9.765013054830288,
"grad_norm": 0.69140625,
"learning_rate": 3.1233556918555117e-07,
"loss": 0.1982,
"step": 5610
},
{
"epoch": 9.773716275021759,
"grad_norm": 0.625,
"learning_rate": 2.8878313923002e-07,
"loss": 0.1929,
"step": 5615
},
{
"epoch": 9.78241949521323,
"grad_norm": 0.66796875,
"learning_rate": 2.661525897817874e-07,
"loss": 0.1987,
"step": 5620
},
{
"epoch": 9.7911227154047,
"grad_norm": 0.7265625,
"learning_rate": 2.444441300717082e-07,
"loss": 0.1953,
"step": 5625
},
{
"epoch": 9.799825935596171,
"grad_norm": 0.6875,
"learning_rate": 2.2365796080542345e-07,
"loss": 0.2007,
"step": 5630
},
{
"epoch": 9.80852915578764,
"grad_norm": 0.85546875,
"learning_rate": 2.037942741615617e-07,
"loss": 0.2001,
"step": 5635
},
{
"epoch": 9.817232375979112,
"grad_norm": 0.69140625,
"learning_rate": 1.8485325378994056e-07,
"loss": 0.198,
"step": 5640
},
{
"epoch": 9.825935596170583,
"grad_norm": 0.7109375,
"learning_rate": 1.6683507480983462e-07,
"loss": 0.1958,
"step": 5645
},
{
"epoch": 9.834638816362054,
"grad_norm": 0.671875,
"learning_rate": 1.4973990380841019e-07,
"loss": 0.1938,
"step": 5650
},
{
"epoch": 9.843342036553524,
"grad_norm": 0.7109375,
"learning_rate": 1.3356789883914865e-07,
"loss": 0.1938,
"step": 5655
},
{
"epoch": 9.852045256744995,
"grad_norm": 0.703125,
"learning_rate": 1.1831920942039221e-07,
"loss": 0.1973,
"step": 5660
},
{
"epoch": 9.860748476936466,
"grad_norm": 0.69140625,
"learning_rate": 1.0399397653395593e-07,
"loss": 0.2024,
"step": 5665
},
{
"epoch": 9.869451697127937,
"grad_norm": 0.66796875,
"learning_rate": 9.059233262386225e-08,
"loss": 0.1995,
"step": 5670
},
{
"epoch": 9.878154917319408,
"grad_norm": 0.71484375,
"learning_rate": 7.811440159507522e-08,
"loss": 0.1972,
"step": 5675
},
{
"epoch": 9.88685813751088,
"grad_norm": 0.65625,
"learning_rate": 6.656029881233483e-08,
"loss": 0.1938,
"step": 5680
},
{
"epoch": 9.89556135770235,
"grad_norm": 0.66796875,
"learning_rate": 5.593013109917999e-08,
"loss": 0.1974,
"step": 5685
},
{
"epoch": 9.904264577893821,
"grad_norm": 0.734375,
"learning_rate": 4.6223996736860506e-08,
"loss": 0.1957,
"step": 5690
},
{
"epoch": 9.912967798085292,
"grad_norm": 0.65625,
"learning_rate": 3.744198546348221e-08,
"loss": 0.1971,
"step": 5695
},
{
"epoch": 9.921671018276763,
"grad_norm": 0.66796875,
"learning_rate": 2.9584178473174296e-08,
"loss": 0.1977,
"step": 5700
},
{
"epoch": 9.930374238468232,
"grad_norm": 0.7421875,
"learning_rate": 2.2650648415334376e-08,
"loss": 0.1934,
"step": 5705
},
{
"epoch": 9.939077458659703,
"grad_norm": 0.69921875,
"learning_rate": 1.664145939394013e-08,
"loss": 0.1937,
"step": 5710
},
{
"epoch": 9.947780678851174,
"grad_norm": 0.6796875,
"learning_rate": 1.1556666966971997e-08,
"loss": 0.1996,
"step": 5715
},
{
"epoch": 9.956483899042645,
"grad_norm": 0.7265625,
"learning_rate": 7.39631814590247e-09,
"loss": 0.1984,
"step": 5720
},
{
"epoch": 9.965187119234116,
"grad_norm": 0.66015625,
"learning_rate": 4.160451395263109e-09,
"loss": 0.1927,
"step": 5725
},
{
"epoch": 9.973890339425587,
"grad_norm": 0.734375,
"learning_rate": 1.8490966322670666e-09,
"loss": 0.2,
"step": 5730
},
{
"epoch": 9.982593559617058,
"grad_norm": 0.75390625,
"learning_rate": 4.6227522655373223e-10,
"loss": 0.1946,
"step": 5735
},
{
"epoch": 9.991296779808529,
"grad_norm": 0.671875,
"learning_rate": 0.0,
"loss": 0.1899,
"step": 5740
},
{
"epoch": 9.991296779808529,
"eval_loss": 5.076513767242432,
"eval_runtime": 0.7783,
"eval_samples_per_second": 7.709,
"eval_steps_per_second": 1.285,
"step": 5740
},
{
"epoch": 9.991296779808529,
"step": 5740,
"total_flos": 6.645284010274587e+18,
"train_loss": 0.6571785643956387,
"train_runtime": 32584.0572,
"train_samples_per_second": 4.229,
"train_steps_per_second": 0.176
}
],
"logging_steps": 5,
"max_steps": 5740,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.645284010274587e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}