kanishka's picture
End of training
c86e108 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 310860,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06433764395547835,
"grad_norm": 0.9469536542892456,
"learning_rate": 9.375e-06,
"loss": 7.0597,
"step": 1000
},
{
"epoch": 0.1286752879109567,
"grad_norm": 0.9877486824989319,
"learning_rate": 1.875e-05,
"loss": 4.8574,
"step": 2000
},
{
"epoch": 0.19301293186643506,
"grad_norm": 1.1811100244522095,
"learning_rate": 2.8125e-05,
"loss": 4.5464,
"step": 3000
},
{
"epoch": 0.2573505758219134,
"grad_norm": 1.155553936958313,
"learning_rate": 3.75e-05,
"loss": 4.3086,
"step": 4000
},
{
"epoch": 0.32168821977739176,
"grad_norm": 1.0062898397445679,
"learning_rate": 4.6874999999999994e-05,
"loss": 4.1307,
"step": 5000
},
{
"epoch": 0.3860258637328701,
"grad_norm": 0.9749443531036377,
"learning_rate": 5.625e-05,
"loss": 3.986,
"step": 6000
},
{
"epoch": 0.45036350768834843,
"grad_norm": 0.9870838522911072,
"learning_rate": 6.5625e-05,
"loss": 3.8708,
"step": 7000
},
{
"epoch": 0.5147011516438268,
"grad_norm": 1.0740858316421509,
"learning_rate": 7.5e-05,
"loss": 3.781,
"step": 8000
},
{
"epoch": 0.5790387955993052,
"grad_norm": 0.969571053981781,
"learning_rate": 8.437499999999999e-05,
"loss": 3.6942,
"step": 9000
},
{
"epoch": 0.6433764395547835,
"grad_norm": 0.923062801361084,
"learning_rate": 9.374999999999999e-05,
"loss": 3.6225,
"step": 10000
},
{
"epoch": 0.7077140835102619,
"grad_norm": 0.87486732006073,
"learning_rate": 0.00010312499999999999,
"loss": 3.5667,
"step": 11000
},
{
"epoch": 0.7720517274657402,
"grad_norm": 0.8343172073364258,
"learning_rate": 0.000112490625,
"loss": 3.5107,
"step": 12000
},
{
"epoch": 0.8363893714212186,
"grad_norm": 0.8089198470115662,
"learning_rate": 0.000121865625,
"loss": 3.4681,
"step": 13000
},
{
"epoch": 0.9007270153766969,
"grad_norm": 0.8141182661056519,
"learning_rate": 0.00013123125,
"loss": 3.4337,
"step": 14000
},
{
"epoch": 0.9650646593321752,
"grad_norm": 0.7596079707145691,
"learning_rate": 0.00014060625,
"loss": 3.3944,
"step": 15000
},
{
"epoch": 1.0,
"eval_accuracy": 0.37339323372369543,
"eval_loss": 3.4211647510528564,
"eval_runtime": 111.2734,
"eval_samples_per_second": 471.271,
"eval_steps_per_second": 7.369,
"step": 15543
},
{
"epoch": 1.0294023032876536,
"grad_norm": 0.7583508491516113,
"learning_rate": 0.000149971875,
"loss": 3.345,
"step": 16000
},
{
"epoch": 1.093739947243132,
"grad_norm": 0.7395954728126526,
"learning_rate": 0.00015933749999999996,
"loss": 3.3182,
"step": 17000
},
{
"epoch": 1.1580775911986103,
"grad_norm": 0.7119142413139343,
"learning_rate": 0.00016871249999999996,
"loss": 3.304,
"step": 18000
},
{
"epoch": 1.2224152351540887,
"grad_norm": 0.7133814692497253,
"learning_rate": 0.00017808749999999999,
"loss": 3.2808,
"step": 19000
},
{
"epoch": 1.286752879109567,
"grad_norm": 0.6662284731864929,
"learning_rate": 0.00018745312499999998,
"loss": 3.2624,
"step": 20000
},
{
"epoch": 1.3510905230650454,
"grad_norm": 0.6821054816246033,
"learning_rate": 0.00019682812499999998,
"loss": 3.2468,
"step": 21000
},
{
"epoch": 1.4154281670205238,
"grad_norm": 0.6423399448394775,
"learning_rate": 0.00020619374999999998,
"loss": 3.2323,
"step": 22000
},
{
"epoch": 1.4797658109760021,
"grad_norm": 0.6489351987838745,
"learning_rate": 0.00021556874999999998,
"loss": 3.218,
"step": 23000
},
{
"epoch": 1.5441034549314803,
"grad_norm": 0.6388360261917114,
"learning_rate": 0.00022493437499999998,
"loss": 3.2063,
"step": 24000
},
{
"epoch": 1.6084410988869586,
"grad_norm": 0.6035541296005249,
"learning_rate": 0.00023430937499999997,
"loss": 3.1971,
"step": 25000
},
{
"epoch": 1.672778742842437,
"grad_norm": 0.5949345231056213,
"learning_rate": 0.00024367499999999997,
"loss": 3.1683,
"step": 26000
},
{
"epoch": 1.7371163867979154,
"grad_norm": 0.5953760147094727,
"learning_rate": 0.00025305,
"loss": 3.1728,
"step": 27000
},
{
"epoch": 1.8014540307533937,
"grad_norm": 0.5276063680648804,
"learning_rate": 0.000262415625,
"loss": 3.1607,
"step": 28000
},
{
"epoch": 1.865791674708872,
"grad_norm": 0.5257272124290466,
"learning_rate": 0.000271790625,
"loss": 3.1472,
"step": 29000
},
{
"epoch": 1.9301293186643504,
"grad_norm": 0.49043259024620056,
"learning_rate": 0.000281165625,
"loss": 3.1367,
"step": 30000
},
{
"epoch": 1.9944669626198288,
"grad_norm": 0.5030378699302673,
"learning_rate": 0.000290521875,
"loss": 3.1245,
"step": 31000
},
{
"epoch": 2.0,
"eval_accuracy": 0.3939620256950988,
"eval_loss": 3.2037432193756104,
"eval_runtime": 111.7392,
"eval_samples_per_second": 469.307,
"eval_steps_per_second": 7.339,
"step": 31086
},
{
"epoch": 2.058804606575307,
"grad_norm": 0.5003546476364136,
"learning_rate": 0.000299896875,
"loss": 3.0828,
"step": 32000
},
{
"epoch": 2.1231422505307855,
"grad_norm": 0.48286330699920654,
"learning_rate": 0.00029893602524564295,
"loss": 3.08,
"step": 33000
},
{
"epoch": 2.187479894486264,
"grad_norm": 0.4852472245693207,
"learning_rate": 0.0002978602165961414,
"loss": 3.0633,
"step": 34000
},
{
"epoch": 2.2518175384417423,
"grad_norm": 0.4629572927951813,
"learning_rate": 0.00029678548375528934,
"loss": 3.063,
"step": 35000
},
{
"epoch": 2.3161551823972206,
"grad_norm": 0.4571368992328644,
"learning_rate": 0.0002957096751057878,
"loss": 3.0453,
"step": 36000
},
{
"epoch": 2.380492826352699,
"grad_norm": 0.44331055879592896,
"learning_rate": 0.0002946349422649358,
"loss": 3.0408,
"step": 37000
},
{
"epoch": 2.4448304703081774,
"grad_norm": 0.4230923354625702,
"learning_rate": 0.00029355913361543424,
"loss": 3.0359,
"step": 38000
},
{
"epoch": 2.5091681142636557,
"grad_norm": 0.4260108768939972,
"learning_rate": 0.0002924833249659327,
"loss": 3.0316,
"step": 39000
},
{
"epoch": 2.573505758219134,
"grad_norm": 0.41887935996055603,
"learning_rate": 0.0002914085921250807,
"loss": 3.0299,
"step": 40000
},
{
"epoch": 2.6378434021746124,
"grad_norm": 0.41068920493125916,
"learning_rate": 0.00029033278347557914,
"loss": 3.0138,
"step": 41000
},
{
"epoch": 2.702181046130091,
"grad_norm": 0.39430394768714905,
"learning_rate": 0.0002892591264433766,
"loss": 3.0038,
"step": 42000
},
{
"epoch": 2.766518690085569,
"grad_norm": 0.4100017547607422,
"learning_rate": 0.00028818331779387505,
"loss": 3.0088,
"step": 43000
},
{
"epoch": 2.8308563340410475,
"grad_norm": 0.4101816415786743,
"learning_rate": 0.0002871075091443735,
"loss": 2.9937,
"step": 44000
},
{
"epoch": 2.895193977996526,
"grad_norm": 0.38294607400894165,
"learning_rate": 0.000286031700494872,
"loss": 2.9898,
"step": 45000
},
{
"epoch": 2.9595316219520043,
"grad_norm": 0.37260037660598755,
"learning_rate": 0.00028495589184537043,
"loss": 2.9807,
"step": 46000
},
{
"epoch": 3.0,
"eval_accuracy": 0.40728960081362825,
"eval_loss": 3.079435110092163,
"eval_runtime": 111.6836,
"eval_samples_per_second": 469.541,
"eval_steps_per_second": 7.342,
"step": 46629
},
{
"epoch": 3.0238692659074826,
"grad_norm": 0.40783312916755676,
"learning_rate": 0.00028388115900451836,
"loss": 2.9534,
"step": 47000
},
{
"epoch": 3.088206909862961,
"grad_norm": 0.38361623883247375,
"learning_rate": 0.0002828053503550168,
"loss": 2.9122,
"step": 48000
},
{
"epoch": 3.1525445538184393,
"grad_norm": 0.3766690790653229,
"learning_rate": 0.0002817306175141648,
"loss": 2.9119,
"step": 49000
},
{
"epoch": 3.2168821977739177,
"grad_norm": 0.38536399602890015,
"learning_rate": 0.00028065480886466326,
"loss": 2.9135,
"step": 50000
},
{
"epoch": 3.2812198417293956,
"grad_norm": 0.38374361395835876,
"learning_rate": 0.0002795800760238112,
"loss": 2.9094,
"step": 51000
},
{
"epoch": 3.345557485684874,
"grad_norm": 0.3872029185295105,
"learning_rate": 0.00027850426737430965,
"loss": 2.9069,
"step": 52000
},
{
"epoch": 3.4098951296403524,
"grad_norm": 0.37401601672172546,
"learning_rate": 0.0002774284587248081,
"loss": 2.9092,
"step": 53000
},
{
"epoch": 3.4742327735958307,
"grad_norm": 0.34951257705688477,
"learning_rate": 0.0002763537258839561,
"loss": 2.9124,
"step": 54000
},
{
"epoch": 3.538570417551309,
"grad_norm": 0.36252182722091675,
"learning_rate": 0.000275278993043104,
"loss": 2.9086,
"step": 55000
},
{
"epoch": 3.6029080615067874,
"grad_norm": 0.3610841631889343,
"learning_rate": 0.0002742031843936025,
"loss": 2.9044,
"step": 56000
},
{
"epoch": 3.667245705462266,
"grad_norm": 0.356315016746521,
"learning_rate": 0.00027312737574410094,
"loss": 2.8984,
"step": 57000
},
{
"epoch": 3.731583349417744,
"grad_norm": 0.3501368761062622,
"learning_rate": 0.0002720515670945994,
"loss": 2.8991,
"step": 58000
},
{
"epoch": 3.7959209933732225,
"grad_norm": 0.3654986619949341,
"learning_rate": 0.00027097575844509786,
"loss": 2.8946,
"step": 59000
},
{
"epoch": 3.860258637328701,
"grad_norm": 0.34233444929122925,
"learning_rate": 0.00026990102560424585,
"loss": 2.8981,
"step": 60000
},
{
"epoch": 3.9245962812841793,
"grad_norm": 0.35118335485458374,
"learning_rate": 0.0002688252169547443,
"loss": 2.8932,
"step": 61000
},
{
"epoch": 3.9889339252396576,
"grad_norm": 0.3542274236679077,
"learning_rate": 0.00026775048411389223,
"loss": 2.8872,
"step": 62000
},
{
"epoch": 4.0,
"eval_accuracy": 0.4139775803532702,
"eval_loss": 3.0204551219940186,
"eval_runtime": 111.8979,
"eval_samples_per_second": 468.642,
"eval_steps_per_second": 7.328,
"step": 62172
},
{
"epoch": 4.053271569195136,
"grad_norm": 0.359250545501709,
"learning_rate": 0.0002666746754643907,
"loss": 2.8242,
"step": 63000
},
{
"epoch": 4.117609213150614,
"grad_norm": 0.34917619824409485,
"learning_rate": 0.00026559886681488915,
"loss": 2.8144,
"step": 64000
},
{
"epoch": 4.181946857106093,
"grad_norm": 0.351457417011261,
"learning_rate": 0.00026452413397403714,
"loss": 2.8132,
"step": 65000
},
{
"epoch": 4.246284501061571,
"grad_norm": 0.35231146216392517,
"learning_rate": 0.0002634483253245356,
"loss": 2.8203,
"step": 66000
},
{
"epoch": 4.310622145017049,
"grad_norm": 0.354030579328537,
"learning_rate": 0.0002623735924836836,
"loss": 2.8273,
"step": 67000
},
{
"epoch": 4.374959788972528,
"grad_norm": 0.3434860408306122,
"learning_rate": 0.00026129778383418204,
"loss": 2.8221,
"step": 68000
},
{
"epoch": 4.439297432928006,
"grad_norm": 0.35598379373550415,
"learning_rate": 0.0002602219751846805,
"loss": 2.8283,
"step": 69000
},
{
"epoch": 4.5036350768834845,
"grad_norm": 0.350340873003006,
"learning_rate": 0.00025914616653517896,
"loss": 2.8242,
"step": 70000
},
{
"epoch": 4.567972720838963,
"grad_norm": 0.34078752994537354,
"learning_rate": 0.0002580714336943269,
"loss": 2.8309,
"step": 71000
},
{
"epoch": 4.632310364794441,
"grad_norm": 0.3571733832359314,
"learning_rate": 0.00025699670085347487,
"loss": 2.8248,
"step": 72000
},
{
"epoch": 4.69664800874992,
"grad_norm": 0.35940021276474,
"learning_rate": 0.00025592089220397333,
"loss": 2.8334,
"step": 73000
},
{
"epoch": 4.760985652705398,
"grad_norm": 0.3354775607585907,
"learning_rate": 0.0002548450835544718,
"loss": 2.8263,
"step": 74000
},
{
"epoch": 4.825323296660876,
"grad_norm": 0.330805242061615,
"learning_rate": 0.0002537703507136197,
"loss": 2.8296,
"step": 75000
},
{
"epoch": 4.889660940616355,
"grad_norm": 0.32566189765930176,
"learning_rate": 0.0002526945420641182,
"loss": 2.8208,
"step": 76000
},
{
"epoch": 4.953998584571833,
"grad_norm": 0.32299116253852844,
"learning_rate": 0.00025161980922326616,
"loss": 2.8286,
"step": 77000
},
{
"epoch": 5.0,
"eval_accuracy": 0.417990981289541,
"eval_loss": 2.988518238067627,
"eval_runtime": 111.9091,
"eval_samples_per_second": 468.595,
"eval_steps_per_second": 7.327,
"step": 77715
},
{
"epoch": 5.018336228527311,
"grad_norm": 0.332711786031723,
"learning_rate": 0.00025054400057376457,
"loss": 2.797,
"step": 78000
},
{
"epoch": 5.08267387248279,
"grad_norm": 0.3597155809402466,
"learning_rate": 0.000249468191924263,
"loss": 2.7461,
"step": 79000
},
{
"epoch": 5.147011516438268,
"grad_norm": 0.3411096930503845,
"learning_rate": 0.000248393459083411,
"loss": 2.7493,
"step": 80000
},
{
"epoch": 5.2113491603937465,
"grad_norm": 0.35248109698295593,
"learning_rate": 0.00024731765043390947,
"loss": 2.7584,
"step": 81000
},
{
"epoch": 5.275686804349225,
"grad_norm": 0.3520190417766571,
"learning_rate": 0.00024624184178440793,
"loss": 2.755,
"step": 82000
},
{
"epoch": 5.340024448304703,
"grad_norm": 0.34867680072784424,
"learning_rate": 0.00024516710894355586,
"loss": 2.7649,
"step": 83000
},
{
"epoch": 5.404362092260182,
"grad_norm": 0.3400154709815979,
"learning_rate": 0.00024409130029405434,
"loss": 2.7586,
"step": 84000
},
{
"epoch": 5.46869973621566,
"grad_norm": 0.3640024662017822,
"learning_rate": 0.0002430154916445528,
"loss": 2.7606,
"step": 85000
},
{
"epoch": 5.533037380171138,
"grad_norm": 0.3456322252750397,
"learning_rate": 0.00024193968299505126,
"loss": 2.767,
"step": 86000
},
{
"epoch": 5.597375024126617,
"grad_norm": 0.3284786343574524,
"learning_rate": 0.00024086495015419922,
"loss": 2.7687,
"step": 87000
},
{
"epoch": 5.661712668082095,
"grad_norm": 0.3351786732673645,
"learning_rate": 0.00023978914150469768,
"loss": 2.7705,
"step": 88000
},
{
"epoch": 5.726050312037573,
"grad_norm": 0.3189627528190613,
"learning_rate": 0.00023871440866384563,
"loss": 2.7743,
"step": 89000
},
{
"epoch": 5.790387955993052,
"grad_norm": 0.3447468876838684,
"learning_rate": 0.0002376386000143441,
"loss": 2.7712,
"step": 90000
},
{
"epoch": 5.85472559994853,
"grad_norm": 0.3212040364742279,
"learning_rate": 0.00023656386717349205,
"loss": 2.7741,
"step": 91000
},
{
"epoch": 5.9190632439040085,
"grad_norm": 0.3384701609611511,
"learning_rate": 0.0002354880585239905,
"loss": 2.7747,
"step": 92000
},
{
"epoch": 5.983400887859487,
"grad_norm": 0.33266380429267883,
"learning_rate": 0.00023441224987448894,
"loss": 2.779,
"step": 93000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4206485843765424,
"eval_loss": 2.969926595687866,
"eval_runtime": 111.9683,
"eval_samples_per_second": 468.347,
"eval_steps_per_second": 7.324,
"step": 93258
},
{
"epoch": 6.047738531814965,
"grad_norm": 0.35645657777786255,
"learning_rate": 0.0002333364412249874,
"loss": 2.7059,
"step": 94000
},
{
"epoch": 6.112076175770444,
"grad_norm": 0.35733386874198914,
"learning_rate": 0.0002322617083841354,
"loss": 2.6966,
"step": 95000
},
{
"epoch": 6.176413819725922,
"grad_norm": 0.3435540199279785,
"learning_rate": 0.00023118589973463387,
"loss": 2.6986,
"step": 96000
},
{
"epoch": 6.2407514636814,
"grad_norm": 0.3479596972465515,
"learning_rate": 0.0002301100910851323,
"loss": 2.7068,
"step": 97000
},
{
"epoch": 6.305089107636879,
"grad_norm": 0.3150424659252167,
"learning_rate": 0.00022903428243563077,
"loss": 2.7074,
"step": 98000
},
{
"epoch": 6.369426751592357,
"grad_norm": 0.34055858850479126,
"learning_rate": 0.00022795954959477872,
"loss": 2.7065,
"step": 99000
},
{
"epoch": 6.433764395547835,
"grad_norm": 0.3491341769695282,
"learning_rate": 0.0002268848167539267,
"loss": 2.7156,
"step": 100000
},
{
"epoch": 6.498102039503314,
"grad_norm": 0.3347100019454956,
"learning_rate": 0.00022580900810442514,
"loss": 2.714,
"step": 101000
},
{
"epoch": 6.562439683458791,
"grad_norm": 0.35210439562797546,
"learning_rate": 0.00022473427526357312,
"loss": 2.7194,
"step": 102000
},
{
"epoch": 6.6267773274142705,
"grad_norm": 0.3326897919178009,
"learning_rate": 0.00022365846661407155,
"loss": 2.727,
"step": 103000
},
{
"epoch": 6.691114971369748,
"grad_norm": 0.3269229531288147,
"learning_rate": 0.00022258265796457,
"loss": 2.7203,
"step": 104000
},
{
"epoch": 6.755452615325227,
"grad_norm": 0.34183254837989807,
"learning_rate": 0.00022150684931506847,
"loss": 2.7328,
"step": 105000
},
{
"epoch": 6.819790259280705,
"grad_norm": 0.33449244499206543,
"learning_rate": 0.00022043211647421643,
"loss": 2.7291,
"step": 106000
},
{
"epoch": 6.884127903236184,
"grad_norm": 0.33734798431396484,
"learning_rate": 0.0002193563078247149,
"loss": 2.7277,
"step": 107000
},
{
"epoch": 6.948465547191661,
"grad_norm": 0.34088870882987976,
"learning_rate": 0.00021828157498386284,
"loss": 2.7316,
"step": 108000
},
{
"epoch": 7.0,
"eval_accuracy": 0.42224443247932275,
"eval_loss": 2.958820104598999,
"eval_runtime": 111.7467,
"eval_samples_per_second": 469.276,
"eval_steps_per_second": 7.338,
"step": 108801
},
{
"epoch": 7.01280319114714,
"grad_norm": 0.3517482876777649,
"learning_rate": 0.0002172057663343613,
"loss": 2.7116,
"step": 109000
},
{
"epoch": 7.077140835102618,
"grad_norm": 0.3411085903644562,
"learning_rate": 0.00021613103349350926,
"loss": 2.6469,
"step": 110000
},
{
"epoch": 7.1414784790580965,
"grad_norm": 0.3486618399620056,
"learning_rate": 0.00021505522484400772,
"loss": 2.6546,
"step": 111000
},
{
"epoch": 7.205816123013575,
"grad_norm": 0.35618531703948975,
"learning_rate": 0.00021397941619450618,
"loss": 2.6603,
"step": 112000
},
{
"epoch": 7.270153766969053,
"grad_norm": 0.34740447998046875,
"learning_rate": 0.00021290468335365413,
"loss": 2.6632,
"step": 113000
},
{
"epoch": 7.334491410924532,
"grad_norm": 0.339108407497406,
"learning_rate": 0.0002118288747041526,
"loss": 2.6682,
"step": 114000
},
{
"epoch": 7.39882905488001,
"grad_norm": 0.36686399579048157,
"learning_rate": 0.00021075306605465105,
"loss": 2.6718,
"step": 115000
},
{
"epoch": 7.463166698835488,
"grad_norm": 0.3336213529109955,
"learning_rate": 0.000209678333213799,
"loss": 2.6806,
"step": 116000
},
{
"epoch": 7.527504342790967,
"grad_norm": 0.34256553649902344,
"learning_rate": 0.00020860252456429747,
"loss": 2.6772,
"step": 117000
},
{
"epoch": 7.591841986746445,
"grad_norm": 0.3527204096317291,
"learning_rate": 0.00020752671591479593,
"loss": 2.6786,
"step": 118000
},
{
"epoch": 7.656179630701923,
"grad_norm": 0.34285178780555725,
"learning_rate": 0.0002064509072652944,
"loss": 2.6816,
"step": 119000
},
{
"epoch": 7.720517274657402,
"grad_norm": 0.3418208658695221,
"learning_rate": 0.00020537617442444234,
"loss": 2.6893,
"step": 120000
},
{
"epoch": 7.78485491861288,
"grad_norm": 0.34486138820648193,
"learning_rate": 0.0002043003657749408,
"loss": 2.6847,
"step": 121000
},
{
"epoch": 7.8491925625683585,
"grad_norm": 0.348530650138855,
"learning_rate": 0.00020322563293408876,
"loss": 2.6826,
"step": 122000
},
{
"epoch": 7.913530206523837,
"grad_norm": 0.33808425068855286,
"learning_rate": 0.00020215090009323674,
"loss": 2.6905,
"step": 123000
},
{
"epoch": 7.977867850479315,
"grad_norm": 0.3486366868019104,
"learning_rate": 0.0002010750914437352,
"loss": 2.6909,
"step": 124000
},
{
"epoch": 8.0,
"eval_accuracy": 0.4232837528604119,
"eval_loss": 2.9554243087768555,
"eval_runtime": 112.1041,
"eval_samples_per_second": 467.779,
"eval_steps_per_second": 7.315,
"step": 124344
},
{
"epoch": 8.042205494434794,
"grad_norm": 0.35380104184150696,
"learning_rate": 0.00020000035860288316,
"loss": 2.6303,
"step": 125000
},
{
"epoch": 8.106543138390272,
"grad_norm": 0.3654320240020752,
"learning_rate": 0.00019892454995338162,
"loss": 2.6128,
"step": 126000
},
{
"epoch": 8.170880782345751,
"grad_norm": 0.3670574724674225,
"learning_rate": 0.00019784874130388008,
"loss": 2.617,
"step": 127000
},
{
"epoch": 8.235218426301229,
"grad_norm": 0.38059455156326294,
"learning_rate": 0.00019677400846302803,
"loss": 2.6274,
"step": 128000
},
{
"epoch": 8.299556070256708,
"grad_norm": 0.3698261082172394,
"learning_rate": 0.00019569927562217599,
"loss": 2.6309,
"step": 129000
},
{
"epoch": 8.363893714212185,
"grad_norm": 0.3583601117134094,
"learning_rate": 0.00019462346697267445,
"loss": 2.6312,
"step": 130000
},
{
"epoch": 8.428231358167665,
"grad_norm": 0.3602234721183777,
"learning_rate": 0.0001935476583231729,
"loss": 2.6368,
"step": 131000
},
{
"epoch": 8.492569002123142,
"grad_norm": 0.3441711664199829,
"learning_rate": 0.00019247184967367137,
"loss": 2.6372,
"step": 132000
},
{
"epoch": 8.556906646078621,
"grad_norm": 0.3533187508583069,
"learning_rate": 0.00019139604102416983,
"loss": 2.6443,
"step": 133000
},
{
"epoch": 8.621244290034099,
"grad_norm": 0.3579193651676178,
"learning_rate": 0.00019032130818331778,
"loss": 2.6481,
"step": 134000
},
{
"epoch": 8.685581933989578,
"grad_norm": 0.3524502217769623,
"learning_rate": 0.00018924549953381624,
"loss": 2.6509,
"step": 135000
},
{
"epoch": 8.749919577945056,
"grad_norm": 0.36159747838974,
"learning_rate": 0.0001881707666929642,
"loss": 2.6456,
"step": 136000
},
{
"epoch": 8.814257221900533,
"grad_norm": 0.34249147772789,
"learning_rate": 0.00018709495804346266,
"loss": 2.6538,
"step": 137000
},
{
"epoch": 8.878594865856012,
"grad_norm": 0.34867429733276367,
"learning_rate": 0.0001860202252026106,
"loss": 2.6558,
"step": 138000
},
{
"epoch": 8.942932509811492,
"grad_norm": 0.3351230025291443,
"learning_rate": 0.00018494441655310907,
"loss": 2.6504,
"step": 139000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4238085730096768,
"eval_loss": 2.9544410705566406,
"eval_runtime": 111.7789,
"eval_samples_per_second": 469.14,
"eval_steps_per_second": 7.336,
"step": 139887
},
{
"epoch": 9.007270153766969,
"grad_norm": 0.36276528239250183,
"learning_rate": 0.00018386968371225703,
"loss": 2.6469,
"step": 140000
},
{
"epoch": 9.071607797722447,
"grad_norm": 0.36368831992149353,
"learning_rate": 0.0001827938750627555,
"loss": 2.5666,
"step": 141000
},
{
"epoch": 9.135945441677926,
"grad_norm": 0.36417004466056824,
"learning_rate": 0.00018171806641325395,
"loss": 2.5832,
"step": 142000
},
{
"epoch": 9.200283085633403,
"grad_norm": 0.3550620973110199,
"learning_rate": 0.0001806422577637524,
"loss": 2.5888,
"step": 143000
},
{
"epoch": 9.264620729588882,
"grad_norm": 0.3513035178184509,
"learning_rate": 0.00017956644911425084,
"loss": 2.5872,
"step": 144000
},
{
"epoch": 9.32895837354436,
"grad_norm": 0.3576969802379608,
"learning_rate": 0.00017849279208204832,
"loss": 2.599,
"step": 145000
},
{
"epoch": 9.39329601749984,
"grad_norm": 0.3496710956096649,
"learning_rate": 0.00017741698343254678,
"loss": 2.6042,
"step": 146000
},
{
"epoch": 9.457633661455317,
"grad_norm": 0.3502206802368164,
"learning_rate": 0.00017634225059169476,
"loss": 2.6069,
"step": 147000
},
{
"epoch": 9.521971305410796,
"grad_norm": 0.3516786992549896,
"learning_rate": 0.00017526644194219322,
"loss": 2.606,
"step": 148000
},
{
"epoch": 9.586308949366273,
"grad_norm": 0.3671824336051941,
"learning_rate": 0.00017419063329269168,
"loss": 2.6151,
"step": 149000
},
{
"epoch": 9.650646593321753,
"grad_norm": 0.36615684628486633,
"learning_rate": 0.00017311590045183964,
"loss": 2.6174,
"step": 150000
},
{
"epoch": 9.71498423727723,
"grad_norm": 0.369759202003479,
"learning_rate": 0.0001720400918023381,
"loss": 2.6162,
"step": 151000
},
{
"epoch": 9.77932188123271,
"grad_norm": 0.3495037257671356,
"learning_rate": 0.00017096428315283656,
"loss": 2.6186,
"step": 152000
},
{
"epoch": 9.843659525188187,
"grad_norm": 0.3635868728160858,
"learning_rate": 0.0001698895503119845,
"loss": 2.616,
"step": 153000
},
{
"epoch": 9.907997169143666,
"grad_norm": 0.352250337600708,
"learning_rate": 0.00016881374166248297,
"loss": 2.626,
"step": 154000
},
{
"epoch": 9.972334813099144,
"grad_norm": 0.3688776195049286,
"learning_rate": 0.00016773900882163093,
"loss": 2.6246,
"step": 155000
},
{
"epoch": 10.0,
"eval_accuracy": 0.424411016885778,
"eval_loss": 2.9523308277130127,
"eval_runtime": 111.8379,
"eval_samples_per_second": 468.893,
"eval_steps_per_second": 7.332,
"step": 155430
},
{
"epoch": 10.036672457054623,
"grad_norm": 0.3961314558982849,
"learning_rate": 0.0001666632001721294,
"loss": 2.5827,
"step": 156000
},
{
"epoch": 10.1010101010101,
"grad_norm": 0.3705954849720001,
"learning_rate": 0.00016558739152262782,
"loss": 2.5413,
"step": 157000
},
{
"epoch": 10.16534774496558,
"grad_norm": 0.37091416120529175,
"learning_rate": 0.00016451158287312628,
"loss": 2.5502,
"step": 158000
},
{
"epoch": 10.229685388921057,
"grad_norm": 0.38428565859794617,
"learning_rate": 0.00016343685003227424,
"loss": 2.5592,
"step": 159000
},
{
"epoch": 10.294023032876536,
"grad_norm": 0.3688577115535736,
"learning_rate": 0.0001623610413827727,
"loss": 2.5673,
"step": 160000
},
{
"epoch": 10.358360676832014,
"grad_norm": 0.38183775544166565,
"learning_rate": 0.00016128630854192065,
"loss": 2.5697,
"step": 161000
},
{
"epoch": 10.422698320787493,
"grad_norm": 0.37677517533302307,
"learning_rate": 0.0001602104998924191,
"loss": 2.5713,
"step": 162000
},
{
"epoch": 10.48703596474297,
"grad_norm": 0.3694332540035248,
"learning_rate": 0.00015913576705156707,
"loss": 2.5751,
"step": 163000
},
{
"epoch": 10.55137360869845,
"grad_norm": 0.3814958333969116,
"learning_rate": 0.00015806103421071502,
"loss": 2.5792,
"step": 164000
},
{
"epoch": 10.615711252653927,
"grad_norm": 0.38280004262924194,
"learning_rate": 0.00015698522556121348,
"loss": 2.5782,
"step": 165000
},
{
"epoch": 10.680048896609406,
"grad_norm": 0.3659280240535736,
"learning_rate": 0.00015590941691171194,
"loss": 2.5862,
"step": 166000
},
{
"epoch": 10.744386540564884,
"grad_norm": 0.34562841057777405,
"learning_rate": 0.0001548336082622104,
"loss": 2.5869,
"step": 167000
},
{
"epoch": 10.808724184520363,
"grad_norm": 0.3570345938205719,
"learning_rate": 0.00015375887542135836,
"loss": 2.59,
"step": 168000
},
{
"epoch": 10.87306182847584,
"grad_norm": 0.360215961933136,
"learning_rate": 0.00015268306677185682,
"loss": 2.5979,
"step": 169000
},
{
"epoch": 10.93739947243132,
"grad_norm": 0.370670884847641,
"learning_rate": 0.00015160725812235528,
"loss": 2.5988,
"step": 170000
},
{
"epoch": 11.0,
"eval_accuracy": 0.4248191770987571,
"eval_loss": 2.9567785263061523,
"eval_runtime": 112.2402,
"eval_samples_per_second": 467.212,
"eval_steps_per_second": 7.306,
"step": 170973
},
{
"epoch": 11.001737116386797,
"grad_norm": 0.38218948245048523,
"learning_rate": 0.00015053252528150323,
"loss": 2.5933,
"step": 171000
},
{
"epoch": 11.066074760342277,
"grad_norm": 0.396331787109375,
"learning_rate": 0.00014945671663200172,
"loss": 2.5023,
"step": 172000
},
{
"epoch": 11.130412404297754,
"grad_norm": 0.3751789927482605,
"learning_rate": 0.00014838090798250018,
"loss": 2.5227,
"step": 173000
},
{
"epoch": 11.194750048253233,
"grad_norm": 0.37265828251838684,
"learning_rate": 0.00014730509933299864,
"loss": 2.5299,
"step": 174000
},
{
"epoch": 11.25908769220871,
"grad_norm": 0.37080228328704834,
"learning_rate": 0.0001462303664921466,
"loss": 2.5333,
"step": 175000
},
{
"epoch": 11.32342533616419,
"grad_norm": 0.3808966875076294,
"learning_rate": 0.00014515563365129455,
"loss": 2.5376,
"step": 176000
},
{
"epoch": 11.387762980119668,
"grad_norm": 0.38901346921920776,
"learning_rate": 0.000144079825001793,
"loss": 2.5422,
"step": 177000
},
{
"epoch": 11.452100624075147,
"grad_norm": 0.380100816488266,
"learning_rate": 0.00014300401635229144,
"loss": 2.5533,
"step": 178000
},
{
"epoch": 11.516438268030624,
"grad_norm": 0.39306920766830444,
"learning_rate": 0.0001419282077027899,
"loss": 2.5507,
"step": 179000
},
{
"epoch": 11.580775911986104,
"grad_norm": 0.3917422890663147,
"learning_rate": 0.00014085239905328836,
"loss": 2.5579,
"step": 180000
},
{
"epoch": 11.645113555941581,
"grad_norm": 0.38742849230766296,
"learning_rate": 0.00013977766621243632,
"loss": 2.5531,
"step": 181000
},
{
"epoch": 11.70945119989706,
"grad_norm": 0.3767852187156677,
"learning_rate": 0.00013870185756293478,
"loss": 2.5633,
"step": 182000
},
{
"epoch": 11.773788843852538,
"grad_norm": 0.39576900005340576,
"learning_rate": 0.00013762604891343324,
"loss": 2.5648,
"step": 183000
},
{
"epoch": 11.838126487808017,
"grad_norm": 0.37659791111946106,
"learning_rate": 0.00013655131607258122,
"loss": 2.5631,
"step": 184000
},
{
"epoch": 11.902464131763494,
"grad_norm": 0.38377416133880615,
"learning_rate": 0.00013547658323172918,
"loss": 2.5631,
"step": 185000
},
{
"epoch": 11.966801775718974,
"grad_norm": 0.37857234477996826,
"learning_rate": 0.00013440077458222764,
"loss": 2.5639,
"step": 186000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4247610714766456,
"eval_loss": 2.9595353603363037,
"eval_runtime": 111.9131,
"eval_samples_per_second": 468.578,
"eval_steps_per_second": 7.327,
"step": 186516
},
{
"epoch": 12.031139419674451,
"grad_norm": 0.4024442136287689,
"learning_rate": 0.0001333249659327261,
"loss": 2.5273,
"step": 187000
},
{
"epoch": 12.09547706362993,
"grad_norm": 0.4137458801269531,
"learning_rate": 0.00013225023309187405,
"loss": 2.4933,
"step": 188000
},
{
"epoch": 12.159814707585408,
"grad_norm": 0.409184992313385,
"learning_rate": 0.0001311744244423725,
"loss": 2.4967,
"step": 189000
},
{
"epoch": 12.224152351540887,
"grad_norm": 0.41316309571266174,
"learning_rate": 0.00013009861579287097,
"loss": 2.5063,
"step": 190000
},
{
"epoch": 12.288489995496365,
"grad_norm": 0.3909110724925995,
"learning_rate": 0.00012902280714336943,
"loss": 2.5153,
"step": 191000
},
{
"epoch": 12.352827639451844,
"grad_norm": 0.39046111702919006,
"learning_rate": 0.0001279469984938679,
"loss": 2.5115,
"step": 192000
},
{
"epoch": 12.417165283407321,
"grad_norm": 0.40070855617523193,
"learning_rate": 0.00012687226565301585,
"loss": 2.5157,
"step": 193000
},
{
"epoch": 12.4815029273628,
"grad_norm": 0.3970703184604645,
"learning_rate": 0.00012579645700351428,
"loss": 2.5198,
"step": 194000
},
{
"epoch": 12.545840571318278,
"grad_norm": 0.40202242136001587,
"learning_rate": 0.00012472064835401274,
"loss": 2.526,
"step": 195000
},
{
"epoch": 12.610178215273757,
"grad_norm": 0.3841732144355774,
"learning_rate": 0.0001236459155131607,
"loss": 2.5295,
"step": 196000
},
{
"epoch": 12.674515859229235,
"grad_norm": 0.40759024024009705,
"learning_rate": 0.00012257010686365916,
"loss": 2.5307,
"step": 197000
},
{
"epoch": 12.738853503184714,
"grad_norm": 0.3963831663131714,
"learning_rate": 0.00012149429821415763,
"loss": 2.534,
"step": 198000
},
{
"epoch": 12.803191147140192,
"grad_norm": 0.37255486845970154,
"learning_rate": 0.0001204195653733056,
"loss": 2.5354,
"step": 199000
},
{
"epoch": 12.86752879109567,
"grad_norm": 0.397368460893631,
"learning_rate": 0.00011934375672380406,
"loss": 2.5352,
"step": 200000
},
{
"epoch": 12.931866435051148,
"grad_norm": 0.379574716091156,
"learning_rate": 0.00011826902388295201,
"loss": 2.5397,
"step": 201000
},
{
"epoch": 12.996204079006628,
"grad_norm": 0.3803842067718506,
"learning_rate": 0.00011719321523345048,
"loss": 2.5361,
"step": 202000
},
{
"epoch": 13.0,
"eval_accuracy": 0.42475613586395655,
"eval_loss": 2.9698119163513184,
"eval_runtime": 112.0848,
"eval_samples_per_second": 467.86,
"eval_steps_per_second": 7.316,
"step": 202059
},
{
"epoch": 13.060541722962105,
"grad_norm": 0.40816885232925415,
"learning_rate": 0.00011611740658394894,
"loss": 2.4669,
"step": 203000
},
{
"epoch": 13.124879366917584,
"grad_norm": 0.42818671464920044,
"learning_rate": 0.00011504159793444738,
"loss": 2.467,
"step": 204000
},
{
"epoch": 13.189217010873062,
"grad_norm": 0.40255987644195557,
"learning_rate": 0.00011396686509359535,
"loss": 2.4753,
"step": 205000
},
{
"epoch": 13.253554654828541,
"grad_norm": 0.4254453778266907,
"learning_rate": 0.0001128921322527433,
"loss": 2.4808,
"step": 206000
},
{
"epoch": 13.317892298784018,
"grad_norm": 0.4060657322406769,
"learning_rate": 0.00011181632360324175,
"loss": 2.4932,
"step": 207000
},
{
"epoch": 13.382229942739498,
"grad_norm": 0.4138365387916565,
"learning_rate": 0.00011074051495374021,
"loss": 2.4922,
"step": 208000
},
{
"epoch": 13.446567586694975,
"grad_norm": 0.4098254442214966,
"learning_rate": 0.00010966578211288817,
"loss": 2.4948,
"step": 209000
},
{
"epoch": 13.510905230650454,
"grad_norm": 0.4242159128189087,
"learning_rate": 0.00010858997346338663,
"loss": 2.5012,
"step": 210000
},
{
"epoch": 13.575242874605932,
"grad_norm": 0.42177829146385193,
"learning_rate": 0.00010751416481388509,
"loss": 2.4998,
"step": 211000
},
{
"epoch": 13.63958051856141,
"grad_norm": 0.4196189045906067,
"learning_rate": 0.00010643943197303304,
"loss": 2.5048,
"step": 212000
},
{
"epoch": 13.703918162516889,
"grad_norm": 0.3965640366077423,
"learning_rate": 0.0001053636233235315,
"loss": 2.5092,
"step": 213000
},
{
"epoch": 13.768255806472368,
"grad_norm": 0.39778339862823486,
"learning_rate": 0.00010428781467402996,
"loss": 2.5121,
"step": 214000
},
{
"epoch": 13.832593450427845,
"grad_norm": 0.40292391180992126,
"learning_rate": 0.00010321308183317793,
"loss": 2.5119,
"step": 215000
},
{
"epoch": 13.896931094383323,
"grad_norm": 0.41673198342323303,
"learning_rate": 0.00010213727318367639,
"loss": 2.5112,
"step": 216000
},
{
"epoch": 13.961268738338802,
"grad_norm": 0.40400612354278564,
"learning_rate": 0.00010106254034282435,
"loss": 2.5098,
"step": 217000
},
{
"epoch": 14.0,
"eval_accuracy": 0.424743796832234,
"eval_loss": 2.9747180938720703,
"eval_runtime": 111.9808,
"eval_samples_per_second": 468.295,
"eval_steps_per_second": 7.323,
"step": 217602
},
{
"epoch": 14.02560638229428,
"grad_norm": 0.40745100378990173,
"learning_rate": 9.998673169332281e-05,
"loss": 2.4894,
"step": 218000
},
{
"epoch": 14.089944026249759,
"grad_norm": 0.42399463057518005,
"learning_rate": 9.891092304382127e-05,
"loss": 2.449,
"step": 219000
},
{
"epoch": 14.154281670205236,
"grad_norm": 0.4149724841117859,
"learning_rate": 9.783511439431973e-05,
"loss": 2.4534,
"step": 220000
},
{
"epoch": 14.218619314160716,
"grad_norm": 0.40756285190582275,
"learning_rate": 9.676145736211718e-05,
"loss": 2.4576,
"step": 221000
},
{
"epoch": 14.282956958116193,
"grad_norm": 0.4224795997142792,
"learning_rate": 9.568564871261564e-05,
"loss": 2.4584,
"step": 222000
},
{
"epoch": 14.347294602071672,
"grad_norm": 0.41213053464889526,
"learning_rate": 9.461091587176359e-05,
"loss": 2.4707,
"step": 223000
},
{
"epoch": 14.41163224602715,
"grad_norm": 0.4161031246185303,
"learning_rate": 9.353510722226205e-05,
"loss": 2.4701,
"step": 224000
},
{
"epoch": 14.475969889982629,
"grad_norm": 0.42417025566101074,
"learning_rate": 9.245929857276051e-05,
"loss": 2.4706,
"step": 225000
},
{
"epoch": 14.540307533938106,
"grad_norm": 0.4227360785007477,
"learning_rate": 9.138348992325897e-05,
"loss": 2.4678,
"step": 226000
},
{
"epoch": 14.604645177893586,
"grad_norm": 0.3956305682659149,
"learning_rate": 9.030768127375742e-05,
"loss": 2.4816,
"step": 227000
},
{
"epoch": 14.668982821849063,
"grad_norm": 0.42013561725616455,
"learning_rate": 8.92329484329054e-05,
"loss": 2.4791,
"step": 228000
},
{
"epoch": 14.733320465804542,
"grad_norm": 0.41232335567474365,
"learning_rate": 8.815713978340386e-05,
"loss": 2.4861,
"step": 229000
},
{
"epoch": 14.79765810976002,
"grad_norm": 0.398253858089447,
"learning_rate": 8.708240694255182e-05,
"loss": 2.4857,
"step": 230000
},
{
"epoch": 14.8619957537155,
"grad_norm": 0.41056615114212036,
"learning_rate": 8.600659829305028e-05,
"loss": 2.4826,
"step": 231000
},
{
"epoch": 14.926333397670977,
"grad_norm": 0.4065124988555908,
"learning_rate": 8.493186545219823e-05,
"loss": 2.4791,
"step": 232000
},
{
"epoch": 14.990671041626456,
"grad_norm": 0.42194780707359314,
"learning_rate": 8.385605680269669e-05,
"loss": 2.4899,
"step": 233000
},
{
"epoch": 15.0,
"eval_accuracy": 0.4246625087868862,
"eval_loss": 2.9792003631591797,
"eval_runtime": 111.7676,
"eval_samples_per_second": 469.188,
"eval_steps_per_second": 7.337,
"step": 233145
},
{
"epoch": 15.055008685581933,
"grad_norm": 0.444181889295578,
"learning_rate": 8.278024815319514e-05,
"loss": 2.4309,
"step": 234000
},
{
"epoch": 15.119346329537413,
"grad_norm": 0.4177301526069641,
"learning_rate": 8.17044395036936e-05,
"loss": 2.4254,
"step": 235000
},
{
"epoch": 15.18368397349289,
"grad_norm": 0.43864157795906067,
"learning_rate": 8.062970666284155e-05,
"loss": 2.432,
"step": 236000
},
{
"epoch": 15.24802161744837,
"grad_norm": 0.43071264028549194,
"learning_rate": 7.955497382198951e-05,
"loss": 2.4372,
"step": 237000
},
{
"epoch": 15.312359261403847,
"grad_norm": 0.44551989436149597,
"learning_rate": 7.847916517248797e-05,
"loss": 2.4441,
"step": 238000
},
{
"epoch": 15.376696905359326,
"grad_norm": 0.42598387598991394,
"learning_rate": 7.740335652298643e-05,
"loss": 2.4448,
"step": 239000
},
{
"epoch": 15.441034549314804,
"grad_norm": 0.4412069618701935,
"learning_rate": 7.632754787348489e-05,
"loss": 2.4481,
"step": 240000
},
{
"epoch": 15.505372193270283,
"grad_norm": 0.4257245361804962,
"learning_rate": 7.525173922398335e-05,
"loss": 2.4496,
"step": 241000
},
{
"epoch": 15.56970983722576,
"grad_norm": 0.4463740885257721,
"learning_rate": 7.417593057448181e-05,
"loss": 2.4583,
"step": 242000
},
{
"epoch": 15.63404748118124,
"grad_norm": 0.40843266248703003,
"learning_rate": 7.310119773362977e-05,
"loss": 2.4549,
"step": 243000
},
{
"epoch": 15.698385125136717,
"grad_norm": 0.43823161721229553,
"learning_rate": 7.202538908412823e-05,
"loss": 2.4565,
"step": 244000
},
{
"epoch": 15.762722769092196,
"grad_norm": 0.4224304258823395,
"learning_rate": 7.09506562432762e-05,
"loss": 2.4664,
"step": 245000
},
{
"epoch": 15.827060413047674,
"grad_norm": 0.42779698967933655,
"learning_rate": 6.987484759377464e-05,
"loss": 2.4607,
"step": 246000
},
{
"epoch": 15.891398057003153,
"grad_norm": 0.41904374957084656,
"learning_rate": 6.880011475292261e-05,
"loss": 2.463,
"step": 247000
},
{
"epoch": 15.95573570095863,
"grad_norm": 0.4636126458644867,
"learning_rate": 6.772430610342107e-05,
"loss": 2.4626,
"step": 248000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4244173733566653,
"eval_loss": 2.9882283210754395,
"eval_runtime": 112.1651,
"eval_samples_per_second": 467.525,
"eval_steps_per_second": 7.311,
"step": 248688
},
{
"epoch": 16.02007334491411,
"grad_norm": 0.44689562916755676,
"learning_rate": 6.664849745391953e-05,
"loss": 2.4432,
"step": 249000
},
{
"epoch": 16.08441098886959,
"grad_norm": 0.45889049768447876,
"learning_rate": 6.557376461306749e-05,
"loss": 2.4048,
"step": 250000
},
{
"epoch": 16.148748632825065,
"grad_norm": 0.4538269639015198,
"learning_rate": 6.449795596356593e-05,
"loss": 2.4123,
"step": 251000
},
{
"epoch": 16.213086276780544,
"grad_norm": 0.44775742292404175,
"learning_rate": 6.342214731406439e-05,
"loss": 2.4137,
"step": 252000
},
{
"epoch": 16.277423920736023,
"grad_norm": 0.4506843090057373,
"learning_rate": 6.234741447321236e-05,
"loss": 2.4152,
"step": 253000
},
{
"epoch": 16.341761564691502,
"grad_norm": 0.4564642310142517,
"learning_rate": 6.127160582371082e-05,
"loss": 2.4236,
"step": 254000
},
{
"epoch": 16.406099208646978,
"grad_norm": 0.4492376744747162,
"learning_rate": 6.0195797174209275e-05,
"loss": 2.4222,
"step": 255000
},
{
"epoch": 16.470436852602457,
"grad_norm": 0.44002753496170044,
"learning_rate": 5.9119988524707736e-05,
"loss": 2.4277,
"step": 256000
},
{
"epoch": 16.534774496557937,
"grad_norm": 0.437580406665802,
"learning_rate": 5.8044179875206196e-05,
"loss": 2.4303,
"step": 257000
},
{
"epoch": 16.599112140513416,
"grad_norm": 0.42502424120903015,
"learning_rate": 5.697052284300365e-05,
"loss": 2.4359,
"step": 258000
},
{
"epoch": 16.66344978446889,
"grad_norm": 0.44441190361976624,
"learning_rate": 5.5894714193502106e-05,
"loss": 2.4306,
"step": 259000
},
{
"epoch": 16.72778742842437,
"grad_norm": 0.4539526700973511,
"learning_rate": 5.4818905544000566e-05,
"loss": 2.4342,
"step": 260000
},
{
"epoch": 16.79212507237985,
"grad_norm": 0.4554595947265625,
"learning_rate": 5.374417270314853e-05,
"loss": 2.4388,
"step": 261000
},
{
"epoch": 16.85646271633533,
"grad_norm": 0.4573330283164978,
"learning_rate": 5.266836405364699e-05,
"loss": 2.441,
"step": 262000
},
{
"epoch": 16.920800360290805,
"grad_norm": 0.449770450592041,
"learning_rate": 5.159255540414545e-05,
"loss": 2.4411,
"step": 263000
},
{
"epoch": 16.985138004246284,
"grad_norm": 0.48139625787734985,
"learning_rate": 5.05178225632934e-05,
"loss": 2.4399,
"step": 264000
},
{
"epoch": 17.0,
"eval_accuracy": 0.4242649676193895,
"eval_loss": 2.9961202144622803,
"eval_runtime": 111.9399,
"eval_samples_per_second": 468.466,
"eval_steps_per_second": 7.325,
"step": 264231
},
{
"epoch": 17.049475648201764,
"grad_norm": 0.4543195366859436,
"learning_rate": 4.9442013913791863e-05,
"loss": 2.4013,
"step": 265000
},
{
"epoch": 17.113813292157243,
"grad_norm": 0.4699794054031372,
"learning_rate": 4.836620526429032e-05,
"loss": 2.3928,
"step": 266000
},
{
"epoch": 17.17815093611272,
"grad_norm": 0.4636929929256439,
"learning_rate": 4.7291472423438285e-05,
"loss": 2.3989,
"step": 267000
},
{
"epoch": 17.242488580068198,
"grad_norm": 0.4614698886871338,
"learning_rate": 4.6215663773936746e-05,
"loss": 2.4004,
"step": 268000
},
{
"epoch": 17.306826224023677,
"grad_norm": 0.46002906560897827,
"learning_rate": 4.513985512443519e-05,
"loss": 2.3982,
"step": 269000
},
{
"epoch": 17.371163867979156,
"grad_norm": 0.42619064450263977,
"learning_rate": 4.4065122283583154e-05,
"loss": 2.4053,
"step": 270000
},
{
"epoch": 17.435501511934632,
"grad_norm": 0.45975300669670105,
"learning_rate": 4.2989313634081614e-05,
"loss": 2.4041,
"step": 271000
},
{
"epoch": 17.49983915589011,
"grad_norm": 0.4545740485191345,
"learning_rate": 4.1913504984580075e-05,
"loss": 2.406,
"step": 272000
},
{
"epoch": 17.56417679984559,
"grad_norm": 0.458011269569397,
"learning_rate": 4.083769633507853e-05,
"loss": 2.4168,
"step": 273000
},
{
"epoch": 17.62851444380107,
"grad_norm": 0.4604107439517975,
"learning_rate": 3.976296349422649e-05,
"loss": 2.411,
"step": 274000
},
{
"epoch": 17.692852087756545,
"grad_norm": 0.4420773684978485,
"learning_rate": 3.8687154844724944e-05,
"loss": 2.4144,
"step": 275000
},
{
"epoch": 17.757189731712025,
"grad_norm": 0.45774900913238525,
"learning_rate": 3.7611346195223404e-05,
"loss": 2.412,
"step": 276000
},
{
"epoch": 17.821527375667504,
"grad_norm": 0.4509606659412384,
"learning_rate": 3.6536613354371366e-05,
"loss": 2.4086,
"step": 277000
},
{
"epoch": 17.885865019622983,
"grad_norm": 0.4442935883998871,
"learning_rate": 3.5460804704869826e-05,
"loss": 2.4134,
"step": 278000
},
{
"epoch": 17.95020266357846,
"grad_norm": 0.42292436957359314,
"learning_rate": 3.438607186401778e-05,
"loss": 2.4186,
"step": 279000
},
{
"epoch": 18.0,
"eval_accuracy": 0.42388941236296196,
"eval_loss": 3.0051016807556152,
"eval_runtime": 111.9834,
"eval_samples_per_second": 468.284,
"eval_steps_per_second": 7.323,
"step": 279774
},
{
"epoch": 18.014540307533938,
"grad_norm": 0.48824623227119446,
"learning_rate": 3.331026321451624e-05,
"loss": 2.4055,
"step": 280000
},
{
"epoch": 18.078877951489417,
"grad_norm": 0.46934977173805237,
"learning_rate": 3.22355303736642e-05,
"loss": 2.3736,
"step": 281000
},
{
"epoch": 18.143215595444893,
"grad_norm": 0.5045217275619507,
"learning_rate": 3.115972172416266e-05,
"loss": 2.382,
"step": 282000
},
{
"epoch": 18.207553239400372,
"grad_norm": 0.46461954712867737,
"learning_rate": 3.008391307466112e-05,
"loss": 2.3806,
"step": 283000
},
{
"epoch": 18.27189088335585,
"grad_norm": 0.4565331041812897,
"learning_rate": 2.9009180233809078e-05,
"loss": 2.3813,
"step": 284000
},
{
"epoch": 18.33622852731133,
"grad_norm": 0.4561784863471985,
"learning_rate": 2.793337158430754e-05,
"loss": 2.3863,
"step": 285000
},
{
"epoch": 18.400566171266806,
"grad_norm": 0.4438989758491516,
"learning_rate": 2.6858638743455493e-05,
"loss": 2.3845,
"step": 286000
},
{
"epoch": 18.464903815222286,
"grad_norm": 0.461086630821228,
"learning_rate": 2.578283009395395e-05,
"loss": 2.3833,
"step": 287000
},
{
"epoch": 18.529241459177765,
"grad_norm": 0.4639764726161957,
"learning_rate": 2.470702144445241e-05,
"loss": 2.3918,
"step": 288000
},
{
"epoch": 18.593579103133244,
"grad_norm": 0.4645422697067261,
"learning_rate": 2.3631212794950868e-05,
"loss": 2.3953,
"step": 289000
},
{
"epoch": 18.65791674708872,
"grad_norm": 0.47392553091049194,
"learning_rate": 2.2555404145449328e-05,
"loss": 2.3829,
"step": 290000
},
{
"epoch": 18.7222543910442,
"grad_norm": 0.4530762732028961,
"learning_rate": 2.148174711324679e-05,
"loss": 2.3904,
"step": 291000
},
{
"epoch": 18.78659203499968,
"grad_norm": 0.47473639249801636,
"learning_rate": 2.0405938463745248e-05,
"loss": 2.3966,
"step": 292000
},
{
"epoch": 18.850929678955158,
"grad_norm": 0.43500351905822754,
"learning_rate": 1.9330129814243705e-05,
"loss": 2.396,
"step": 293000
},
{
"epoch": 18.915267322910633,
"grad_norm": 0.45157596468925476,
"learning_rate": 1.8254321164742165e-05,
"loss": 2.3959,
"step": 294000
},
{
"epoch": 18.979604966866113,
"grad_norm": 0.4546051621437073,
"learning_rate": 1.7180664132539624e-05,
"loss": 2.3869,
"step": 295000
},
{
"epoch": 19.0,
"eval_accuracy": 0.42373925008599933,
"eval_loss": 3.011887311935425,
"eval_runtime": 112.1189,
"eval_samples_per_second": 467.718,
"eval_steps_per_second": 7.314,
"step": 295317
},
{
"epoch": 19.043942610821592,
"grad_norm": 0.46901893615722656,
"learning_rate": 1.610485548303808e-05,
"loss": 2.3726,
"step": 296000
},
{
"epoch": 19.10828025477707,
"grad_norm": 0.43862438201904297,
"learning_rate": 1.502904683353654e-05,
"loss": 2.3688,
"step": 297000
},
{
"epoch": 19.172617898732547,
"grad_norm": 0.4580424427986145,
"learning_rate": 1.3954313992684501e-05,
"loss": 2.3682,
"step": 298000
},
{
"epoch": 19.236955542688026,
"grad_norm": 0.47557470202445984,
"learning_rate": 1.2878505343182957e-05,
"loss": 2.3687,
"step": 299000
},
{
"epoch": 19.301293186643505,
"grad_norm": 0.48615992069244385,
"learning_rate": 1.1802696693681415e-05,
"loss": 2.3636,
"step": 300000
},
{
"epoch": 19.365630830598985,
"grad_norm": 0.5019800662994385,
"learning_rate": 1.0726888044179874e-05,
"loss": 2.3668,
"step": 301000
},
{
"epoch": 19.42996847455446,
"grad_norm": 0.4481401741504669,
"learning_rate": 9.652155203327834e-06,
"loss": 2.3721,
"step": 302000
},
{
"epoch": 19.49430611850994,
"grad_norm": 0.4632056653499603,
"learning_rate": 8.577422362475793e-06,
"loss": 2.372,
"step": 303000
},
{
"epoch": 19.55864376246542,
"grad_norm": 0.4590476453304291,
"learning_rate": 7.5016137129742514e-06,
"loss": 2.3725,
"step": 304000
},
{
"epoch": 19.622981406420898,
"grad_norm": 0.4774569272994995,
"learning_rate": 6.42580506347271e-06,
"loss": 2.37,
"step": 305000
},
{
"epoch": 19.687319050376374,
"grad_norm": 0.47048863768577576,
"learning_rate": 5.351072222620669e-06,
"loss": 2.371,
"step": 306000
},
{
"epoch": 19.751656694331853,
"grad_norm": 0.4567144215106964,
"learning_rate": 4.275263573119128e-06,
"loss": 2.3706,
"step": 307000
},
{
"epoch": 19.815994338287332,
"grad_norm": 0.4492277503013611,
"learning_rate": 3.200530732267087e-06,
"loss": 2.3714,
"step": 308000
},
{
"epoch": 19.88033198224281,
"grad_norm": 0.44562822580337524,
"learning_rate": 2.1247220827655454e-06,
"loss": 2.3711,
"step": 309000
},
{
"epoch": 19.944669626198287,
"grad_norm": 0.4758046269416809,
"learning_rate": 1.0499892419135049e-06,
"loss": 2.3686,
"step": 310000
},
{
"epoch": 20.0,
"eval_accuracy": 0.4234014597448438,
"eval_loss": 3.0190186500549316,
"eval_runtime": 111.9505,
"eval_samples_per_second": 468.421,
"eval_steps_per_second": 7.325,
"step": 310860
},
{
"epoch": 20.0,
"step": 310860,
"total_flos": 1.29957250203648e+18,
"train_loss": 2.7038125842232534,
"train_runtime": 43992.4165,
"train_samples_per_second": 226.114,
"train_steps_per_second": 7.066
}
],
"logging_steps": 1000,
"max_steps": 310860,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.29957250203648e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}