kanishka's picture
End of training
583c323 verified
raw
history blame
70.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 371920,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05377500537750054,
"grad_norm": 0.8324036598205566,
"learning_rate": 3.125e-05,
"loss": 6.2314,
"step": 1000
},
{
"epoch": 0.10755001075500108,
"grad_norm": 0.9016917943954468,
"learning_rate": 6.25e-05,
"loss": 5.01,
"step": 2000
},
{
"epoch": 0.1613250161325016,
"grad_norm": 0.8393586874008179,
"learning_rate": 9.375e-05,
"loss": 4.6792,
"step": 3000
},
{
"epoch": 0.21510002151000215,
"grad_norm": 0.8001790642738342,
"learning_rate": 0.000125,
"loss": 4.4675,
"step": 4000
},
{
"epoch": 0.2688750268875027,
"grad_norm": 0.7500863671302795,
"learning_rate": 0.00015625,
"loss": 4.3004,
"step": 5000
},
{
"epoch": 0.3226500322650032,
"grad_norm": 0.6959784626960754,
"learning_rate": 0.0001875,
"loss": 4.1762,
"step": 6000
},
{
"epoch": 0.3764250376425038,
"grad_norm": 0.7082997560501099,
"learning_rate": 0.00021875,
"loss": 4.0795,
"step": 7000
},
{
"epoch": 0.4302000430200043,
"grad_norm": 0.7400528788566589,
"learning_rate": 0.00025,
"loss": 3.9794,
"step": 8000
},
{
"epoch": 0.4839750483975048,
"grad_norm": 0.6886024475097656,
"learning_rate": 0.00028121875,
"loss": 3.9062,
"step": 9000
},
{
"epoch": 0.5377500537750054,
"grad_norm": 0.6196364760398865,
"learning_rate": 0.0003124375,
"loss": 3.8427,
"step": 10000
},
{
"epoch": 0.5915250591525059,
"grad_norm": 0.5815768241882324,
"learning_rate": 0.00034368749999999997,
"loss": 3.7992,
"step": 11000
},
{
"epoch": 0.6453000645300064,
"grad_norm": 0.5629006624221802,
"learning_rate": 0.0003749375,
"loss": 3.7502,
"step": 12000
},
{
"epoch": 0.699075069907507,
"grad_norm": 0.5031692981719971,
"learning_rate": 0.00040615625,
"loss": 3.7233,
"step": 13000
},
{
"epoch": 0.7528500752850076,
"grad_norm": 0.4921340048313141,
"learning_rate": 0.00043737500000000005,
"loss": 3.6917,
"step": 14000
},
{
"epoch": 0.806625080662508,
"grad_norm": 0.45878851413726807,
"learning_rate": 0.000468625,
"loss": 3.6641,
"step": 15000
},
{
"epoch": 0.8604000860400086,
"grad_norm": 0.4047335684299469,
"learning_rate": 0.000499875,
"loss": 3.6404,
"step": 16000
},
{
"epoch": 0.9141750914175092,
"grad_norm": 0.4339119493961334,
"learning_rate": 0.000531125,
"loss": 3.6129,
"step": 17000
},
{
"epoch": 0.9679500967950097,
"grad_norm": 0.3588213324546814,
"learning_rate": 0.00056234375,
"loss": 3.5932,
"step": 18000
},
{
"epoch": 1.0,
"eval_accuracy": 0.3588065680197524,
"eval_loss": 3.770080804824829,
"eval_runtime": 152.9859,
"eval_samples_per_second": 378.577,
"eval_steps_per_second": 5.916,
"step": 18596
},
{
"epoch": 1.0217251021725102,
"grad_norm": 0.3476862907409668,
"learning_rate": 0.00059359375,
"loss": 3.5726,
"step": 19000
},
{
"epoch": 1.0755001075500108,
"grad_norm": 0.3370579481124878,
"learning_rate": 0.0006248437500000001,
"loss": 3.5453,
"step": 20000
},
{
"epoch": 1.1292751129275114,
"grad_norm": 0.3253530263900757,
"learning_rate": 0.00065609375,
"loss": 3.5364,
"step": 21000
},
{
"epoch": 1.1830501183050117,
"grad_norm": 0.3063829839229584,
"learning_rate": 0.00068728125,
"loss": 3.5214,
"step": 22000
},
{
"epoch": 1.2368251236825123,
"grad_norm": 0.28737059235572815,
"learning_rate": 0.00071853125,
"loss": 3.5158,
"step": 23000
},
{
"epoch": 1.2906001290600129,
"grad_norm": 0.29937857389450073,
"learning_rate": 0.00074978125,
"loss": 3.5014,
"step": 24000
},
{
"epoch": 1.3443751344375134,
"grad_norm": 0.2835935056209564,
"learning_rate": 0.0007810312499999999,
"loss": 3.4946,
"step": 25000
},
{
"epoch": 1.398150139815014,
"grad_norm": 0.2764816880226135,
"learning_rate": 0.00081225,
"loss": 3.4832,
"step": 26000
},
{
"epoch": 1.4519251451925146,
"grad_norm": 0.2620868384838104,
"learning_rate": 0.0008435000000000001,
"loss": 3.4761,
"step": 27000
},
{
"epoch": 1.5057001505700152,
"grad_norm": 0.2731957733631134,
"learning_rate": 0.00087471875,
"loss": 3.4653,
"step": 28000
},
{
"epoch": 1.5594751559475157,
"grad_norm": 0.26957619190216064,
"learning_rate": 0.00090596875,
"loss": 3.4552,
"step": 29000
},
{
"epoch": 1.613250161325016,
"grad_norm": 0.24591492116451263,
"learning_rate": 0.00093721875,
"loss": 3.4474,
"step": 30000
},
{
"epoch": 1.6670251667025167,
"grad_norm": 0.23927152156829834,
"learning_rate": 0.00096846875,
"loss": 3.4443,
"step": 31000
},
{
"epoch": 1.7208001720800172,
"grad_norm": 0.2176426500082016,
"learning_rate": 0.0009996875,
"loss": 3.4401,
"step": 32000
},
{
"epoch": 1.7745751774575176,
"grad_norm": 0.20793931186199188,
"learning_rate": 0.0009970875500117675,
"loss": 3.4261,
"step": 33000
},
{
"epoch": 1.8283501828350182,
"grad_norm": 0.2189057469367981,
"learning_rate": 0.0009941486232054601,
"loss": 3.419,
"step": 34000
},
{
"epoch": 1.8821251882125187,
"grad_norm": 0.2241194099187851,
"learning_rate": 0.0009912096963991528,
"loss": 3.4088,
"step": 35000
},
{
"epoch": 1.9359001935900193,
"grad_norm": 0.23365530371665955,
"learning_rate": 0.0009882678277241704,
"loss": 3.3934,
"step": 36000
},
{
"epoch": 1.9896751989675199,
"grad_norm": 0.2019016444683075,
"learning_rate": 0.000985328900917863,
"loss": 3.3833,
"step": 37000
},
{
"epoch": 2.0,
"eval_accuracy": 0.38185108449506,
"eval_loss": 3.5596837997436523,
"eval_runtime": 154.0793,
"eval_samples_per_second": 375.891,
"eval_steps_per_second": 5.874,
"step": 37192
},
{
"epoch": 2.0434502043450204,
"grad_norm": 0.20308424532413483,
"learning_rate": 0.0009823870322428808,
"loss": 3.3404,
"step": 38000
},
{
"epoch": 2.097225209722521,
"grad_norm": 0.21531735360622406,
"learning_rate": 0.0009794451635678984,
"loss": 3.3266,
"step": 39000
},
{
"epoch": 2.1510002151000216,
"grad_norm": 0.26003631949424744,
"learning_rate": 0.000976503294892916,
"loss": 3.3242,
"step": 40000
},
{
"epoch": 2.204775220477522,
"grad_norm": 0.24143873155117035,
"learning_rate": 0.0009735643680866086,
"loss": 3.3166,
"step": 41000
},
{
"epoch": 2.2585502258550227,
"grad_norm": 0.19083160161972046,
"learning_rate": 0.0009706224994116263,
"loss": 3.3187,
"step": 42000
},
{
"epoch": 2.3123252312325233,
"grad_norm": 0.22694003582000732,
"learning_rate": 0.000967680630736644,
"loss": 3.3112,
"step": 43000
},
{
"epoch": 2.3661002366100234,
"grad_norm": 0.21774055063724518,
"learning_rate": 0.0009647417039303365,
"loss": 3.3075,
"step": 44000
},
{
"epoch": 2.419875241987524,
"grad_norm": 0.2047697901725769,
"learning_rate": 0.0009617998352553542,
"loss": 3.2992,
"step": 45000
},
{
"epoch": 2.4736502473650246,
"grad_norm": 0.21876117587089539,
"learning_rate": 0.0009588579665803719,
"loss": 3.2983,
"step": 46000
},
{
"epoch": 2.527425252742525,
"grad_norm": 0.21647591888904572,
"learning_rate": 0.0009559190397740644,
"loss": 3.2876,
"step": 47000
},
{
"epoch": 2.5812002581200257,
"grad_norm": 0.20933736860752106,
"learning_rate": 0.0009529771710990821,
"loss": 3.2814,
"step": 48000
},
{
"epoch": 2.6349752634975263,
"grad_norm": 0.1911548376083374,
"learning_rate": 0.0009500382442927748,
"loss": 3.2797,
"step": 49000
},
{
"epoch": 2.688750268875027,
"grad_norm": 0.22081832587718964,
"learning_rate": 0.0009470963756177925,
"loss": 3.2783,
"step": 50000
},
{
"epoch": 2.7425252742525275,
"grad_norm": 0.21164289116859436,
"learning_rate": 0.0009441545069428101,
"loss": 3.2752,
"step": 51000
},
{
"epoch": 2.796300279630028,
"grad_norm": 0.21225039660930634,
"learning_rate": 0.0009412126382678278,
"loss": 3.2681,
"step": 52000
},
{
"epoch": 2.8500752850075286,
"grad_norm": 0.1924898326396942,
"learning_rate": 0.0009382707695928455,
"loss": 3.2629,
"step": 53000
},
{
"epoch": 2.903850290385029,
"grad_norm": 0.19862565398216248,
"learning_rate": 0.000935331842786538,
"loss": 3.2634,
"step": 54000
},
{
"epoch": 2.9576252957625293,
"grad_norm": 0.19020138680934906,
"learning_rate": 0.0009323899741115557,
"loss": 3.2597,
"step": 55000
},
{
"epoch": 3.0,
"eval_accuracy": 0.39273080241152825,
"eval_loss": 3.4648334980010986,
"eval_runtime": 154.5571,
"eval_samples_per_second": 374.729,
"eval_steps_per_second": 5.855,
"step": 55788
},
{
"epoch": 3.0114003011400303,
"grad_norm": 0.19856859743595123,
"learning_rate": 0.0009294481054365734,
"loss": 3.239,
"step": 56000
},
{
"epoch": 3.0651753065175305,
"grad_norm": 0.22371411323547363,
"learning_rate": 0.0009265091786302659,
"loss": 3.1886,
"step": 57000
},
{
"epoch": 3.118950311895031,
"grad_norm": 0.21350081264972687,
"learning_rate": 0.0009235673099552836,
"loss": 3.1941,
"step": 58000
},
{
"epoch": 3.1727253172725316,
"grad_norm": 0.219674214720726,
"learning_rate": 0.0009206254412803013,
"loss": 3.1942,
"step": 59000
},
{
"epoch": 3.226500322650032,
"grad_norm": 0.19072189927101135,
"learning_rate": 0.0009176865144739939,
"loss": 3.1973,
"step": 60000
},
{
"epoch": 3.2802753280275327,
"grad_norm": 0.205557718873024,
"learning_rate": 0.0009147475876676865,
"loss": 3.1932,
"step": 61000
},
{
"epoch": 3.3340503334050333,
"grad_norm": 0.2098790556192398,
"learning_rate": 0.0009118057189927041,
"loss": 3.1935,
"step": 62000
},
{
"epoch": 3.387825338782534,
"grad_norm": 0.196111798286438,
"learning_rate": 0.0009088638503177218,
"loss": 3.1954,
"step": 63000
},
{
"epoch": 3.4416003441600345,
"grad_norm": 0.19440898299217224,
"learning_rate": 0.0009059219816427395,
"loss": 3.1924,
"step": 64000
},
{
"epoch": 3.495375349537535,
"grad_norm": 0.21081770956516266,
"learning_rate": 0.0009029801129677572,
"loss": 3.1952,
"step": 65000
},
{
"epoch": 3.5491503549150356,
"grad_norm": 0.21867215633392334,
"learning_rate": 0.0009000411861614498,
"loss": 3.195,
"step": 66000
},
{
"epoch": 3.602925360292536,
"grad_norm": 0.22000326216220856,
"learning_rate": 0.0008970993174864674,
"loss": 3.1911,
"step": 67000
},
{
"epoch": 3.6567003656700363,
"grad_norm": 0.1891467422246933,
"learning_rate": 0.0008941574488114851,
"loss": 3.1934,
"step": 68000
},
{
"epoch": 3.7104753710475373,
"grad_norm": 0.18787287175655365,
"learning_rate": 0.0008912185220051777,
"loss": 3.191,
"step": 69000
},
{
"epoch": 3.7642503764250375,
"grad_norm": 0.23694172501564026,
"learning_rate": 0.0008882766533301954,
"loss": 3.1831,
"step": 70000
},
{
"epoch": 3.818025381802538,
"grad_norm": 0.19812917709350586,
"learning_rate": 0.000885334784655213,
"loss": 3.1815,
"step": 71000
},
{
"epoch": 3.8718003871800386,
"grad_norm": 0.2005423903465271,
"learning_rate": 0.0008823958578489056,
"loss": 3.1801,
"step": 72000
},
{
"epoch": 3.925575392557539,
"grad_norm": 0.21525584161281586,
"learning_rate": 0.0008794539891739233,
"loss": 3.1795,
"step": 73000
},
{
"epoch": 3.9793503979350398,
"grad_norm": 0.19802774488925934,
"learning_rate": 0.0008765150623676159,
"loss": 3.1741,
"step": 74000
},
{
"epoch": 4.0,
"eval_accuracy": 0.3976694409529698,
"eval_loss": 3.419067859649658,
"eval_runtime": 155.1328,
"eval_samples_per_second": 373.338,
"eval_steps_per_second": 5.834,
"step": 74384
},
{
"epoch": 4.033125403312541,
"grad_norm": 0.2355957329273224,
"learning_rate": 0.0008735731936926335,
"loss": 3.1356,
"step": 75000
},
{
"epoch": 4.086900408690041,
"grad_norm": 0.20892471075057983,
"learning_rate": 0.0008706313250176512,
"loss": 3.1124,
"step": 76000
},
{
"epoch": 4.140675414067541,
"grad_norm": 0.24330681562423706,
"learning_rate": 0.0008676923982113439,
"loss": 3.1228,
"step": 77000
},
{
"epoch": 4.194450419445042,
"grad_norm": 0.30532753467559814,
"learning_rate": 0.0008647505295363614,
"loss": 3.1191,
"step": 78000
},
{
"epoch": 4.248225424822542,
"grad_norm": 0.2023121416568756,
"learning_rate": 0.0008618116027300541,
"loss": 3.1185,
"step": 79000
},
{
"epoch": 4.302000430200043,
"grad_norm": 0.20023038983345032,
"learning_rate": 0.0008588697340550719,
"loss": 3.1248,
"step": 80000
},
{
"epoch": 4.355775435577543,
"grad_norm": 0.20664258301258087,
"learning_rate": 0.0008559278653800895,
"loss": 3.1305,
"step": 81000
},
{
"epoch": 4.409550440955044,
"grad_norm": 0.21807469427585602,
"learning_rate": 0.0008529889385737821,
"loss": 3.1265,
"step": 82000
},
{
"epoch": 4.4633254463325445,
"grad_norm": 0.20922619104385376,
"learning_rate": 0.0008500470698987998,
"loss": 3.1287,
"step": 83000
},
{
"epoch": 4.5171004517100455,
"grad_norm": 0.22318531572818756,
"learning_rate": 0.0008471052012238174,
"loss": 3.1265,
"step": 84000
},
{
"epoch": 4.570875457087546,
"grad_norm": 0.20071184635162354,
"learning_rate": 0.000844163332548835,
"loss": 3.1244,
"step": 85000
},
{
"epoch": 4.624650462465047,
"grad_norm": 0.23887498676776886,
"learning_rate": 0.0008412244057425277,
"loss": 3.1309,
"step": 86000
},
{
"epoch": 4.678425467842547,
"grad_norm": 0.21280068159103394,
"learning_rate": 0.0008382825370675454,
"loss": 3.1261,
"step": 87000
},
{
"epoch": 4.732200473220047,
"grad_norm": 0.20855990052223206,
"learning_rate": 0.0008353406683925629,
"loss": 3.1227,
"step": 88000
},
{
"epoch": 4.785975478597548,
"grad_norm": 0.23701632022857666,
"learning_rate": 0.0008324017415862556,
"loss": 3.1274,
"step": 89000
},
{
"epoch": 4.839750483975048,
"grad_norm": 0.22062337398529053,
"learning_rate": 0.0008294598729112733,
"loss": 3.1259,
"step": 90000
},
{
"epoch": 4.893525489352549,
"grad_norm": 0.21007812023162842,
"learning_rate": 0.0008265209461049658,
"loss": 3.1241,
"step": 91000
},
{
"epoch": 4.947300494730049,
"grad_norm": 0.3277081847190857,
"learning_rate": 0.0008235790774299835,
"loss": 3.1213,
"step": 92000
},
{
"epoch": 5.0,
"eval_accuracy": 0.4008789177643117,
"eval_loss": 3.396653652191162,
"eval_runtime": 155.4385,
"eval_samples_per_second": 372.604,
"eval_steps_per_second": 5.822,
"step": 92980
},
{
"epoch": 5.00107550010755,
"grad_norm": 0.20310255885124207,
"learning_rate": 0.0008206401506236762,
"loss": 3.1191,
"step": 93000
},
{
"epoch": 5.05485050548505,
"grad_norm": 0.20080606639385223,
"learning_rate": 0.0008176982819486937,
"loss": 3.0521,
"step": 94000
},
{
"epoch": 5.108625510862551,
"grad_norm": 0.21395424008369446,
"learning_rate": 0.0008147593551423864,
"loss": 3.0656,
"step": 95000
},
{
"epoch": 5.1624005162400515,
"grad_norm": 0.22563432157039642,
"learning_rate": 0.0008118174864674042,
"loss": 3.0641,
"step": 96000
},
{
"epoch": 5.2161755216175525,
"grad_norm": 0.21297597885131836,
"learning_rate": 0.0008088785596610967,
"loss": 3.07,
"step": 97000
},
{
"epoch": 5.269950526995053,
"grad_norm": 0.20302899181842804,
"learning_rate": 0.0008059366909861144,
"loss": 3.0717,
"step": 98000
},
{
"epoch": 5.323725532372554,
"grad_norm": 0.2152853012084961,
"learning_rate": 0.0008029948223111321,
"loss": 3.0759,
"step": 99000
},
{
"epoch": 5.377500537750054,
"grad_norm": 0.2148328423500061,
"learning_rate": 0.0008000529536361497,
"loss": 3.0728,
"step": 100000
},
{
"epoch": 5.431275543127555,
"grad_norm": 0.20232610404491425,
"learning_rate": 0.0007971140268298423,
"loss": 3.0798,
"step": 101000
},
{
"epoch": 5.485050548505055,
"grad_norm": 0.22732730209827423,
"learning_rate": 0.000794175100023535,
"loss": 3.0756,
"step": 102000
},
{
"epoch": 5.538825553882555,
"grad_norm": 0.2203952670097351,
"learning_rate": 0.0007912332313485526,
"loss": 3.0776,
"step": 103000
},
{
"epoch": 5.592600559260056,
"grad_norm": 0.21848390996456146,
"learning_rate": 0.0007882943045422453,
"loss": 3.0763,
"step": 104000
},
{
"epoch": 5.646375564637556,
"grad_norm": 0.22204072773456573,
"learning_rate": 0.0007853524358672629,
"loss": 3.0797,
"step": 105000
},
{
"epoch": 5.700150570015057,
"grad_norm": 0.20933043956756592,
"learning_rate": 0.0007824135090609555,
"loss": 3.0763,
"step": 106000
},
{
"epoch": 5.753925575392557,
"grad_norm": 0.19925065338611603,
"learning_rate": 0.0007794716403859732,
"loss": 3.0802,
"step": 107000
},
{
"epoch": 5.807700580770058,
"grad_norm": 0.20748205482959747,
"learning_rate": 0.0007765297717109908,
"loss": 3.081,
"step": 108000
},
{
"epoch": 5.8614755861475585,
"grad_norm": 0.2089342474937439,
"learning_rate": 0.0007735908449046834,
"loss": 3.0787,
"step": 109000
},
{
"epoch": 5.9152505915250595,
"grad_norm": 0.20147345960140228,
"learning_rate": 0.0007706489762297011,
"loss": 3.0829,
"step": 110000
},
{
"epoch": 5.96902559690256,
"grad_norm": 0.2211214154958725,
"learning_rate": 0.0007677071075547188,
"loss": 3.0783,
"step": 111000
},
{
"epoch": 6.0,
"eval_accuracy": 0.405020628943781,
"eval_loss": 3.3772811889648438,
"eval_runtime": 156.2536,
"eval_samples_per_second": 370.66,
"eval_steps_per_second": 5.792,
"step": 111576
},
{
"epoch": 6.022800602280061,
"grad_norm": 0.21148885786533356,
"learning_rate": 0.0007647681807484113,
"loss": 3.0451,
"step": 112000
},
{
"epoch": 6.076575607657561,
"grad_norm": 0.2195775806903839,
"learning_rate": 0.000761826312073429,
"loss": 3.0147,
"step": 113000
},
{
"epoch": 6.130350613035061,
"grad_norm": 0.20522399246692657,
"learning_rate": 0.0007588873852671217,
"loss": 3.0194,
"step": 114000
},
{
"epoch": 6.184125618412562,
"grad_norm": 0.20723003149032593,
"learning_rate": 0.0007559455165921393,
"loss": 3.026,
"step": 115000
},
{
"epoch": 6.237900623790062,
"grad_norm": 0.23514005541801453,
"learning_rate": 0.000753003647917157,
"loss": 3.0231,
"step": 116000
},
{
"epoch": 6.291675629167563,
"grad_norm": 0.20580914616584778,
"learning_rate": 0.0007500647211108497,
"loss": 3.0321,
"step": 117000
},
{
"epoch": 6.345450634545063,
"grad_norm": 0.2240120768547058,
"learning_rate": 0.0007471228524358674,
"loss": 3.0332,
"step": 118000
},
{
"epoch": 6.399225639922564,
"grad_norm": 0.23184897005558014,
"learning_rate": 0.0007441839256295599,
"loss": 3.0369,
"step": 119000
},
{
"epoch": 6.453000645300064,
"grad_norm": 0.22646069526672363,
"learning_rate": 0.0007412449988232526,
"loss": 3.0357,
"step": 120000
},
{
"epoch": 6.506775650677565,
"grad_norm": 0.21927151083946228,
"learning_rate": 0.0007383031301482702,
"loss": 3.0398,
"step": 121000
},
{
"epoch": 6.5605506560550655,
"grad_norm": 0.24726586043834686,
"learning_rate": 0.0007353612614732878,
"loss": 3.0373,
"step": 122000
},
{
"epoch": 6.6143256614325665,
"grad_norm": 0.21686062216758728,
"learning_rate": 0.0007324193927983055,
"loss": 3.0399,
"step": 123000
},
{
"epoch": 6.668100666810067,
"grad_norm": 0.21142247319221497,
"learning_rate": 0.0007294804659919982,
"loss": 3.0446,
"step": 124000
},
{
"epoch": 6.721875672187567,
"grad_norm": 0.21460475027561188,
"learning_rate": 0.0007265385973170157,
"loss": 3.0403,
"step": 125000
},
{
"epoch": 6.775650677565068,
"grad_norm": 0.22398121654987335,
"learning_rate": 0.0007235967286420334,
"loss": 3.0426,
"step": 126000
},
{
"epoch": 6.829425682942568,
"grad_norm": 0.23123160004615784,
"learning_rate": 0.0007206548599670511,
"loss": 3.0443,
"step": 127000
},
{
"epoch": 6.883200688320069,
"grad_norm": 0.21254226565361023,
"learning_rate": 0.0007177159331607437,
"loss": 3.0424,
"step": 128000
},
{
"epoch": 6.936975693697569,
"grad_norm": 0.21302445232868195,
"learning_rate": 0.0007147740644857613,
"loss": 3.0472,
"step": 129000
},
{
"epoch": 6.99075069907507,
"grad_norm": 0.2217877358198166,
"learning_rate": 0.0007118321958107791,
"loss": 3.0456,
"step": 130000
},
{
"epoch": 7.0,
"eval_accuracy": 0.4055193299898036,
"eval_loss": 3.3825550079345703,
"eval_runtime": 155.2208,
"eval_samples_per_second": 373.126,
"eval_steps_per_second": 5.83,
"step": 130172
},
{
"epoch": 7.04452570445257,
"grad_norm": 0.23532521724700928,
"learning_rate": 0.0007088903271357967,
"loss": 2.9839,
"step": 131000
},
{
"epoch": 7.098300709830071,
"grad_norm": 0.22214515507221222,
"learning_rate": 0.0007059514003294893,
"loss": 2.9765,
"step": 132000
},
{
"epoch": 7.152075715207571,
"grad_norm": 0.2383100390434265,
"learning_rate": 0.0007030154153918568,
"loss": 2.9843,
"step": 133000
},
{
"epoch": 7.205850720585072,
"grad_norm": 0.22472046315670013,
"learning_rate": 0.0007000735467168746,
"loss": 2.9925,
"step": 134000
},
{
"epoch": 7.2596257259625725,
"grad_norm": 0.26296138763427734,
"learning_rate": 0.0006971316780418923,
"loss": 2.997,
"step": 135000
},
{
"epoch": 7.3134007313400735,
"grad_norm": 0.2494724839925766,
"learning_rate": 0.0006941898093669099,
"loss": 2.997,
"step": 136000
},
{
"epoch": 7.367175736717574,
"grad_norm": 0.22137367725372314,
"learning_rate": 0.0006912508825606025,
"loss": 2.9973,
"step": 137000
},
{
"epoch": 7.420950742095075,
"grad_norm": 0.22704289853572845,
"learning_rate": 0.0006883090138856202,
"loss": 3.0066,
"step": 138000
},
{
"epoch": 7.474725747472575,
"grad_norm": 0.2145918905735016,
"learning_rate": 0.0006853700870793128,
"loss": 3.0054,
"step": 139000
},
{
"epoch": 7.528500752850075,
"grad_norm": 0.21607990562915802,
"learning_rate": 0.0006824282184043304,
"loss": 3.0018,
"step": 140000
},
{
"epoch": 7.582275758227576,
"grad_norm": 0.2057826817035675,
"learning_rate": 0.0006794863497293481,
"loss": 3.0101,
"step": 141000
},
{
"epoch": 7.636050763605076,
"grad_norm": 0.23032937943935394,
"learning_rate": 0.0006765474229230408,
"loss": 3.0099,
"step": 142000
},
{
"epoch": 7.689825768982577,
"grad_norm": 0.22495923936367035,
"learning_rate": 0.0006736055542480583,
"loss": 3.008,
"step": 143000
},
{
"epoch": 7.743600774360077,
"grad_norm": 0.2345353364944458,
"learning_rate": 0.000670666627441751,
"loss": 3.0099,
"step": 144000
},
{
"epoch": 7.797375779737578,
"grad_norm": 0.23005186021327972,
"learning_rate": 0.0006677277006354437,
"loss": 3.0125,
"step": 145000
},
{
"epoch": 7.851150785115078,
"grad_norm": 0.29431313276290894,
"learning_rate": 0.0006647858319604612,
"loss": 3.0107,
"step": 146000
},
{
"epoch": 7.904925790492579,
"grad_norm": 0.2245541363954544,
"learning_rate": 0.0006618439632854789,
"loss": 3.0133,
"step": 147000
},
{
"epoch": 7.9587007958700795,
"grad_norm": 0.22786079347133636,
"learning_rate": 0.0006589020946104966,
"loss": 3.0126,
"step": 148000
},
{
"epoch": 8.0,
"eval_accuracy": 0.40771172002548395,
"eval_loss": 3.3547439575195312,
"eval_runtime": 154.7201,
"eval_samples_per_second": 374.334,
"eval_steps_per_second": 5.849,
"step": 148768
},
{
"epoch": 8.01247580124758,
"grad_norm": 0.2346237152814865,
"learning_rate": 0.0006559602259355142,
"loss": 2.9965,
"step": 149000
},
{
"epoch": 8.066250806625082,
"grad_norm": 0.23346728086471558,
"learning_rate": 0.0006530212991292069,
"loss": 2.9479,
"step": 150000
},
{
"epoch": 8.12002581200258,
"grad_norm": 0.24378512799739838,
"learning_rate": 0.0006500794304542246,
"loss": 2.9513,
"step": 151000
},
{
"epoch": 8.173800817380082,
"grad_norm": 0.24439002573490143,
"learning_rate": 0.0006471405036479172,
"loss": 2.9601,
"step": 152000
},
{
"epoch": 8.227575822757583,
"grad_norm": 0.27081623673439026,
"learning_rate": 0.0006441986349729348,
"loss": 2.9631,
"step": 153000
},
{
"epoch": 8.281350828135082,
"grad_norm": 0.2521245777606964,
"learning_rate": 0.0006412626500353025,
"loss": 2.966,
"step": 154000
},
{
"epoch": 8.335125833512583,
"grad_norm": 0.21975190937519073,
"learning_rate": 0.0006383207813603201,
"loss": 2.9678,
"step": 155000
},
{
"epoch": 8.388900838890084,
"grad_norm": 0.2267887145280838,
"learning_rate": 0.0006353789126853378,
"loss": 2.9696,
"step": 156000
},
{
"epoch": 8.442675844267585,
"grad_norm": 0.218279168009758,
"learning_rate": 0.0006324370440103554,
"loss": 2.9713,
"step": 157000
},
{
"epoch": 8.496450849645084,
"grad_norm": 0.23300865292549133,
"learning_rate": 0.0006294951753353731,
"loss": 2.9772,
"step": 158000
},
{
"epoch": 8.550225855022585,
"grad_norm": 0.21749693155288696,
"learning_rate": 0.0006265562485290657,
"loss": 2.9773,
"step": 159000
},
{
"epoch": 8.604000860400086,
"grad_norm": 0.26928380131721497,
"learning_rate": 0.0006236143798540833,
"loss": 2.978,
"step": 160000
},
{
"epoch": 8.657775865777587,
"grad_norm": 0.22122180461883545,
"learning_rate": 0.000620672511179101,
"loss": 2.9794,
"step": 161000
},
{
"epoch": 8.711550871155087,
"grad_norm": 0.22700442373752594,
"learning_rate": 0.0006177306425041186,
"loss": 2.9824,
"step": 162000
},
{
"epoch": 8.765325876532588,
"grad_norm": 0.2541004419326782,
"learning_rate": 0.0006147917156978112,
"loss": 2.9841,
"step": 163000
},
{
"epoch": 8.819100881910089,
"grad_norm": 0.2551893889904022,
"learning_rate": 0.0006118498470228289,
"loss": 2.9837,
"step": 164000
},
{
"epoch": 8.872875887287588,
"grad_norm": 0.25604966282844543,
"learning_rate": 0.0006089079783478466,
"loss": 2.984,
"step": 165000
},
{
"epoch": 8.926650892665089,
"grad_norm": 0.24571265280246735,
"learning_rate": 0.0006059719934102142,
"loss": 2.9849,
"step": 166000
},
{
"epoch": 8.98042589804259,
"grad_norm": 0.24128392338752747,
"learning_rate": 0.0006030301247352318,
"loss": 2.9843,
"step": 167000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4083424360998555,
"eval_loss": 3.3613698482513428,
"eval_runtime": 155.0364,
"eval_samples_per_second": 373.57,
"eval_steps_per_second": 5.837,
"step": 167364
},
{
"epoch": 9.034200903420091,
"grad_norm": 0.24964158236980438,
"learning_rate": 0.0006000882560602495,
"loss": 2.9432,
"step": 168000
},
{
"epoch": 9.08797590879759,
"grad_norm": 0.2405262142419815,
"learning_rate": 0.0005971463873852672,
"loss": 2.922,
"step": 169000
},
{
"epoch": 9.141750914175091,
"grad_norm": 0.22288870811462402,
"learning_rate": 0.0005942045187102848,
"loss": 2.9269,
"step": 170000
},
{
"epoch": 9.195525919552592,
"grad_norm": 0.3041359484195709,
"learning_rate": 0.0005912626500353024,
"loss": 2.9342,
"step": 171000
},
{
"epoch": 9.249300924930093,
"grad_norm": 0.22550632059574127,
"learning_rate": 0.0005883237232289951,
"loss": 2.9392,
"step": 172000
},
{
"epoch": 9.303075930307593,
"grad_norm": 0.23584921658039093,
"learning_rate": 0.0005853818545540127,
"loss": 2.9422,
"step": 173000
},
{
"epoch": 9.356850935685094,
"grad_norm": 0.2634640634059906,
"learning_rate": 0.0005824458696163803,
"loss": 2.945,
"step": 174000
},
{
"epoch": 9.410625941062595,
"grad_norm": 0.2354883849620819,
"learning_rate": 0.000579504000941398,
"loss": 2.9485,
"step": 175000
},
{
"epoch": 9.464400946440094,
"grad_norm": 0.26491352915763855,
"learning_rate": 0.0005765621322664157,
"loss": 2.9475,
"step": 176000
},
{
"epoch": 9.518175951817595,
"grad_norm": 0.2462054342031479,
"learning_rate": 0.0005736202635914332,
"loss": 2.9482,
"step": 177000
},
{
"epoch": 9.571950957195096,
"grad_norm": 0.2643495202064514,
"learning_rate": 0.0005706783949164509,
"loss": 2.9502,
"step": 178000
},
{
"epoch": 9.625725962572597,
"grad_norm": 0.24029669165611267,
"learning_rate": 0.0005677365262414686,
"loss": 2.9518,
"step": 179000
},
{
"epoch": 9.679500967950096,
"grad_norm": 0.2550260126590729,
"learning_rate": 0.0005648005413038361,
"loss": 2.9571,
"step": 180000
},
{
"epoch": 9.733275973327597,
"grad_norm": 0.23675589263439178,
"learning_rate": 0.0005618586726288538,
"loss": 2.9543,
"step": 181000
},
{
"epoch": 9.787050978705098,
"grad_norm": 0.2279191017150879,
"learning_rate": 0.0005589197458225465,
"loss": 2.9593,
"step": 182000
},
{
"epoch": 9.8408259840826,
"grad_norm": 0.27392587065696716,
"learning_rate": 0.0005559778771475641,
"loss": 2.9561,
"step": 183000
},
{
"epoch": 9.894600989460098,
"grad_norm": 0.2388741672039032,
"learning_rate": 0.0005530360084725818,
"loss": 2.9565,
"step": 184000
},
{
"epoch": 9.9483759948376,
"grad_norm": 0.2503463327884674,
"learning_rate": 0.0005500970816662745,
"loss": 2.9592,
"step": 185000
},
{
"epoch": 10.0,
"eval_accuracy": 0.40854156716551776,
"eval_loss": 3.377901077270508,
"eval_runtime": 154.6842,
"eval_samples_per_second": 374.421,
"eval_steps_per_second": 5.851,
"step": 185960
},
{
"epoch": 10.0021510002151,
"grad_norm": 0.2553974688053131,
"learning_rate": 0.0005471552129912922,
"loss": 2.9593,
"step": 186000
},
{
"epoch": 10.055926005592601,
"grad_norm": 0.25415316224098206,
"learning_rate": 0.0005442133443163097,
"loss": 2.8931,
"step": 187000
},
{
"epoch": 10.1097010109701,
"grad_norm": 0.2477007806301117,
"learning_rate": 0.0005412744175100024,
"loss": 2.8999,
"step": 188000
},
{
"epoch": 10.163476016347602,
"grad_norm": 0.23852670192718506,
"learning_rate": 0.0005383325488350201,
"loss": 2.9028,
"step": 189000
},
{
"epoch": 10.217251021725103,
"grad_norm": 0.2484176605939865,
"learning_rate": 0.0005353906801600376,
"loss": 2.9117,
"step": 190000
},
{
"epoch": 10.271026027102602,
"grad_norm": 0.27494022250175476,
"learning_rate": 0.0005324517533537303,
"loss": 2.9133,
"step": 191000
},
{
"epoch": 10.324801032480103,
"grad_norm": 0.2577020823955536,
"learning_rate": 0.000529509884678748,
"loss": 2.9214,
"step": 192000
},
{
"epoch": 10.378576037857604,
"grad_norm": 0.2292626053094864,
"learning_rate": 0.0005265680160037656,
"loss": 2.9208,
"step": 193000
},
{
"epoch": 10.432351043235105,
"grad_norm": 0.267873615026474,
"learning_rate": 0.0005236290891974582,
"loss": 2.9213,
"step": 194000
},
{
"epoch": 10.486126048612604,
"grad_norm": 0.24749253690242767,
"learning_rate": 0.0005206872205224759,
"loss": 2.9239,
"step": 195000
},
{
"epoch": 10.539901053990105,
"grad_norm": 0.26779085397720337,
"learning_rate": 0.0005177453518474935,
"loss": 2.929,
"step": 196000
},
{
"epoch": 10.593676059367606,
"grad_norm": 0.2552465796470642,
"learning_rate": 0.0005148064250411861,
"loss": 2.9309,
"step": 197000
},
{
"epoch": 10.647451064745107,
"grad_norm": 0.2551726996898651,
"learning_rate": 0.0005118645563662038,
"loss": 2.9287,
"step": 198000
},
{
"epoch": 10.701226070122607,
"grad_norm": 0.24481531977653503,
"learning_rate": 0.0005089256295598964,
"loss": 2.9326,
"step": 199000
},
{
"epoch": 10.755001075500108,
"grad_norm": 0.2609283924102783,
"learning_rate": 0.0005059837608849142,
"loss": 2.9332,
"step": 200000
},
{
"epoch": 10.808776080877609,
"grad_norm": 0.22893798351287842,
"learning_rate": 0.0005030418922099318,
"loss": 2.9361,
"step": 201000
},
{
"epoch": 10.86255108625511,
"grad_norm": 0.24516218900680542,
"learning_rate": 0.0005001029654036244,
"loss": 2.9358,
"step": 202000
},
{
"epoch": 10.916326091632609,
"grad_norm": 0.23790410161018372,
"learning_rate": 0.0004971610967286421,
"loss": 2.9344,
"step": 203000
},
{
"epoch": 10.97010109701011,
"grad_norm": 0.2549736797809601,
"learning_rate": 0.0004942221699223347,
"loss": 2.9367,
"step": 204000
},
{
"epoch": 11.0,
"eval_accuracy": 0.40993629082379995,
"eval_loss": 3.3604304790496826,
"eval_runtime": 154.4315,
"eval_samples_per_second": 375.034,
"eval_steps_per_second": 5.86,
"step": 204556
},
{
"epoch": 11.02387610238761,
"grad_norm": 0.2493738979101181,
"learning_rate": 0.0004912803012473523,
"loss": 2.9057,
"step": 205000
},
{
"epoch": 11.07765110776511,
"grad_norm": 0.245314821600914,
"learning_rate": 0.00048833843257237,
"loss": 2.8775,
"step": 206000
},
{
"epoch": 11.131426113142611,
"grad_norm": 0.24536246061325073,
"learning_rate": 0.0004853995057660626,
"loss": 2.8783,
"step": 207000
},
{
"epoch": 11.185201118520112,
"grad_norm": 0.26116108894348145,
"learning_rate": 0.00048245763709108024,
"loss": 2.8851,
"step": 208000
},
{
"epoch": 11.238976123897613,
"grad_norm": 0.24136824905872345,
"learning_rate": 0.0004795187102847729,
"loss": 2.8918,
"step": 209000
},
{
"epoch": 11.292751129275112,
"grad_norm": 0.275006502866745,
"learning_rate": 0.0004765768416097905,
"loss": 2.8951,
"step": 210000
},
{
"epoch": 11.346526134652613,
"grad_norm": 0.27772292494773865,
"learning_rate": 0.0004736349729348082,
"loss": 2.8973,
"step": 211000
},
{
"epoch": 11.400301140030114,
"grad_norm": 0.2865879237651825,
"learning_rate": 0.0004706960461285008,
"loss": 2.8977,
"step": 212000
},
{
"epoch": 11.454076145407615,
"grad_norm": 0.24032117426395416,
"learning_rate": 0.00046775711932219347,
"loss": 2.9002,
"step": 213000
},
{
"epoch": 11.507851150785115,
"grad_norm": 0.24863946437835693,
"learning_rate": 0.00046481525064721114,
"loss": 2.9052,
"step": 214000
},
{
"epoch": 11.561626156162616,
"grad_norm": 0.25115910172462463,
"learning_rate": 0.00046187338197222876,
"loss": 2.9081,
"step": 215000
},
{
"epoch": 11.615401161540117,
"grad_norm": 0.25204330682754517,
"learning_rate": 0.00045893151329724644,
"loss": 2.906,
"step": 216000
},
{
"epoch": 11.669176166917616,
"grad_norm": 0.25553619861602783,
"learning_rate": 0.00045599258649093905,
"loss": 2.9074,
"step": 217000
},
{
"epoch": 11.722951172295117,
"grad_norm": 0.24004510045051575,
"learning_rate": 0.00045305071781595667,
"loss": 2.9105,
"step": 218000
},
{
"epoch": 11.776726177672618,
"grad_norm": 0.28220391273498535,
"learning_rate": 0.00045010884914097434,
"loss": 2.9075,
"step": 219000
},
{
"epoch": 11.830501183050119,
"grad_norm": 0.2582526206970215,
"learning_rate": 0.00044716992233466695,
"loss": 2.9129,
"step": 220000
},
{
"epoch": 11.884276188427618,
"grad_norm": 0.27006328105926514,
"learning_rate": 0.0004442309955283596,
"loss": 2.9146,
"step": 221000
},
{
"epoch": 11.93805119380512,
"grad_norm": 0.3025253415107727,
"learning_rate": 0.0004412891268533773,
"loss": 2.9161,
"step": 222000
},
{
"epoch": 11.99182619918262,
"grad_norm": 0.2534899115562439,
"learning_rate": 0.00043834725817839496,
"loss": 2.9145,
"step": 223000
},
{
"epoch": 12.0,
"eval_accuracy": 0.40973608482660917,
"eval_loss": 3.3759312629699707,
"eval_runtime": 154.6971,
"eval_samples_per_second": 374.39,
"eval_steps_per_second": 5.85,
"step": 223152
},
{
"epoch": 12.045601204560121,
"grad_norm": 0.27566835284233093,
"learning_rate": 0.0004354053895034126,
"loss": 2.8626,
"step": 224000
},
{
"epoch": 12.09937620993762,
"grad_norm": 0.243771031498909,
"learning_rate": 0.0004324664626971052,
"loss": 2.8583,
"step": 225000
},
{
"epoch": 12.153151215315122,
"grad_norm": 0.26204991340637207,
"learning_rate": 0.00042952459402212286,
"loss": 2.8654,
"step": 226000
},
{
"epoch": 12.206926220692623,
"grad_norm": 0.2582913637161255,
"learning_rate": 0.0004265827253471405,
"loss": 2.8673,
"step": 227000
},
{
"epoch": 12.260701226070122,
"grad_norm": 0.27284786105155945,
"learning_rate": 0.00042364674040950813,
"loss": 2.8735,
"step": 228000
},
{
"epoch": 12.314476231447623,
"grad_norm": 0.25654593110084534,
"learning_rate": 0.00042070487173452575,
"loss": 2.8751,
"step": 229000
},
{
"epoch": 12.368251236825124,
"grad_norm": 0.23449215292930603,
"learning_rate": 0.0004177630030595435,
"loss": 2.8769,
"step": 230000
},
{
"epoch": 12.422026242202625,
"grad_norm": 0.24525216221809387,
"learning_rate": 0.00041482407625323604,
"loss": 2.8763,
"step": 231000
},
{
"epoch": 12.475801247580124,
"grad_norm": 0.26022928953170776,
"learning_rate": 0.0004118822075782537,
"loss": 2.8811,
"step": 232000
},
{
"epoch": 12.529576252957625,
"grad_norm": 0.24519123136997223,
"learning_rate": 0.0004089403389032714,
"loss": 2.8844,
"step": 233000
},
{
"epoch": 12.583351258335126,
"grad_norm": 0.3118698298931122,
"learning_rate": 0.000405998470228289,
"loss": 2.8837,
"step": 234000
},
{
"epoch": 12.637126263712627,
"grad_norm": 0.28287139534950256,
"learning_rate": 0.0004030566015533066,
"loss": 2.8843,
"step": 235000
},
{
"epoch": 12.690901269090126,
"grad_norm": 0.24490605294704437,
"learning_rate": 0.0004001176747469993,
"loss": 2.8868,
"step": 236000
},
{
"epoch": 12.744676274467627,
"grad_norm": 0.2810141444206238,
"learning_rate": 0.00039717580607201696,
"loss": 2.8905,
"step": 237000
},
{
"epoch": 12.798451279845128,
"grad_norm": 0.2660733759403229,
"learning_rate": 0.0003942368792657096,
"loss": 2.8917,
"step": 238000
},
{
"epoch": 12.852226285222628,
"grad_norm": 0.2440771758556366,
"learning_rate": 0.00039129501059072724,
"loss": 2.888,
"step": 239000
},
{
"epoch": 12.906001290600129,
"grad_norm": 0.25520211458206177,
"learning_rate": 0.0003883560837844199,
"loss": 2.8938,
"step": 240000
},
{
"epoch": 12.95977629597763,
"grad_norm": 0.2661096751689911,
"learning_rate": 0.0003854142151094375,
"loss": 2.8924,
"step": 241000
},
{
"epoch": 13.0,
"eval_accuracy": 0.4095556978794759,
"eval_loss": 3.3856160640716553,
"eval_runtime": 154.4788,
"eval_samples_per_second": 374.919,
"eval_steps_per_second": 5.858,
"step": 241748
},
{
"epoch": 13.01355130135513,
"grad_norm": 0.29137110710144043,
"learning_rate": 0.00038247234643445514,
"loss": 2.8796,
"step": 242000
},
{
"epoch": 13.06732630673263,
"grad_norm": 0.26854678988456726,
"learning_rate": 0.0003795304777594728,
"loss": 2.837,
"step": 243000
},
{
"epoch": 13.121101312110131,
"grad_norm": 0.2614552974700928,
"learning_rate": 0.00037659155095316543,
"loss": 2.8409,
"step": 244000
},
{
"epoch": 13.174876317487632,
"grad_norm": 0.3031397759914398,
"learning_rate": 0.00037364968227818316,
"loss": 2.8458,
"step": 245000
},
{
"epoch": 13.228651322865133,
"grad_norm": 0.2507542073726654,
"learning_rate": 0.0003707107554718757,
"loss": 2.8537,
"step": 246000
},
{
"epoch": 13.282426328242632,
"grad_norm": 0.2524189054965973,
"learning_rate": 0.0003677688867968934,
"loss": 2.852,
"step": 247000
},
{
"epoch": 13.336201333620133,
"grad_norm": 0.27850231528282166,
"learning_rate": 0.00036482701812191106,
"loss": 2.8574,
"step": 248000
},
{
"epoch": 13.389976338997634,
"grad_norm": 0.2408652901649475,
"learning_rate": 0.00036188809131560367,
"loss": 2.8563,
"step": 249000
},
{
"epoch": 13.443751344375134,
"grad_norm": 0.28609830141067505,
"learning_rate": 0.00035894622264062134,
"loss": 2.8627,
"step": 250000
},
{
"epoch": 13.497526349752635,
"grad_norm": 0.2690850496292114,
"learning_rate": 0.00035600435396563896,
"loss": 2.8619,
"step": 251000
},
{
"epoch": 13.551301355130136,
"grad_norm": 0.2862522304058075,
"learning_rate": 0.0003530654271593316,
"loss": 2.8646,
"step": 252000
},
{
"epoch": 13.605076360507637,
"grad_norm": 0.2629512548446655,
"learning_rate": 0.0003501235584843493,
"loss": 2.865,
"step": 253000
},
{
"epoch": 13.658851365885136,
"grad_norm": 0.2542857825756073,
"learning_rate": 0.00034718463167804185,
"loss": 2.865,
"step": 254000
},
{
"epoch": 13.712626371262637,
"grad_norm": 0.258798211812973,
"learning_rate": 0.0003442427630030596,
"loss": 2.8686,
"step": 255000
},
{
"epoch": 13.766401376640138,
"grad_norm": 0.2492339164018631,
"learning_rate": 0.0003413008943280772,
"loss": 2.8689,
"step": 256000
},
{
"epoch": 13.820176382017639,
"grad_norm": 0.29779887199401855,
"learning_rate": 0.0003383590256530948,
"loss": 2.8691,
"step": 257000
},
{
"epoch": 13.873951387395138,
"grad_norm": 0.2670515179634094,
"learning_rate": 0.0003354200988467875,
"loss": 2.8711,
"step": 258000
},
{
"epoch": 13.92772639277264,
"grad_norm": 0.26957201957702637,
"learning_rate": 0.0003324782301718051,
"loss": 2.8711,
"step": 259000
},
{
"epoch": 13.98150139815014,
"grad_norm": 0.2824675142765045,
"learning_rate": 0.00032953636149682283,
"loss": 2.8757,
"step": 260000
},
{
"epoch": 14.0,
"eval_accuracy": 0.410457699798363,
"eval_loss": 3.384411573410034,
"eval_runtime": 154.9443,
"eval_samples_per_second": 373.792,
"eval_steps_per_second": 5.841,
"step": 260344
},
{
"epoch": 14.035276403527641,
"grad_norm": 0.29713505506515503,
"learning_rate": 0.00032659449282184045,
"loss": 2.8354,
"step": 261000
},
{
"epoch": 14.08905140890514,
"grad_norm": 0.27730679512023926,
"learning_rate": 0.00032365850788420805,
"loss": 2.8198,
"step": 262000
},
{
"epoch": 14.142826414282641,
"grad_norm": 0.29409125447273254,
"learning_rate": 0.0003207166392092257,
"loss": 2.824,
"step": 263000
},
{
"epoch": 14.196601419660142,
"grad_norm": 0.280676931142807,
"learning_rate": 0.00031777477053424334,
"loss": 2.8301,
"step": 264000
},
{
"epoch": 14.250376425037642,
"grad_norm": 0.3202875554561615,
"learning_rate": 0.000314835843727936,
"loss": 2.8271,
"step": 265000
},
{
"epoch": 14.304151430415143,
"grad_norm": 0.2673070728778839,
"learning_rate": 0.0003118939750529536,
"loss": 2.8376,
"step": 266000
},
{
"epoch": 14.357926435792644,
"grad_norm": 0.2868790030479431,
"learning_rate": 0.0003089550482466463,
"loss": 2.8416,
"step": 267000
},
{
"epoch": 14.411701441170145,
"grad_norm": 0.2689562737941742,
"learning_rate": 0.0003060131795716639,
"loss": 2.8408,
"step": 268000
},
{
"epoch": 14.465476446547644,
"grad_norm": 0.2907434403896332,
"learning_rate": 0.0003030713108966815,
"loss": 2.8429,
"step": 269000
},
{
"epoch": 14.519251451925145,
"grad_norm": 0.26338914036750793,
"learning_rate": 0.00030012944222169925,
"loss": 2.8465,
"step": 270000
},
{
"epoch": 14.573026457302646,
"grad_norm": 0.2925278842449188,
"learning_rate": 0.00029719051541539186,
"loss": 2.8444,
"step": 271000
},
{
"epoch": 14.626801462680147,
"grad_norm": 0.2883965075016022,
"learning_rate": 0.00029424864674040954,
"loss": 2.8466,
"step": 272000
},
{
"epoch": 14.680576468057646,
"grad_norm": 0.28422123193740845,
"learning_rate": 0.00029130971993410215,
"loss": 2.8458,
"step": 273000
},
{
"epoch": 14.734351473435147,
"grad_norm": 0.28057223558425903,
"learning_rate": 0.0002883678512591198,
"loss": 2.8512,
"step": 274000
},
{
"epoch": 14.788126478812648,
"grad_norm": 0.2739641070365906,
"learning_rate": 0.00028542892445281243,
"loss": 2.8508,
"step": 275000
},
{
"epoch": 14.84190148419015,
"grad_norm": 0.26283150911331177,
"learning_rate": 0.00028248705577783005,
"loss": 2.8491,
"step": 276000
},
{
"epoch": 14.895676489567649,
"grad_norm": 0.25209805369377136,
"learning_rate": 0.0002795451871028477,
"loss": 2.8528,
"step": 277000
},
{
"epoch": 14.94945149494515,
"grad_norm": 0.29996606707572937,
"learning_rate": 0.0002766033184278654,
"loss": 2.8545,
"step": 278000
},
{
"epoch": 15.0,
"eval_accuracy": 0.4106898178253074,
"eval_loss": 3.383195400238037,
"eval_runtime": 154.9853,
"eval_samples_per_second": 373.693,
"eval_steps_per_second": 5.839,
"step": 278940
},
{
"epoch": 15.00322650032265,
"grad_norm": 0.30987992882728577,
"learning_rate": 0.00027366439162155806,
"loss": 2.8502,
"step": 279000
},
{
"epoch": 15.05700150570015,
"grad_norm": 0.2791779041290283,
"learning_rate": 0.0002707225229465757,
"loss": 2.8054,
"step": 280000
},
{
"epoch": 15.11077651107765,
"grad_norm": 0.29352006316185,
"learning_rate": 0.00026778359614026834,
"loss": 2.8087,
"step": 281000
},
{
"epoch": 15.164551516455152,
"grad_norm": 0.2739843428134918,
"learning_rate": 0.00026484466933396095,
"loss": 2.8134,
"step": 282000
},
{
"epoch": 15.218326521832653,
"grad_norm": 0.29397326707839966,
"learning_rate": 0.00026190280065897857,
"loss": 2.814,
"step": 283000
},
{
"epoch": 15.272101527210152,
"grad_norm": 0.2891228199005127,
"learning_rate": 0.00025896387385267123,
"loss": 2.8154,
"step": 284000
},
{
"epoch": 15.325876532587653,
"grad_norm": 0.2697419226169586,
"learning_rate": 0.00025602200517768885,
"loss": 2.815,
"step": 285000
},
{
"epoch": 15.379651537965154,
"grad_norm": 0.27169767022132874,
"learning_rate": 0.00025308013650270653,
"loss": 2.8179,
"step": 286000
},
{
"epoch": 15.433426543342655,
"grad_norm": 0.28420206904411316,
"learning_rate": 0.0002501382678277242,
"loss": 2.8242,
"step": 287000
},
{
"epoch": 15.487201548720154,
"grad_norm": 0.29990944266319275,
"learning_rate": 0.0002471993410214168,
"loss": 2.8284,
"step": 288000
},
{
"epoch": 15.540976554097655,
"grad_norm": 0.29358208179473877,
"learning_rate": 0.0002442574723464345,
"loss": 2.8242,
"step": 289000
},
{
"epoch": 15.594751559475156,
"grad_norm": 0.2709376811981201,
"learning_rate": 0.0002413156036714521,
"loss": 2.8323,
"step": 290000
},
{
"epoch": 15.648526564852656,
"grad_norm": 0.27639514207839966,
"learning_rate": 0.00023837961873381973,
"loss": 2.8295,
"step": 291000
},
{
"epoch": 15.702301570230157,
"grad_norm": 0.27499568462371826,
"learning_rate": 0.00023543775005883738,
"loss": 2.8273,
"step": 292000
},
{
"epoch": 15.756076575607658,
"grad_norm": 0.2614821493625641,
"learning_rate": 0.00023249588138385502,
"loss": 2.8302,
"step": 293000
},
{
"epoch": 15.809851580985159,
"grad_norm": 0.30566710233688354,
"learning_rate": 0.00022955695457754766,
"loss": 2.8315,
"step": 294000
},
{
"epoch": 15.863626586362658,
"grad_norm": 0.28071853518486023,
"learning_rate": 0.00022661508590256533,
"loss": 2.8282,
"step": 295000
},
{
"epoch": 15.917401591740159,
"grad_norm": 0.26871344447135925,
"learning_rate": 0.00022367321722758295,
"loss": 2.8318,
"step": 296000
},
{
"epoch": 15.97117659711766,
"grad_norm": 0.27507010102272034,
"learning_rate": 0.0002207313485526006,
"loss": 2.8339,
"step": 297000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4098433764298017,
"eval_loss": 3.4079394340515137,
"eval_runtime": 155.0447,
"eval_samples_per_second": 373.55,
"eval_steps_per_second": 5.837,
"step": 297536
},
{
"epoch": 16.02495160249516,
"grad_norm": 0.2846646010875702,
"learning_rate": 0.00021778947987761827,
"loss": 2.8126,
"step": 298000
},
{
"epoch": 16.07872660787266,
"grad_norm": 0.30357855558395386,
"learning_rate": 0.0002148505530713109,
"loss": 2.7901,
"step": 299000
},
{
"epoch": 16.132501613250163,
"grad_norm": 0.27245578169822693,
"learning_rate": 0.00021191162626500354,
"loss": 2.791,
"step": 300000
},
{
"epoch": 16.186276618627662,
"grad_norm": 0.27511173486709595,
"learning_rate": 0.0002089697575900212,
"loss": 2.7991,
"step": 301000
},
{
"epoch": 16.24005162400516,
"grad_norm": 0.2818892300128937,
"learning_rate": 0.00020602788891503884,
"loss": 2.8017,
"step": 302000
},
{
"epoch": 16.293826629382664,
"grad_norm": 0.29610157012939453,
"learning_rate": 0.00020308896210873147,
"loss": 2.7992,
"step": 303000
},
{
"epoch": 16.347601634760164,
"grad_norm": 0.28958022594451904,
"learning_rate": 0.00020014709343374912,
"loss": 2.8032,
"step": 304000
},
{
"epoch": 16.401376640137663,
"grad_norm": 0.32356253266334534,
"learning_rate": 0.00019720522475876677,
"loss": 2.8018,
"step": 305000
},
{
"epoch": 16.455151645515166,
"grad_norm": 0.2847062051296234,
"learning_rate": 0.0001942633560837844,
"loss": 2.8077,
"step": 306000
},
{
"epoch": 16.508926650892665,
"grad_norm": 0.2739544212818146,
"learning_rate": 0.00019132442927747705,
"loss": 2.81,
"step": 307000
},
{
"epoch": 16.562701656270164,
"grad_norm": 0.28697946667671204,
"learning_rate": 0.0001883825606024947,
"loss": 2.807,
"step": 308000
},
{
"epoch": 16.616476661647667,
"grad_norm": 0.29771292209625244,
"learning_rate": 0.00018544363379618733,
"loss": 2.8096,
"step": 309000
},
{
"epoch": 16.670251667025166,
"grad_norm": 0.29724204540252686,
"learning_rate": 0.000182501765121205,
"loss": 2.8081,
"step": 310000
},
{
"epoch": 16.72402667240267,
"grad_norm": 0.2964895963668823,
"learning_rate": 0.00017955989644622265,
"loss": 2.8133,
"step": 311000
},
{
"epoch": 16.777801677780168,
"grad_norm": 0.28841668367385864,
"learning_rate": 0.00017662391150859025,
"loss": 2.8086,
"step": 312000
},
{
"epoch": 16.831576683157667,
"grad_norm": 0.2766316831111908,
"learning_rate": 0.00017368204283360793,
"loss": 2.8152,
"step": 313000
},
{
"epoch": 16.88535168853517,
"grad_norm": 0.32704171538352966,
"learning_rate": 0.00017074017415862554,
"loss": 2.8162,
"step": 314000
},
{
"epoch": 16.93912669391267,
"grad_norm": 0.3360968828201294,
"learning_rate": 0.00016779830548364322,
"loss": 2.8133,
"step": 315000
},
{
"epoch": 16.99290169929017,
"grad_norm": 0.31055524945259094,
"learning_rate": 0.00016485643680866087,
"loss": 2.8157,
"step": 316000
},
{
"epoch": 17.0,
"eval_accuracy": 0.4103879636154489,
"eval_loss": 3.3883779048919678,
"eval_runtime": 154.6539,
"eval_samples_per_second": 374.494,
"eval_steps_per_second": 5.852,
"step": 316132
},
{
"epoch": 17.04667670466767,
"grad_norm": 0.30388155579566956,
"learning_rate": 0.0001619175100023535,
"loss": 2.7807,
"step": 317000
},
{
"epoch": 17.10045171004517,
"grad_norm": 0.2881651818752289,
"learning_rate": 0.00015897564132737118,
"loss": 2.7798,
"step": 318000
},
{
"epoch": 17.15422671542267,
"grad_norm": 0.2695824205875397,
"learning_rate": 0.0001560337726523888,
"loss": 2.7817,
"step": 319000
},
{
"epoch": 17.208001720800173,
"grad_norm": 0.30308765172958374,
"learning_rate": 0.00015309484584608143,
"loss": 2.7808,
"step": 320000
},
{
"epoch": 17.261776726177672,
"grad_norm": 0.29839852452278137,
"learning_rate": 0.00015015297717109908,
"loss": 2.7878,
"step": 321000
},
{
"epoch": 17.315551731555175,
"grad_norm": 0.2739504873752594,
"learning_rate": 0.00014721110849611675,
"loss": 2.7849,
"step": 322000
},
{
"epoch": 17.369326736932674,
"grad_norm": 0.33289453387260437,
"learning_rate": 0.00014427512355848435,
"loss": 2.7878,
"step": 323000
},
{
"epoch": 17.423101742310173,
"grad_norm": 0.30872535705566406,
"learning_rate": 0.00014133325488350202,
"loss": 2.7862,
"step": 324000
},
{
"epoch": 17.476876747687676,
"grad_norm": 0.30065229535102844,
"learning_rate": 0.00013839138620851964,
"loss": 2.7903,
"step": 325000
},
{
"epoch": 17.530651753065175,
"grad_norm": 0.294783353805542,
"learning_rate": 0.0001354495175335373,
"loss": 2.7901,
"step": 326000
},
{
"epoch": 17.584426758442675,
"grad_norm": 0.3017074167728424,
"learning_rate": 0.00013251059072722993,
"loss": 2.789,
"step": 327000
},
{
"epoch": 17.638201763820177,
"grad_norm": 0.28931453824043274,
"learning_rate": 0.0001295687220522476,
"loss": 2.7962,
"step": 328000
},
{
"epoch": 17.691976769197677,
"grad_norm": 0.30777105689048767,
"learning_rate": 0.00012662685337726525,
"loss": 2.7905,
"step": 329000
},
{
"epoch": 17.745751774575176,
"grad_norm": 0.30437570810317993,
"learning_rate": 0.00012368792657095788,
"loss": 2.7922,
"step": 330000
},
{
"epoch": 17.79952677995268,
"grad_norm": 0.279985249042511,
"learning_rate": 0.00012074605789597553,
"loss": 2.7958,
"step": 331000
},
{
"epoch": 17.853301785330178,
"grad_norm": 0.2660251557826996,
"learning_rate": 0.00011780418922099318,
"loss": 2.7954,
"step": 332000
},
{
"epoch": 17.90707679070768,
"grad_norm": 0.2697618901729584,
"learning_rate": 0.00011486526241468581,
"loss": 2.7969,
"step": 333000
},
{
"epoch": 17.96085179608518,
"grad_norm": 0.2890709340572357,
"learning_rate": 0.00011192633560837845,
"loss": 2.7966,
"step": 334000
},
{
"epoch": 18.0,
"eval_accuracy": 0.4105063404500295,
"eval_loss": 3.4080612659454346,
"eval_runtime": 154.9923,
"eval_samples_per_second": 373.677,
"eval_steps_per_second": 5.839,
"step": 334728
},
{
"epoch": 18.01462680146268,
"grad_norm": 0.2767917215824127,
"learning_rate": 0.0001089844669333961,
"loss": 2.7867,
"step": 335000
},
{
"epoch": 18.068401806840182,
"grad_norm": 0.29336050152778625,
"learning_rate": 0.00010604259825841375,
"loss": 2.7639,
"step": 336000
},
{
"epoch": 18.12217681221768,
"grad_norm": 0.3294292688369751,
"learning_rate": 0.0001031007295834314,
"loss": 2.7658,
"step": 337000
},
{
"epoch": 18.17595181759518,
"grad_norm": 0.30230215191841125,
"learning_rate": 0.00010015886090844905,
"loss": 2.766,
"step": 338000
},
{
"epoch": 18.229726822972683,
"grad_norm": 0.28469741344451904,
"learning_rate": 9.721993410214168e-05,
"loss": 2.7682,
"step": 339000
},
{
"epoch": 18.283501828350182,
"grad_norm": 0.2829948365688324,
"learning_rate": 9.427806542715933e-05,
"loss": 2.7747,
"step": 340000
},
{
"epoch": 18.33727683372768,
"grad_norm": 0.27874138951301575,
"learning_rate": 9.133913862085197e-05,
"loss": 2.7719,
"step": 341000
},
{
"epoch": 18.391051839105184,
"grad_norm": 0.3442845642566681,
"learning_rate": 8.839726994586963e-05,
"loss": 2.7753,
"step": 342000
},
{
"epoch": 18.444826844482684,
"grad_norm": 0.36807698011398315,
"learning_rate": 8.545540127088727e-05,
"loss": 2.7734,
"step": 343000
},
{
"epoch": 18.498601849860187,
"grad_norm": 0.3008968234062195,
"learning_rate": 8.25164744645799e-05,
"loss": 2.7742,
"step": 344000
},
{
"epoch": 18.552376855237686,
"grad_norm": 0.3300212621688843,
"learning_rate": 7.957460578959754e-05,
"loss": 2.7729,
"step": 345000
},
{
"epoch": 18.606151860615185,
"grad_norm": 0.2936519682407379,
"learning_rate": 7.66327371146152e-05,
"loss": 2.7756,
"step": 346000
},
{
"epoch": 18.659926865992688,
"grad_norm": 0.2997318208217621,
"learning_rate": 7.369381030830784e-05,
"loss": 2.7787,
"step": 347000
},
{
"epoch": 18.713701871370187,
"grad_norm": 0.27832144498825073,
"learning_rate": 7.075194163332549e-05,
"loss": 2.7736,
"step": 348000
},
{
"epoch": 18.767476876747686,
"grad_norm": 0.28568559885025024,
"learning_rate": 6.781301482701812e-05,
"loss": 2.7752,
"step": 349000
},
{
"epoch": 18.82125188212519,
"grad_norm": 0.2887098789215088,
"learning_rate": 6.487408802071074e-05,
"loss": 2.7764,
"step": 350000
},
{
"epoch": 18.87502688750269,
"grad_norm": 0.31327852606773376,
"learning_rate": 6.19322193457284e-05,
"loss": 2.7776,
"step": 351000
},
{
"epoch": 18.928801892880188,
"grad_norm": 0.2947293519973755,
"learning_rate": 5.899035067074606e-05,
"loss": 2.7778,
"step": 352000
},
{
"epoch": 18.98257689825769,
"grad_norm": 0.27630892395973206,
"learning_rate": 5.604848199576371e-05,
"loss": 2.7807,
"step": 353000
},
{
"epoch": 19.0,
"eval_accuracy": 0.41043096087659053,
"eval_loss": 3.4180543422698975,
"eval_runtime": 155.0182,
"eval_samples_per_second": 373.614,
"eval_steps_per_second": 5.838,
"step": 353324
},
{
"epoch": 19.03635190363519,
"grad_norm": 0.29157590866088867,
"learning_rate": 5.310955518945634e-05,
"loss": 2.7661,
"step": 354000
},
{
"epoch": 19.090126909012692,
"grad_norm": 0.26800793409347534,
"learning_rate": 5.0167686514473995e-05,
"loss": 2.7558,
"step": 355000
},
{
"epoch": 19.14390191439019,
"grad_norm": 0.31518709659576416,
"learning_rate": 4.722581783949165e-05,
"loss": 2.7579,
"step": 356000
},
{
"epoch": 19.19767691976769,
"grad_norm": 0.2998282313346863,
"learning_rate": 4.428689103318428e-05,
"loss": 2.7574,
"step": 357000
},
{
"epoch": 19.251451925145194,
"grad_norm": 0.3034313917160034,
"learning_rate": 4.134502235820193e-05,
"loss": 2.7606,
"step": 358000
},
{
"epoch": 19.305226930522693,
"grad_norm": 0.2875937819480896,
"learning_rate": 3.8403153683219584e-05,
"loss": 2.7586,
"step": 359000
},
{
"epoch": 19.359001935900192,
"grad_norm": 0.30761197209358215,
"learning_rate": 3.546422687691222e-05,
"loss": 2.7577,
"step": 360000
},
{
"epoch": 19.412776941277695,
"grad_norm": 0.29801440238952637,
"learning_rate": 3.252530007060485e-05,
"loss": 2.7557,
"step": 361000
},
{
"epoch": 19.466551946655194,
"grad_norm": 0.2939643859863281,
"learning_rate": 2.95834313956225e-05,
"loss": 2.7571,
"step": 362000
},
{
"epoch": 19.520326952032697,
"grad_norm": 0.29968592524528503,
"learning_rate": 2.6641562720640153e-05,
"loss": 2.761,
"step": 363000
},
{
"epoch": 19.574101957410196,
"grad_norm": 0.28651562333106995,
"learning_rate": 2.36996940456578e-05,
"loss": 2.7579,
"step": 364000
},
{
"epoch": 19.627876962787695,
"grad_norm": 0.30474939942359924,
"learning_rate": 2.0760767239350436e-05,
"loss": 2.7598,
"step": 365000
},
{
"epoch": 19.6816519681652,
"grad_norm": 0.2941524088382721,
"learning_rate": 1.7818898564368086e-05,
"loss": 2.7644,
"step": 366000
},
{
"epoch": 19.735426973542697,
"grad_norm": 0.2570919990539551,
"learning_rate": 1.487997175806072e-05,
"loss": 2.7549,
"step": 367000
},
{
"epoch": 19.789201978920197,
"grad_norm": 0.31410789489746094,
"learning_rate": 1.193810308307837e-05,
"loss": 2.7624,
"step": 368000
},
{
"epoch": 19.8429769842977,
"grad_norm": 0.29923874139785767,
"learning_rate": 8.999176276771005e-06,
"loss": 2.7621,
"step": 369000
},
{
"epoch": 19.8967519896752,
"grad_norm": 0.29635271430015564,
"learning_rate": 6.057307601788656e-06,
"loss": 2.7601,
"step": 370000
},
{
"epoch": 19.950526995052698,
"grad_norm": 0.29568880796432495,
"learning_rate": 3.1154389268063074e-06,
"loss": 2.7595,
"step": 371000
},
{
"epoch": 20.0,
"eval_accuracy": 0.41021489963935376,
"eval_loss": 3.423628807067871,
"eval_runtime": 154.8423,
"eval_samples_per_second": 374.039,
"eval_steps_per_second": 5.845,
"step": 371920
},
{
"epoch": 20.0,
"step": 371920,
"total_flos": 1.5670047538944e+18,
"train_loss": 3.0279207580395733,
"train_runtime": 82395.3872,
"train_samples_per_second": 144.441,
"train_steps_per_second": 4.514
}
],
"logging_steps": 1000,
"max_steps": 371920,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5670047538944e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}