{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05377500537750054, "grad_norm": 0.8324036598205566, "learning_rate": 3.125e-05, "loss": 6.2314, "step": 1000 }, { "epoch": 0.10755001075500108, "grad_norm": 0.9016917943954468, "learning_rate": 6.25e-05, "loss": 5.01, "step": 2000 }, { "epoch": 0.1613250161325016, "grad_norm": 0.8393586874008179, "learning_rate": 9.375e-05, "loss": 4.6792, "step": 3000 }, { "epoch": 0.21510002151000215, "grad_norm": 0.8001790642738342, "learning_rate": 0.000125, "loss": 4.4675, "step": 4000 }, { "epoch": 0.2688750268875027, "grad_norm": 0.7500863671302795, "learning_rate": 0.00015625, "loss": 4.3004, "step": 5000 }, { "epoch": 0.3226500322650032, "grad_norm": 0.6959784626960754, "learning_rate": 0.0001875, "loss": 4.1762, "step": 6000 }, { "epoch": 0.3764250376425038, "grad_norm": 0.7082997560501099, "learning_rate": 0.00021875, "loss": 4.0795, "step": 7000 }, { "epoch": 0.4302000430200043, "grad_norm": 0.7400528788566589, "learning_rate": 0.00025, "loss": 3.9794, "step": 8000 }, { "epoch": 0.4839750483975048, "grad_norm": 0.6886024475097656, "learning_rate": 0.00028121875, "loss": 3.9062, "step": 9000 }, { "epoch": 0.5377500537750054, "grad_norm": 0.6196364760398865, "learning_rate": 0.0003124375, "loss": 3.8427, "step": 10000 }, { "epoch": 0.5915250591525059, "grad_norm": 0.5815768241882324, "learning_rate": 0.00034368749999999997, "loss": 3.7992, "step": 11000 }, { "epoch": 0.6453000645300064, "grad_norm": 0.5629006624221802, "learning_rate": 0.0003749375, "loss": 3.7502, "step": 12000 }, { "epoch": 0.699075069907507, "grad_norm": 0.5031692981719971, "learning_rate": 0.00040615625, "loss": 3.7233, "step": 13000 }, { "epoch": 0.7528500752850076, "grad_norm": 0.4921340048313141, "learning_rate": 0.00043737500000000005, "loss": 3.6917, "step": 14000 }, { "epoch": 0.806625080662508, "grad_norm": 0.45878851413726807, "learning_rate": 0.000468625, "loss": 3.6641, "step": 15000 }, { "epoch": 0.8604000860400086, "grad_norm": 0.4047335684299469, "learning_rate": 0.000499875, "loss": 3.6404, "step": 16000 }, { "epoch": 0.9141750914175092, "grad_norm": 0.4339119493961334, "learning_rate": 0.000531125, "loss": 3.6129, "step": 17000 }, { "epoch": 0.9679500967950097, "grad_norm": 0.3588213324546814, "learning_rate": 0.00056234375, "loss": 3.5932, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.3588065680197524, "eval_loss": 3.770080804824829, "eval_runtime": 152.9859, "eval_samples_per_second": 378.577, "eval_steps_per_second": 5.916, "step": 18596 }, { "epoch": 1.0217251021725102, "grad_norm": 0.3476862907409668, "learning_rate": 0.00059359375, "loss": 3.5726, "step": 19000 }, { "epoch": 1.0755001075500108, "grad_norm": 0.3370579481124878, "learning_rate": 0.0006248437500000001, "loss": 3.5453, "step": 20000 }, { "epoch": 1.1292751129275114, "grad_norm": 0.3253530263900757, "learning_rate": 0.00065609375, "loss": 3.5364, "step": 21000 }, { "epoch": 1.1830501183050117, "grad_norm": 0.3063829839229584, "learning_rate": 0.00068728125, "loss": 3.5214, "step": 22000 }, { "epoch": 1.2368251236825123, "grad_norm": 0.28737059235572815, "learning_rate": 0.00071853125, "loss": 3.5158, "step": 23000 }, { "epoch": 1.2906001290600129, "grad_norm": 0.29937857389450073, "learning_rate": 0.00074978125, "loss": 3.5014, "step": 24000 }, { "epoch": 1.3443751344375134, "grad_norm": 0.2835935056209564, "learning_rate": 0.0007810312499999999, "loss": 3.4946, "step": 25000 }, { "epoch": 1.398150139815014, "grad_norm": 0.2764816880226135, "learning_rate": 0.00081225, "loss": 3.4832, "step": 26000 }, { "epoch": 1.4519251451925146, "grad_norm": 0.2620868384838104, "learning_rate": 0.0008435000000000001, "loss": 3.4761, "step": 27000 }, { "epoch": 1.5057001505700152, "grad_norm": 0.2731957733631134, "learning_rate": 0.00087471875, "loss": 3.4653, "step": 28000 }, { "epoch": 1.5594751559475157, "grad_norm": 0.26957619190216064, "learning_rate": 0.00090596875, "loss": 3.4552, "step": 29000 }, { "epoch": 1.613250161325016, "grad_norm": 0.24591492116451263, "learning_rate": 0.00093721875, "loss": 3.4474, "step": 30000 }, { "epoch": 1.6670251667025167, "grad_norm": 0.23927152156829834, "learning_rate": 0.00096846875, "loss": 3.4443, "step": 31000 }, { "epoch": 1.7208001720800172, "grad_norm": 0.2176426500082016, "learning_rate": 0.0009996875, "loss": 3.4401, "step": 32000 }, { "epoch": 1.7745751774575176, "grad_norm": 0.20793931186199188, "learning_rate": 0.0009970875500117675, "loss": 3.4261, "step": 33000 }, { "epoch": 1.8283501828350182, "grad_norm": 0.2189057469367981, "learning_rate": 0.0009941486232054601, "loss": 3.419, "step": 34000 }, { "epoch": 1.8821251882125187, "grad_norm": 0.2241194099187851, "learning_rate": 0.0009912096963991528, "loss": 3.4088, "step": 35000 }, { "epoch": 1.9359001935900193, "grad_norm": 0.23365530371665955, "learning_rate": 0.0009882678277241704, "loss": 3.3934, "step": 36000 }, { "epoch": 1.9896751989675199, "grad_norm": 0.2019016444683075, "learning_rate": 0.000985328900917863, "loss": 3.3833, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.38185108449506, "eval_loss": 3.5596837997436523, "eval_runtime": 154.0793, "eval_samples_per_second": 375.891, "eval_steps_per_second": 5.874, "step": 37192 }, { "epoch": 2.0434502043450204, "grad_norm": 0.20308424532413483, "learning_rate": 0.0009823870322428808, "loss": 3.3404, "step": 38000 }, { "epoch": 2.097225209722521, "grad_norm": 0.21531735360622406, "learning_rate": 0.0009794451635678984, "loss": 3.3266, "step": 39000 }, { "epoch": 2.1510002151000216, "grad_norm": 0.26003631949424744, "learning_rate": 0.000976503294892916, "loss": 3.3242, "step": 40000 }, { "epoch": 2.204775220477522, "grad_norm": 0.24143873155117035, "learning_rate": 0.0009735643680866086, "loss": 3.3166, "step": 41000 }, { "epoch": 2.2585502258550227, "grad_norm": 0.19083160161972046, "learning_rate": 0.0009706224994116263, "loss": 3.3187, "step": 42000 }, { "epoch": 2.3123252312325233, "grad_norm": 0.22694003582000732, "learning_rate": 0.000967680630736644, "loss": 3.3112, "step": 43000 }, { "epoch": 2.3661002366100234, "grad_norm": 0.21774055063724518, "learning_rate": 0.0009647417039303365, "loss": 3.3075, "step": 44000 }, { "epoch": 2.419875241987524, "grad_norm": 0.2047697901725769, "learning_rate": 0.0009617998352553542, "loss": 3.2992, "step": 45000 }, { "epoch": 2.4736502473650246, "grad_norm": 0.21876117587089539, "learning_rate": 0.0009588579665803719, "loss": 3.2983, "step": 46000 }, { "epoch": 2.527425252742525, "grad_norm": 0.21647591888904572, "learning_rate": 0.0009559190397740644, "loss": 3.2876, "step": 47000 }, { "epoch": 2.5812002581200257, "grad_norm": 0.20933736860752106, "learning_rate": 0.0009529771710990821, "loss": 3.2814, "step": 48000 }, { "epoch": 2.6349752634975263, "grad_norm": 0.1911548376083374, "learning_rate": 0.0009500382442927748, "loss": 3.2797, "step": 49000 }, { "epoch": 2.688750268875027, "grad_norm": 0.22081832587718964, "learning_rate": 0.0009470963756177925, "loss": 3.2783, "step": 50000 }, { "epoch": 2.7425252742525275, "grad_norm": 0.21164289116859436, "learning_rate": 0.0009441545069428101, "loss": 3.2752, "step": 51000 }, { "epoch": 2.796300279630028, "grad_norm": 0.21225039660930634, "learning_rate": 0.0009412126382678278, "loss": 3.2681, "step": 52000 }, { "epoch": 2.8500752850075286, "grad_norm": 0.1924898326396942, "learning_rate": 0.0009382707695928455, "loss": 3.2629, "step": 53000 }, { "epoch": 2.903850290385029, "grad_norm": 0.19862565398216248, "learning_rate": 0.000935331842786538, "loss": 3.2634, "step": 54000 }, { "epoch": 2.9576252957625293, "grad_norm": 0.19020138680934906, "learning_rate": 0.0009323899741115557, "loss": 3.2597, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.39273080241152825, "eval_loss": 3.4648334980010986, "eval_runtime": 154.5571, "eval_samples_per_second": 374.729, "eval_steps_per_second": 5.855, "step": 55788 }, { "epoch": 3.0114003011400303, "grad_norm": 0.19856859743595123, "learning_rate": 0.0009294481054365734, "loss": 3.239, "step": 56000 }, { "epoch": 3.0651753065175305, "grad_norm": 0.22371411323547363, "learning_rate": 0.0009265091786302659, "loss": 3.1886, "step": 57000 }, { "epoch": 3.118950311895031, "grad_norm": 0.21350081264972687, "learning_rate": 0.0009235673099552836, "loss": 3.1941, "step": 58000 }, { "epoch": 3.1727253172725316, "grad_norm": 0.219674214720726, "learning_rate": 0.0009206254412803013, "loss": 3.1942, "step": 59000 }, { "epoch": 3.226500322650032, "grad_norm": 0.19072189927101135, "learning_rate": 0.0009176865144739939, "loss": 3.1973, "step": 60000 }, { "epoch": 3.2802753280275327, "grad_norm": 0.205557718873024, "learning_rate": 0.0009147475876676865, "loss": 3.1932, "step": 61000 }, { "epoch": 3.3340503334050333, "grad_norm": 0.2098790556192398, "learning_rate": 0.0009118057189927041, "loss": 3.1935, "step": 62000 }, { "epoch": 3.387825338782534, "grad_norm": 0.196111798286438, "learning_rate": 0.0009088638503177218, "loss": 3.1954, "step": 63000 }, { "epoch": 3.4416003441600345, "grad_norm": 0.19440898299217224, "learning_rate": 0.0009059219816427395, "loss": 3.1924, "step": 64000 }, { "epoch": 3.495375349537535, "grad_norm": 0.21081770956516266, "learning_rate": 0.0009029801129677572, "loss": 3.1952, "step": 65000 }, { "epoch": 3.5491503549150356, "grad_norm": 0.21867215633392334, "learning_rate": 0.0009000411861614498, "loss": 3.195, "step": 66000 }, { "epoch": 3.602925360292536, "grad_norm": 0.22000326216220856, "learning_rate": 0.0008970993174864674, "loss": 3.1911, "step": 67000 }, { "epoch": 3.6567003656700363, "grad_norm": 0.1891467422246933, "learning_rate": 0.0008941574488114851, "loss": 3.1934, "step": 68000 }, { "epoch": 3.7104753710475373, "grad_norm": 0.18787287175655365, "learning_rate": 0.0008912185220051777, "loss": 3.191, "step": 69000 }, { "epoch": 3.7642503764250375, "grad_norm": 0.23694172501564026, "learning_rate": 0.0008882766533301954, "loss": 3.1831, "step": 70000 }, { "epoch": 3.818025381802538, "grad_norm": 0.19812917709350586, "learning_rate": 0.000885334784655213, "loss": 3.1815, "step": 71000 }, { "epoch": 3.8718003871800386, "grad_norm": 0.2005423903465271, "learning_rate": 0.0008823958578489056, "loss": 3.1801, "step": 72000 }, { "epoch": 3.925575392557539, "grad_norm": 0.21525584161281586, "learning_rate": 0.0008794539891739233, "loss": 3.1795, "step": 73000 }, { "epoch": 3.9793503979350398, "grad_norm": 0.19802774488925934, "learning_rate": 0.0008765150623676159, "loss": 3.1741, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3976694409529698, "eval_loss": 3.419067859649658, "eval_runtime": 155.1328, "eval_samples_per_second": 373.338, "eval_steps_per_second": 5.834, "step": 74384 }, { "epoch": 4.033125403312541, "grad_norm": 0.2355957329273224, "learning_rate": 0.0008735731936926335, "loss": 3.1356, "step": 75000 }, { "epoch": 4.086900408690041, "grad_norm": 0.20892471075057983, "learning_rate": 0.0008706313250176512, "loss": 3.1124, "step": 76000 }, { "epoch": 4.140675414067541, "grad_norm": 0.24330681562423706, "learning_rate": 0.0008676923982113439, "loss": 3.1228, "step": 77000 }, { "epoch": 4.194450419445042, "grad_norm": 0.30532753467559814, "learning_rate": 0.0008647505295363614, "loss": 3.1191, "step": 78000 }, { "epoch": 4.248225424822542, "grad_norm": 0.2023121416568756, "learning_rate": 0.0008618116027300541, "loss": 3.1185, "step": 79000 }, { "epoch": 4.302000430200043, "grad_norm": 0.20023038983345032, "learning_rate": 0.0008588697340550719, "loss": 3.1248, "step": 80000 }, { "epoch": 4.355775435577543, "grad_norm": 0.20664258301258087, "learning_rate": 0.0008559278653800895, "loss": 3.1305, "step": 81000 }, { "epoch": 4.409550440955044, "grad_norm": 0.21807469427585602, "learning_rate": 0.0008529889385737821, "loss": 3.1265, "step": 82000 }, { "epoch": 4.4633254463325445, "grad_norm": 0.20922619104385376, "learning_rate": 0.0008500470698987998, "loss": 3.1287, "step": 83000 }, { "epoch": 4.5171004517100455, "grad_norm": 0.22318531572818756, "learning_rate": 0.0008471052012238174, "loss": 3.1265, "step": 84000 }, { "epoch": 4.570875457087546, "grad_norm": 0.20071184635162354, "learning_rate": 0.000844163332548835, "loss": 3.1244, "step": 85000 }, { "epoch": 4.624650462465047, "grad_norm": 0.23887498676776886, "learning_rate": 0.0008412244057425277, "loss": 3.1309, "step": 86000 }, { "epoch": 4.678425467842547, "grad_norm": 0.21280068159103394, "learning_rate": 0.0008382825370675454, "loss": 3.1261, "step": 87000 }, { "epoch": 4.732200473220047, "grad_norm": 0.20855990052223206, "learning_rate": 0.0008353406683925629, "loss": 3.1227, "step": 88000 }, { "epoch": 4.785975478597548, "grad_norm": 0.23701632022857666, "learning_rate": 0.0008324017415862556, "loss": 3.1274, "step": 89000 }, { "epoch": 4.839750483975048, "grad_norm": 0.22062337398529053, "learning_rate": 0.0008294598729112733, "loss": 3.1259, "step": 90000 }, { "epoch": 4.893525489352549, "grad_norm": 0.21007812023162842, "learning_rate": 0.0008265209461049658, "loss": 3.1241, "step": 91000 }, { "epoch": 4.947300494730049, "grad_norm": 0.3277081847190857, "learning_rate": 0.0008235790774299835, "loss": 3.1213, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.4008789177643117, "eval_loss": 3.396653652191162, "eval_runtime": 155.4385, "eval_samples_per_second": 372.604, "eval_steps_per_second": 5.822, "step": 92980 }, { "epoch": 5.00107550010755, "grad_norm": 0.20310255885124207, "learning_rate": 0.0008206401506236762, "loss": 3.1191, "step": 93000 }, { "epoch": 5.05485050548505, "grad_norm": 0.20080606639385223, "learning_rate": 0.0008176982819486937, "loss": 3.0521, "step": 94000 }, { "epoch": 5.108625510862551, "grad_norm": 0.21395424008369446, "learning_rate": 0.0008147593551423864, "loss": 3.0656, "step": 95000 }, { "epoch": 5.1624005162400515, "grad_norm": 0.22563432157039642, "learning_rate": 0.0008118174864674042, "loss": 3.0641, "step": 96000 }, { "epoch": 5.2161755216175525, "grad_norm": 0.21297597885131836, "learning_rate": 0.0008088785596610967, "loss": 3.07, "step": 97000 }, { "epoch": 5.269950526995053, "grad_norm": 0.20302899181842804, "learning_rate": 0.0008059366909861144, "loss": 3.0717, "step": 98000 }, { "epoch": 5.323725532372554, "grad_norm": 0.2152853012084961, "learning_rate": 0.0008029948223111321, "loss": 3.0759, "step": 99000 }, { "epoch": 5.377500537750054, "grad_norm": 0.2148328423500061, "learning_rate": 0.0008000529536361497, "loss": 3.0728, "step": 100000 }, { "epoch": 5.431275543127555, "grad_norm": 0.20232610404491425, "learning_rate": 0.0007971140268298423, "loss": 3.0798, "step": 101000 }, { "epoch": 5.485050548505055, "grad_norm": 0.22732730209827423, "learning_rate": 0.000794175100023535, "loss": 3.0756, "step": 102000 }, { "epoch": 5.538825553882555, "grad_norm": 0.2203952670097351, "learning_rate": 0.0007912332313485526, "loss": 3.0776, "step": 103000 }, { "epoch": 5.592600559260056, "grad_norm": 0.21848390996456146, "learning_rate": 0.0007882943045422453, "loss": 3.0763, "step": 104000 }, { "epoch": 5.646375564637556, "grad_norm": 0.22204072773456573, "learning_rate": 0.0007853524358672629, "loss": 3.0797, "step": 105000 }, { "epoch": 5.700150570015057, "grad_norm": 0.20933043956756592, "learning_rate": 0.0007824135090609555, "loss": 3.0763, "step": 106000 }, { "epoch": 5.753925575392557, "grad_norm": 0.19925065338611603, "learning_rate": 0.0007794716403859732, "loss": 3.0802, "step": 107000 }, { "epoch": 5.807700580770058, "grad_norm": 0.20748205482959747, "learning_rate": 0.0007765297717109908, "loss": 3.081, "step": 108000 }, { "epoch": 5.8614755861475585, "grad_norm": 0.2089342474937439, "learning_rate": 0.0007735908449046834, "loss": 3.0787, "step": 109000 }, { "epoch": 5.9152505915250595, "grad_norm": 0.20147345960140228, "learning_rate": 0.0007706489762297011, "loss": 3.0829, "step": 110000 }, { "epoch": 5.96902559690256, "grad_norm": 0.2211214154958725, "learning_rate": 0.0007677071075547188, "loss": 3.0783, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.405020628943781, "eval_loss": 3.3772811889648438, "eval_runtime": 156.2536, "eval_samples_per_second": 370.66, "eval_steps_per_second": 5.792, "step": 111576 }, { "epoch": 6.022800602280061, "grad_norm": 0.21148885786533356, "learning_rate": 0.0007647681807484113, "loss": 3.0451, "step": 112000 }, { "epoch": 6.076575607657561, "grad_norm": 0.2195775806903839, "learning_rate": 0.000761826312073429, "loss": 3.0147, "step": 113000 }, { "epoch": 6.130350613035061, "grad_norm": 0.20522399246692657, "learning_rate": 0.0007588873852671217, "loss": 3.0194, "step": 114000 }, { "epoch": 6.184125618412562, "grad_norm": 0.20723003149032593, "learning_rate": 0.0007559455165921393, "loss": 3.026, "step": 115000 }, { "epoch": 6.237900623790062, "grad_norm": 0.23514005541801453, "learning_rate": 0.000753003647917157, "loss": 3.0231, "step": 116000 }, { "epoch": 6.291675629167563, "grad_norm": 0.20580914616584778, "learning_rate": 0.0007500647211108497, "loss": 3.0321, "step": 117000 }, { "epoch": 6.345450634545063, "grad_norm": 0.2240120768547058, "learning_rate": 0.0007471228524358674, "loss": 3.0332, "step": 118000 }, { "epoch": 6.399225639922564, "grad_norm": 0.23184897005558014, "learning_rate": 0.0007441839256295599, "loss": 3.0369, "step": 119000 }, { "epoch": 6.453000645300064, "grad_norm": 0.22646069526672363, "learning_rate": 0.0007412449988232526, "loss": 3.0357, "step": 120000 }, { "epoch": 6.506775650677565, "grad_norm": 0.21927151083946228, "learning_rate": 0.0007383031301482702, "loss": 3.0398, "step": 121000 }, { "epoch": 6.5605506560550655, "grad_norm": 0.24726586043834686, "learning_rate": 0.0007353612614732878, "loss": 3.0373, "step": 122000 }, { "epoch": 6.6143256614325665, "grad_norm": 0.21686062216758728, "learning_rate": 0.0007324193927983055, "loss": 3.0399, "step": 123000 }, { "epoch": 6.668100666810067, "grad_norm": 0.21142247319221497, "learning_rate": 0.0007294804659919982, "loss": 3.0446, "step": 124000 }, { "epoch": 6.721875672187567, "grad_norm": 0.21460475027561188, "learning_rate": 0.0007265385973170157, "loss": 3.0403, "step": 125000 }, { "epoch": 6.775650677565068, "grad_norm": 0.22398121654987335, "learning_rate": 0.0007235967286420334, "loss": 3.0426, "step": 126000 }, { "epoch": 6.829425682942568, "grad_norm": 0.23123160004615784, "learning_rate": 0.0007206548599670511, "loss": 3.0443, "step": 127000 }, { "epoch": 6.883200688320069, "grad_norm": 0.21254226565361023, "learning_rate": 0.0007177159331607437, "loss": 3.0424, "step": 128000 }, { "epoch": 6.936975693697569, "grad_norm": 0.21302445232868195, "learning_rate": 0.0007147740644857613, "loss": 3.0472, "step": 129000 }, { "epoch": 6.99075069907507, "grad_norm": 0.2217877358198166, "learning_rate": 0.0007118321958107791, "loss": 3.0456, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.4055193299898036, "eval_loss": 3.3825550079345703, "eval_runtime": 155.2208, "eval_samples_per_second": 373.126, "eval_steps_per_second": 5.83, "step": 130172 }, { "epoch": 7.04452570445257, "grad_norm": 0.23532521724700928, "learning_rate": 0.0007088903271357967, "loss": 2.9839, "step": 131000 }, { "epoch": 7.098300709830071, "grad_norm": 0.22214515507221222, "learning_rate": 0.0007059514003294893, "loss": 2.9765, "step": 132000 }, { "epoch": 7.152075715207571, "grad_norm": 0.2383100390434265, "learning_rate": 0.0007030154153918568, "loss": 2.9843, "step": 133000 }, { "epoch": 7.205850720585072, "grad_norm": 0.22472046315670013, "learning_rate": 0.0007000735467168746, "loss": 2.9925, "step": 134000 }, { "epoch": 7.2596257259625725, "grad_norm": 0.26296138763427734, "learning_rate": 0.0006971316780418923, "loss": 2.997, "step": 135000 }, { "epoch": 7.3134007313400735, "grad_norm": 0.2494724839925766, "learning_rate": 0.0006941898093669099, "loss": 2.997, "step": 136000 }, { "epoch": 7.367175736717574, "grad_norm": 0.22137367725372314, "learning_rate": 0.0006912508825606025, "loss": 2.9973, "step": 137000 }, { "epoch": 7.420950742095075, "grad_norm": 0.22704289853572845, "learning_rate": 0.0006883090138856202, "loss": 3.0066, "step": 138000 }, { "epoch": 7.474725747472575, "grad_norm": 0.2145918905735016, "learning_rate": 0.0006853700870793128, "loss": 3.0054, "step": 139000 }, { "epoch": 7.528500752850075, "grad_norm": 0.21607990562915802, "learning_rate": 0.0006824282184043304, "loss": 3.0018, "step": 140000 }, { "epoch": 7.582275758227576, "grad_norm": 0.2057826817035675, "learning_rate": 0.0006794863497293481, "loss": 3.0101, "step": 141000 }, { "epoch": 7.636050763605076, "grad_norm": 0.23032937943935394, "learning_rate": 0.0006765474229230408, "loss": 3.0099, "step": 142000 }, { "epoch": 7.689825768982577, "grad_norm": 0.22495923936367035, "learning_rate": 0.0006736055542480583, "loss": 3.008, "step": 143000 }, { "epoch": 7.743600774360077, "grad_norm": 0.2345353364944458, "learning_rate": 0.000670666627441751, "loss": 3.0099, "step": 144000 }, { "epoch": 7.797375779737578, "grad_norm": 0.23005186021327972, "learning_rate": 0.0006677277006354437, "loss": 3.0125, "step": 145000 }, { "epoch": 7.851150785115078, "grad_norm": 0.29431313276290894, "learning_rate": 0.0006647858319604612, "loss": 3.0107, "step": 146000 }, { "epoch": 7.904925790492579, "grad_norm": 0.2245541363954544, "learning_rate": 0.0006618439632854789, "loss": 3.0133, "step": 147000 }, { "epoch": 7.9587007958700795, "grad_norm": 0.22786079347133636, "learning_rate": 0.0006589020946104966, "loss": 3.0126, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.40771172002548395, "eval_loss": 3.3547439575195312, "eval_runtime": 154.7201, "eval_samples_per_second": 374.334, "eval_steps_per_second": 5.849, "step": 148768 }, { "epoch": 8.01247580124758, "grad_norm": 0.2346237152814865, "learning_rate": 0.0006559602259355142, "loss": 2.9965, "step": 149000 }, { "epoch": 8.066250806625082, "grad_norm": 0.23346728086471558, "learning_rate": 0.0006530212991292069, "loss": 2.9479, "step": 150000 }, { "epoch": 8.12002581200258, "grad_norm": 0.24378512799739838, "learning_rate": 0.0006500794304542246, "loss": 2.9513, "step": 151000 }, { "epoch": 8.173800817380082, "grad_norm": 0.24439002573490143, "learning_rate": 0.0006471405036479172, "loss": 2.9601, "step": 152000 }, { "epoch": 8.227575822757583, "grad_norm": 0.27081623673439026, "learning_rate": 0.0006441986349729348, "loss": 2.9631, "step": 153000 }, { "epoch": 8.281350828135082, "grad_norm": 0.2521245777606964, "learning_rate": 0.0006412626500353025, "loss": 2.966, "step": 154000 }, { "epoch": 8.335125833512583, "grad_norm": 0.21975190937519073, "learning_rate": 0.0006383207813603201, "loss": 2.9678, "step": 155000 }, { "epoch": 8.388900838890084, "grad_norm": 0.2267887145280838, "learning_rate": 0.0006353789126853378, "loss": 2.9696, "step": 156000 }, { "epoch": 8.442675844267585, "grad_norm": 0.218279168009758, "learning_rate": 0.0006324370440103554, "loss": 2.9713, "step": 157000 }, { "epoch": 8.496450849645084, "grad_norm": 0.23300865292549133, "learning_rate": 0.0006294951753353731, "loss": 2.9772, "step": 158000 }, { "epoch": 8.550225855022585, "grad_norm": 0.21749693155288696, "learning_rate": 0.0006265562485290657, "loss": 2.9773, "step": 159000 }, { "epoch": 8.604000860400086, "grad_norm": 0.26928380131721497, "learning_rate": 0.0006236143798540833, "loss": 2.978, "step": 160000 }, { "epoch": 8.657775865777587, "grad_norm": 0.22122180461883545, "learning_rate": 0.000620672511179101, "loss": 2.9794, "step": 161000 }, { "epoch": 8.711550871155087, "grad_norm": 0.22700442373752594, "learning_rate": 0.0006177306425041186, "loss": 2.9824, "step": 162000 }, { "epoch": 8.765325876532588, "grad_norm": 0.2541004419326782, "learning_rate": 0.0006147917156978112, "loss": 2.9841, "step": 163000 }, { "epoch": 8.819100881910089, "grad_norm": 0.2551893889904022, "learning_rate": 0.0006118498470228289, "loss": 2.9837, "step": 164000 }, { "epoch": 8.872875887287588, "grad_norm": 0.25604966282844543, "learning_rate": 0.0006089079783478466, "loss": 2.984, "step": 165000 }, { "epoch": 8.926650892665089, "grad_norm": 0.24571265280246735, "learning_rate": 0.0006059719934102142, "loss": 2.9849, "step": 166000 }, { "epoch": 8.98042589804259, "grad_norm": 0.24128392338752747, "learning_rate": 0.0006030301247352318, "loss": 2.9843, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.4083424360998555, "eval_loss": 3.3613698482513428, "eval_runtime": 155.0364, "eval_samples_per_second": 373.57, "eval_steps_per_second": 5.837, "step": 167364 }, { "epoch": 9.034200903420091, "grad_norm": 0.24964158236980438, "learning_rate": 0.0006000882560602495, "loss": 2.9432, "step": 168000 }, { "epoch": 9.08797590879759, "grad_norm": 0.2405262142419815, "learning_rate": 0.0005971463873852672, "loss": 2.922, "step": 169000 }, { "epoch": 9.141750914175091, "grad_norm": 0.22288870811462402, "learning_rate": 0.0005942045187102848, "loss": 2.9269, "step": 170000 }, { "epoch": 9.195525919552592, "grad_norm": 0.3041359484195709, "learning_rate": 0.0005912626500353024, "loss": 2.9342, "step": 171000 }, { "epoch": 9.249300924930093, "grad_norm": 0.22550632059574127, "learning_rate": 0.0005883237232289951, "loss": 2.9392, "step": 172000 }, { "epoch": 9.303075930307593, "grad_norm": 0.23584921658039093, "learning_rate": 0.0005853818545540127, "loss": 2.9422, "step": 173000 }, { "epoch": 9.356850935685094, "grad_norm": 0.2634640634059906, "learning_rate": 0.0005824458696163803, "loss": 2.945, "step": 174000 }, { "epoch": 9.410625941062595, "grad_norm": 0.2354883849620819, "learning_rate": 0.000579504000941398, "loss": 2.9485, "step": 175000 }, { "epoch": 9.464400946440094, "grad_norm": 0.26491352915763855, "learning_rate": 0.0005765621322664157, "loss": 2.9475, "step": 176000 }, { "epoch": 9.518175951817595, "grad_norm": 0.2462054342031479, "learning_rate": 0.0005736202635914332, "loss": 2.9482, "step": 177000 }, { "epoch": 9.571950957195096, "grad_norm": 0.2643495202064514, "learning_rate": 0.0005706783949164509, "loss": 2.9502, "step": 178000 }, { "epoch": 9.625725962572597, "grad_norm": 0.24029669165611267, "learning_rate": 0.0005677365262414686, "loss": 2.9518, "step": 179000 }, { "epoch": 9.679500967950096, "grad_norm": 0.2550260126590729, "learning_rate": 0.0005648005413038361, "loss": 2.9571, "step": 180000 }, { "epoch": 9.733275973327597, "grad_norm": 0.23675589263439178, "learning_rate": 0.0005618586726288538, "loss": 2.9543, "step": 181000 }, { "epoch": 9.787050978705098, "grad_norm": 0.2279191017150879, "learning_rate": 0.0005589197458225465, "loss": 2.9593, "step": 182000 }, { "epoch": 9.8408259840826, "grad_norm": 0.27392587065696716, "learning_rate": 0.0005559778771475641, "loss": 2.9561, "step": 183000 }, { "epoch": 9.894600989460098, "grad_norm": 0.2388741672039032, "learning_rate": 0.0005530360084725818, "loss": 2.9565, "step": 184000 }, { "epoch": 9.9483759948376, "grad_norm": 0.2503463327884674, "learning_rate": 0.0005500970816662745, "loss": 2.9592, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.40854156716551776, "eval_loss": 3.377901077270508, "eval_runtime": 154.6842, "eval_samples_per_second": 374.421, "eval_steps_per_second": 5.851, "step": 185960 }, { "epoch": 10.0021510002151, "grad_norm": 0.2553974688053131, "learning_rate": 0.0005471552129912922, "loss": 2.9593, "step": 186000 }, { "epoch": 10.055926005592601, "grad_norm": 0.25415316224098206, "learning_rate": 0.0005442133443163097, "loss": 2.8931, "step": 187000 }, { "epoch": 10.1097010109701, "grad_norm": 0.2477007806301117, "learning_rate": 0.0005412744175100024, "loss": 2.8999, "step": 188000 }, { "epoch": 10.163476016347602, "grad_norm": 0.23852670192718506, "learning_rate": 0.0005383325488350201, "loss": 2.9028, "step": 189000 }, { "epoch": 10.217251021725103, "grad_norm": 0.2484176605939865, "learning_rate": 0.0005353906801600376, "loss": 2.9117, "step": 190000 }, { "epoch": 10.271026027102602, "grad_norm": 0.27494022250175476, "learning_rate": 0.0005324517533537303, "loss": 2.9133, "step": 191000 }, { "epoch": 10.324801032480103, "grad_norm": 0.2577020823955536, "learning_rate": 0.000529509884678748, "loss": 2.9214, "step": 192000 }, { "epoch": 10.378576037857604, "grad_norm": 0.2292626053094864, "learning_rate": 0.0005265680160037656, "loss": 2.9208, "step": 193000 }, { "epoch": 10.432351043235105, "grad_norm": 0.267873615026474, "learning_rate": 0.0005236290891974582, "loss": 2.9213, "step": 194000 }, { "epoch": 10.486126048612604, "grad_norm": 0.24749253690242767, "learning_rate": 0.0005206872205224759, "loss": 2.9239, "step": 195000 }, { "epoch": 10.539901053990105, "grad_norm": 0.26779085397720337, "learning_rate": 0.0005177453518474935, "loss": 2.929, "step": 196000 }, { "epoch": 10.593676059367606, "grad_norm": 0.2552465796470642, "learning_rate": 0.0005148064250411861, "loss": 2.9309, "step": 197000 }, { "epoch": 10.647451064745107, "grad_norm": 0.2551726996898651, "learning_rate": 0.0005118645563662038, "loss": 2.9287, "step": 198000 }, { "epoch": 10.701226070122607, "grad_norm": 0.24481531977653503, "learning_rate": 0.0005089256295598964, "loss": 2.9326, "step": 199000 }, { "epoch": 10.755001075500108, "grad_norm": 0.2609283924102783, "learning_rate": 0.0005059837608849142, "loss": 2.9332, "step": 200000 }, { "epoch": 10.808776080877609, "grad_norm": 0.22893798351287842, "learning_rate": 0.0005030418922099318, "loss": 2.9361, "step": 201000 }, { "epoch": 10.86255108625511, "grad_norm": 0.24516218900680542, "learning_rate": 0.0005001029654036244, "loss": 2.9358, "step": 202000 }, { "epoch": 10.916326091632609, "grad_norm": 0.23790410161018372, "learning_rate": 0.0004971610967286421, "loss": 2.9344, "step": 203000 }, { "epoch": 10.97010109701011, "grad_norm": 0.2549736797809601, "learning_rate": 0.0004942221699223347, "loss": 2.9367, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40993629082379995, "eval_loss": 3.3604304790496826, "eval_runtime": 154.4315, "eval_samples_per_second": 375.034, "eval_steps_per_second": 5.86, "step": 204556 }, { "epoch": 11.02387610238761, "grad_norm": 0.2493738979101181, "learning_rate": 0.0004912803012473523, "loss": 2.9057, "step": 205000 }, { "epoch": 11.07765110776511, "grad_norm": 0.245314821600914, "learning_rate": 0.00048833843257237, "loss": 2.8775, "step": 206000 }, { "epoch": 11.131426113142611, "grad_norm": 0.24536246061325073, "learning_rate": 0.0004853995057660626, "loss": 2.8783, "step": 207000 }, { "epoch": 11.185201118520112, "grad_norm": 0.26116108894348145, "learning_rate": 0.00048245763709108024, "loss": 2.8851, "step": 208000 }, { "epoch": 11.238976123897613, "grad_norm": 0.24136824905872345, "learning_rate": 0.0004795187102847729, "loss": 2.8918, "step": 209000 }, { "epoch": 11.292751129275112, "grad_norm": 0.275006502866745, "learning_rate": 0.0004765768416097905, "loss": 2.8951, "step": 210000 }, { "epoch": 11.346526134652613, "grad_norm": 0.27772292494773865, "learning_rate": 0.0004736349729348082, "loss": 2.8973, "step": 211000 }, { "epoch": 11.400301140030114, "grad_norm": 0.2865879237651825, "learning_rate": 0.0004706960461285008, "loss": 2.8977, "step": 212000 }, { "epoch": 11.454076145407615, "grad_norm": 0.24032117426395416, "learning_rate": 0.00046775711932219347, "loss": 2.9002, "step": 213000 }, { "epoch": 11.507851150785115, "grad_norm": 0.24863946437835693, "learning_rate": 0.00046481525064721114, "loss": 2.9052, "step": 214000 }, { "epoch": 11.561626156162616, "grad_norm": 0.25115910172462463, "learning_rate": 0.00046187338197222876, "loss": 2.9081, "step": 215000 }, { "epoch": 11.615401161540117, "grad_norm": 0.25204330682754517, "learning_rate": 0.00045893151329724644, "loss": 2.906, "step": 216000 }, { "epoch": 11.669176166917616, "grad_norm": 0.25553619861602783, "learning_rate": 0.00045599258649093905, "loss": 2.9074, "step": 217000 }, { "epoch": 11.722951172295117, "grad_norm": 0.24004510045051575, "learning_rate": 0.00045305071781595667, "loss": 2.9105, "step": 218000 }, { "epoch": 11.776726177672618, "grad_norm": 0.28220391273498535, "learning_rate": 0.00045010884914097434, "loss": 2.9075, "step": 219000 }, { "epoch": 11.830501183050119, "grad_norm": 0.2582526206970215, "learning_rate": 0.00044716992233466695, "loss": 2.9129, "step": 220000 }, { "epoch": 11.884276188427618, "grad_norm": 0.27006328105926514, "learning_rate": 0.0004442309955283596, "loss": 2.9146, "step": 221000 }, { "epoch": 11.93805119380512, "grad_norm": 0.3025253415107727, "learning_rate": 0.0004412891268533773, "loss": 2.9161, "step": 222000 }, { "epoch": 11.99182619918262, "grad_norm": 0.2534899115562439, "learning_rate": 0.00043834725817839496, "loss": 2.9145, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.40973608482660917, "eval_loss": 3.3759312629699707, "eval_runtime": 154.6971, "eval_samples_per_second": 374.39, "eval_steps_per_second": 5.85, "step": 223152 }, { "epoch": 12.045601204560121, "grad_norm": 0.27566835284233093, "learning_rate": 0.0004354053895034126, "loss": 2.8626, "step": 224000 }, { "epoch": 12.09937620993762, "grad_norm": 0.243771031498909, "learning_rate": 0.0004324664626971052, "loss": 2.8583, "step": 225000 }, { "epoch": 12.153151215315122, "grad_norm": 0.26204991340637207, "learning_rate": 0.00042952459402212286, "loss": 2.8654, "step": 226000 }, { "epoch": 12.206926220692623, "grad_norm": 0.2582913637161255, "learning_rate": 0.0004265827253471405, "loss": 2.8673, "step": 227000 }, { "epoch": 12.260701226070122, "grad_norm": 0.27284786105155945, "learning_rate": 0.00042364674040950813, "loss": 2.8735, "step": 228000 }, { "epoch": 12.314476231447623, "grad_norm": 0.25654593110084534, "learning_rate": 0.00042070487173452575, "loss": 2.8751, "step": 229000 }, { "epoch": 12.368251236825124, "grad_norm": 0.23449215292930603, "learning_rate": 0.0004177630030595435, "loss": 2.8769, "step": 230000 }, { "epoch": 12.422026242202625, "grad_norm": 0.24525216221809387, "learning_rate": 0.00041482407625323604, "loss": 2.8763, "step": 231000 }, { "epoch": 12.475801247580124, "grad_norm": 0.26022928953170776, "learning_rate": 0.0004118822075782537, "loss": 2.8811, "step": 232000 }, { "epoch": 12.529576252957625, "grad_norm": 0.24519123136997223, "learning_rate": 0.0004089403389032714, "loss": 2.8844, "step": 233000 }, { "epoch": 12.583351258335126, "grad_norm": 0.3118698298931122, "learning_rate": 0.000405998470228289, "loss": 2.8837, "step": 234000 }, { "epoch": 12.637126263712627, "grad_norm": 0.28287139534950256, "learning_rate": 0.0004030566015533066, "loss": 2.8843, "step": 235000 }, { "epoch": 12.690901269090126, "grad_norm": 0.24490605294704437, "learning_rate": 0.0004001176747469993, "loss": 2.8868, "step": 236000 }, { "epoch": 12.744676274467627, "grad_norm": 0.2810141444206238, "learning_rate": 0.00039717580607201696, "loss": 2.8905, "step": 237000 }, { "epoch": 12.798451279845128, "grad_norm": 0.2660733759403229, "learning_rate": 0.0003942368792657096, "loss": 2.8917, "step": 238000 }, { "epoch": 12.852226285222628, "grad_norm": 0.2440771758556366, "learning_rate": 0.00039129501059072724, "loss": 2.888, "step": 239000 }, { "epoch": 12.906001290600129, "grad_norm": 0.25520211458206177, "learning_rate": 0.0003883560837844199, "loss": 2.8938, "step": 240000 }, { "epoch": 12.95977629597763, "grad_norm": 0.2661096751689911, "learning_rate": 0.0003854142151094375, "loss": 2.8924, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4095556978794759, "eval_loss": 3.3856160640716553, "eval_runtime": 154.4788, "eval_samples_per_second": 374.919, "eval_steps_per_second": 5.858, "step": 241748 }, { "epoch": 13.01355130135513, "grad_norm": 0.29137110710144043, "learning_rate": 0.00038247234643445514, "loss": 2.8796, "step": 242000 }, { "epoch": 13.06732630673263, "grad_norm": 0.26854678988456726, "learning_rate": 0.0003795304777594728, "loss": 2.837, "step": 243000 }, { "epoch": 13.121101312110131, "grad_norm": 0.2614552974700928, "learning_rate": 0.00037659155095316543, "loss": 2.8409, "step": 244000 }, { "epoch": 13.174876317487632, "grad_norm": 0.3031397759914398, "learning_rate": 0.00037364968227818316, "loss": 2.8458, "step": 245000 }, { "epoch": 13.228651322865133, "grad_norm": 0.2507542073726654, "learning_rate": 0.0003707107554718757, "loss": 2.8537, "step": 246000 }, { "epoch": 13.282426328242632, "grad_norm": 0.2524189054965973, "learning_rate": 0.0003677688867968934, "loss": 2.852, "step": 247000 }, { "epoch": 13.336201333620133, "grad_norm": 0.27850231528282166, "learning_rate": 0.00036482701812191106, "loss": 2.8574, "step": 248000 }, { "epoch": 13.389976338997634, "grad_norm": 0.2408652901649475, "learning_rate": 0.00036188809131560367, "loss": 2.8563, "step": 249000 }, { "epoch": 13.443751344375134, "grad_norm": 0.28609830141067505, "learning_rate": 0.00035894622264062134, "loss": 2.8627, "step": 250000 }, { "epoch": 13.497526349752635, "grad_norm": 0.2690850496292114, "learning_rate": 0.00035600435396563896, "loss": 2.8619, "step": 251000 }, { "epoch": 13.551301355130136, "grad_norm": 0.2862522304058075, "learning_rate": 0.0003530654271593316, "loss": 2.8646, "step": 252000 }, { "epoch": 13.605076360507637, "grad_norm": 0.2629512548446655, "learning_rate": 0.0003501235584843493, "loss": 2.865, "step": 253000 }, { "epoch": 13.658851365885136, "grad_norm": 0.2542857825756073, "learning_rate": 0.00034718463167804185, "loss": 2.865, "step": 254000 }, { "epoch": 13.712626371262637, "grad_norm": 0.258798211812973, "learning_rate": 0.0003442427630030596, "loss": 2.8686, "step": 255000 }, { "epoch": 13.766401376640138, "grad_norm": 0.2492339164018631, "learning_rate": 0.0003413008943280772, "loss": 2.8689, "step": 256000 }, { "epoch": 13.820176382017639, "grad_norm": 0.29779887199401855, "learning_rate": 0.0003383590256530948, "loss": 2.8691, "step": 257000 }, { "epoch": 13.873951387395138, "grad_norm": 0.2670515179634094, "learning_rate": 0.0003354200988467875, "loss": 2.8711, "step": 258000 }, { "epoch": 13.92772639277264, "grad_norm": 0.26957201957702637, "learning_rate": 0.0003324782301718051, "loss": 2.8711, "step": 259000 }, { "epoch": 13.98150139815014, "grad_norm": 0.2824675142765045, "learning_rate": 0.00032953636149682283, "loss": 2.8757, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.410457699798363, "eval_loss": 3.384411573410034, "eval_runtime": 154.9443, "eval_samples_per_second": 373.792, "eval_steps_per_second": 5.841, "step": 260344 }, { "epoch": 14.035276403527641, "grad_norm": 0.29713505506515503, "learning_rate": 0.00032659449282184045, "loss": 2.8354, "step": 261000 }, { "epoch": 14.08905140890514, "grad_norm": 0.27730679512023926, "learning_rate": 0.00032365850788420805, "loss": 2.8198, "step": 262000 }, { "epoch": 14.142826414282641, "grad_norm": 0.29409125447273254, "learning_rate": 0.0003207166392092257, "loss": 2.824, "step": 263000 }, { "epoch": 14.196601419660142, "grad_norm": 0.280676931142807, "learning_rate": 0.00031777477053424334, "loss": 2.8301, "step": 264000 }, { "epoch": 14.250376425037642, "grad_norm": 0.3202875554561615, "learning_rate": 0.000314835843727936, "loss": 2.8271, "step": 265000 }, { "epoch": 14.304151430415143, "grad_norm": 0.2673070728778839, "learning_rate": 0.0003118939750529536, "loss": 2.8376, "step": 266000 }, { "epoch": 14.357926435792644, "grad_norm": 0.2868790030479431, "learning_rate": 0.0003089550482466463, "loss": 2.8416, "step": 267000 }, { "epoch": 14.411701441170145, "grad_norm": 0.2689562737941742, "learning_rate": 0.0003060131795716639, "loss": 2.8408, "step": 268000 }, { "epoch": 14.465476446547644, "grad_norm": 0.2907434403896332, "learning_rate": 0.0003030713108966815, "loss": 2.8429, "step": 269000 }, { "epoch": 14.519251451925145, "grad_norm": 0.26338914036750793, "learning_rate": 0.00030012944222169925, "loss": 2.8465, "step": 270000 }, { "epoch": 14.573026457302646, "grad_norm": 0.2925278842449188, "learning_rate": 0.00029719051541539186, "loss": 2.8444, "step": 271000 }, { "epoch": 14.626801462680147, "grad_norm": 0.2883965075016022, "learning_rate": 0.00029424864674040954, "loss": 2.8466, "step": 272000 }, { "epoch": 14.680576468057646, "grad_norm": 0.28422123193740845, "learning_rate": 0.00029130971993410215, "loss": 2.8458, "step": 273000 }, { "epoch": 14.734351473435147, "grad_norm": 0.28057223558425903, "learning_rate": 0.0002883678512591198, "loss": 2.8512, "step": 274000 }, { "epoch": 14.788126478812648, "grad_norm": 0.2739641070365906, "learning_rate": 0.00028542892445281243, "loss": 2.8508, "step": 275000 }, { "epoch": 14.84190148419015, "grad_norm": 0.26283150911331177, "learning_rate": 0.00028248705577783005, "loss": 2.8491, "step": 276000 }, { "epoch": 14.895676489567649, "grad_norm": 0.25209805369377136, "learning_rate": 0.0002795451871028477, "loss": 2.8528, "step": 277000 }, { "epoch": 14.94945149494515, "grad_norm": 0.29996606707572937, "learning_rate": 0.0002766033184278654, "loss": 2.8545, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.4106898178253074, "eval_loss": 3.383195400238037, "eval_runtime": 154.9853, "eval_samples_per_second": 373.693, "eval_steps_per_second": 5.839, "step": 278940 }, { "epoch": 15.00322650032265, "grad_norm": 0.30987992882728577, "learning_rate": 0.00027366439162155806, "loss": 2.8502, "step": 279000 }, { "epoch": 15.05700150570015, "grad_norm": 0.2791779041290283, "learning_rate": 0.0002707225229465757, "loss": 2.8054, "step": 280000 }, { "epoch": 15.11077651107765, "grad_norm": 0.29352006316185, "learning_rate": 0.00026778359614026834, "loss": 2.8087, "step": 281000 }, { "epoch": 15.164551516455152, "grad_norm": 0.2739843428134918, "learning_rate": 0.00026484466933396095, "loss": 2.8134, "step": 282000 }, { "epoch": 15.218326521832653, "grad_norm": 0.29397326707839966, "learning_rate": 0.00026190280065897857, "loss": 2.814, "step": 283000 }, { "epoch": 15.272101527210152, "grad_norm": 0.2891228199005127, "learning_rate": 0.00025896387385267123, "loss": 2.8154, "step": 284000 }, { "epoch": 15.325876532587653, "grad_norm": 0.2697419226169586, "learning_rate": 0.00025602200517768885, "loss": 2.815, "step": 285000 }, { "epoch": 15.379651537965154, "grad_norm": 0.27169767022132874, "learning_rate": 0.00025308013650270653, "loss": 2.8179, "step": 286000 }, { "epoch": 15.433426543342655, "grad_norm": 0.28420206904411316, "learning_rate": 0.0002501382678277242, "loss": 2.8242, "step": 287000 }, { "epoch": 15.487201548720154, "grad_norm": 0.29990944266319275, "learning_rate": 0.0002471993410214168, "loss": 2.8284, "step": 288000 }, { "epoch": 15.540976554097655, "grad_norm": 0.29358208179473877, "learning_rate": 0.0002442574723464345, "loss": 2.8242, "step": 289000 }, { "epoch": 15.594751559475156, "grad_norm": 0.2709376811981201, "learning_rate": 0.0002413156036714521, "loss": 2.8323, "step": 290000 }, { "epoch": 15.648526564852656, "grad_norm": 0.27639514207839966, "learning_rate": 0.00023837961873381973, "loss": 2.8295, "step": 291000 }, { "epoch": 15.702301570230157, "grad_norm": 0.27499568462371826, "learning_rate": 0.00023543775005883738, "loss": 2.8273, "step": 292000 }, { "epoch": 15.756076575607658, "grad_norm": 0.2614821493625641, "learning_rate": 0.00023249588138385502, "loss": 2.8302, "step": 293000 }, { "epoch": 15.809851580985159, "grad_norm": 0.30566710233688354, "learning_rate": 0.00022955695457754766, "loss": 2.8315, "step": 294000 }, { "epoch": 15.863626586362658, "grad_norm": 0.28071853518486023, "learning_rate": 0.00022661508590256533, "loss": 2.8282, "step": 295000 }, { "epoch": 15.917401591740159, "grad_norm": 0.26871344447135925, "learning_rate": 0.00022367321722758295, "loss": 2.8318, "step": 296000 }, { "epoch": 15.97117659711766, "grad_norm": 0.27507010102272034, "learning_rate": 0.0002207313485526006, "loss": 2.8339, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4098433764298017, "eval_loss": 3.4079394340515137, "eval_runtime": 155.0447, "eval_samples_per_second": 373.55, "eval_steps_per_second": 5.837, "step": 297536 }, { "epoch": 16.02495160249516, "grad_norm": 0.2846646010875702, "learning_rate": 0.00021778947987761827, "loss": 2.8126, "step": 298000 }, { "epoch": 16.07872660787266, "grad_norm": 0.30357855558395386, "learning_rate": 0.0002148505530713109, "loss": 2.7901, "step": 299000 }, { "epoch": 16.132501613250163, "grad_norm": 0.27245578169822693, "learning_rate": 0.00021191162626500354, "loss": 2.791, "step": 300000 }, { "epoch": 16.186276618627662, "grad_norm": 0.27511173486709595, "learning_rate": 0.0002089697575900212, "loss": 2.7991, "step": 301000 }, { "epoch": 16.24005162400516, "grad_norm": 0.2818892300128937, "learning_rate": 0.00020602788891503884, "loss": 2.8017, "step": 302000 }, { "epoch": 16.293826629382664, "grad_norm": 0.29610157012939453, "learning_rate": 0.00020308896210873147, "loss": 2.7992, "step": 303000 }, { "epoch": 16.347601634760164, "grad_norm": 0.28958022594451904, "learning_rate": 0.00020014709343374912, "loss": 2.8032, "step": 304000 }, { "epoch": 16.401376640137663, "grad_norm": 0.32356253266334534, "learning_rate": 0.00019720522475876677, "loss": 2.8018, "step": 305000 }, { "epoch": 16.455151645515166, "grad_norm": 0.2847062051296234, "learning_rate": 0.0001942633560837844, "loss": 2.8077, "step": 306000 }, { "epoch": 16.508926650892665, "grad_norm": 0.2739544212818146, "learning_rate": 0.00019132442927747705, "loss": 2.81, "step": 307000 }, { "epoch": 16.562701656270164, "grad_norm": 0.28697946667671204, "learning_rate": 0.0001883825606024947, "loss": 2.807, "step": 308000 }, { "epoch": 16.616476661647667, "grad_norm": 0.29771292209625244, "learning_rate": 0.00018544363379618733, "loss": 2.8096, "step": 309000 }, { "epoch": 16.670251667025166, "grad_norm": 0.29724204540252686, "learning_rate": 0.000182501765121205, "loss": 2.8081, "step": 310000 }, { "epoch": 16.72402667240267, "grad_norm": 0.2964895963668823, "learning_rate": 0.00017955989644622265, "loss": 2.8133, "step": 311000 }, { "epoch": 16.777801677780168, "grad_norm": 0.28841668367385864, "learning_rate": 0.00017662391150859025, "loss": 2.8086, "step": 312000 }, { "epoch": 16.831576683157667, "grad_norm": 0.2766316831111908, "learning_rate": 0.00017368204283360793, "loss": 2.8152, "step": 313000 }, { "epoch": 16.88535168853517, "grad_norm": 0.32704171538352966, "learning_rate": 0.00017074017415862554, "loss": 2.8162, "step": 314000 }, { "epoch": 16.93912669391267, "grad_norm": 0.3360968828201294, "learning_rate": 0.00016779830548364322, "loss": 2.8133, "step": 315000 }, { "epoch": 16.99290169929017, "grad_norm": 0.31055524945259094, "learning_rate": 0.00016485643680866087, "loss": 2.8157, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4103879636154489, "eval_loss": 3.3883779048919678, "eval_runtime": 154.6539, "eval_samples_per_second": 374.494, "eval_steps_per_second": 5.852, "step": 316132 }, { "epoch": 17.04667670466767, "grad_norm": 0.30388155579566956, "learning_rate": 0.0001619175100023535, "loss": 2.7807, "step": 317000 }, { "epoch": 17.10045171004517, "grad_norm": 0.2881651818752289, "learning_rate": 0.00015897564132737118, "loss": 2.7798, "step": 318000 }, { "epoch": 17.15422671542267, "grad_norm": 0.2695824205875397, "learning_rate": 0.0001560337726523888, "loss": 2.7817, "step": 319000 }, { "epoch": 17.208001720800173, "grad_norm": 0.30308765172958374, "learning_rate": 0.00015309484584608143, "loss": 2.7808, "step": 320000 }, { "epoch": 17.261776726177672, "grad_norm": 0.29839852452278137, "learning_rate": 0.00015015297717109908, "loss": 2.7878, "step": 321000 }, { "epoch": 17.315551731555175, "grad_norm": 0.2739504873752594, "learning_rate": 0.00014721110849611675, "loss": 2.7849, "step": 322000 }, { "epoch": 17.369326736932674, "grad_norm": 0.33289453387260437, "learning_rate": 0.00014427512355848435, "loss": 2.7878, "step": 323000 }, { "epoch": 17.423101742310173, "grad_norm": 0.30872535705566406, "learning_rate": 0.00014133325488350202, "loss": 2.7862, "step": 324000 }, { "epoch": 17.476876747687676, "grad_norm": 0.30065229535102844, "learning_rate": 0.00013839138620851964, "loss": 2.7903, "step": 325000 }, { "epoch": 17.530651753065175, "grad_norm": 0.294783353805542, "learning_rate": 0.0001354495175335373, "loss": 2.7901, "step": 326000 }, { "epoch": 17.584426758442675, "grad_norm": 0.3017074167728424, "learning_rate": 0.00013251059072722993, "loss": 2.789, "step": 327000 }, { "epoch": 17.638201763820177, "grad_norm": 0.28931453824043274, "learning_rate": 0.0001295687220522476, "loss": 2.7962, "step": 328000 }, { "epoch": 17.691976769197677, "grad_norm": 0.30777105689048767, "learning_rate": 0.00012662685337726525, "loss": 2.7905, "step": 329000 }, { "epoch": 17.745751774575176, "grad_norm": 0.30437570810317993, "learning_rate": 0.00012368792657095788, "loss": 2.7922, "step": 330000 }, { "epoch": 17.79952677995268, "grad_norm": 0.279985249042511, "learning_rate": 0.00012074605789597553, "loss": 2.7958, "step": 331000 }, { "epoch": 17.853301785330178, "grad_norm": 0.2660251557826996, "learning_rate": 0.00011780418922099318, "loss": 2.7954, "step": 332000 }, { "epoch": 17.90707679070768, "grad_norm": 0.2697618901729584, "learning_rate": 0.00011486526241468581, "loss": 2.7969, "step": 333000 }, { "epoch": 17.96085179608518, "grad_norm": 0.2890709340572357, "learning_rate": 0.00011192633560837845, "loss": 2.7966, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4105063404500295, "eval_loss": 3.4080612659454346, "eval_runtime": 154.9923, "eval_samples_per_second": 373.677, "eval_steps_per_second": 5.839, "step": 334728 }, { "epoch": 18.01462680146268, "grad_norm": 0.2767917215824127, "learning_rate": 0.0001089844669333961, "loss": 2.7867, "step": 335000 }, { "epoch": 18.068401806840182, "grad_norm": 0.29336050152778625, "learning_rate": 0.00010604259825841375, "loss": 2.7639, "step": 336000 }, { "epoch": 18.12217681221768, "grad_norm": 0.3294292688369751, "learning_rate": 0.0001031007295834314, "loss": 2.7658, "step": 337000 }, { "epoch": 18.17595181759518, "grad_norm": 0.30230215191841125, "learning_rate": 0.00010015886090844905, "loss": 2.766, "step": 338000 }, { "epoch": 18.229726822972683, "grad_norm": 0.28469741344451904, "learning_rate": 9.721993410214168e-05, "loss": 2.7682, "step": 339000 }, { "epoch": 18.283501828350182, "grad_norm": 0.2829948365688324, "learning_rate": 9.427806542715933e-05, "loss": 2.7747, "step": 340000 }, { "epoch": 18.33727683372768, "grad_norm": 0.27874138951301575, "learning_rate": 9.133913862085197e-05, "loss": 2.7719, "step": 341000 }, { "epoch": 18.391051839105184, "grad_norm": 0.3442845642566681, "learning_rate": 8.839726994586963e-05, "loss": 2.7753, "step": 342000 }, { "epoch": 18.444826844482684, "grad_norm": 0.36807698011398315, "learning_rate": 8.545540127088727e-05, "loss": 2.7734, "step": 343000 }, { "epoch": 18.498601849860187, "grad_norm": 0.3008968234062195, "learning_rate": 8.25164744645799e-05, "loss": 2.7742, "step": 344000 }, { "epoch": 18.552376855237686, "grad_norm": 0.3300212621688843, "learning_rate": 7.957460578959754e-05, "loss": 2.7729, "step": 345000 }, { "epoch": 18.606151860615185, "grad_norm": 0.2936519682407379, "learning_rate": 7.66327371146152e-05, "loss": 2.7756, "step": 346000 }, { "epoch": 18.659926865992688, "grad_norm": 0.2997318208217621, "learning_rate": 7.369381030830784e-05, "loss": 2.7787, "step": 347000 }, { "epoch": 18.713701871370187, "grad_norm": 0.27832144498825073, "learning_rate": 7.075194163332549e-05, "loss": 2.7736, "step": 348000 }, { "epoch": 18.767476876747686, "grad_norm": 0.28568559885025024, "learning_rate": 6.781301482701812e-05, "loss": 2.7752, "step": 349000 }, { "epoch": 18.82125188212519, "grad_norm": 0.2887098789215088, "learning_rate": 6.487408802071074e-05, "loss": 2.7764, "step": 350000 }, { "epoch": 18.87502688750269, "grad_norm": 0.31327852606773376, "learning_rate": 6.19322193457284e-05, "loss": 2.7776, "step": 351000 }, { "epoch": 18.928801892880188, "grad_norm": 0.2947293519973755, "learning_rate": 5.899035067074606e-05, "loss": 2.7778, "step": 352000 }, { "epoch": 18.98257689825769, "grad_norm": 0.27630892395973206, "learning_rate": 5.604848199576371e-05, "loss": 2.7807, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.41043096087659053, "eval_loss": 3.4180543422698975, "eval_runtime": 155.0182, "eval_samples_per_second": 373.614, "eval_steps_per_second": 5.838, "step": 353324 }, { "epoch": 19.03635190363519, "grad_norm": 0.29157590866088867, "learning_rate": 5.310955518945634e-05, "loss": 2.7661, "step": 354000 }, { "epoch": 19.090126909012692, "grad_norm": 0.26800793409347534, "learning_rate": 5.0167686514473995e-05, "loss": 2.7558, "step": 355000 }, { "epoch": 19.14390191439019, "grad_norm": 0.31518709659576416, "learning_rate": 4.722581783949165e-05, "loss": 2.7579, "step": 356000 }, { "epoch": 19.19767691976769, "grad_norm": 0.2998282313346863, "learning_rate": 4.428689103318428e-05, "loss": 2.7574, "step": 357000 }, { "epoch": 19.251451925145194, "grad_norm": 0.3034313917160034, "learning_rate": 4.134502235820193e-05, "loss": 2.7606, "step": 358000 }, { "epoch": 19.305226930522693, "grad_norm": 0.2875937819480896, "learning_rate": 3.8403153683219584e-05, "loss": 2.7586, "step": 359000 }, { "epoch": 19.359001935900192, "grad_norm": 0.30761197209358215, "learning_rate": 3.546422687691222e-05, "loss": 2.7577, "step": 360000 }, { "epoch": 19.412776941277695, "grad_norm": 0.29801440238952637, "learning_rate": 3.252530007060485e-05, "loss": 2.7557, "step": 361000 }, { "epoch": 19.466551946655194, "grad_norm": 0.2939643859863281, "learning_rate": 2.95834313956225e-05, "loss": 2.7571, "step": 362000 }, { "epoch": 19.520326952032697, "grad_norm": 0.29968592524528503, "learning_rate": 2.6641562720640153e-05, "loss": 2.761, "step": 363000 }, { "epoch": 19.574101957410196, "grad_norm": 0.28651562333106995, "learning_rate": 2.36996940456578e-05, "loss": 2.7579, "step": 364000 }, { "epoch": 19.627876962787695, "grad_norm": 0.30474939942359924, "learning_rate": 2.0760767239350436e-05, "loss": 2.7598, "step": 365000 }, { "epoch": 19.6816519681652, "grad_norm": 0.2941524088382721, "learning_rate": 1.7818898564368086e-05, "loss": 2.7644, "step": 366000 }, { "epoch": 19.735426973542697, "grad_norm": 0.2570919990539551, "learning_rate": 1.487997175806072e-05, "loss": 2.7549, "step": 367000 }, { "epoch": 19.789201978920197, "grad_norm": 0.31410789489746094, "learning_rate": 1.193810308307837e-05, "loss": 2.7624, "step": 368000 }, { "epoch": 19.8429769842977, "grad_norm": 0.29923874139785767, "learning_rate": 8.999176276771005e-06, "loss": 2.7621, "step": 369000 }, { "epoch": 19.8967519896752, "grad_norm": 0.29635271430015564, "learning_rate": 6.057307601788656e-06, "loss": 2.7601, "step": 370000 }, { "epoch": 19.950526995052698, "grad_norm": 0.29568880796432495, "learning_rate": 3.1154389268063074e-06, "loss": 2.7595, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.41021489963935376, "eval_loss": 3.423628807067871, "eval_runtime": 154.8423, "eval_samples_per_second": 374.039, "eval_steps_per_second": 5.845, "step": 371920 }, { "epoch": 20.0, "step": 371920, "total_flos": 1.5670047538944e+18, "train_loss": 3.0279207580395733, "train_runtime": 82395.3872, "train_samples_per_second": 144.441, "train_steps_per_second": 4.514 } ], "logging_steps": 1000, "max_steps": 371920, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5670047538944e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }