smolm-autoreg-bpe-counterfactual_babylm_aann_high_variability_numeral-seed_1024-1e-3
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 20.0, | |
"eval_steps": 500, | |
"global_step": 371920, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.05377500537750054, | |
"grad_norm": 0.8324036598205566, | |
"learning_rate": 3.125e-05, | |
"loss": 6.2314, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.10755001075500108, | |
"grad_norm": 0.9016917943954468, | |
"learning_rate": 6.25e-05, | |
"loss": 5.01, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 0.1613250161325016, | |
"grad_norm": 0.8393586874008179, | |
"learning_rate": 9.375e-05, | |
"loss": 4.6792, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 0.21510002151000215, | |
"grad_norm": 0.8001790642738342, | |
"learning_rate": 0.000125, | |
"loss": 4.4675, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 0.2688750268875027, | |
"grad_norm": 0.7500863671302795, | |
"learning_rate": 0.00015625, | |
"loss": 4.3004, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 0.3226500322650032, | |
"grad_norm": 0.6959784626960754, | |
"learning_rate": 0.0001875, | |
"loss": 4.1762, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 0.3764250376425038, | |
"grad_norm": 0.7082997560501099, | |
"learning_rate": 0.00021875, | |
"loss": 4.0795, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 0.4302000430200043, | |
"grad_norm": 0.7400528788566589, | |
"learning_rate": 0.00025, | |
"loss": 3.9794, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 0.4839750483975048, | |
"grad_norm": 0.6886024475097656, | |
"learning_rate": 0.00028121875, | |
"loss": 3.9062, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 0.5377500537750054, | |
"grad_norm": 0.6196364760398865, | |
"learning_rate": 0.0003124375, | |
"loss": 3.8427, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 0.5915250591525059, | |
"grad_norm": 0.5815768241882324, | |
"learning_rate": 0.00034368749999999997, | |
"loss": 3.7992, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 0.6453000645300064, | |
"grad_norm": 0.5629006624221802, | |
"learning_rate": 0.0003749375, | |
"loss": 3.7502, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 0.699075069907507, | |
"grad_norm": 0.5031692981719971, | |
"learning_rate": 0.00040615625, | |
"loss": 3.7233, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 0.7528500752850076, | |
"grad_norm": 0.4921340048313141, | |
"learning_rate": 0.00043737500000000005, | |
"loss": 3.6917, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 0.806625080662508, | |
"grad_norm": 0.45878851413726807, | |
"learning_rate": 0.000468625, | |
"loss": 3.6641, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 0.8604000860400086, | |
"grad_norm": 0.4047335684299469, | |
"learning_rate": 0.000499875, | |
"loss": 3.6404, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 0.9141750914175092, | |
"grad_norm": 0.4339119493961334, | |
"learning_rate": 0.000531125, | |
"loss": 3.6129, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 0.9679500967950097, | |
"grad_norm": 0.3588213324546814, | |
"learning_rate": 0.00056234375, | |
"loss": 3.5932, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.3588065680197524, | |
"eval_loss": 3.770080804824829, | |
"eval_runtime": 152.9859, | |
"eval_samples_per_second": 378.577, | |
"eval_steps_per_second": 5.916, | |
"step": 18596 | |
}, | |
{ | |
"epoch": 1.0217251021725102, | |
"grad_norm": 0.3476862907409668, | |
"learning_rate": 0.00059359375, | |
"loss": 3.5726, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 1.0755001075500108, | |
"grad_norm": 0.3370579481124878, | |
"learning_rate": 0.0006248437500000001, | |
"loss": 3.5453, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 1.1292751129275114, | |
"grad_norm": 0.3253530263900757, | |
"learning_rate": 0.00065609375, | |
"loss": 3.5364, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 1.1830501183050117, | |
"grad_norm": 0.3063829839229584, | |
"learning_rate": 0.00068728125, | |
"loss": 3.5214, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 1.2368251236825123, | |
"grad_norm": 0.28737059235572815, | |
"learning_rate": 0.00071853125, | |
"loss": 3.5158, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 1.2906001290600129, | |
"grad_norm": 0.29937857389450073, | |
"learning_rate": 0.00074978125, | |
"loss": 3.5014, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 1.3443751344375134, | |
"grad_norm": 0.2835935056209564, | |
"learning_rate": 0.0007810312499999999, | |
"loss": 3.4946, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 1.398150139815014, | |
"grad_norm": 0.2764816880226135, | |
"learning_rate": 0.00081225, | |
"loss": 3.4832, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 1.4519251451925146, | |
"grad_norm": 0.2620868384838104, | |
"learning_rate": 0.0008435000000000001, | |
"loss": 3.4761, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 1.5057001505700152, | |
"grad_norm": 0.2731957733631134, | |
"learning_rate": 0.00087471875, | |
"loss": 3.4653, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 1.5594751559475157, | |
"grad_norm": 0.26957619190216064, | |
"learning_rate": 0.00090596875, | |
"loss": 3.4552, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 1.613250161325016, | |
"grad_norm": 0.24591492116451263, | |
"learning_rate": 0.00093721875, | |
"loss": 3.4474, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 1.6670251667025167, | |
"grad_norm": 0.23927152156829834, | |
"learning_rate": 0.00096846875, | |
"loss": 3.4443, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 1.7208001720800172, | |
"grad_norm": 0.2176426500082016, | |
"learning_rate": 0.0009996875, | |
"loss": 3.4401, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 1.7745751774575176, | |
"grad_norm": 0.20793931186199188, | |
"learning_rate": 0.0009970875500117675, | |
"loss": 3.4261, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 1.8283501828350182, | |
"grad_norm": 0.2189057469367981, | |
"learning_rate": 0.0009941486232054601, | |
"loss": 3.419, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 1.8821251882125187, | |
"grad_norm": 0.2241194099187851, | |
"learning_rate": 0.0009912096963991528, | |
"loss": 3.4088, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 1.9359001935900193, | |
"grad_norm": 0.23365530371665955, | |
"learning_rate": 0.0009882678277241704, | |
"loss": 3.3934, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 1.9896751989675199, | |
"grad_norm": 0.2019016444683075, | |
"learning_rate": 0.000985328900917863, | |
"loss": 3.3833, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.38185108449506, | |
"eval_loss": 3.5596837997436523, | |
"eval_runtime": 154.0793, | |
"eval_samples_per_second": 375.891, | |
"eval_steps_per_second": 5.874, | |
"step": 37192 | |
}, | |
{ | |
"epoch": 2.0434502043450204, | |
"grad_norm": 0.20308424532413483, | |
"learning_rate": 0.0009823870322428808, | |
"loss": 3.3404, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 2.097225209722521, | |
"grad_norm": 0.21531735360622406, | |
"learning_rate": 0.0009794451635678984, | |
"loss": 3.3266, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 2.1510002151000216, | |
"grad_norm": 0.26003631949424744, | |
"learning_rate": 0.000976503294892916, | |
"loss": 3.3242, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 2.204775220477522, | |
"grad_norm": 0.24143873155117035, | |
"learning_rate": 0.0009735643680866086, | |
"loss": 3.3166, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 2.2585502258550227, | |
"grad_norm": 0.19083160161972046, | |
"learning_rate": 0.0009706224994116263, | |
"loss": 3.3187, | |
"step": 42000 | |
}, | |
{ | |
"epoch": 2.3123252312325233, | |
"grad_norm": 0.22694003582000732, | |
"learning_rate": 0.000967680630736644, | |
"loss": 3.3112, | |
"step": 43000 | |
}, | |
{ | |
"epoch": 2.3661002366100234, | |
"grad_norm": 0.21774055063724518, | |
"learning_rate": 0.0009647417039303365, | |
"loss": 3.3075, | |
"step": 44000 | |
}, | |
{ | |
"epoch": 2.419875241987524, | |
"grad_norm": 0.2047697901725769, | |
"learning_rate": 0.0009617998352553542, | |
"loss": 3.2992, | |
"step": 45000 | |
}, | |
{ | |
"epoch": 2.4736502473650246, | |
"grad_norm": 0.21876117587089539, | |
"learning_rate": 0.0009588579665803719, | |
"loss": 3.2983, | |
"step": 46000 | |
}, | |
{ | |
"epoch": 2.527425252742525, | |
"grad_norm": 0.21647591888904572, | |
"learning_rate": 0.0009559190397740644, | |
"loss": 3.2876, | |
"step": 47000 | |
}, | |
{ | |
"epoch": 2.5812002581200257, | |
"grad_norm": 0.20933736860752106, | |
"learning_rate": 0.0009529771710990821, | |
"loss": 3.2814, | |
"step": 48000 | |
}, | |
{ | |
"epoch": 2.6349752634975263, | |
"grad_norm": 0.1911548376083374, | |
"learning_rate": 0.0009500382442927748, | |
"loss": 3.2797, | |
"step": 49000 | |
}, | |
{ | |
"epoch": 2.688750268875027, | |
"grad_norm": 0.22081832587718964, | |
"learning_rate": 0.0009470963756177925, | |
"loss": 3.2783, | |
"step": 50000 | |
}, | |
{ | |
"epoch": 2.7425252742525275, | |
"grad_norm": 0.21164289116859436, | |
"learning_rate": 0.0009441545069428101, | |
"loss": 3.2752, | |
"step": 51000 | |
}, | |
{ | |
"epoch": 2.796300279630028, | |
"grad_norm": 0.21225039660930634, | |
"learning_rate": 0.0009412126382678278, | |
"loss": 3.2681, | |
"step": 52000 | |
}, | |
{ | |
"epoch": 2.8500752850075286, | |
"grad_norm": 0.1924898326396942, | |
"learning_rate": 0.0009382707695928455, | |
"loss": 3.2629, | |
"step": 53000 | |
}, | |
{ | |
"epoch": 2.903850290385029, | |
"grad_norm": 0.19862565398216248, | |
"learning_rate": 0.000935331842786538, | |
"loss": 3.2634, | |
"step": 54000 | |
}, | |
{ | |
"epoch": 2.9576252957625293, | |
"grad_norm": 0.19020138680934906, | |
"learning_rate": 0.0009323899741115557, | |
"loss": 3.2597, | |
"step": 55000 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.39273080241152825, | |
"eval_loss": 3.4648334980010986, | |
"eval_runtime": 154.5571, | |
"eval_samples_per_second": 374.729, | |
"eval_steps_per_second": 5.855, | |
"step": 55788 | |
}, | |
{ | |
"epoch": 3.0114003011400303, | |
"grad_norm": 0.19856859743595123, | |
"learning_rate": 0.0009294481054365734, | |
"loss": 3.239, | |
"step": 56000 | |
}, | |
{ | |
"epoch": 3.0651753065175305, | |
"grad_norm": 0.22371411323547363, | |
"learning_rate": 0.0009265091786302659, | |
"loss": 3.1886, | |
"step": 57000 | |
}, | |
{ | |
"epoch": 3.118950311895031, | |
"grad_norm": 0.21350081264972687, | |
"learning_rate": 0.0009235673099552836, | |
"loss": 3.1941, | |
"step": 58000 | |
}, | |
{ | |
"epoch": 3.1727253172725316, | |
"grad_norm": 0.219674214720726, | |
"learning_rate": 0.0009206254412803013, | |
"loss": 3.1942, | |
"step": 59000 | |
}, | |
{ | |
"epoch": 3.226500322650032, | |
"grad_norm": 0.19072189927101135, | |
"learning_rate": 0.0009176865144739939, | |
"loss": 3.1973, | |
"step": 60000 | |
}, | |
{ | |
"epoch": 3.2802753280275327, | |
"grad_norm": 0.205557718873024, | |
"learning_rate": 0.0009147475876676865, | |
"loss": 3.1932, | |
"step": 61000 | |
}, | |
{ | |
"epoch": 3.3340503334050333, | |
"grad_norm": 0.2098790556192398, | |
"learning_rate": 0.0009118057189927041, | |
"loss": 3.1935, | |
"step": 62000 | |
}, | |
{ | |
"epoch": 3.387825338782534, | |
"grad_norm": 0.196111798286438, | |
"learning_rate": 0.0009088638503177218, | |
"loss": 3.1954, | |
"step": 63000 | |
}, | |
{ | |
"epoch": 3.4416003441600345, | |
"grad_norm": 0.19440898299217224, | |
"learning_rate": 0.0009059219816427395, | |
"loss": 3.1924, | |
"step": 64000 | |
}, | |
{ | |
"epoch": 3.495375349537535, | |
"grad_norm": 0.21081770956516266, | |
"learning_rate": 0.0009029801129677572, | |
"loss": 3.1952, | |
"step": 65000 | |
}, | |
{ | |
"epoch": 3.5491503549150356, | |
"grad_norm": 0.21867215633392334, | |
"learning_rate": 0.0009000411861614498, | |
"loss": 3.195, | |
"step": 66000 | |
}, | |
{ | |
"epoch": 3.602925360292536, | |
"grad_norm": 0.22000326216220856, | |
"learning_rate": 0.0008970993174864674, | |
"loss": 3.1911, | |
"step": 67000 | |
}, | |
{ | |
"epoch": 3.6567003656700363, | |
"grad_norm": 0.1891467422246933, | |
"learning_rate": 0.0008941574488114851, | |
"loss": 3.1934, | |
"step": 68000 | |
}, | |
{ | |
"epoch": 3.7104753710475373, | |
"grad_norm": 0.18787287175655365, | |
"learning_rate": 0.0008912185220051777, | |
"loss": 3.191, | |
"step": 69000 | |
}, | |
{ | |
"epoch": 3.7642503764250375, | |
"grad_norm": 0.23694172501564026, | |
"learning_rate": 0.0008882766533301954, | |
"loss": 3.1831, | |
"step": 70000 | |
}, | |
{ | |
"epoch": 3.818025381802538, | |
"grad_norm": 0.19812917709350586, | |
"learning_rate": 0.000885334784655213, | |
"loss": 3.1815, | |
"step": 71000 | |
}, | |
{ | |
"epoch": 3.8718003871800386, | |
"grad_norm": 0.2005423903465271, | |
"learning_rate": 0.0008823958578489056, | |
"loss": 3.1801, | |
"step": 72000 | |
}, | |
{ | |
"epoch": 3.925575392557539, | |
"grad_norm": 0.21525584161281586, | |
"learning_rate": 0.0008794539891739233, | |
"loss": 3.1795, | |
"step": 73000 | |
}, | |
{ | |
"epoch": 3.9793503979350398, | |
"grad_norm": 0.19802774488925934, | |
"learning_rate": 0.0008765150623676159, | |
"loss": 3.1741, | |
"step": 74000 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.3976694409529698, | |
"eval_loss": 3.419067859649658, | |
"eval_runtime": 155.1328, | |
"eval_samples_per_second": 373.338, | |
"eval_steps_per_second": 5.834, | |
"step": 74384 | |
}, | |
{ | |
"epoch": 4.033125403312541, | |
"grad_norm": 0.2355957329273224, | |
"learning_rate": 0.0008735731936926335, | |
"loss": 3.1356, | |
"step": 75000 | |
}, | |
{ | |
"epoch": 4.086900408690041, | |
"grad_norm": 0.20892471075057983, | |
"learning_rate": 0.0008706313250176512, | |
"loss": 3.1124, | |
"step": 76000 | |
}, | |
{ | |
"epoch": 4.140675414067541, | |
"grad_norm": 0.24330681562423706, | |
"learning_rate": 0.0008676923982113439, | |
"loss": 3.1228, | |
"step": 77000 | |
}, | |
{ | |
"epoch": 4.194450419445042, | |
"grad_norm": 0.30532753467559814, | |
"learning_rate": 0.0008647505295363614, | |
"loss": 3.1191, | |
"step": 78000 | |
}, | |
{ | |
"epoch": 4.248225424822542, | |
"grad_norm": 0.2023121416568756, | |
"learning_rate": 0.0008618116027300541, | |
"loss": 3.1185, | |
"step": 79000 | |
}, | |
{ | |
"epoch": 4.302000430200043, | |
"grad_norm": 0.20023038983345032, | |
"learning_rate": 0.0008588697340550719, | |
"loss": 3.1248, | |
"step": 80000 | |
}, | |
{ | |
"epoch": 4.355775435577543, | |
"grad_norm": 0.20664258301258087, | |
"learning_rate": 0.0008559278653800895, | |
"loss": 3.1305, | |
"step": 81000 | |
}, | |
{ | |
"epoch": 4.409550440955044, | |
"grad_norm": 0.21807469427585602, | |
"learning_rate": 0.0008529889385737821, | |
"loss": 3.1265, | |
"step": 82000 | |
}, | |
{ | |
"epoch": 4.4633254463325445, | |
"grad_norm": 0.20922619104385376, | |
"learning_rate": 0.0008500470698987998, | |
"loss": 3.1287, | |
"step": 83000 | |
}, | |
{ | |
"epoch": 4.5171004517100455, | |
"grad_norm": 0.22318531572818756, | |
"learning_rate": 0.0008471052012238174, | |
"loss": 3.1265, | |
"step": 84000 | |
}, | |
{ | |
"epoch": 4.570875457087546, | |
"grad_norm": 0.20071184635162354, | |
"learning_rate": 0.000844163332548835, | |
"loss": 3.1244, | |
"step": 85000 | |
}, | |
{ | |
"epoch": 4.624650462465047, | |
"grad_norm": 0.23887498676776886, | |
"learning_rate": 0.0008412244057425277, | |
"loss": 3.1309, | |
"step": 86000 | |
}, | |
{ | |
"epoch": 4.678425467842547, | |
"grad_norm": 0.21280068159103394, | |
"learning_rate": 0.0008382825370675454, | |
"loss": 3.1261, | |
"step": 87000 | |
}, | |
{ | |
"epoch": 4.732200473220047, | |
"grad_norm": 0.20855990052223206, | |
"learning_rate": 0.0008353406683925629, | |
"loss": 3.1227, | |
"step": 88000 | |
}, | |
{ | |
"epoch": 4.785975478597548, | |
"grad_norm": 0.23701632022857666, | |
"learning_rate": 0.0008324017415862556, | |
"loss": 3.1274, | |
"step": 89000 | |
}, | |
{ | |
"epoch": 4.839750483975048, | |
"grad_norm": 0.22062337398529053, | |
"learning_rate": 0.0008294598729112733, | |
"loss": 3.1259, | |
"step": 90000 | |
}, | |
{ | |
"epoch": 4.893525489352549, | |
"grad_norm": 0.21007812023162842, | |
"learning_rate": 0.0008265209461049658, | |
"loss": 3.1241, | |
"step": 91000 | |
}, | |
{ | |
"epoch": 4.947300494730049, | |
"grad_norm": 0.3277081847190857, | |
"learning_rate": 0.0008235790774299835, | |
"loss": 3.1213, | |
"step": 92000 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.4008789177643117, | |
"eval_loss": 3.396653652191162, | |
"eval_runtime": 155.4385, | |
"eval_samples_per_second": 372.604, | |
"eval_steps_per_second": 5.822, | |
"step": 92980 | |
}, | |
{ | |
"epoch": 5.00107550010755, | |
"grad_norm": 0.20310255885124207, | |
"learning_rate": 0.0008206401506236762, | |
"loss": 3.1191, | |
"step": 93000 | |
}, | |
{ | |
"epoch": 5.05485050548505, | |
"grad_norm": 0.20080606639385223, | |
"learning_rate": 0.0008176982819486937, | |
"loss": 3.0521, | |
"step": 94000 | |
}, | |
{ | |
"epoch": 5.108625510862551, | |
"grad_norm": 0.21395424008369446, | |
"learning_rate": 0.0008147593551423864, | |
"loss": 3.0656, | |
"step": 95000 | |
}, | |
{ | |
"epoch": 5.1624005162400515, | |
"grad_norm": 0.22563432157039642, | |
"learning_rate": 0.0008118174864674042, | |
"loss": 3.0641, | |
"step": 96000 | |
}, | |
{ | |
"epoch": 5.2161755216175525, | |
"grad_norm": 0.21297597885131836, | |
"learning_rate": 0.0008088785596610967, | |
"loss": 3.07, | |
"step": 97000 | |
}, | |
{ | |
"epoch": 5.269950526995053, | |
"grad_norm": 0.20302899181842804, | |
"learning_rate": 0.0008059366909861144, | |
"loss": 3.0717, | |
"step": 98000 | |
}, | |
{ | |
"epoch": 5.323725532372554, | |
"grad_norm": 0.2152853012084961, | |
"learning_rate": 0.0008029948223111321, | |
"loss": 3.0759, | |
"step": 99000 | |
}, | |
{ | |
"epoch": 5.377500537750054, | |
"grad_norm": 0.2148328423500061, | |
"learning_rate": 0.0008000529536361497, | |
"loss": 3.0728, | |
"step": 100000 | |
}, | |
{ | |
"epoch": 5.431275543127555, | |
"grad_norm": 0.20232610404491425, | |
"learning_rate": 0.0007971140268298423, | |
"loss": 3.0798, | |
"step": 101000 | |
}, | |
{ | |
"epoch": 5.485050548505055, | |
"grad_norm": 0.22732730209827423, | |
"learning_rate": 0.000794175100023535, | |
"loss": 3.0756, | |
"step": 102000 | |
}, | |
{ | |
"epoch": 5.538825553882555, | |
"grad_norm": 0.2203952670097351, | |
"learning_rate": 0.0007912332313485526, | |
"loss": 3.0776, | |
"step": 103000 | |
}, | |
{ | |
"epoch": 5.592600559260056, | |
"grad_norm": 0.21848390996456146, | |
"learning_rate": 0.0007882943045422453, | |
"loss": 3.0763, | |
"step": 104000 | |
}, | |
{ | |
"epoch": 5.646375564637556, | |
"grad_norm": 0.22204072773456573, | |
"learning_rate": 0.0007853524358672629, | |
"loss": 3.0797, | |
"step": 105000 | |
}, | |
{ | |
"epoch": 5.700150570015057, | |
"grad_norm": 0.20933043956756592, | |
"learning_rate": 0.0007824135090609555, | |
"loss": 3.0763, | |
"step": 106000 | |
}, | |
{ | |
"epoch": 5.753925575392557, | |
"grad_norm": 0.19925065338611603, | |
"learning_rate": 0.0007794716403859732, | |
"loss": 3.0802, | |
"step": 107000 | |
}, | |
{ | |
"epoch": 5.807700580770058, | |
"grad_norm": 0.20748205482959747, | |
"learning_rate": 0.0007765297717109908, | |
"loss": 3.081, | |
"step": 108000 | |
}, | |
{ | |
"epoch": 5.8614755861475585, | |
"grad_norm": 0.2089342474937439, | |
"learning_rate": 0.0007735908449046834, | |
"loss": 3.0787, | |
"step": 109000 | |
}, | |
{ | |
"epoch": 5.9152505915250595, | |
"grad_norm": 0.20147345960140228, | |
"learning_rate": 0.0007706489762297011, | |
"loss": 3.0829, | |
"step": 110000 | |
}, | |
{ | |
"epoch": 5.96902559690256, | |
"grad_norm": 0.2211214154958725, | |
"learning_rate": 0.0007677071075547188, | |
"loss": 3.0783, | |
"step": 111000 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.405020628943781, | |
"eval_loss": 3.3772811889648438, | |
"eval_runtime": 156.2536, | |
"eval_samples_per_second": 370.66, | |
"eval_steps_per_second": 5.792, | |
"step": 111576 | |
}, | |
{ | |
"epoch": 6.022800602280061, | |
"grad_norm": 0.21148885786533356, | |
"learning_rate": 0.0007647681807484113, | |
"loss": 3.0451, | |
"step": 112000 | |
}, | |
{ | |
"epoch": 6.076575607657561, | |
"grad_norm": 0.2195775806903839, | |
"learning_rate": 0.000761826312073429, | |
"loss": 3.0147, | |
"step": 113000 | |
}, | |
{ | |
"epoch": 6.130350613035061, | |
"grad_norm": 0.20522399246692657, | |
"learning_rate": 0.0007588873852671217, | |
"loss": 3.0194, | |
"step": 114000 | |
}, | |
{ | |
"epoch": 6.184125618412562, | |
"grad_norm": 0.20723003149032593, | |
"learning_rate": 0.0007559455165921393, | |
"loss": 3.026, | |
"step": 115000 | |
}, | |
{ | |
"epoch": 6.237900623790062, | |
"grad_norm": 0.23514005541801453, | |
"learning_rate": 0.000753003647917157, | |
"loss": 3.0231, | |
"step": 116000 | |
}, | |
{ | |
"epoch": 6.291675629167563, | |
"grad_norm": 0.20580914616584778, | |
"learning_rate": 0.0007500647211108497, | |
"loss": 3.0321, | |
"step": 117000 | |
}, | |
{ | |
"epoch": 6.345450634545063, | |
"grad_norm": 0.2240120768547058, | |
"learning_rate": 0.0007471228524358674, | |
"loss": 3.0332, | |
"step": 118000 | |
}, | |
{ | |
"epoch": 6.399225639922564, | |
"grad_norm": 0.23184897005558014, | |
"learning_rate": 0.0007441839256295599, | |
"loss": 3.0369, | |
"step": 119000 | |
}, | |
{ | |
"epoch": 6.453000645300064, | |
"grad_norm": 0.22646069526672363, | |
"learning_rate": 0.0007412449988232526, | |
"loss": 3.0357, | |
"step": 120000 | |
}, | |
{ | |
"epoch": 6.506775650677565, | |
"grad_norm": 0.21927151083946228, | |
"learning_rate": 0.0007383031301482702, | |
"loss": 3.0398, | |
"step": 121000 | |
}, | |
{ | |
"epoch": 6.5605506560550655, | |
"grad_norm": 0.24726586043834686, | |
"learning_rate": 0.0007353612614732878, | |
"loss": 3.0373, | |
"step": 122000 | |
}, | |
{ | |
"epoch": 6.6143256614325665, | |
"grad_norm": 0.21686062216758728, | |
"learning_rate": 0.0007324193927983055, | |
"loss": 3.0399, | |
"step": 123000 | |
}, | |
{ | |
"epoch": 6.668100666810067, | |
"grad_norm": 0.21142247319221497, | |
"learning_rate": 0.0007294804659919982, | |
"loss": 3.0446, | |
"step": 124000 | |
}, | |
{ | |
"epoch": 6.721875672187567, | |
"grad_norm": 0.21460475027561188, | |
"learning_rate": 0.0007265385973170157, | |
"loss": 3.0403, | |
"step": 125000 | |
}, | |
{ | |
"epoch": 6.775650677565068, | |
"grad_norm": 0.22398121654987335, | |
"learning_rate": 0.0007235967286420334, | |
"loss": 3.0426, | |
"step": 126000 | |
}, | |
{ | |
"epoch": 6.829425682942568, | |
"grad_norm": 0.23123160004615784, | |
"learning_rate": 0.0007206548599670511, | |
"loss": 3.0443, | |
"step": 127000 | |
}, | |
{ | |
"epoch": 6.883200688320069, | |
"grad_norm": 0.21254226565361023, | |
"learning_rate": 0.0007177159331607437, | |
"loss": 3.0424, | |
"step": 128000 | |
}, | |
{ | |
"epoch": 6.936975693697569, | |
"grad_norm": 0.21302445232868195, | |
"learning_rate": 0.0007147740644857613, | |
"loss": 3.0472, | |
"step": 129000 | |
}, | |
{ | |
"epoch": 6.99075069907507, | |
"grad_norm": 0.2217877358198166, | |
"learning_rate": 0.0007118321958107791, | |
"loss": 3.0456, | |
"step": 130000 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.4055193299898036, | |
"eval_loss": 3.3825550079345703, | |
"eval_runtime": 155.2208, | |
"eval_samples_per_second": 373.126, | |
"eval_steps_per_second": 5.83, | |
"step": 130172 | |
}, | |
{ | |
"epoch": 7.04452570445257, | |
"grad_norm": 0.23532521724700928, | |
"learning_rate": 0.0007088903271357967, | |
"loss": 2.9839, | |
"step": 131000 | |
}, | |
{ | |
"epoch": 7.098300709830071, | |
"grad_norm": 0.22214515507221222, | |
"learning_rate": 0.0007059514003294893, | |
"loss": 2.9765, | |
"step": 132000 | |
}, | |
{ | |
"epoch": 7.152075715207571, | |
"grad_norm": 0.2383100390434265, | |
"learning_rate": 0.0007030154153918568, | |
"loss": 2.9843, | |
"step": 133000 | |
}, | |
{ | |
"epoch": 7.205850720585072, | |
"grad_norm": 0.22472046315670013, | |
"learning_rate": 0.0007000735467168746, | |
"loss": 2.9925, | |
"step": 134000 | |
}, | |
{ | |
"epoch": 7.2596257259625725, | |
"grad_norm": 0.26296138763427734, | |
"learning_rate": 0.0006971316780418923, | |
"loss": 2.997, | |
"step": 135000 | |
}, | |
{ | |
"epoch": 7.3134007313400735, | |
"grad_norm": 0.2494724839925766, | |
"learning_rate": 0.0006941898093669099, | |
"loss": 2.997, | |
"step": 136000 | |
}, | |
{ | |
"epoch": 7.367175736717574, | |
"grad_norm": 0.22137367725372314, | |
"learning_rate": 0.0006912508825606025, | |
"loss": 2.9973, | |
"step": 137000 | |
}, | |
{ | |
"epoch": 7.420950742095075, | |
"grad_norm": 0.22704289853572845, | |
"learning_rate": 0.0006883090138856202, | |
"loss": 3.0066, | |
"step": 138000 | |
}, | |
{ | |
"epoch": 7.474725747472575, | |
"grad_norm": 0.2145918905735016, | |
"learning_rate": 0.0006853700870793128, | |
"loss": 3.0054, | |
"step": 139000 | |
}, | |
{ | |
"epoch": 7.528500752850075, | |
"grad_norm": 0.21607990562915802, | |
"learning_rate": 0.0006824282184043304, | |
"loss": 3.0018, | |
"step": 140000 | |
}, | |
{ | |
"epoch": 7.582275758227576, | |
"grad_norm": 0.2057826817035675, | |
"learning_rate": 0.0006794863497293481, | |
"loss": 3.0101, | |
"step": 141000 | |
}, | |
{ | |
"epoch": 7.636050763605076, | |
"grad_norm": 0.23032937943935394, | |
"learning_rate": 0.0006765474229230408, | |
"loss": 3.0099, | |
"step": 142000 | |
}, | |
{ | |
"epoch": 7.689825768982577, | |
"grad_norm": 0.22495923936367035, | |
"learning_rate": 0.0006736055542480583, | |
"loss": 3.008, | |
"step": 143000 | |
}, | |
{ | |
"epoch": 7.743600774360077, | |
"grad_norm": 0.2345353364944458, | |
"learning_rate": 0.000670666627441751, | |
"loss": 3.0099, | |
"step": 144000 | |
}, | |
{ | |
"epoch": 7.797375779737578, | |
"grad_norm": 0.23005186021327972, | |
"learning_rate": 0.0006677277006354437, | |
"loss": 3.0125, | |
"step": 145000 | |
}, | |
{ | |
"epoch": 7.851150785115078, | |
"grad_norm": 0.29431313276290894, | |
"learning_rate": 0.0006647858319604612, | |
"loss": 3.0107, | |
"step": 146000 | |
}, | |
{ | |
"epoch": 7.904925790492579, | |
"grad_norm": 0.2245541363954544, | |
"learning_rate": 0.0006618439632854789, | |
"loss": 3.0133, | |
"step": 147000 | |
}, | |
{ | |
"epoch": 7.9587007958700795, | |
"grad_norm": 0.22786079347133636, | |
"learning_rate": 0.0006589020946104966, | |
"loss": 3.0126, | |
"step": 148000 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.40771172002548395, | |
"eval_loss": 3.3547439575195312, | |
"eval_runtime": 154.7201, | |
"eval_samples_per_second": 374.334, | |
"eval_steps_per_second": 5.849, | |
"step": 148768 | |
}, | |
{ | |
"epoch": 8.01247580124758, | |
"grad_norm": 0.2346237152814865, | |
"learning_rate": 0.0006559602259355142, | |
"loss": 2.9965, | |
"step": 149000 | |
}, | |
{ | |
"epoch": 8.066250806625082, | |
"grad_norm": 0.23346728086471558, | |
"learning_rate": 0.0006530212991292069, | |
"loss": 2.9479, | |
"step": 150000 | |
}, | |
{ | |
"epoch": 8.12002581200258, | |
"grad_norm": 0.24378512799739838, | |
"learning_rate": 0.0006500794304542246, | |
"loss": 2.9513, | |
"step": 151000 | |
}, | |
{ | |
"epoch": 8.173800817380082, | |
"grad_norm": 0.24439002573490143, | |
"learning_rate": 0.0006471405036479172, | |
"loss": 2.9601, | |
"step": 152000 | |
}, | |
{ | |
"epoch": 8.227575822757583, | |
"grad_norm": 0.27081623673439026, | |
"learning_rate": 0.0006441986349729348, | |
"loss": 2.9631, | |
"step": 153000 | |
}, | |
{ | |
"epoch": 8.281350828135082, | |
"grad_norm": 0.2521245777606964, | |
"learning_rate": 0.0006412626500353025, | |
"loss": 2.966, | |
"step": 154000 | |
}, | |
{ | |
"epoch": 8.335125833512583, | |
"grad_norm": 0.21975190937519073, | |
"learning_rate": 0.0006383207813603201, | |
"loss": 2.9678, | |
"step": 155000 | |
}, | |
{ | |
"epoch": 8.388900838890084, | |
"grad_norm": 0.2267887145280838, | |
"learning_rate": 0.0006353789126853378, | |
"loss": 2.9696, | |
"step": 156000 | |
}, | |
{ | |
"epoch": 8.442675844267585, | |
"grad_norm": 0.218279168009758, | |
"learning_rate": 0.0006324370440103554, | |
"loss": 2.9713, | |
"step": 157000 | |
}, | |
{ | |
"epoch": 8.496450849645084, | |
"grad_norm": 0.23300865292549133, | |
"learning_rate": 0.0006294951753353731, | |
"loss": 2.9772, | |
"step": 158000 | |
}, | |
{ | |
"epoch": 8.550225855022585, | |
"grad_norm": 0.21749693155288696, | |
"learning_rate": 0.0006265562485290657, | |
"loss": 2.9773, | |
"step": 159000 | |
}, | |
{ | |
"epoch": 8.604000860400086, | |
"grad_norm": 0.26928380131721497, | |
"learning_rate": 0.0006236143798540833, | |
"loss": 2.978, | |
"step": 160000 | |
}, | |
{ | |
"epoch": 8.657775865777587, | |
"grad_norm": 0.22122180461883545, | |
"learning_rate": 0.000620672511179101, | |
"loss": 2.9794, | |
"step": 161000 | |
}, | |
{ | |
"epoch": 8.711550871155087, | |
"grad_norm": 0.22700442373752594, | |
"learning_rate": 0.0006177306425041186, | |
"loss": 2.9824, | |
"step": 162000 | |
}, | |
{ | |
"epoch": 8.765325876532588, | |
"grad_norm": 0.2541004419326782, | |
"learning_rate": 0.0006147917156978112, | |
"loss": 2.9841, | |
"step": 163000 | |
}, | |
{ | |
"epoch": 8.819100881910089, | |
"grad_norm": 0.2551893889904022, | |
"learning_rate": 0.0006118498470228289, | |
"loss": 2.9837, | |
"step": 164000 | |
}, | |
{ | |
"epoch": 8.872875887287588, | |
"grad_norm": 0.25604966282844543, | |
"learning_rate": 0.0006089079783478466, | |
"loss": 2.984, | |
"step": 165000 | |
}, | |
{ | |
"epoch": 8.926650892665089, | |
"grad_norm": 0.24571265280246735, | |
"learning_rate": 0.0006059719934102142, | |
"loss": 2.9849, | |
"step": 166000 | |
}, | |
{ | |
"epoch": 8.98042589804259, | |
"grad_norm": 0.24128392338752747, | |
"learning_rate": 0.0006030301247352318, | |
"loss": 2.9843, | |
"step": 167000 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.4083424360998555, | |
"eval_loss": 3.3613698482513428, | |
"eval_runtime": 155.0364, | |
"eval_samples_per_second": 373.57, | |
"eval_steps_per_second": 5.837, | |
"step": 167364 | |
}, | |
{ | |
"epoch": 9.034200903420091, | |
"grad_norm": 0.24964158236980438, | |
"learning_rate": 0.0006000882560602495, | |
"loss": 2.9432, | |
"step": 168000 | |
}, | |
{ | |
"epoch": 9.08797590879759, | |
"grad_norm": 0.2405262142419815, | |
"learning_rate": 0.0005971463873852672, | |
"loss": 2.922, | |
"step": 169000 | |
}, | |
{ | |
"epoch": 9.141750914175091, | |
"grad_norm": 0.22288870811462402, | |
"learning_rate": 0.0005942045187102848, | |
"loss": 2.9269, | |
"step": 170000 | |
}, | |
{ | |
"epoch": 9.195525919552592, | |
"grad_norm": 0.3041359484195709, | |
"learning_rate": 0.0005912626500353024, | |
"loss": 2.9342, | |
"step": 171000 | |
}, | |
{ | |
"epoch": 9.249300924930093, | |
"grad_norm": 0.22550632059574127, | |
"learning_rate": 0.0005883237232289951, | |
"loss": 2.9392, | |
"step": 172000 | |
}, | |
{ | |
"epoch": 9.303075930307593, | |
"grad_norm": 0.23584921658039093, | |
"learning_rate": 0.0005853818545540127, | |
"loss": 2.9422, | |
"step": 173000 | |
}, | |
{ | |
"epoch": 9.356850935685094, | |
"grad_norm": 0.2634640634059906, | |
"learning_rate": 0.0005824458696163803, | |
"loss": 2.945, | |
"step": 174000 | |
}, | |
{ | |
"epoch": 9.410625941062595, | |
"grad_norm": 0.2354883849620819, | |
"learning_rate": 0.000579504000941398, | |
"loss": 2.9485, | |
"step": 175000 | |
}, | |
{ | |
"epoch": 9.464400946440094, | |
"grad_norm": 0.26491352915763855, | |
"learning_rate": 0.0005765621322664157, | |
"loss": 2.9475, | |
"step": 176000 | |
}, | |
{ | |
"epoch": 9.518175951817595, | |
"grad_norm": 0.2462054342031479, | |
"learning_rate": 0.0005736202635914332, | |
"loss": 2.9482, | |
"step": 177000 | |
}, | |
{ | |
"epoch": 9.571950957195096, | |
"grad_norm": 0.2643495202064514, | |
"learning_rate": 0.0005706783949164509, | |
"loss": 2.9502, | |
"step": 178000 | |
}, | |
{ | |
"epoch": 9.625725962572597, | |
"grad_norm": 0.24029669165611267, | |
"learning_rate": 0.0005677365262414686, | |
"loss": 2.9518, | |
"step": 179000 | |
}, | |
{ | |
"epoch": 9.679500967950096, | |
"grad_norm": 0.2550260126590729, | |
"learning_rate": 0.0005648005413038361, | |
"loss": 2.9571, | |
"step": 180000 | |
}, | |
{ | |
"epoch": 9.733275973327597, | |
"grad_norm": 0.23675589263439178, | |
"learning_rate": 0.0005618586726288538, | |
"loss": 2.9543, | |
"step": 181000 | |
}, | |
{ | |
"epoch": 9.787050978705098, | |
"grad_norm": 0.2279191017150879, | |
"learning_rate": 0.0005589197458225465, | |
"loss": 2.9593, | |
"step": 182000 | |
}, | |
{ | |
"epoch": 9.8408259840826, | |
"grad_norm": 0.27392587065696716, | |
"learning_rate": 0.0005559778771475641, | |
"loss": 2.9561, | |
"step": 183000 | |
}, | |
{ | |
"epoch": 9.894600989460098, | |
"grad_norm": 0.2388741672039032, | |
"learning_rate": 0.0005530360084725818, | |
"loss": 2.9565, | |
"step": 184000 | |
}, | |
{ | |
"epoch": 9.9483759948376, | |
"grad_norm": 0.2503463327884674, | |
"learning_rate": 0.0005500970816662745, | |
"loss": 2.9592, | |
"step": 185000 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.40854156716551776, | |
"eval_loss": 3.377901077270508, | |
"eval_runtime": 154.6842, | |
"eval_samples_per_second": 374.421, | |
"eval_steps_per_second": 5.851, | |
"step": 185960 | |
}, | |
{ | |
"epoch": 10.0021510002151, | |
"grad_norm": 0.2553974688053131, | |
"learning_rate": 0.0005471552129912922, | |
"loss": 2.9593, | |
"step": 186000 | |
}, | |
{ | |
"epoch": 10.055926005592601, | |
"grad_norm": 0.25415316224098206, | |
"learning_rate": 0.0005442133443163097, | |
"loss": 2.8931, | |
"step": 187000 | |
}, | |
{ | |
"epoch": 10.1097010109701, | |
"grad_norm": 0.2477007806301117, | |
"learning_rate": 0.0005412744175100024, | |
"loss": 2.8999, | |
"step": 188000 | |
}, | |
{ | |
"epoch": 10.163476016347602, | |
"grad_norm": 0.23852670192718506, | |
"learning_rate": 0.0005383325488350201, | |
"loss": 2.9028, | |
"step": 189000 | |
}, | |
{ | |
"epoch": 10.217251021725103, | |
"grad_norm": 0.2484176605939865, | |
"learning_rate": 0.0005353906801600376, | |
"loss": 2.9117, | |
"step": 190000 | |
}, | |
{ | |
"epoch": 10.271026027102602, | |
"grad_norm": 0.27494022250175476, | |
"learning_rate": 0.0005324517533537303, | |
"loss": 2.9133, | |
"step": 191000 | |
}, | |
{ | |
"epoch": 10.324801032480103, | |
"grad_norm": 0.2577020823955536, | |
"learning_rate": 0.000529509884678748, | |
"loss": 2.9214, | |
"step": 192000 | |
}, | |
{ | |
"epoch": 10.378576037857604, | |
"grad_norm": 0.2292626053094864, | |
"learning_rate": 0.0005265680160037656, | |
"loss": 2.9208, | |
"step": 193000 | |
}, | |
{ | |
"epoch": 10.432351043235105, | |
"grad_norm": 0.267873615026474, | |
"learning_rate": 0.0005236290891974582, | |
"loss": 2.9213, | |
"step": 194000 | |
}, | |
{ | |
"epoch": 10.486126048612604, | |
"grad_norm": 0.24749253690242767, | |
"learning_rate": 0.0005206872205224759, | |
"loss": 2.9239, | |
"step": 195000 | |
}, | |
{ | |
"epoch": 10.539901053990105, | |
"grad_norm": 0.26779085397720337, | |
"learning_rate": 0.0005177453518474935, | |
"loss": 2.929, | |
"step": 196000 | |
}, | |
{ | |
"epoch": 10.593676059367606, | |
"grad_norm": 0.2552465796470642, | |
"learning_rate": 0.0005148064250411861, | |
"loss": 2.9309, | |
"step": 197000 | |
}, | |
{ | |
"epoch": 10.647451064745107, | |
"grad_norm": 0.2551726996898651, | |
"learning_rate": 0.0005118645563662038, | |
"loss": 2.9287, | |
"step": 198000 | |
}, | |
{ | |
"epoch": 10.701226070122607, | |
"grad_norm": 0.24481531977653503, | |
"learning_rate": 0.0005089256295598964, | |
"loss": 2.9326, | |
"step": 199000 | |
}, | |
{ | |
"epoch": 10.755001075500108, | |
"grad_norm": 0.2609283924102783, | |
"learning_rate": 0.0005059837608849142, | |
"loss": 2.9332, | |
"step": 200000 | |
}, | |
{ | |
"epoch": 10.808776080877609, | |
"grad_norm": 0.22893798351287842, | |
"learning_rate": 0.0005030418922099318, | |
"loss": 2.9361, | |
"step": 201000 | |
}, | |
{ | |
"epoch": 10.86255108625511, | |
"grad_norm": 0.24516218900680542, | |
"learning_rate": 0.0005001029654036244, | |
"loss": 2.9358, | |
"step": 202000 | |
}, | |
{ | |
"epoch": 10.916326091632609, | |
"grad_norm": 0.23790410161018372, | |
"learning_rate": 0.0004971610967286421, | |
"loss": 2.9344, | |
"step": 203000 | |
}, | |
{ | |
"epoch": 10.97010109701011, | |
"grad_norm": 0.2549736797809601, | |
"learning_rate": 0.0004942221699223347, | |
"loss": 2.9367, | |
"step": 204000 | |
}, | |
{ | |
"epoch": 11.0, | |
"eval_accuracy": 0.40993629082379995, | |
"eval_loss": 3.3604304790496826, | |
"eval_runtime": 154.4315, | |
"eval_samples_per_second": 375.034, | |
"eval_steps_per_second": 5.86, | |
"step": 204556 | |
}, | |
{ | |
"epoch": 11.02387610238761, | |
"grad_norm": 0.2493738979101181, | |
"learning_rate": 0.0004912803012473523, | |
"loss": 2.9057, | |
"step": 205000 | |
}, | |
{ | |
"epoch": 11.07765110776511, | |
"grad_norm": 0.245314821600914, | |
"learning_rate": 0.00048833843257237, | |
"loss": 2.8775, | |
"step": 206000 | |
}, | |
{ | |
"epoch": 11.131426113142611, | |
"grad_norm": 0.24536246061325073, | |
"learning_rate": 0.0004853995057660626, | |
"loss": 2.8783, | |
"step": 207000 | |
}, | |
{ | |
"epoch": 11.185201118520112, | |
"grad_norm": 0.26116108894348145, | |
"learning_rate": 0.00048245763709108024, | |
"loss": 2.8851, | |
"step": 208000 | |
}, | |
{ | |
"epoch": 11.238976123897613, | |
"grad_norm": 0.24136824905872345, | |
"learning_rate": 0.0004795187102847729, | |
"loss": 2.8918, | |
"step": 209000 | |
}, | |
{ | |
"epoch": 11.292751129275112, | |
"grad_norm": 0.275006502866745, | |
"learning_rate": 0.0004765768416097905, | |
"loss": 2.8951, | |
"step": 210000 | |
}, | |
{ | |
"epoch": 11.346526134652613, | |
"grad_norm": 0.27772292494773865, | |
"learning_rate": 0.0004736349729348082, | |
"loss": 2.8973, | |
"step": 211000 | |
}, | |
{ | |
"epoch": 11.400301140030114, | |
"grad_norm": 0.2865879237651825, | |
"learning_rate": 0.0004706960461285008, | |
"loss": 2.8977, | |
"step": 212000 | |
}, | |
{ | |
"epoch": 11.454076145407615, | |
"grad_norm": 0.24032117426395416, | |
"learning_rate": 0.00046775711932219347, | |
"loss": 2.9002, | |
"step": 213000 | |
}, | |
{ | |
"epoch": 11.507851150785115, | |
"grad_norm": 0.24863946437835693, | |
"learning_rate": 0.00046481525064721114, | |
"loss": 2.9052, | |
"step": 214000 | |
}, | |
{ | |
"epoch": 11.561626156162616, | |
"grad_norm": 0.25115910172462463, | |
"learning_rate": 0.00046187338197222876, | |
"loss": 2.9081, | |
"step": 215000 | |
}, | |
{ | |
"epoch": 11.615401161540117, | |
"grad_norm": 0.25204330682754517, | |
"learning_rate": 0.00045893151329724644, | |
"loss": 2.906, | |
"step": 216000 | |
}, | |
{ | |
"epoch": 11.669176166917616, | |
"grad_norm": 0.25553619861602783, | |
"learning_rate": 0.00045599258649093905, | |
"loss": 2.9074, | |
"step": 217000 | |
}, | |
{ | |
"epoch": 11.722951172295117, | |
"grad_norm": 0.24004510045051575, | |
"learning_rate": 0.00045305071781595667, | |
"loss": 2.9105, | |
"step": 218000 | |
}, | |
{ | |
"epoch": 11.776726177672618, | |
"grad_norm": 0.28220391273498535, | |
"learning_rate": 0.00045010884914097434, | |
"loss": 2.9075, | |
"step": 219000 | |
}, | |
{ | |
"epoch": 11.830501183050119, | |
"grad_norm": 0.2582526206970215, | |
"learning_rate": 0.00044716992233466695, | |
"loss": 2.9129, | |
"step": 220000 | |
}, | |
{ | |
"epoch": 11.884276188427618, | |
"grad_norm": 0.27006328105926514, | |
"learning_rate": 0.0004442309955283596, | |
"loss": 2.9146, | |
"step": 221000 | |
}, | |
{ | |
"epoch": 11.93805119380512, | |
"grad_norm": 0.3025253415107727, | |
"learning_rate": 0.0004412891268533773, | |
"loss": 2.9161, | |
"step": 222000 | |
}, | |
{ | |
"epoch": 11.99182619918262, | |
"grad_norm": 0.2534899115562439, | |
"learning_rate": 0.00043834725817839496, | |
"loss": 2.9145, | |
"step": 223000 | |
}, | |
{ | |
"epoch": 12.0, | |
"eval_accuracy": 0.40973608482660917, | |
"eval_loss": 3.3759312629699707, | |
"eval_runtime": 154.6971, | |
"eval_samples_per_second": 374.39, | |
"eval_steps_per_second": 5.85, | |
"step": 223152 | |
}, | |
{ | |
"epoch": 12.045601204560121, | |
"grad_norm": 0.27566835284233093, | |
"learning_rate": 0.0004354053895034126, | |
"loss": 2.8626, | |
"step": 224000 | |
}, | |
{ | |
"epoch": 12.09937620993762, | |
"grad_norm": 0.243771031498909, | |
"learning_rate": 0.0004324664626971052, | |
"loss": 2.8583, | |
"step": 225000 | |
}, | |
{ | |
"epoch": 12.153151215315122, | |
"grad_norm": 0.26204991340637207, | |
"learning_rate": 0.00042952459402212286, | |
"loss": 2.8654, | |
"step": 226000 | |
}, | |
{ | |
"epoch": 12.206926220692623, | |
"grad_norm": 0.2582913637161255, | |
"learning_rate": 0.0004265827253471405, | |
"loss": 2.8673, | |
"step": 227000 | |
}, | |
{ | |
"epoch": 12.260701226070122, | |
"grad_norm": 0.27284786105155945, | |
"learning_rate": 0.00042364674040950813, | |
"loss": 2.8735, | |
"step": 228000 | |
}, | |
{ | |
"epoch": 12.314476231447623, | |
"grad_norm": 0.25654593110084534, | |
"learning_rate": 0.00042070487173452575, | |
"loss": 2.8751, | |
"step": 229000 | |
}, | |
{ | |
"epoch": 12.368251236825124, | |
"grad_norm": 0.23449215292930603, | |
"learning_rate": 0.0004177630030595435, | |
"loss": 2.8769, | |
"step": 230000 | |
}, | |
{ | |
"epoch": 12.422026242202625, | |
"grad_norm": 0.24525216221809387, | |
"learning_rate": 0.00041482407625323604, | |
"loss": 2.8763, | |
"step": 231000 | |
}, | |
{ | |
"epoch": 12.475801247580124, | |
"grad_norm": 0.26022928953170776, | |
"learning_rate": 0.0004118822075782537, | |
"loss": 2.8811, | |
"step": 232000 | |
}, | |
{ | |
"epoch": 12.529576252957625, | |
"grad_norm": 0.24519123136997223, | |
"learning_rate": 0.0004089403389032714, | |
"loss": 2.8844, | |
"step": 233000 | |
}, | |
{ | |
"epoch": 12.583351258335126, | |
"grad_norm": 0.3118698298931122, | |
"learning_rate": 0.000405998470228289, | |
"loss": 2.8837, | |
"step": 234000 | |
}, | |
{ | |
"epoch": 12.637126263712627, | |
"grad_norm": 0.28287139534950256, | |
"learning_rate": 0.0004030566015533066, | |
"loss": 2.8843, | |
"step": 235000 | |
}, | |
{ | |
"epoch": 12.690901269090126, | |
"grad_norm": 0.24490605294704437, | |
"learning_rate": 0.0004001176747469993, | |
"loss": 2.8868, | |
"step": 236000 | |
}, | |
{ | |
"epoch": 12.744676274467627, | |
"grad_norm": 0.2810141444206238, | |
"learning_rate": 0.00039717580607201696, | |
"loss": 2.8905, | |
"step": 237000 | |
}, | |
{ | |
"epoch": 12.798451279845128, | |
"grad_norm": 0.2660733759403229, | |
"learning_rate": 0.0003942368792657096, | |
"loss": 2.8917, | |
"step": 238000 | |
}, | |
{ | |
"epoch": 12.852226285222628, | |
"grad_norm": 0.2440771758556366, | |
"learning_rate": 0.00039129501059072724, | |
"loss": 2.888, | |
"step": 239000 | |
}, | |
{ | |
"epoch": 12.906001290600129, | |
"grad_norm": 0.25520211458206177, | |
"learning_rate": 0.0003883560837844199, | |
"loss": 2.8938, | |
"step": 240000 | |
}, | |
{ | |
"epoch": 12.95977629597763, | |
"grad_norm": 0.2661096751689911, | |
"learning_rate": 0.0003854142151094375, | |
"loss": 2.8924, | |
"step": 241000 | |
}, | |
{ | |
"epoch": 13.0, | |
"eval_accuracy": 0.4095556978794759, | |
"eval_loss": 3.3856160640716553, | |
"eval_runtime": 154.4788, | |
"eval_samples_per_second": 374.919, | |
"eval_steps_per_second": 5.858, | |
"step": 241748 | |
}, | |
{ | |
"epoch": 13.01355130135513, | |
"grad_norm": 0.29137110710144043, | |
"learning_rate": 0.00038247234643445514, | |
"loss": 2.8796, | |
"step": 242000 | |
}, | |
{ | |
"epoch": 13.06732630673263, | |
"grad_norm": 0.26854678988456726, | |
"learning_rate": 0.0003795304777594728, | |
"loss": 2.837, | |
"step": 243000 | |
}, | |
{ | |
"epoch": 13.121101312110131, | |
"grad_norm": 0.2614552974700928, | |
"learning_rate": 0.00037659155095316543, | |
"loss": 2.8409, | |
"step": 244000 | |
}, | |
{ | |
"epoch": 13.174876317487632, | |
"grad_norm": 0.3031397759914398, | |
"learning_rate": 0.00037364968227818316, | |
"loss": 2.8458, | |
"step": 245000 | |
}, | |
{ | |
"epoch": 13.228651322865133, | |
"grad_norm": 0.2507542073726654, | |
"learning_rate": 0.0003707107554718757, | |
"loss": 2.8537, | |
"step": 246000 | |
}, | |
{ | |
"epoch": 13.282426328242632, | |
"grad_norm": 0.2524189054965973, | |
"learning_rate": 0.0003677688867968934, | |
"loss": 2.852, | |
"step": 247000 | |
}, | |
{ | |
"epoch": 13.336201333620133, | |
"grad_norm": 0.27850231528282166, | |
"learning_rate": 0.00036482701812191106, | |
"loss": 2.8574, | |
"step": 248000 | |
}, | |
{ | |
"epoch": 13.389976338997634, | |
"grad_norm": 0.2408652901649475, | |
"learning_rate": 0.00036188809131560367, | |
"loss": 2.8563, | |
"step": 249000 | |
}, | |
{ | |
"epoch": 13.443751344375134, | |
"grad_norm": 0.28609830141067505, | |
"learning_rate": 0.00035894622264062134, | |
"loss": 2.8627, | |
"step": 250000 | |
}, | |
{ | |
"epoch": 13.497526349752635, | |
"grad_norm": 0.2690850496292114, | |
"learning_rate": 0.00035600435396563896, | |
"loss": 2.8619, | |
"step": 251000 | |
}, | |
{ | |
"epoch": 13.551301355130136, | |
"grad_norm": 0.2862522304058075, | |
"learning_rate": 0.0003530654271593316, | |
"loss": 2.8646, | |
"step": 252000 | |
}, | |
{ | |
"epoch": 13.605076360507637, | |
"grad_norm": 0.2629512548446655, | |
"learning_rate": 0.0003501235584843493, | |
"loss": 2.865, | |
"step": 253000 | |
}, | |
{ | |
"epoch": 13.658851365885136, | |
"grad_norm": 0.2542857825756073, | |
"learning_rate": 0.00034718463167804185, | |
"loss": 2.865, | |
"step": 254000 | |
}, | |
{ | |
"epoch": 13.712626371262637, | |
"grad_norm": 0.258798211812973, | |
"learning_rate": 0.0003442427630030596, | |
"loss": 2.8686, | |
"step": 255000 | |
}, | |
{ | |
"epoch": 13.766401376640138, | |
"grad_norm": 0.2492339164018631, | |
"learning_rate": 0.0003413008943280772, | |
"loss": 2.8689, | |
"step": 256000 | |
}, | |
{ | |
"epoch": 13.820176382017639, | |
"grad_norm": 0.29779887199401855, | |
"learning_rate": 0.0003383590256530948, | |
"loss": 2.8691, | |
"step": 257000 | |
}, | |
{ | |
"epoch": 13.873951387395138, | |
"grad_norm": 0.2670515179634094, | |
"learning_rate": 0.0003354200988467875, | |
"loss": 2.8711, | |
"step": 258000 | |
}, | |
{ | |
"epoch": 13.92772639277264, | |
"grad_norm": 0.26957201957702637, | |
"learning_rate": 0.0003324782301718051, | |
"loss": 2.8711, | |
"step": 259000 | |
}, | |
{ | |
"epoch": 13.98150139815014, | |
"grad_norm": 0.2824675142765045, | |
"learning_rate": 0.00032953636149682283, | |
"loss": 2.8757, | |
"step": 260000 | |
}, | |
{ | |
"epoch": 14.0, | |
"eval_accuracy": 0.410457699798363, | |
"eval_loss": 3.384411573410034, | |
"eval_runtime": 154.9443, | |
"eval_samples_per_second": 373.792, | |
"eval_steps_per_second": 5.841, | |
"step": 260344 | |
}, | |
{ | |
"epoch": 14.035276403527641, | |
"grad_norm": 0.29713505506515503, | |
"learning_rate": 0.00032659449282184045, | |
"loss": 2.8354, | |
"step": 261000 | |
}, | |
{ | |
"epoch": 14.08905140890514, | |
"grad_norm": 0.27730679512023926, | |
"learning_rate": 0.00032365850788420805, | |
"loss": 2.8198, | |
"step": 262000 | |
}, | |
{ | |
"epoch": 14.142826414282641, | |
"grad_norm": 0.29409125447273254, | |
"learning_rate": 0.0003207166392092257, | |
"loss": 2.824, | |
"step": 263000 | |
}, | |
{ | |
"epoch": 14.196601419660142, | |
"grad_norm": 0.280676931142807, | |
"learning_rate": 0.00031777477053424334, | |
"loss": 2.8301, | |
"step": 264000 | |
}, | |
{ | |
"epoch": 14.250376425037642, | |
"grad_norm": 0.3202875554561615, | |
"learning_rate": 0.000314835843727936, | |
"loss": 2.8271, | |
"step": 265000 | |
}, | |
{ | |
"epoch": 14.304151430415143, | |
"grad_norm": 0.2673070728778839, | |
"learning_rate": 0.0003118939750529536, | |
"loss": 2.8376, | |
"step": 266000 | |
}, | |
{ | |
"epoch": 14.357926435792644, | |
"grad_norm": 0.2868790030479431, | |
"learning_rate": 0.0003089550482466463, | |
"loss": 2.8416, | |
"step": 267000 | |
}, | |
{ | |
"epoch": 14.411701441170145, | |
"grad_norm": 0.2689562737941742, | |
"learning_rate": 0.0003060131795716639, | |
"loss": 2.8408, | |
"step": 268000 | |
}, | |
{ | |
"epoch": 14.465476446547644, | |
"grad_norm": 0.2907434403896332, | |
"learning_rate": 0.0003030713108966815, | |
"loss": 2.8429, | |
"step": 269000 | |
}, | |
{ | |
"epoch": 14.519251451925145, | |
"grad_norm": 0.26338914036750793, | |
"learning_rate": 0.00030012944222169925, | |
"loss": 2.8465, | |
"step": 270000 | |
}, | |
{ | |
"epoch": 14.573026457302646, | |
"grad_norm": 0.2925278842449188, | |
"learning_rate": 0.00029719051541539186, | |
"loss": 2.8444, | |
"step": 271000 | |
}, | |
{ | |
"epoch": 14.626801462680147, | |
"grad_norm": 0.2883965075016022, | |
"learning_rate": 0.00029424864674040954, | |
"loss": 2.8466, | |
"step": 272000 | |
}, | |
{ | |
"epoch": 14.680576468057646, | |
"grad_norm": 0.28422123193740845, | |
"learning_rate": 0.00029130971993410215, | |
"loss": 2.8458, | |
"step": 273000 | |
}, | |
{ | |
"epoch": 14.734351473435147, | |
"grad_norm": 0.28057223558425903, | |
"learning_rate": 0.0002883678512591198, | |
"loss": 2.8512, | |
"step": 274000 | |
}, | |
{ | |
"epoch": 14.788126478812648, | |
"grad_norm": 0.2739641070365906, | |
"learning_rate": 0.00028542892445281243, | |
"loss": 2.8508, | |
"step": 275000 | |
}, | |
{ | |
"epoch": 14.84190148419015, | |
"grad_norm": 0.26283150911331177, | |
"learning_rate": 0.00028248705577783005, | |
"loss": 2.8491, | |
"step": 276000 | |
}, | |
{ | |
"epoch": 14.895676489567649, | |
"grad_norm": 0.25209805369377136, | |
"learning_rate": 0.0002795451871028477, | |
"loss": 2.8528, | |
"step": 277000 | |
}, | |
{ | |
"epoch": 14.94945149494515, | |
"grad_norm": 0.29996606707572937, | |
"learning_rate": 0.0002766033184278654, | |
"loss": 2.8545, | |
"step": 278000 | |
}, | |
{ | |
"epoch": 15.0, | |
"eval_accuracy": 0.4106898178253074, | |
"eval_loss": 3.383195400238037, | |
"eval_runtime": 154.9853, | |
"eval_samples_per_second": 373.693, | |
"eval_steps_per_second": 5.839, | |
"step": 278940 | |
}, | |
{ | |
"epoch": 15.00322650032265, | |
"grad_norm": 0.30987992882728577, | |
"learning_rate": 0.00027366439162155806, | |
"loss": 2.8502, | |
"step": 279000 | |
}, | |
{ | |
"epoch": 15.05700150570015, | |
"grad_norm": 0.2791779041290283, | |
"learning_rate": 0.0002707225229465757, | |
"loss": 2.8054, | |
"step": 280000 | |
}, | |
{ | |
"epoch": 15.11077651107765, | |
"grad_norm": 0.29352006316185, | |
"learning_rate": 0.00026778359614026834, | |
"loss": 2.8087, | |
"step": 281000 | |
}, | |
{ | |
"epoch": 15.164551516455152, | |
"grad_norm": 0.2739843428134918, | |
"learning_rate": 0.00026484466933396095, | |
"loss": 2.8134, | |
"step": 282000 | |
}, | |
{ | |
"epoch": 15.218326521832653, | |
"grad_norm": 0.29397326707839966, | |
"learning_rate": 0.00026190280065897857, | |
"loss": 2.814, | |
"step": 283000 | |
}, | |
{ | |
"epoch": 15.272101527210152, | |
"grad_norm": 0.2891228199005127, | |
"learning_rate": 0.00025896387385267123, | |
"loss": 2.8154, | |
"step": 284000 | |
}, | |
{ | |
"epoch": 15.325876532587653, | |
"grad_norm": 0.2697419226169586, | |
"learning_rate": 0.00025602200517768885, | |
"loss": 2.815, | |
"step": 285000 | |
}, | |
{ | |
"epoch": 15.379651537965154, | |
"grad_norm": 0.27169767022132874, | |
"learning_rate": 0.00025308013650270653, | |
"loss": 2.8179, | |
"step": 286000 | |
}, | |
{ | |
"epoch": 15.433426543342655, | |
"grad_norm": 0.28420206904411316, | |
"learning_rate": 0.0002501382678277242, | |
"loss": 2.8242, | |
"step": 287000 | |
}, | |
{ | |
"epoch": 15.487201548720154, | |
"grad_norm": 0.29990944266319275, | |
"learning_rate": 0.0002471993410214168, | |
"loss": 2.8284, | |
"step": 288000 | |
}, | |
{ | |
"epoch": 15.540976554097655, | |
"grad_norm": 0.29358208179473877, | |
"learning_rate": 0.0002442574723464345, | |
"loss": 2.8242, | |
"step": 289000 | |
}, | |
{ | |
"epoch": 15.594751559475156, | |
"grad_norm": 0.2709376811981201, | |
"learning_rate": 0.0002413156036714521, | |
"loss": 2.8323, | |
"step": 290000 | |
}, | |
{ | |
"epoch": 15.648526564852656, | |
"grad_norm": 0.27639514207839966, | |
"learning_rate": 0.00023837961873381973, | |
"loss": 2.8295, | |
"step": 291000 | |
}, | |
{ | |
"epoch": 15.702301570230157, | |
"grad_norm": 0.27499568462371826, | |
"learning_rate": 0.00023543775005883738, | |
"loss": 2.8273, | |
"step": 292000 | |
}, | |
{ | |
"epoch": 15.756076575607658, | |
"grad_norm": 0.2614821493625641, | |
"learning_rate": 0.00023249588138385502, | |
"loss": 2.8302, | |
"step": 293000 | |
}, | |
{ | |
"epoch": 15.809851580985159, | |
"grad_norm": 0.30566710233688354, | |
"learning_rate": 0.00022955695457754766, | |
"loss": 2.8315, | |
"step": 294000 | |
}, | |
{ | |
"epoch": 15.863626586362658, | |
"grad_norm": 0.28071853518486023, | |
"learning_rate": 0.00022661508590256533, | |
"loss": 2.8282, | |
"step": 295000 | |
}, | |
{ | |
"epoch": 15.917401591740159, | |
"grad_norm": 0.26871344447135925, | |
"learning_rate": 0.00022367321722758295, | |
"loss": 2.8318, | |
"step": 296000 | |
}, | |
{ | |
"epoch": 15.97117659711766, | |
"grad_norm": 0.27507010102272034, | |
"learning_rate": 0.0002207313485526006, | |
"loss": 2.8339, | |
"step": 297000 | |
}, | |
{ | |
"epoch": 16.0, | |
"eval_accuracy": 0.4098433764298017, | |
"eval_loss": 3.4079394340515137, | |
"eval_runtime": 155.0447, | |
"eval_samples_per_second": 373.55, | |
"eval_steps_per_second": 5.837, | |
"step": 297536 | |
}, | |
{ | |
"epoch": 16.02495160249516, | |
"grad_norm": 0.2846646010875702, | |
"learning_rate": 0.00021778947987761827, | |
"loss": 2.8126, | |
"step": 298000 | |
}, | |
{ | |
"epoch": 16.07872660787266, | |
"grad_norm": 0.30357855558395386, | |
"learning_rate": 0.0002148505530713109, | |
"loss": 2.7901, | |
"step": 299000 | |
}, | |
{ | |
"epoch": 16.132501613250163, | |
"grad_norm": 0.27245578169822693, | |
"learning_rate": 0.00021191162626500354, | |
"loss": 2.791, | |
"step": 300000 | |
}, | |
{ | |
"epoch": 16.186276618627662, | |
"grad_norm": 0.27511173486709595, | |
"learning_rate": 0.0002089697575900212, | |
"loss": 2.7991, | |
"step": 301000 | |
}, | |
{ | |
"epoch": 16.24005162400516, | |
"grad_norm": 0.2818892300128937, | |
"learning_rate": 0.00020602788891503884, | |
"loss": 2.8017, | |
"step": 302000 | |
}, | |
{ | |
"epoch": 16.293826629382664, | |
"grad_norm": 0.29610157012939453, | |
"learning_rate": 0.00020308896210873147, | |
"loss": 2.7992, | |
"step": 303000 | |
}, | |
{ | |
"epoch": 16.347601634760164, | |
"grad_norm": 0.28958022594451904, | |
"learning_rate": 0.00020014709343374912, | |
"loss": 2.8032, | |
"step": 304000 | |
}, | |
{ | |
"epoch": 16.401376640137663, | |
"grad_norm": 0.32356253266334534, | |
"learning_rate": 0.00019720522475876677, | |
"loss": 2.8018, | |
"step": 305000 | |
}, | |
{ | |
"epoch": 16.455151645515166, | |
"grad_norm": 0.2847062051296234, | |
"learning_rate": 0.0001942633560837844, | |
"loss": 2.8077, | |
"step": 306000 | |
}, | |
{ | |
"epoch": 16.508926650892665, | |
"grad_norm": 0.2739544212818146, | |
"learning_rate": 0.00019132442927747705, | |
"loss": 2.81, | |
"step": 307000 | |
}, | |
{ | |
"epoch": 16.562701656270164, | |
"grad_norm": 0.28697946667671204, | |
"learning_rate": 0.0001883825606024947, | |
"loss": 2.807, | |
"step": 308000 | |
}, | |
{ | |
"epoch": 16.616476661647667, | |
"grad_norm": 0.29771292209625244, | |
"learning_rate": 0.00018544363379618733, | |
"loss": 2.8096, | |
"step": 309000 | |
}, | |
{ | |
"epoch": 16.670251667025166, | |
"grad_norm": 0.29724204540252686, | |
"learning_rate": 0.000182501765121205, | |
"loss": 2.8081, | |
"step": 310000 | |
}, | |
{ | |
"epoch": 16.72402667240267, | |
"grad_norm": 0.2964895963668823, | |
"learning_rate": 0.00017955989644622265, | |
"loss": 2.8133, | |
"step": 311000 | |
}, | |
{ | |
"epoch": 16.777801677780168, | |
"grad_norm": 0.28841668367385864, | |
"learning_rate": 0.00017662391150859025, | |
"loss": 2.8086, | |
"step": 312000 | |
}, | |
{ | |
"epoch": 16.831576683157667, | |
"grad_norm": 0.2766316831111908, | |
"learning_rate": 0.00017368204283360793, | |
"loss": 2.8152, | |
"step": 313000 | |
}, | |
{ | |
"epoch": 16.88535168853517, | |
"grad_norm": 0.32704171538352966, | |
"learning_rate": 0.00017074017415862554, | |
"loss": 2.8162, | |
"step": 314000 | |
}, | |
{ | |
"epoch": 16.93912669391267, | |
"grad_norm": 0.3360968828201294, | |
"learning_rate": 0.00016779830548364322, | |
"loss": 2.8133, | |
"step": 315000 | |
}, | |
{ | |
"epoch": 16.99290169929017, | |
"grad_norm": 0.31055524945259094, | |
"learning_rate": 0.00016485643680866087, | |
"loss": 2.8157, | |
"step": 316000 | |
}, | |
{ | |
"epoch": 17.0, | |
"eval_accuracy": 0.4103879636154489, | |
"eval_loss": 3.3883779048919678, | |
"eval_runtime": 154.6539, | |
"eval_samples_per_second": 374.494, | |
"eval_steps_per_second": 5.852, | |
"step": 316132 | |
}, | |
{ | |
"epoch": 17.04667670466767, | |
"grad_norm": 0.30388155579566956, | |
"learning_rate": 0.0001619175100023535, | |
"loss": 2.7807, | |
"step": 317000 | |
}, | |
{ | |
"epoch": 17.10045171004517, | |
"grad_norm": 0.2881651818752289, | |
"learning_rate": 0.00015897564132737118, | |
"loss": 2.7798, | |
"step": 318000 | |
}, | |
{ | |
"epoch": 17.15422671542267, | |
"grad_norm": 0.2695824205875397, | |
"learning_rate": 0.0001560337726523888, | |
"loss": 2.7817, | |
"step": 319000 | |
}, | |
{ | |
"epoch": 17.208001720800173, | |
"grad_norm": 0.30308765172958374, | |
"learning_rate": 0.00015309484584608143, | |
"loss": 2.7808, | |
"step": 320000 | |
}, | |
{ | |
"epoch": 17.261776726177672, | |
"grad_norm": 0.29839852452278137, | |
"learning_rate": 0.00015015297717109908, | |
"loss": 2.7878, | |
"step": 321000 | |
}, | |
{ | |
"epoch": 17.315551731555175, | |
"grad_norm": 0.2739504873752594, | |
"learning_rate": 0.00014721110849611675, | |
"loss": 2.7849, | |
"step": 322000 | |
}, | |
{ | |
"epoch": 17.369326736932674, | |
"grad_norm": 0.33289453387260437, | |
"learning_rate": 0.00014427512355848435, | |
"loss": 2.7878, | |
"step": 323000 | |
}, | |
{ | |
"epoch": 17.423101742310173, | |
"grad_norm": 0.30872535705566406, | |
"learning_rate": 0.00014133325488350202, | |
"loss": 2.7862, | |
"step": 324000 | |
}, | |
{ | |
"epoch": 17.476876747687676, | |
"grad_norm": 0.30065229535102844, | |
"learning_rate": 0.00013839138620851964, | |
"loss": 2.7903, | |
"step": 325000 | |
}, | |
{ | |
"epoch": 17.530651753065175, | |
"grad_norm": 0.294783353805542, | |
"learning_rate": 0.0001354495175335373, | |
"loss": 2.7901, | |
"step": 326000 | |
}, | |
{ | |
"epoch": 17.584426758442675, | |
"grad_norm": 0.3017074167728424, | |
"learning_rate": 0.00013251059072722993, | |
"loss": 2.789, | |
"step": 327000 | |
}, | |
{ | |
"epoch": 17.638201763820177, | |
"grad_norm": 0.28931453824043274, | |
"learning_rate": 0.0001295687220522476, | |
"loss": 2.7962, | |
"step": 328000 | |
}, | |
{ | |
"epoch": 17.691976769197677, | |
"grad_norm": 0.30777105689048767, | |
"learning_rate": 0.00012662685337726525, | |
"loss": 2.7905, | |
"step": 329000 | |
}, | |
{ | |
"epoch": 17.745751774575176, | |
"grad_norm": 0.30437570810317993, | |
"learning_rate": 0.00012368792657095788, | |
"loss": 2.7922, | |
"step": 330000 | |
}, | |
{ | |
"epoch": 17.79952677995268, | |
"grad_norm": 0.279985249042511, | |
"learning_rate": 0.00012074605789597553, | |
"loss": 2.7958, | |
"step": 331000 | |
}, | |
{ | |
"epoch": 17.853301785330178, | |
"grad_norm": 0.2660251557826996, | |
"learning_rate": 0.00011780418922099318, | |
"loss": 2.7954, | |
"step": 332000 | |
}, | |
{ | |
"epoch": 17.90707679070768, | |
"grad_norm": 0.2697618901729584, | |
"learning_rate": 0.00011486526241468581, | |
"loss": 2.7969, | |
"step": 333000 | |
}, | |
{ | |
"epoch": 17.96085179608518, | |
"grad_norm": 0.2890709340572357, | |
"learning_rate": 0.00011192633560837845, | |
"loss": 2.7966, | |
"step": 334000 | |
}, | |
{ | |
"epoch": 18.0, | |
"eval_accuracy": 0.4105063404500295, | |
"eval_loss": 3.4080612659454346, | |
"eval_runtime": 154.9923, | |
"eval_samples_per_second": 373.677, | |
"eval_steps_per_second": 5.839, | |
"step": 334728 | |
}, | |
{ | |
"epoch": 18.01462680146268, | |
"grad_norm": 0.2767917215824127, | |
"learning_rate": 0.0001089844669333961, | |
"loss": 2.7867, | |
"step": 335000 | |
}, | |
{ | |
"epoch": 18.068401806840182, | |
"grad_norm": 0.29336050152778625, | |
"learning_rate": 0.00010604259825841375, | |
"loss": 2.7639, | |
"step": 336000 | |
}, | |
{ | |
"epoch": 18.12217681221768, | |
"grad_norm": 0.3294292688369751, | |
"learning_rate": 0.0001031007295834314, | |
"loss": 2.7658, | |
"step": 337000 | |
}, | |
{ | |
"epoch": 18.17595181759518, | |
"grad_norm": 0.30230215191841125, | |
"learning_rate": 0.00010015886090844905, | |
"loss": 2.766, | |
"step": 338000 | |
}, | |
{ | |
"epoch": 18.229726822972683, | |
"grad_norm": 0.28469741344451904, | |
"learning_rate": 9.721993410214168e-05, | |
"loss": 2.7682, | |
"step": 339000 | |
}, | |
{ | |
"epoch": 18.283501828350182, | |
"grad_norm": 0.2829948365688324, | |
"learning_rate": 9.427806542715933e-05, | |
"loss": 2.7747, | |
"step": 340000 | |
}, | |
{ | |
"epoch": 18.33727683372768, | |
"grad_norm": 0.27874138951301575, | |
"learning_rate": 9.133913862085197e-05, | |
"loss": 2.7719, | |
"step": 341000 | |
}, | |
{ | |
"epoch": 18.391051839105184, | |
"grad_norm": 0.3442845642566681, | |
"learning_rate": 8.839726994586963e-05, | |
"loss": 2.7753, | |
"step": 342000 | |
}, | |
{ | |
"epoch": 18.444826844482684, | |
"grad_norm": 0.36807698011398315, | |
"learning_rate": 8.545540127088727e-05, | |
"loss": 2.7734, | |
"step": 343000 | |
}, | |
{ | |
"epoch": 18.498601849860187, | |
"grad_norm": 0.3008968234062195, | |
"learning_rate": 8.25164744645799e-05, | |
"loss": 2.7742, | |
"step": 344000 | |
}, | |
{ | |
"epoch": 18.552376855237686, | |
"grad_norm": 0.3300212621688843, | |
"learning_rate": 7.957460578959754e-05, | |
"loss": 2.7729, | |
"step": 345000 | |
}, | |
{ | |
"epoch": 18.606151860615185, | |
"grad_norm": 0.2936519682407379, | |
"learning_rate": 7.66327371146152e-05, | |
"loss": 2.7756, | |
"step": 346000 | |
}, | |
{ | |
"epoch": 18.659926865992688, | |
"grad_norm": 0.2997318208217621, | |
"learning_rate": 7.369381030830784e-05, | |
"loss": 2.7787, | |
"step": 347000 | |
}, | |
{ | |
"epoch": 18.713701871370187, | |
"grad_norm": 0.27832144498825073, | |
"learning_rate": 7.075194163332549e-05, | |
"loss": 2.7736, | |
"step": 348000 | |
}, | |
{ | |
"epoch": 18.767476876747686, | |
"grad_norm": 0.28568559885025024, | |
"learning_rate": 6.781301482701812e-05, | |
"loss": 2.7752, | |
"step": 349000 | |
}, | |
{ | |
"epoch": 18.82125188212519, | |
"grad_norm": 0.2887098789215088, | |
"learning_rate": 6.487408802071074e-05, | |
"loss": 2.7764, | |
"step": 350000 | |
}, | |
{ | |
"epoch": 18.87502688750269, | |
"grad_norm": 0.31327852606773376, | |
"learning_rate": 6.19322193457284e-05, | |
"loss": 2.7776, | |
"step": 351000 | |
}, | |
{ | |
"epoch": 18.928801892880188, | |
"grad_norm": 0.2947293519973755, | |
"learning_rate": 5.899035067074606e-05, | |
"loss": 2.7778, | |
"step": 352000 | |
}, | |
{ | |
"epoch": 18.98257689825769, | |
"grad_norm": 0.27630892395973206, | |
"learning_rate": 5.604848199576371e-05, | |
"loss": 2.7807, | |
"step": 353000 | |
}, | |
{ | |
"epoch": 19.0, | |
"eval_accuracy": 0.41043096087659053, | |
"eval_loss": 3.4180543422698975, | |
"eval_runtime": 155.0182, | |
"eval_samples_per_second": 373.614, | |
"eval_steps_per_second": 5.838, | |
"step": 353324 | |
}, | |
{ | |
"epoch": 19.03635190363519, | |
"grad_norm": 0.29157590866088867, | |
"learning_rate": 5.310955518945634e-05, | |
"loss": 2.7661, | |
"step": 354000 | |
}, | |
{ | |
"epoch": 19.090126909012692, | |
"grad_norm": 0.26800793409347534, | |
"learning_rate": 5.0167686514473995e-05, | |
"loss": 2.7558, | |
"step": 355000 | |
}, | |
{ | |
"epoch": 19.14390191439019, | |
"grad_norm": 0.31518709659576416, | |
"learning_rate": 4.722581783949165e-05, | |
"loss": 2.7579, | |
"step": 356000 | |
}, | |
{ | |
"epoch": 19.19767691976769, | |
"grad_norm": 0.2998282313346863, | |
"learning_rate": 4.428689103318428e-05, | |
"loss": 2.7574, | |
"step": 357000 | |
}, | |
{ | |
"epoch": 19.251451925145194, | |
"grad_norm": 0.3034313917160034, | |
"learning_rate": 4.134502235820193e-05, | |
"loss": 2.7606, | |
"step": 358000 | |
}, | |
{ | |
"epoch": 19.305226930522693, | |
"grad_norm": 0.2875937819480896, | |
"learning_rate": 3.8403153683219584e-05, | |
"loss": 2.7586, | |
"step": 359000 | |
}, | |
{ | |
"epoch": 19.359001935900192, | |
"grad_norm": 0.30761197209358215, | |
"learning_rate": 3.546422687691222e-05, | |
"loss": 2.7577, | |
"step": 360000 | |
}, | |
{ | |
"epoch": 19.412776941277695, | |
"grad_norm": 0.29801440238952637, | |
"learning_rate": 3.252530007060485e-05, | |
"loss": 2.7557, | |
"step": 361000 | |
}, | |
{ | |
"epoch": 19.466551946655194, | |
"grad_norm": 0.2939643859863281, | |
"learning_rate": 2.95834313956225e-05, | |
"loss": 2.7571, | |
"step": 362000 | |
}, | |
{ | |
"epoch": 19.520326952032697, | |
"grad_norm": 0.29968592524528503, | |
"learning_rate": 2.6641562720640153e-05, | |
"loss": 2.761, | |
"step": 363000 | |
}, | |
{ | |
"epoch": 19.574101957410196, | |
"grad_norm": 0.28651562333106995, | |
"learning_rate": 2.36996940456578e-05, | |
"loss": 2.7579, | |
"step": 364000 | |
}, | |
{ | |
"epoch": 19.627876962787695, | |
"grad_norm": 0.30474939942359924, | |
"learning_rate": 2.0760767239350436e-05, | |
"loss": 2.7598, | |
"step": 365000 | |
}, | |
{ | |
"epoch": 19.6816519681652, | |
"grad_norm": 0.2941524088382721, | |
"learning_rate": 1.7818898564368086e-05, | |
"loss": 2.7644, | |
"step": 366000 | |
}, | |
{ | |
"epoch": 19.735426973542697, | |
"grad_norm": 0.2570919990539551, | |
"learning_rate": 1.487997175806072e-05, | |
"loss": 2.7549, | |
"step": 367000 | |
}, | |
{ | |
"epoch": 19.789201978920197, | |
"grad_norm": 0.31410789489746094, | |
"learning_rate": 1.193810308307837e-05, | |
"loss": 2.7624, | |
"step": 368000 | |
}, | |
{ | |
"epoch": 19.8429769842977, | |
"grad_norm": 0.29923874139785767, | |
"learning_rate": 8.999176276771005e-06, | |
"loss": 2.7621, | |
"step": 369000 | |
}, | |
{ | |
"epoch": 19.8967519896752, | |
"grad_norm": 0.29635271430015564, | |
"learning_rate": 6.057307601788656e-06, | |
"loss": 2.7601, | |
"step": 370000 | |
}, | |
{ | |
"epoch": 19.950526995052698, | |
"grad_norm": 0.29568880796432495, | |
"learning_rate": 3.1154389268063074e-06, | |
"loss": 2.7595, | |
"step": 371000 | |
}, | |
{ | |
"epoch": 20.0, | |
"eval_accuracy": 0.41021489963935376, | |
"eval_loss": 3.423628807067871, | |
"eval_runtime": 154.8423, | |
"eval_samples_per_second": 374.039, | |
"eval_steps_per_second": 5.845, | |
"step": 371920 | |
}, | |
{ | |
"epoch": 20.0, | |
"step": 371920, | |
"total_flos": 1.5670047538944e+18, | |
"train_loss": 3.0279207580395733, | |
"train_runtime": 82395.3872, | |
"train_samples_per_second": 144.441, | |
"train_steps_per_second": 4.514 | |
} | |
], | |
"logging_steps": 1000, | |
"max_steps": 371920, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 20, | |
"save_steps": 5000, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 1.5670047538944e+18, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |