|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9984038308060654, |
|
"eval_steps": 250, |
|
"global_step": 1878, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010641127959563713, |
|
"grad_norm": 0.8616393804550171, |
|
"learning_rate": 3.1914893617021277e-06, |
|
"loss": 0.6847, |
|
"num_input_tokens_seen": 327680, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021282255919127427, |
|
"grad_norm": 1.3775863647460938, |
|
"learning_rate": 6.3829787234042555e-06, |
|
"loss": 0.6579, |
|
"num_input_tokens_seen": 655360, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03192338387869114, |
|
"grad_norm": 2.395984411239624, |
|
"learning_rate": 9.574468085106385e-06, |
|
"loss": 0.6053, |
|
"num_input_tokens_seen": 983040, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.042564511838254854, |
|
"grad_norm": 1.8644745349884033, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 0.53, |
|
"num_input_tokens_seen": 1310720, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05320563979781857, |
|
"grad_norm": 2.1690289974212646, |
|
"learning_rate": 1.5957446808510637e-05, |
|
"loss": 0.4419, |
|
"num_input_tokens_seen": 1638400, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06384676775738228, |
|
"grad_norm": 1.3926266431808472, |
|
"learning_rate": 1.914893617021277e-05, |
|
"loss": 0.3329, |
|
"num_input_tokens_seen": 1966080, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.074487895716946, |
|
"grad_norm": 1.0763431787490845, |
|
"learning_rate": 2.2340425531914894e-05, |
|
"loss": 0.2703, |
|
"num_input_tokens_seen": 2293760, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08512902367650971, |
|
"grad_norm": 12.503619194030762, |
|
"learning_rate": 2.5531914893617022e-05, |
|
"loss": 0.1906, |
|
"num_input_tokens_seen": 2621440, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09577015163607343, |
|
"grad_norm": 0.6052917838096619, |
|
"learning_rate": 2.872340425531915e-05, |
|
"loss": 0.1476, |
|
"num_input_tokens_seen": 2949120, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10641127959563713, |
|
"grad_norm": 5.584522247314453, |
|
"learning_rate": 2.9899103139013456e-05, |
|
"loss": 0.1279, |
|
"num_input_tokens_seen": 3276800, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11705240755520085, |
|
"grad_norm": 1.0587092638015747, |
|
"learning_rate": 2.9730941704035875e-05, |
|
"loss": 0.112, |
|
"num_input_tokens_seen": 3604480, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12769353551476456, |
|
"grad_norm": 2.5089759826660156, |
|
"learning_rate": 2.9562780269058297e-05, |
|
"loss": 0.1119, |
|
"num_input_tokens_seen": 3932160, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13833466347432827, |
|
"grad_norm": 4.025810241699219, |
|
"learning_rate": 2.939461883408072e-05, |
|
"loss": 0.1155, |
|
"num_input_tokens_seen": 4259840, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.148975791433892, |
|
"grad_norm": 0.6721552014350891, |
|
"learning_rate": 2.922645739910314e-05, |
|
"loss": 0.0937, |
|
"num_input_tokens_seen": 4587520, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1596169193934557, |
|
"grad_norm": 4.8363542556762695, |
|
"learning_rate": 2.905829596412556e-05, |
|
"loss": 0.089, |
|
"num_input_tokens_seen": 4915200, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17025804735301941, |
|
"grad_norm": 13.355521202087402, |
|
"learning_rate": 2.889013452914798e-05, |
|
"loss": 0.0525, |
|
"num_input_tokens_seen": 5242880, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18089917531258312, |
|
"grad_norm": 17.72276496887207, |
|
"learning_rate": 2.8721973094170402e-05, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 5570560, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19154030327214686, |
|
"grad_norm": 3.537041187286377, |
|
"learning_rate": 2.8553811659192828e-05, |
|
"loss": 0.0811, |
|
"num_input_tokens_seen": 5898240, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20218143123171056, |
|
"grad_norm": 0.13461732864379883, |
|
"learning_rate": 2.8385650224215247e-05, |
|
"loss": 0.0763, |
|
"num_input_tokens_seen": 6225920, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21282255919127427, |
|
"grad_norm": 9.155119895935059, |
|
"learning_rate": 2.821748878923767e-05, |
|
"loss": 0.1048, |
|
"num_input_tokens_seen": 6553600, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22346368715083798, |
|
"grad_norm": 0.7209023833274841, |
|
"learning_rate": 2.804932735426009e-05, |
|
"loss": 0.1231, |
|
"num_input_tokens_seen": 6881280, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2341048151104017, |
|
"grad_norm": 0.5195837020874023, |
|
"learning_rate": 2.788116591928251e-05, |
|
"loss": 0.0537, |
|
"num_input_tokens_seen": 7208960, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24474594306996542, |
|
"grad_norm": 3.8807427883148193, |
|
"learning_rate": 2.7713004484304933e-05, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 7536640, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2553870710295291, |
|
"grad_norm": 3.4100818634033203, |
|
"learning_rate": 2.7544843049327355e-05, |
|
"loss": 0.062, |
|
"num_input_tokens_seen": 7864320, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26602819898909286, |
|
"grad_norm": 0.3366034924983978, |
|
"learning_rate": 2.7376681614349774e-05, |
|
"loss": 0.0298, |
|
"num_input_tokens_seen": 8192000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26602819898909286, |
|
"eval_accuracy": 0.99, |
|
"eval_loss": 0.044801026582717896, |
|
"eval_runtime": 1.1309, |
|
"eval_samples_per_second": 442.123, |
|
"eval_steps_per_second": 55.707, |
|
"num_input_tokens_seen": 8192000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27666932694865654, |
|
"grad_norm": 1.2192944288253784, |
|
"learning_rate": 2.72085201793722e-05, |
|
"loss": 0.0562, |
|
"num_input_tokens_seen": 8519680, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28731045490822027, |
|
"grad_norm": 0.7389326691627502, |
|
"learning_rate": 2.7040358744394622e-05, |
|
"loss": 0.0435, |
|
"num_input_tokens_seen": 8847360, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.297951582867784, |
|
"grad_norm": 1.691129446029663, |
|
"learning_rate": 2.687219730941704e-05, |
|
"loss": 0.0256, |
|
"num_input_tokens_seen": 9175040, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3085927108273477, |
|
"grad_norm": 0.20158784091472626, |
|
"learning_rate": 2.6704035874439464e-05, |
|
"loss": 0.08, |
|
"num_input_tokens_seen": 9502720, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3192338387869114, |
|
"grad_norm": 0.4045298099517822, |
|
"learning_rate": 2.6535874439461886e-05, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 9830400, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32987496674647515, |
|
"grad_norm": 5.865575313568115, |
|
"learning_rate": 2.6367713004484305e-05, |
|
"loss": 0.0701, |
|
"num_input_tokens_seen": 10158080, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34051609470603883, |
|
"grad_norm": 12.122817993164062, |
|
"learning_rate": 2.6199551569506727e-05, |
|
"loss": 0.1398, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.35115722266560256, |
|
"grad_norm": 0.43689683079719543, |
|
"learning_rate": 2.6031390134529146e-05, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 10813440, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.36179835062516624, |
|
"grad_norm": 0.1345166265964508, |
|
"learning_rate": 2.586322869955157e-05, |
|
"loss": 0.0394, |
|
"num_input_tokens_seen": 11141120, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.37243947858473, |
|
"grad_norm": 0.5597580075263977, |
|
"learning_rate": 2.5695067264573994e-05, |
|
"loss": 0.0534, |
|
"num_input_tokens_seen": 11468800, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3830806065442937, |
|
"grad_norm": 1.6686193943023682, |
|
"learning_rate": 2.5526905829596413e-05, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 11796480, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3937217345038574, |
|
"grad_norm": 0.08618992567062378, |
|
"learning_rate": 2.5358744394618835e-05, |
|
"loss": 0.0312, |
|
"num_input_tokens_seen": 12124160, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4043628624634211, |
|
"grad_norm": 0.07978615164756775, |
|
"learning_rate": 2.5190582959641258e-05, |
|
"loss": 0.0488, |
|
"num_input_tokens_seen": 12451840, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41500399042298486, |
|
"grad_norm": 2.9216437339782715, |
|
"learning_rate": 2.5022421524663677e-05, |
|
"loss": 0.0281, |
|
"num_input_tokens_seen": 12779520, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42564511838254854, |
|
"grad_norm": 2.1254470348358154, |
|
"learning_rate": 2.48542600896861e-05, |
|
"loss": 0.044, |
|
"num_input_tokens_seen": 13107200, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43628624634211227, |
|
"grad_norm": 0.1027815118432045, |
|
"learning_rate": 2.468609865470852e-05, |
|
"loss": 0.0278, |
|
"num_input_tokens_seen": 13434880, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44692737430167595, |
|
"grad_norm": 0.15135648846626282, |
|
"learning_rate": 2.451793721973094e-05, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 13762560, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4575685022612397, |
|
"grad_norm": 0.09930180758237839, |
|
"learning_rate": 2.4349775784753363e-05, |
|
"loss": 0.0294, |
|
"num_input_tokens_seen": 14090240, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4682096302208034, |
|
"grad_norm": 0.37529394030570984, |
|
"learning_rate": 2.4181614349775788e-05, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 14417920, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4788507581803671, |
|
"grad_norm": 0.0906977429986, |
|
"learning_rate": 2.4013452914798207e-05, |
|
"loss": 0.0276, |
|
"num_input_tokens_seen": 14745600, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.48949188613993083, |
|
"grad_norm": 2.0479931831359863, |
|
"learning_rate": 2.384529147982063e-05, |
|
"loss": 0.0638, |
|
"num_input_tokens_seen": 15073280, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5001330140994945, |
|
"grad_norm": 0.427298903465271, |
|
"learning_rate": 2.367713004484305e-05, |
|
"loss": 0.0333, |
|
"num_input_tokens_seen": 15400960, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5107741420590582, |
|
"grad_norm": 0.6889400482177734, |
|
"learning_rate": 2.350896860986547e-05, |
|
"loss": 0.0225, |
|
"num_input_tokens_seen": 15728640, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.521415270018622, |
|
"grad_norm": 0.06804540008306503, |
|
"learning_rate": 2.3340807174887893e-05, |
|
"loss": 0.0285, |
|
"num_input_tokens_seen": 16056320, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5320563979781857, |
|
"grad_norm": 0.20838595926761627, |
|
"learning_rate": 2.3172645739910312e-05, |
|
"loss": 0.0141, |
|
"num_input_tokens_seen": 16384000, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5320563979781857, |
|
"eval_accuracy": 0.99, |
|
"eval_loss": 0.033007875084877014, |
|
"eval_runtime": 1.1242, |
|
"eval_samples_per_second": 444.771, |
|
"eval_steps_per_second": 56.041, |
|
"num_input_tokens_seen": 16384000, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5426975259377494, |
|
"grad_norm": 0.09140049666166306, |
|
"learning_rate": 2.3004484304932734e-05, |
|
"loss": 0.019, |
|
"num_input_tokens_seen": 16711680, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5533386538973131, |
|
"grad_norm": 0.06261716037988663, |
|
"learning_rate": 2.283632286995516e-05, |
|
"loss": 0.0355, |
|
"num_input_tokens_seen": 17039360, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5639797818568768, |
|
"grad_norm": 2.4450674057006836, |
|
"learning_rate": 2.266816143497758e-05, |
|
"loss": 0.031, |
|
"num_input_tokens_seen": 17367040, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5746209098164405, |
|
"grad_norm": 1.1212217807769775, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.0265, |
|
"num_input_tokens_seen": 17694720, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5852620377760043, |
|
"grad_norm": 0.638861358165741, |
|
"learning_rate": 2.2331838565022424e-05, |
|
"loss": 0.041, |
|
"num_input_tokens_seen": 18022400, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.595903165735568, |
|
"grad_norm": 0.8384909629821777, |
|
"learning_rate": 2.2163677130044843e-05, |
|
"loss": 0.0377, |
|
"num_input_tokens_seen": 18350080, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6065442936951316, |
|
"grad_norm": 2.6054413318634033, |
|
"learning_rate": 2.1995515695067265e-05, |
|
"loss": 0.0621, |
|
"num_input_tokens_seen": 18677760, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6171854216546954, |
|
"grad_norm": 0.05188291519880295, |
|
"learning_rate": 2.1827354260089687e-05, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 19005440, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6278265496142591, |
|
"grad_norm": 6.18527889251709, |
|
"learning_rate": 2.1659192825112106e-05, |
|
"loss": 0.0623, |
|
"num_input_tokens_seen": 19333120, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6384676775738228, |
|
"grad_norm": 4.499662399291992, |
|
"learning_rate": 2.149103139013453e-05, |
|
"loss": 0.0413, |
|
"num_input_tokens_seen": 19660800, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6491088055333866, |
|
"grad_norm": 0.06525593250989914, |
|
"learning_rate": 2.1322869955156954e-05, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 19988480, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6597499334929503, |
|
"grad_norm": 0.7937769889831543, |
|
"learning_rate": 2.1154708520179373e-05, |
|
"loss": 0.0294, |
|
"num_input_tokens_seen": 20316160, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6703910614525139, |
|
"grad_norm": 0.42232292890548706, |
|
"learning_rate": 2.0986547085201796e-05, |
|
"loss": 0.0086, |
|
"num_input_tokens_seen": 20643840, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6810321894120777, |
|
"grad_norm": 0.23680944740772247, |
|
"learning_rate": 2.0818385650224215e-05, |
|
"loss": 0.0182, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6916733173716414, |
|
"grad_norm": 0.8892483115196228, |
|
"learning_rate": 2.0650224215246637e-05, |
|
"loss": 0.0158, |
|
"num_input_tokens_seen": 21299200, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7023144453312051, |
|
"grad_norm": 9.271723747253418, |
|
"learning_rate": 2.048206278026906e-05, |
|
"loss": 0.0332, |
|
"num_input_tokens_seen": 21626880, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7129555732907689, |
|
"grad_norm": 0.681903600692749, |
|
"learning_rate": 2.0313901345291478e-05, |
|
"loss": 0.0402, |
|
"num_input_tokens_seen": 21954560, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7235967012503325, |
|
"grad_norm": 2.4827804565429688, |
|
"learning_rate": 2.01457399103139e-05, |
|
"loss": 0.0297, |
|
"num_input_tokens_seen": 22282240, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7342378292098962, |
|
"grad_norm": 2.727994203567505, |
|
"learning_rate": 1.9977578475336323e-05, |
|
"loss": 0.027, |
|
"num_input_tokens_seen": 22609920, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.74487895716946, |
|
"grad_norm": 1.978765845298767, |
|
"learning_rate": 1.9809417040358745e-05, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 22937600, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7555200851290237, |
|
"grad_norm": 2.512544870376587, |
|
"learning_rate": 1.9641255605381167e-05, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 23265280, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7661612130885874, |
|
"grad_norm": 5.157982349395752, |
|
"learning_rate": 1.947309417040359e-05, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7768023410481512, |
|
"grad_norm": 0.037381790578365326, |
|
"learning_rate": 1.930493273542601e-05, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 23920640, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7874434690077148, |
|
"grad_norm": 1.0004149675369263, |
|
"learning_rate": 1.913677130044843e-05, |
|
"loss": 0.0315, |
|
"num_input_tokens_seen": 24248320, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7980845969672785, |
|
"grad_norm": 0.046527761965990067, |
|
"learning_rate": 1.8968609865470853e-05, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 24576000, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7980845969672785, |
|
"eval_accuracy": 0.99, |
|
"eval_loss": 0.02980552613735199, |
|
"eval_runtime": 1.1295, |
|
"eval_samples_per_second": 442.672, |
|
"eval_steps_per_second": 55.777, |
|
"num_input_tokens_seen": 24576000, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8087257249268422, |
|
"grad_norm": 0.3098304867744446, |
|
"learning_rate": 1.8800448430493272e-05, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 24903680, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.819366852886406, |
|
"grad_norm": 1.8411376476287842, |
|
"learning_rate": 1.8632286995515695e-05, |
|
"loss": 0.0219, |
|
"num_input_tokens_seen": 25231360, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8300079808459697, |
|
"grad_norm": 0.6672658920288086, |
|
"learning_rate": 1.8464125560538117e-05, |
|
"loss": 0.0236, |
|
"num_input_tokens_seen": 25559040, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8406491088055333, |
|
"grad_norm": 0.15667960047721863, |
|
"learning_rate": 1.829596412556054e-05, |
|
"loss": 0.0373, |
|
"num_input_tokens_seen": 25886720, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8512902367650971, |
|
"grad_norm": 0.039243053644895554, |
|
"learning_rate": 1.812780269058296e-05, |
|
"loss": 0.0118, |
|
"num_input_tokens_seen": 26214400, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8619313647246608, |
|
"grad_norm": 0.9345981478691101, |
|
"learning_rate": 1.795964125560538e-05, |
|
"loss": 0.0322, |
|
"num_input_tokens_seen": 26542080, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8725724926842245, |
|
"grad_norm": 0.06790352612733841, |
|
"learning_rate": 1.7791479820627803e-05, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 26869760, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8832136206437883, |
|
"grad_norm": 0.065700002014637, |
|
"learning_rate": 1.7623318385650225e-05, |
|
"loss": 0.0188, |
|
"num_input_tokens_seen": 27197440, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8938547486033519, |
|
"grad_norm": 3.7558648586273193, |
|
"learning_rate": 1.7455156950672644e-05, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 27525120, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9044958765629156, |
|
"grad_norm": 4.746110916137695, |
|
"learning_rate": 1.7286995515695067e-05, |
|
"loss": 0.0171, |
|
"num_input_tokens_seen": 27852800, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9151370045224794, |
|
"grad_norm": 0.26326820254325867, |
|
"learning_rate": 1.711883408071749e-05, |
|
"loss": 0.0236, |
|
"num_input_tokens_seen": 28180480, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9257781324820431, |
|
"grad_norm": 0.10672000050544739, |
|
"learning_rate": 1.695067264573991e-05, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 28508160, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9364192604416068, |
|
"grad_norm": 0.16295024752616882, |
|
"learning_rate": 1.6782511210762334e-05, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 28835840, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9470603884011706, |
|
"grad_norm": 4.8795857429504395, |
|
"learning_rate": 1.6614349775784756e-05, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 29163520, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9577015163607342, |
|
"grad_norm": 0.06518769264221191, |
|
"learning_rate": 1.6446188340807175e-05, |
|
"loss": 0.0117, |
|
"num_input_tokens_seen": 29491200, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9683426443202979, |
|
"grad_norm": 1.4961518049240112, |
|
"learning_rate": 1.6278026905829597e-05, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 29818880, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9789837722798617, |
|
"grad_norm": 1.2783812284469604, |
|
"learning_rate": 1.610986547085202e-05, |
|
"loss": 0.0405, |
|
"num_input_tokens_seen": 30146560, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9896249002394254, |
|
"grad_norm": 0.15925170481204987, |
|
"learning_rate": 1.594170403587444e-05, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 30474240, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.000266028198989, |
|
"grad_norm": 1.536391019821167, |
|
"learning_rate": 1.577354260089686e-05, |
|
"loss": 0.0159, |
|
"num_input_tokens_seen": 30799872, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0109071561585528, |
|
"grad_norm": 0.04294372722506523, |
|
"learning_rate": 1.560538116591928e-05, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 31127552, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0215482841181165, |
|
"grad_norm": 0.13462825119495392, |
|
"learning_rate": 1.5437219730941705e-05, |
|
"loss": 0.0129, |
|
"num_input_tokens_seen": 31455232, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0321894120776802, |
|
"grad_norm": 0.03951927274465561, |
|
"learning_rate": 1.5269058295964128e-05, |
|
"loss": 0.017, |
|
"num_input_tokens_seen": 31782912, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.042830540037244, |
|
"grad_norm": 0.12142454832792282, |
|
"learning_rate": 1.5100896860986547e-05, |
|
"loss": 0.0207, |
|
"num_input_tokens_seen": 32110592, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0534716679968077, |
|
"grad_norm": 0.11652370542287827, |
|
"learning_rate": 1.4932735426008969e-05, |
|
"loss": 0.0176, |
|
"num_input_tokens_seen": 32438272, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0641127959563714, |
|
"grad_norm": 4.033369064331055, |
|
"learning_rate": 1.476457399103139e-05, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 32765952, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0641127959563714, |
|
"eval_accuracy": 0.994, |
|
"eval_loss": 0.022239448502659798, |
|
"eval_runtime": 1.1241, |
|
"eval_samples_per_second": 444.814, |
|
"eval_steps_per_second": 56.047, |
|
"num_input_tokens_seen": 32765952, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0747539239159352, |
|
"grad_norm": 0.10022466629743576, |
|
"learning_rate": 1.4596412556053812e-05, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 33093632, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.085395051875499, |
|
"grad_norm": 0.0608280785381794, |
|
"learning_rate": 1.4428251121076234e-05, |
|
"loss": 0.0244, |
|
"num_input_tokens_seen": 33421312, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.0960361798350626, |
|
"grad_norm": 0.6638007164001465, |
|
"learning_rate": 1.4260089686098655e-05, |
|
"loss": 0.0049, |
|
"num_input_tokens_seen": 33748992, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.1066773077946261, |
|
"grad_norm": 0.17382824420928955, |
|
"learning_rate": 1.4091928251121077e-05, |
|
"loss": 0.0106, |
|
"num_input_tokens_seen": 34076672, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1173184357541899, |
|
"grad_norm": 0.10657654702663422, |
|
"learning_rate": 1.3923766816143498e-05, |
|
"loss": 0.0381, |
|
"num_input_tokens_seen": 34404352, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1279595637137536, |
|
"grad_norm": 0.7529979348182678, |
|
"learning_rate": 1.375560538116592e-05, |
|
"loss": 0.0235, |
|
"num_input_tokens_seen": 34732032, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.1386006916733173, |
|
"grad_norm": 0.07195574790239334, |
|
"learning_rate": 1.358744394618834e-05, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 35059712, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.149241819632881, |
|
"grad_norm": 0.8922456502914429, |
|
"learning_rate": 1.3419282511210763e-05, |
|
"loss": 0.0201, |
|
"num_input_tokens_seen": 35387392, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.1598829475924448, |
|
"grad_norm": 0.2780587375164032, |
|
"learning_rate": 1.3251121076233184e-05, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 35715072, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1705240755520085, |
|
"grad_norm": 0.014401647262275219, |
|
"learning_rate": 1.3082959641255604e-05, |
|
"loss": 0.0025, |
|
"num_input_tokens_seen": 36042752, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1811652035115723, |
|
"grad_norm": 0.07402833551168442, |
|
"learning_rate": 1.2914798206278028e-05, |
|
"loss": 0.0038, |
|
"num_input_tokens_seen": 36370432, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.191806331471136, |
|
"grad_norm": 0.035160522907972336, |
|
"learning_rate": 1.2746636771300449e-05, |
|
"loss": 0.0221, |
|
"num_input_tokens_seen": 36698112, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2024474594306997, |
|
"grad_norm": 0.23754417896270752, |
|
"learning_rate": 1.257847533632287e-05, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 37025792, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2130885873902635, |
|
"grad_norm": 0.07629762589931488, |
|
"learning_rate": 1.241031390134529e-05, |
|
"loss": 0.0119, |
|
"num_input_tokens_seen": 37353472, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.223729715349827, |
|
"grad_norm": 0.23725423216819763, |
|
"learning_rate": 1.2242152466367714e-05, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 37681152, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2343708433093907, |
|
"grad_norm": 1.0171340703964233, |
|
"learning_rate": 1.2073991031390135e-05, |
|
"loss": 0.0531, |
|
"num_input_tokens_seen": 38008832, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.2450119712689545, |
|
"grad_norm": 0.016075875610113144, |
|
"learning_rate": 1.1905829596412556e-05, |
|
"loss": 0.0261, |
|
"num_input_tokens_seen": 38336512, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.2556530992285182, |
|
"grad_norm": 0.8257108330726624, |
|
"learning_rate": 1.1737668161434978e-05, |
|
"loss": 0.0166, |
|
"num_input_tokens_seen": 38664192, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.266294227188082, |
|
"grad_norm": 0.0884622186422348, |
|
"learning_rate": 1.15695067264574e-05, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 38991872, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.2769353551476457, |
|
"grad_norm": 0.101267971098423, |
|
"learning_rate": 1.1401345291479821e-05, |
|
"loss": 0.019, |
|
"num_input_tokens_seen": 39319552, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2875764831072094, |
|
"grad_norm": 2.194119691848755, |
|
"learning_rate": 1.1233183856502243e-05, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 39647232, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2982176110667731, |
|
"grad_norm": 2.7684483528137207, |
|
"learning_rate": 1.1065022421524664e-05, |
|
"loss": 0.0076, |
|
"num_input_tokens_seen": 39974912, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.3088587390263369, |
|
"grad_norm": 2.1547205448150635, |
|
"learning_rate": 1.0896860986547085e-05, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 40302592, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.3194998669859004, |
|
"grad_norm": 0.39225855469703674, |
|
"learning_rate": 1.0728699551569507e-05, |
|
"loss": 0.013, |
|
"num_input_tokens_seen": 40630272, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.3301409949454643, |
|
"grad_norm": 0.12444789707660675, |
|
"learning_rate": 1.056053811659193e-05, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 40957952, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3301409949454643, |
|
"eval_accuracy": 0.994, |
|
"eval_loss": 0.020717209205031395, |
|
"eval_runtime": 1.1258, |
|
"eval_samples_per_second": 444.121, |
|
"eval_steps_per_second": 55.959, |
|
"num_input_tokens_seen": 40957952, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3407821229050279, |
|
"grad_norm": 0.224708691239357, |
|
"learning_rate": 1.039237668161435e-05, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 41285632, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.3514232508645916, |
|
"grad_norm": 0.08499462902545929, |
|
"learning_rate": 1.022421524663677e-05, |
|
"loss": 0.0182, |
|
"num_input_tokens_seen": 41613312, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.3620643788241553, |
|
"grad_norm": 0.05140333250164986, |
|
"learning_rate": 1.0056053811659195e-05, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 41940992, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.372705506783719, |
|
"grad_norm": 0.05546234920620918, |
|
"learning_rate": 9.887892376681615e-06, |
|
"loss": 0.0117, |
|
"num_input_tokens_seen": 42268672, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.3833466347432828, |
|
"grad_norm": 0.029206566512584686, |
|
"learning_rate": 9.719730941704036e-06, |
|
"loss": 0.0179, |
|
"num_input_tokens_seen": 42596352, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3939877627028465, |
|
"grad_norm": 0.3235812485218048, |
|
"learning_rate": 9.551569506726456e-06, |
|
"loss": 0.0333, |
|
"num_input_tokens_seen": 42924032, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.4046288906624103, |
|
"grad_norm": 4.916908264160156, |
|
"learning_rate": 9.38340807174888e-06, |
|
"loss": 0.0167, |
|
"num_input_tokens_seen": 43251712, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.415270018621974, |
|
"grad_norm": 0.10124430060386658, |
|
"learning_rate": 9.215246636771301e-06, |
|
"loss": 0.0299, |
|
"num_input_tokens_seen": 43579392, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.4259111465815377, |
|
"grad_norm": 0.09930448234081268, |
|
"learning_rate": 9.047085201793722e-06, |
|
"loss": 0.0112, |
|
"num_input_tokens_seen": 43907072, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.4365522745411012, |
|
"grad_norm": 0.1370278298854828, |
|
"learning_rate": 8.878923766816144e-06, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 44234752, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4471934025006652, |
|
"grad_norm": 1.9884629249572754, |
|
"learning_rate": 8.710762331838565e-06, |
|
"loss": 0.0093, |
|
"num_input_tokens_seen": 44562432, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.4578345304602287, |
|
"grad_norm": 0.768826961517334, |
|
"learning_rate": 8.542600896860987e-06, |
|
"loss": 0.0297, |
|
"num_input_tokens_seen": 44890112, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.4684756584197924, |
|
"grad_norm": 0.08758696168661118, |
|
"learning_rate": 8.374439461883408e-06, |
|
"loss": 0.0234, |
|
"num_input_tokens_seen": 45217792, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.4791167863793562, |
|
"grad_norm": 0.1405934989452362, |
|
"learning_rate": 8.20627802690583e-06, |
|
"loss": 0.0072, |
|
"num_input_tokens_seen": 45545472, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.48975791433892, |
|
"grad_norm": 0.32703763246536255, |
|
"learning_rate": 8.03811659192825e-06, |
|
"loss": 0.0023, |
|
"num_input_tokens_seen": 45873152, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5003990422984836, |
|
"grad_norm": 0.8952039480209351, |
|
"learning_rate": 7.869955156950673e-06, |
|
"loss": 0.0183, |
|
"num_input_tokens_seen": 46200832, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.5110401702580474, |
|
"grad_norm": 0.2962280213832855, |
|
"learning_rate": 7.701793721973095e-06, |
|
"loss": 0.0013, |
|
"num_input_tokens_seen": 46528512, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.5216812982176111, |
|
"grad_norm": 2.0377979278564453, |
|
"learning_rate": 7.533632286995516e-06, |
|
"loss": 0.0195, |
|
"num_input_tokens_seen": 46856192, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.5323224261771746, |
|
"grad_norm": 0.08011902123689651, |
|
"learning_rate": 7.365470852017937e-06, |
|
"loss": 0.0065, |
|
"num_input_tokens_seen": 47183872, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.5429635541367386, |
|
"grad_norm": 0.07826100289821625, |
|
"learning_rate": 7.197309417040359e-06, |
|
"loss": 0.0203, |
|
"num_input_tokens_seen": 47511552, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.553604682096302, |
|
"grad_norm": 0.08626201748847961, |
|
"learning_rate": 7.02914798206278e-06, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 47839232, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.564245810055866, |
|
"grad_norm": 1.227737545967102, |
|
"learning_rate": 6.860986547085202e-06, |
|
"loss": 0.0159, |
|
"num_input_tokens_seen": 48166912, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.5748869380154296, |
|
"grad_norm": 0.45808491110801697, |
|
"learning_rate": 6.692825112107623e-06, |
|
"loss": 0.0182, |
|
"num_input_tokens_seen": 48494592, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.5855280659749933, |
|
"grad_norm": 0.19725441932678223, |
|
"learning_rate": 6.524663677130045e-06, |
|
"loss": 0.011, |
|
"num_input_tokens_seen": 48822272, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.596169193934557, |
|
"grad_norm": 0.11997473984956741, |
|
"learning_rate": 6.356502242152466e-06, |
|
"loss": 0.0104, |
|
"num_input_tokens_seen": 49149952, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.596169193934557, |
|
"eval_accuracy": 0.996, |
|
"eval_loss": 0.02015475556254387, |
|
"eval_runtime": 1.1247, |
|
"eval_samples_per_second": 444.581, |
|
"eval_steps_per_second": 56.017, |
|
"num_input_tokens_seen": 49149952, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6068103218941208, |
|
"grad_norm": 0.08161328732967377, |
|
"learning_rate": 6.188340807174889e-06, |
|
"loss": 0.011, |
|
"num_input_tokens_seen": 49477632, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.6174514498536845, |
|
"grad_norm": 0.04879956319928169, |
|
"learning_rate": 6.020179372197309e-06, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 49805312, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.6280925778132482, |
|
"grad_norm": 0.2356010526418686, |
|
"learning_rate": 5.8520179372197316e-06, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 50132992, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.638733705772812, |
|
"grad_norm": 0.08499031513929367, |
|
"learning_rate": 5.683856502242152e-06, |
|
"loss": 0.0106, |
|
"num_input_tokens_seen": 50460672, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6493748337323755, |
|
"grad_norm": 0.10495586693286896, |
|
"learning_rate": 5.5156950672645745e-06, |
|
"loss": 0.012, |
|
"num_input_tokens_seen": 50788352, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6600159616919394, |
|
"grad_norm": 0.09235712140798569, |
|
"learning_rate": 5.347533632286995e-06, |
|
"loss": 0.0017, |
|
"num_input_tokens_seen": 51116032, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.670657089651503, |
|
"grad_norm": 0.04202970489859581, |
|
"learning_rate": 5.1793721973094175e-06, |
|
"loss": 0.0172, |
|
"num_input_tokens_seen": 51443712, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.681298217611067, |
|
"grad_norm": 3.6560862064361572, |
|
"learning_rate": 5.011210762331839e-06, |
|
"loss": 0.0259, |
|
"num_input_tokens_seen": 51771392, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.6919393455706304, |
|
"grad_norm": 0.20075471699237823, |
|
"learning_rate": 4.8430493273542605e-06, |
|
"loss": 0.0144, |
|
"num_input_tokens_seen": 52099072, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.7025804735301941, |
|
"grad_norm": 0.14858105778694153, |
|
"learning_rate": 4.674887892376682e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 52426752, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.7132216014897579, |
|
"grad_norm": 0.08154450356960297, |
|
"learning_rate": 4.506726457399103e-06, |
|
"loss": 0.0155, |
|
"num_input_tokens_seen": 52754432, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.7238627294493216, |
|
"grad_norm": 0.030162209644913673, |
|
"learning_rate": 4.338565022421525e-06, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 53082112, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.7345038574088854, |
|
"grad_norm": 0.058421239256858826, |
|
"learning_rate": 4.170403587443946e-06, |
|
"loss": 0.0205, |
|
"num_input_tokens_seen": 53409792, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.745144985368449, |
|
"grad_norm": 0.9610540270805359, |
|
"learning_rate": 4.002242152466368e-06, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 53737472, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.7557861133280128, |
|
"grad_norm": 0.3001765310764313, |
|
"learning_rate": 3.834080717488789e-06, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 54065152, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7664272412875763, |
|
"grad_norm": 0.07005713880062103, |
|
"learning_rate": 3.665919282511211e-06, |
|
"loss": 0.0166, |
|
"num_input_tokens_seen": 54392832, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.7770683692471403, |
|
"grad_norm": 0.044125888496637344, |
|
"learning_rate": 3.4977578475336323e-06, |
|
"loss": 0.0016, |
|
"num_input_tokens_seen": 54720512, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.7877094972067038, |
|
"grad_norm": 1.5570340156555176, |
|
"learning_rate": 3.329596412556054e-06, |
|
"loss": 0.0208, |
|
"num_input_tokens_seen": 55048192, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.7983506251662678, |
|
"grad_norm": 0.12797504663467407, |
|
"learning_rate": 3.1614349775784753e-06, |
|
"loss": 0.0127, |
|
"num_input_tokens_seen": 55375872, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.8089917531258313, |
|
"grad_norm": 0.12429122626781464, |
|
"learning_rate": 2.9932735426008968e-06, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 55703552, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.819632881085395, |
|
"grad_norm": 0.15149074792861938, |
|
"learning_rate": 2.8251121076233182e-06, |
|
"loss": 0.0083, |
|
"num_input_tokens_seen": 56031232, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.8302740090449587, |
|
"grad_norm": 0.10725903511047363, |
|
"learning_rate": 2.65695067264574e-06, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 56358912, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.8409151370045225, |
|
"grad_norm": 0.1267658919095993, |
|
"learning_rate": 2.4887892376681616e-06, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 56686592, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.8515562649640862, |
|
"grad_norm": 0.35703355073928833, |
|
"learning_rate": 2.320627802690583e-06, |
|
"loss": 0.0068, |
|
"num_input_tokens_seen": 57014272, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.86219739292365, |
|
"grad_norm": 0.7102775573730469, |
|
"learning_rate": 2.1524663677130046e-06, |
|
"loss": 0.0237, |
|
"num_input_tokens_seen": 57341952, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.86219739292365, |
|
"eval_accuracy": 0.996, |
|
"eval_loss": 0.018471572548151016, |
|
"eval_runtime": 1.1252, |
|
"eval_samples_per_second": 444.377, |
|
"eval_steps_per_second": 55.992, |
|
"num_input_tokens_seen": 57341952, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8728385208832137, |
|
"grad_norm": 0.04301352798938751, |
|
"learning_rate": 1.984304932735426e-06, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 57669632, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.8834796488427772, |
|
"grad_norm": 0.12998220324516296, |
|
"learning_rate": 1.8161434977578476e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 57997312, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.8941207768023411, |
|
"grad_norm": 0.05428827181458473, |
|
"learning_rate": 1.647982062780269e-06, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 58324992, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.031001785770058632, |
|
"learning_rate": 1.4798206278026905e-06, |
|
"loss": 0.0201, |
|
"num_input_tokens_seen": 58652672, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.9154030327214686, |
|
"grad_norm": 0.06974712759256363, |
|
"learning_rate": 1.3116591928251122e-06, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 58980352, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9260441606810321, |
|
"grad_norm": 0.028872903436422348, |
|
"learning_rate": 1.1434977578475337e-06, |
|
"loss": 0.0201, |
|
"num_input_tokens_seen": 59308032, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.9366852886405959, |
|
"grad_norm": 0.04791630432009697, |
|
"learning_rate": 9.75336322869955e-07, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 59635712, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.9473264166001596, |
|
"grad_norm": 0.9329636096954346, |
|
"learning_rate": 8.071748878923768e-07, |
|
"loss": 0.0041, |
|
"num_input_tokens_seen": 59963392, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.9579675445597233, |
|
"grad_norm": 0.2609878182411194, |
|
"learning_rate": 6.390134529147982e-07, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 60291072, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.968608672519287, |
|
"grad_norm": 1.2760034799575806, |
|
"learning_rate": 4.7085201793721974e-07, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 60618752, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9792498004788508, |
|
"grad_norm": 0.5698215961456299, |
|
"learning_rate": 3.026905829596413e-07, |
|
"loss": 0.0106, |
|
"num_input_tokens_seen": 60946432, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.9898909284384145, |
|
"grad_norm": 0.08324664831161499, |
|
"learning_rate": 1.345291479820628e-07, |
|
"loss": 0.0096, |
|
"num_input_tokens_seen": 61274112, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.9984038308060654, |
|
"num_input_tokens_seen": 61536256, |
|
"step": 1878, |
|
"total_flos": 3986132331896832.0, |
|
"train_loss": 0.0478181641492338, |
|
"train_runtime": 541.4691, |
|
"train_samples_per_second": 222.136, |
|
"train_steps_per_second": 3.468 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1878, |
|
"num_input_tokens_seen": 61536256, |
|
"num_train_epochs": 2, |
|
"save_steps": 400, |
|
"total_flos": 3986132331896832.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|