{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 40, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "eval_loss": 0.9384576082229614, "eval_runtime": 0.4222, "eval_samples_per_second": 85.258, "eval_steps_per_second": 11.841, "step": 1 }, { "epoch": 0.25, "grad_norm": 0.06372307986021042, "learning_rate": 8.333333333333334e-05, "loss": 0.2623, "step": 10 }, { "epoch": 0.5, "grad_norm": 0.029395487159490585, "learning_rate": 0.0001666666666666667, "loss": 0.0007, "step": 20 }, { "epoch": 0.75, "grad_norm": 2.7581191062927246, "learning_rate": 0.00019999887622676146, "loss": 0.026, "step": 30 }, { "epoch": 1.0, "grad_norm": 0.48524343967437744, "learning_rate": 0.00019999200881510367, "loss": 0.0292, "step": 40 }, { "epoch": 1.0, "eval_loss": 0.004332332406193018, "eval_runtime": 0.3245, "eval_samples_per_second": 110.942, "eval_steps_per_second": 15.409, "step": 40 }, { "epoch": 1.25, "grad_norm": 43.160125732421875, "learning_rate": 0.00019997889873847797, "loss": 0.1101, "step": 50 }, { "epoch": 1.5, "grad_norm": 71.57295989990234, "learning_rate": 0.00019995954681536798, "loss": 0.0241, "step": 60 }, { "epoch": 1.75, "grad_norm": 0.3996742069721222, "learning_rate": 0.00019993395425394592, "loss": 0.0163, "step": 70 }, { "epoch": 2.0, "grad_norm": 0.061198778450489044, "learning_rate": 0.00019990212265199738, "loss": 0.0148, "step": 80 }, { "epoch": 2.0, "eval_loss": 0.033167850226163864, "eval_runtime": 0.3415, "eval_samples_per_second": 105.418, "eval_steps_per_second": 14.641, "step": 80 }, { "epoch": 2.25, "grad_norm": 2.403183698654175, "learning_rate": 0.0001998640539968214, "loss": 0.01, "step": 90 }, { "epoch": 2.5, "grad_norm": 2.304408550262451, "learning_rate": 0.00019981975066510655, "loss": 0.0435, "step": 100 }, { "epoch": 2.75, "grad_norm": 0.02899610437452793, "learning_rate": 0.00019976921542278237, "loss": 0.0296, "step": 110 }, { "epoch": 3.0, "grad_norm": 3.8328208923339844, "learning_rate": 0.0001997124514248469, "loss": 0.1015, "step": 120 }, { "epoch": 3.0, "eval_loss": 0.00442217942327261, "eval_runtime": 0.3282, "eval_samples_per_second": 109.685, "eval_steps_per_second": 15.234, "step": 120 }, { "epoch": 3.25, "grad_norm": 0.07601974904537201, "learning_rate": 0.00019964946221516953, "loss": 0.0273, "step": 130 }, { "epoch": 3.5, "grad_norm": 0.02951742894947529, "learning_rate": 0.00019958025172626986, "loss": 0.0316, "step": 140 }, { "epoch": 3.75, "grad_norm": 0.09413129091262817, "learning_rate": 0.00019950482427907211, "loss": 0.0071, "step": 150 }, { "epoch": 4.0, "grad_norm": 0.0033842374105006456, "learning_rate": 0.0001994231845826354, "loss": 0.0002, "step": 160 }, { "epoch": 4.0, "eval_loss": 0.00014786835527047515, "eval_runtime": 0.3249, "eval_samples_per_second": 110.813, "eval_steps_per_second": 15.391, "step": 160 }, { "epoch": 4.25, "grad_norm": 0.12055602669715881, "learning_rate": 0.00019933533773385976, "loss": 0.0001, "step": 170 }, { "epoch": 4.5, "grad_norm": 0.007594508584588766, "learning_rate": 0.00019924128921716797, "loss": 0.0001, "step": 180 }, { "epoch": 4.75, "grad_norm": 0.000780335278250277, "learning_rate": 0.000199141044904163, "loss": 0.0, "step": 190 }, { "epoch": 5.0, "grad_norm": 0.0012850259663537145, "learning_rate": 0.00019903461105326154, "loss": 0.0, "step": 200 }, { "epoch": 5.0, "eval_loss": 2.7732399757951498e-05, "eval_runtime": 0.3542, "eval_samples_per_second": 101.634, "eval_steps_per_second": 14.116, "step": 200 }, { "epoch": 5.25, "grad_norm": 0.0005795760662294924, "learning_rate": 0.0001989219943093034, "loss": 0.0, "step": 210 }, { "epoch": 5.5, "grad_norm": 0.0004225255688652396, "learning_rate": 0.0001988032017031364, "loss": 0.0, "step": 220 }, { "epoch": 5.75, "grad_norm": 0.0006476517883129418, "learning_rate": 0.00019867824065117765, "loss": 0.0, "step": 230 }, { "epoch": 6.0, "grad_norm": 0.0004615155339706689, "learning_rate": 0.00019854711895495036, "loss": 0.0, "step": 240 }, { "epoch": 6.0, "eval_loss": 1.7349062545690686e-05, "eval_runtime": 0.328, "eval_samples_per_second": 109.765, "eval_steps_per_second": 15.245, "step": 240 }, { "epoch": 6.25, "grad_norm": 0.0003548109089024365, "learning_rate": 0.00019840984480059689, "loss": 0.0, "step": 250 }, { "epoch": 6.5, "grad_norm": 0.0010875174775719643, "learning_rate": 0.0001982664267583677, "loss": 0.0, "step": 260 }, { "epoch": 6.75, "grad_norm": 0.0003341367410030216, "learning_rate": 0.00019811687378208613, "loss": 0.0, "step": 270 }, { "epoch": 7.0, "grad_norm": 0.00045011454494670033, "learning_rate": 0.00019796119520858955, "loss": 0.0, "step": 280 }, { "epoch": 7.0, "eval_loss": 1.3329447938303929e-05, "eval_runtime": 0.3393, "eval_samples_per_second": 106.103, "eval_steps_per_second": 14.737, "step": 280 }, { "epoch": 7.25, "grad_norm": 0.00023316974693443626, "learning_rate": 0.00019779940075714648, "loss": 0.0, "step": 290 }, { "epoch": 7.5, "grad_norm": 0.0002178147406084463, "learning_rate": 0.00019763150052884966, "loss": 0.0, "step": 300 }, { "epoch": 7.75, "grad_norm": 0.00018833605281542987, "learning_rate": 0.00019745750500598538, "loss": 0.0, "step": 310 }, { "epoch": 8.0, "grad_norm": 0.0005992311052978039, "learning_rate": 0.00019727742505137936, "loss": 0.0, "step": 320 }, { "epoch": 8.0, "eval_loss": 1.119767694035545e-05, "eval_runtime": 0.3439, "eval_samples_per_second": 104.694, "eval_steps_per_second": 14.541, "step": 320 }, { "epoch": 8.25, "grad_norm": 0.00013200360990595073, "learning_rate": 0.00019709127190771825, "loss": 0.0, "step": 330 }, { "epoch": 8.5, "grad_norm": 0.0002380541991442442, "learning_rate": 0.00019689905719684782, "loss": 0.0, "step": 340 }, { "epoch": 8.75, "grad_norm": 0.00014980848936829716, "learning_rate": 0.00019670079291904752, "loss": 0.0, "step": 350 }, { "epoch": 9.0, "grad_norm": 0.0002899216488003731, "learning_rate": 0.00019649649145228102, "loss": 0.0, "step": 360 }, { "epoch": 9.0, "eval_loss": 9.466394658375066e-06, "eval_runtime": 0.3404, "eval_samples_per_second": 105.754, "eval_steps_per_second": 14.688, "step": 360 }, { "epoch": 9.25, "grad_norm": 0.00021961786842439324, "learning_rate": 0.00019628616555142372, "loss": 0.0, "step": 370 }, { "epoch": 9.5, "grad_norm": 0.00020691509416792542, "learning_rate": 0.00019606982834746627, "loss": 0.0, "step": 380 }, { "epoch": 9.75, "grad_norm": 0.00023161708668339998, "learning_rate": 0.00019584749334669487, "loss": 0.0, "step": 390 }, { "epoch": 10.0, "grad_norm": 0.00017992363427765667, "learning_rate": 0.00019561917442984788, "loss": 0.0, "step": 400 }, { "epoch": 10.0, "eval_loss": 8.257883564510848e-06, "eval_runtime": 0.3275, "eval_samples_per_second": 109.923, "eval_steps_per_second": 15.267, "step": 400 }, { "epoch": 10.25, "grad_norm": 0.00013070827117189765, "learning_rate": 0.00019538488585124953, "loss": 0.0, "step": 410 }, { "epoch": 10.5, "grad_norm": 0.00018156137957703322, "learning_rate": 0.00019514464223791965, "loss": 0.0, "step": 420 }, { "epoch": 10.75, "grad_norm": 0.0001987970608752221, "learning_rate": 0.00019489845858866066, "loss": 0.0, "step": 430 }, { "epoch": 11.0, "grad_norm": 0.00012906281335745007, "learning_rate": 0.00019464635027312128, "loss": 0.0, "step": 440 }, { "epoch": 11.0, "eval_loss": 7.331655979214702e-06, "eval_runtime": 0.3356, "eval_samples_per_second": 107.279, "eval_steps_per_second": 14.9, "step": 440 }, { "epoch": 11.25, "grad_norm": 0.00031813167151995003, "learning_rate": 0.00019438833303083678, "loss": 0.0, "step": 450 }, { "epoch": 11.5, "grad_norm": 0.00016680177941452712, "learning_rate": 0.00019412442297024637, "loss": 0.0, "step": 460 }, { "epoch": 11.75, "grad_norm": 0.00013162715185899287, "learning_rate": 0.00019385463656768762, "loss": 0.0, "step": 470 }, { "epoch": 12.0, "grad_norm": 0.00015330589667428285, "learning_rate": 0.00019357899066636773, "loss": 0.0, "step": 480 }, { "epoch": 12.0, "eval_loss": 6.5182412072317675e-06, "eval_runtime": 0.3246, "eval_samples_per_second": 110.889, "eval_steps_per_second": 15.401, "step": 480 }, { "epoch": 12.25, "grad_norm": 0.00018356599321123213, "learning_rate": 0.00019329750247531205, "loss": 0.0, "step": 490 }, { "epoch": 12.5, "grad_norm": 0.00015767107834108174, "learning_rate": 0.00019301018956828964, "loss": 0.0, "step": 500 }, { "epoch": 12.75, "grad_norm": 0.00029690677183680236, "learning_rate": 0.00019271706988271606, "loss": 0.0, "step": 510 }, { "epoch": 13.0, "grad_norm": 9.481079177930951e-05, "learning_rate": 0.0001924181617185336, "loss": 0.0, "step": 520 }, { "epoch": 13.0, "eval_loss": 5.88237071497133e-06, "eval_runtime": 0.3263, "eval_samples_per_second": 110.333, "eval_steps_per_second": 15.324, "step": 520 }, { "epoch": 13.25, "grad_norm": 0.00016097365005407482, "learning_rate": 0.00019211348373706884, "loss": 0.0, "step": 530 }, { "epoch": 13.5, "grad_norm": 0.0001369424571748823, "learning_rate": 0.0001918030549598674, "loss": 0.0, "step": 540 }, { "epoch": 13.75, "grad_norm": 0.00018055856344290078, "learning_rate": 0.00019148689476750658, "loss": 0.0, "step": 550 }, { "epoch": 14.0, "grad_norm": 0.00010365981870563701, "learning_rate": 0.00019116502289838523, "loss": 0.0, "step": 560 }, { "epoch": 14.0, "eval_loss": 5.300180873746285e-06, "eval_runtime": 0.3471, "eval_samples_per_second": 103.705, "eval_steps_per_second": 14.404, "step": 560 }, { "epoch": 14.25, "grad_norm": 7.365662168012932e-05, "learning_rate": 0.00019083745944749162, "loss": 0.0, "step": 570 }, { "epoch": 14.5, "grad_norm": 0.00015878217527642846, "learning_rate": 0.00019050422486514878, "loss": 0.0, "step": 580 }, { "epoch": 14.75, "grad_norm": 0.00016406863869633526, "learning_rate": 0.00019016533995573772, "loss": 0.0, "step": 590 }, { "epoch": 15.0, "grad_norm": 0.0001134676203946583, "learning_rate": 0.0001898208258763987, "loss": 0.0, "step": 600 }, { "epoch": 15.0, "eval_loss": 4.918438207823783e-06, "eval_runtime": 0.3237, "eval_samples_per_second": 111.198, "eval_steps_per_second": 15.444, "step": 600 }, { "epoch": 15.25, "grad_norm": 0.00010196594666922465, "learning_rate": 0.00018947070413571026, "loss": 0.0, "step": 610 }, { "epoch": 15.5, "grad_norm": 0.00013735589163843542, "learning_rate": 0.0001891149965923464, "loss": 0.0, "step": 620 }, { "epoch": 15.75, "grad_norm": 8.303586218971759e-05, "learning_rate": 0.00018875372545371194, "loss": 0.0, "step": 630 }, { "epoch": 16.0, "grad_norm": 8.282584167318419e-05, "learning_rate": 0.0001883869132745561, "loss": 0.0, "step": 640 }, { "epoch": 16.0, "eval_loss": 4.482135864236625e-06, "eval_runtime": 0.3334, "eval_samples_per_second": 107.988, "eval_steps_per_second": 14.998, "step": 640 }, { "epoch": 16.25, "grad_norm": 7.883716170908883e-05, "learning_rate": 0.00018801458295556435, "loss": 0.0, "step": 650 }, { "epoch": 16.5, "grad_norm": 0.00016775316908024251, "learning_rate": 0.0001876367577419286, "loss": 0.0, "step": 660 }, { "epoch": 16.75, "grad_norm": 8.655583224026486e-05, "learning_rate": 0.00018725346122189606, "loss": 0.0, "step": 670 }, { "epoch": 17.0, "grad_norm": 6.624006346100941e-05, "learning_rate": 0.00018686471732529665, "loss": 0.0, "step": 680 }, { "epoch": 17.0, "eval_loss": 4.2038600440719165e-06, "eval_runtime": 0.331, "eval_samples_per_second": 108.76, "eval_steps_per_second": 15.106, "step": 680 }, { "epoch": 17.25, "grad_norm": 0.00013512188161257654, "learning_rate": 0.00018647055032204883, "loss": 0.0, "step": 690 }, { "epoch": 17.5, "grad_norm": 8.678815356688574e-05, "learning_rate": 0.0001860709848206446, "loss": 0.0, "step": 700 }, { "epoch": 17.75, "grad_norm": 6.948116788407788e-05, "learning_rate": 0.00018566604576661288, "loss": 0.0, "step": 710 }, { "epoch": 18.0, "grad_norm": 7.376579014817253e-05, "learning_rate": 0.00018525575844096243, "loss": 0.0, "step": 720 }, { "epoch": 18.0, "eval_loss": 3.883159479300957e-06, "eval_runtime": 0.3356, "eval_samples_per_second": 107.284, "eval_steps_per_second": 14.901, "step": 720 }, { "epoch": 18.25, "grad_norm": 9.459959983360022e-05, "learning_rate": 0.0001848401484586034, "loss": 0.0, "step": 730 }, { "epoch": 18.5, "grad_norm": 9.205293463310227e-05, "learning_rate": 0.00018441924176674794, "loss": 0.0, "step": 740 }, { "epoch": 18.75, "grad_norm": 0.00011255600111326203, "learning_rate": 0.00018399306464329066, "loss": 0.0, "step": 750 }, { "epoch": 19.0, "grad_norm": 6.045972986612469e-05, "learning_rate": 0.0001835616436951677, "loss": 0.0, "step": 760 }, { "epoch": 19.0, "eval_loss": 3.604589437600225e-06, "eval_runtime": 0.3263, "eval_samples_per_second": 110.323, "eval_steps_per_second": 15.323, "step": 760 }, { "epoch": 19.25, "grad_norm": 5.718848478863947e-05, "learning_rate": 0.00018312500585669584, "loss": 0.0, "step": 770 }, { "epoch": 19.5, "grad_norm": 0.00010984807158820331, "learning_rate": 0.00018268317838789088, "loss": 0.0, "step": 780 }, { "epoch": 19.75, "grad_norm": 4.868064570473507e-05, "learning_rate": 0.0001822361888727657, "loss": 0.0, "step": 790 }, { "epoch": 20.0, "grad_norm": 7.550454029114917e-05, "learning_rate": 0.0001817840652176082, "loss": 0.0, "step": 800 }, { "epoch": 20.0, "eval_loss": 3.3906435419339687e-06, "eval_runtime": 0.3395, "eval_samples_per_second": 106.05, "eval_steps_per_second": 14.729, "step": 800 }, { "epoch": 20.25, "grad_norm": 6.606967508560047e-05, "learning_rate": 0.00018132683564923906, "loss": 0.0, "step": 810 }, { "epoch": 20.5, "grad_norm": 0.0001721412845654413, "learning_rate": 0.00018086452871324954, "loss": 0.0, "step": 820 }, { "epoch": 20.75, "grad_norm": 4.9960210162680596e-05, "learning_rate": 0.00018039717327221925, "loss": 0.0, "step": 830 }, { "epoch": 21.0, "grad_norm": 5.9810005041072145e-05, "learning_rate": 0.00017992479850391417, "loss": 0.0, "step": 840 }, { "epoch": 21.0, "eval_loss": 3.1668755582359154e-06, "eval_runtime": 0.3326, "eval_samples_per_second": 108.232, "eval_steps_per_second": 15.032, "step": 840 }, { "epoch": 21.25, "grad_norm": 5.891801993129775e-05, "learning_rate": 0.00017944743389946524, "loss": 0.0, "step": 850 }, { "epoch": 21.5, "grad_norm": 8.631425589555874e-05, "learning_rate": 0.0001789651092615269, "loss": 0.0, "step": 860 }, { "epoch": 21.75, "grad_norm": 6.0596958064706996e-05, "learning_rate": 0.00017847785470241677, "loss": 0.0, "step": 870 }, { "epoch": 22.0, "grad_norm": 7.751138764433563e-05, "learning_rate": 0.00017798570064223533, "loss": 0.0, "step": 880 }, { "epoch": 22.0, "eval_loss": 2.9938312309241155e-06, "eval_runtime": 0.3268, "eval_samples_per_second": 110.167, "eval_steps_per_second": 15.301, "step": 880 }, { "epoch": 22.25, "grad_norm": 6.764694990124553e-05, "learning_rate": 0.00017748867780696716, "loss": 0.0, "step": 890 }, { "epoch": 22.5, "grad_norm": 7.38737580832094e-05, "learning_rate": 0.0001769868172265623, "loss": 0.0, "step": 900 }, { "epoch": 22.75, "grad_norm": 0.00010331822704756632, "learning_rate": 0.00017648015023299918, "loss": 0.0, "step": 910 }, { "epoch": 23.0, "grad_norm": 0.00010948543786071241, "learning_rate": 0.0001759687084583285, "loss": 0.0, "step": 920 }, { "epoch": 23.0, "eval_loss": 2.7970015707978746e-06, "eval_runtime": 0.3433, "eval_samples_per_second": 104.875, "eval_steps_per_second": 14.566, "step": 920 }, { "epoch": 23.25, "grad_norm": 4.273112062946893e-05, "learning_rate": 0.00017545252383269837, "loss": 0.0, "step": 930 }, { "epoch": 23.5, "grad_norm": 0.0001338142465101555, "learning_rate": 0.00017493162858236077, "loss": 0.0, "step": 940 }, { "epoch": 23.75, "grad_norm": 5.875607530470006e-05, "learning_rate": 0.00017440605522765984, "loss": 0.0, "step": 950 }, { "epoch": 24.0, "grad_norm": 7.345333142438903e-05, "learning_rate": 0.00017387583658100142, "loss": 0.0, "step": 960 }, { "epoch": 24.0, "eval_loss": 2.6630891625245567e-06, "eval_runtime": 0.3317, "eval_samples_per_second": 108.524, "eval_steps_per_second": 15.073, "step": 960 }, { "epoch": 24.25, "grad_norm": 6.94195696269162e-05, "learning_rate": 0.00017334100574480435, "loss": 0.0, "step": 970 }, { "epoch": 24.5, "grad_norm": 4.8001227696659043e-05, "learning_rate": 0.0001728015961094343, "loss": 0.0, "step": 980 }, { "epoch": 24.75, "grad_norm": 4.3018935684813187e-05, "learning_rate": 0.00017225764135111868, "loss": 0.0, "step": 990 }, { "epoch": 25.0, "grad_norm": 7.503097003791481e-05, "learning_rate": 0.00017170917542984443, "loss": 0.0, "step": 1000 }, { "epoch": 25.0, "eval_loss": 2.498412186469068e-06, "eval_runtime": 0.3252, "eval_samples_per_second": 110.685, "eval_steps_per_second": 15.373, "step": 1000 }, { "epoch": 25.25, "grad_norm": 2.499126276234165e-05, "learning_rate": 0.00017115623258723783, "loss": 0.0, "step": 1010 }, { "epoch": 25.5, "grad_norm": 8.122723374981433e-05, "learning_rate": 0.00017059884734442658, "loss": 0.0, "step": 1020 }, { "epoch": 25.75, "grad_norm": 5.7621167798060924e-05, "learning_rate": 0.00017003705449988486, "loss": 0.0, "step": 1030 }, { "epoch": 26.0, "grad_norm": 6.584699440281838e-05, "learning_rate": 0.00016947088912726052, "loss": 0.0, "step": 1040 }, { "epoch": 26.0, "eval_loss": 2.384617800998967e-06, "eval_runtime": 0.3289, "eval_samples_per_second": 109.466, "eval_steps_per_second": 15.204, "step": 1040 }, { "epoch": 26.25, "grad_norm": 3.284347985754721e-05, "learning_rate": 0.00016890038657318556, "loss": 0.0, "step": 1050 }, { "epoch": 26.5, "grad_norm": 6.672390009043738e-05, "learning_rate": 0.00016832558245506935, "loss": 0.0, "step": 1060 }, { "epoch": 26.75, "grad_norm": 3.635583561845124e-05, "learning_rate": 0.0001677465126588749, "loss": 0.0, "step": 1070 }, { "epoch": 27.0, "grad_norm": 5.236966899246909e-05, "learning_rate": 0.00016716321333687848, "loss": 0.0, "step": 1080 }, { "epoch": 27.0, "eval_loss": 2.2538335997523973e-06, "eval_runtime": 0.327, "eval_samples_per_second": 110.094, "eval_steps_per_second": 15.291, "step": 1080 }, { "epoch": 27.25, "grad_norm": 5.55117912881542e-05, "learning_rate": 0.00016657572090541262, "loss": 0.0, "step": 1090 }, { "epoch": 27.5, "grad_norm": 0.00013249287439975888, "learning_rate": 0.0001659840720425926, "loss": 0.0, "step": 1100 }, { "epoch": 27.75, "grad_norm": 5.55339029233437e-05, "learning_rate": 0.00016538830368602648, "loss": 0.0, "step": 1110 }, { "epoch": 28.0, "grad_norm": 5.33119855390396e-05, "learning_rate": 0.0001647884530305089, "loss": 0.0, "step": 1120 }, { "epoch": 28.0, "eval_loss": 2.159326413675444e-06, "eval_runtime": 0.3173, "eval_samples_per_second": 113.452, "eval_steps_per_second": 15.757, "step": 1120 }, { "epoch": 28.25, "grad_norm": 6.674770702375099e-05, "learning_rate": 0.00016418455752569943, "loss": 0.0, "step": 1130 }, { "epoch": 28.5, "grad_norm": 5.4036871006246656e-05, "learning_rate": 0.00016357665487378397, "loss": 0.0, "step": 1140 }, { "epoch": 28.75, "grad_norm": 9.294509800383821e-05, "learning_rate": 0.00016296478302712126, "loss": 0.0, "step": 1150 }, { "epoch": 29.0, "grad_norm": 6.301044049905613e-05, "learning_rate": 0.00016234898018587337, "loss": 0.0, "step": 1160 }, { "epoch": 29.0, "eval_loss": 2.0828572360187536e-06, "eval_runtime": 0.3199, "eval_samples_per_second": 112.52, "eval_steps_per_second": 15.628, "step": 1160 }, { "epoch": 29.25, "grad_norm": 6.311033212114125e-05, "learning_rate": 0.00016172928479562078, "loss": 0.0, "step": 1170 }, { "epoch": 29.5, "grad_norm": 3.820831625489518e-05, "learning_rate": 0.00016110573554496224, "loss": 0.0, "step": 1180 }, { "epoch": 29.75, "grad_norm": 4.628980968846008e-05, "learning_rate": 0.00016047837136309924, "loss": 0.0, "step": 1190 }, { "epoch": 30.0, "grad_norm": 3.80598139599897e-05, "learning_rate": 0.00015984723141740576, "loss": 0.0, "step": 1200 }, { "epoch": 30.0, "eval_loss": 1.9744732071558246e-06, "eval_runtime": 0.3173, "eval_samples_per_second": 113.449, "eval_steps_per_second": 15.757, "step": 1200 }, { "epoch": 30.25, "grad_norm": 3.0195853469194844e-05, "learning_rate": 0.00015921235511098282, "loss": 0.0, "step": 1210 }, { "epoch": 30.5, "grad_norm": 5.462007538881153e-05, "learning_rate": 0.00015857378208019863, "loss": 0.0, "step": 1220 }, { "epoch": 30.75, "grad_norm": 2.7883037546416745e-05, "learning_rate": 0.00015793155219221395, "loss": 0.0, "step": 1230 }, { "epoch": 31.0, "grad_norm": 4.7888908738968894e-05, "learning_rate": 0.00015728570554249312, "loss": 0.0, "step": 1240 }, { "epoch": 31.0, "eval_loss": 1.8858928569898126e-06, "eval_runtime": 0.3223, "eval_samples_per_second": 111.705, "eval_steps_per_second": 15.515, "step": 1240 }, { "epoch": 31.25, "grad_norm": 4.82973555335775e-05, "learning_rate": 0.0001566362824523008, "loss": 0.0, "step": 1250 }, { "epoch": 31.5, "grad_norm": 3.9442336856154725e-05, "learning_rate": 0.00015598332346618472, "loss": 0.0, "step": 1260 }, { "epoch": 31.75, "grad_norm": 3.770321563933976e-05, "learning_rate": 0.00015532686934944438, "loss": 0.0, "step": 1270 }, { "epoch": 32.0, "grad_norm": 4.669040936278179e-05, "learning_rate": 0.00015466696108558611, "loss": 0.0, "step": 1280 }, { "epoch": 32.0, "eval_loss": 1.8240966710436624e-06, "eval_runtime": 0.3185, "eval_samples_per_second": 113.013, "eval_steps_per_second": 15.696, "step": 1280 }, { "epoch": 32.25, "grad_norm": 2.80893400486093e-05, "learning_rate": 0.00015400363987376413, "loss": 0.0, "step": 1290 }, { "epoch": 32.5, "grad_norm": 4.817240915144794e-05, "learning_rate": 0.00015333694712620877, "loss": 0.0, "step": 1300 }, { "epoch": 32.75, "grad_norm": 4.6051696699578315e-05, "learning_rate": 0.00015266692446564063, "loss": 0.0, "step": 1310 }, { "epoch": 33.0, "grad_norm": 3.602392098400742e-05, "learning_rate": 0.00015199361372267252, "loss": 0.0, "step": 1320 }, { "epoch": 33.0, "eval_loss": 1.7236499161299434e-06, "eval_runtime": 0.3163, "eval_samples_per_second": 113.807, "eval_steps_per_second": 15.806, "step": 1320 }, { "epoch": 33.25, "grad_norm": 2.2813776013208553e-05, "learning_rate": 0.00015131705693319743, "loss": 0.0, "step": 1330 }, { "epoch": 33.5, "grad_norm": 7.926914986455813e-05, "learning_rate": 0.0001506372963357644, "loss": 0.0, "step": 1340 }, { "epoch": 33.75, "grad_norm": 6.877528358018026e-05, "learning_rate": 0.00014995437436894147, "loss": 0.0, "step": 1350 }, { "epoch": 34.0, "grad_norm": 2.7551081075216644e-05, "learning_rate": 0.0001492683336686661, "loss": 0.0, "step": 1360 }, { "epoch": 34.0, "eval_loss": 1.67099869941012e-06, "eval_runtime": 0.325, "eval_samples_per_second": 110.775, "eval_steps_per_second": 15.385, "step": 1360 }, { "epoch": 34.25, "grad_norm": 3.4323111322009936e-05, "learning_rate": 0.0001485792170655835, "loss": 0.0, "step": 1370 }, { "epoch": 34.5, "grad_norm": 3.862389348796569e-05, "learning_rate": 0.00014788706758237237, "loss": 0.0, "step": 1380 }, { "epoch": 34.75, "grad_norm": 3.117803134955466e-05, "learning_rate": 0.00014719192843105924, "loss": 0.0, "step": 1390 }, { "epoch": 35.0, "grad_norm": 3.452876626397483e-05, "learning_rate": 0.00014649384301032044, "loss": 0.0, "step": 1400 }, { "epoch": 35.0, "eval_loss": 1.6147401993293897e-06, "eval_runtime": 0.319, "eval_samples_per_second": 112.868, "eval_steps_per_second": 15.676, "step": 1400 }, { "epoch": 35.25, "grad_norm": 2.5607059797039255e-05, "learning_rate": 0.00014579285490277274, "loss": 0.0, "step": 1410 }, { "epoch": 35.5, "grad_norm": 7.004107465036213e-05, "learning_rate": 0.0001450890078722524, "loss": 0.0, "step": 1420 }, { "epoch": 35.75, "grad_norm": 5.070870975032449e-05, "learning_rate": 0.00014438234586108297, "loss": 0.0, "step": 1430 }, { "epoch": 36.0, "grad_norm": 2.5347033442812972e-05, "learning_rate": 0.00014367291298733178, "loss": 0.0, "step": 1440 }, { "epoch": 36.0, "eval_loss": 1.5523125966865337e-06, "eval_runtime": 0.3195, "eval_samples_per_second": 112.683, "eval_steps_per_second": 15.65, "step": 1440 }, { "epoch": 36.25, "grad_norm": 3.3264463127125055e-05, "learning_rate": 0.0001429607535420557, "loss": 0.0, "step": 1450 }, { "epoch": 36.5, "grad_norm": 4.0014037949731573e-05, "learning_rate": 0.00014224591198653595, "loss": 0.0, "step": 1460 }, { "epoch": 36.75, "grad_norm": 4.455630187294446e-05, "learning_rate": 0.00014152843294950218, "loss": 0.0, "step": 1470 }, { "epoch": 37.0, "grad_norm": 3.4259654057677835e-05, "learning_rate": 0.0001408083612243465, "loss": 0.0, "step": 1480 }, { "epoch": 37.0, "eval_loss": 1.506923695160367e-06, "eval_runtime": 0.3136, "eval_samples_per_second": 114.814, "eval_steps_per_second": 15.946, "step": 1480 }, { "epoch": 37.25, "grad_norm": 3.984866998507641e-05, "learning_rate": 0.00014008574176632666, "loss": 0.0, "step": 1490 }, { "epoch": 37.5, "grad_norm": 3.252027090638876e-05, "learning_rate": 0.00013936061968975957, "loss": 0.0, "step": 1500 }, { "epoch": 37.75, "grad_norm": 2.17838187381858e-05, "learning_rate": 0.00013863304026520473, "loss": 0.0, "step": 1510 }, { "epoch": 38.0, "grad_norm": 4.0549610275775194e-05, "learning_rate": 0.00013790304891663792, "loss": 0.0, "step": 1520 }, { "epoch": 38.0, "eval_loss": 1.457518123970658e-06, "eval_runtime": 0.3138, "eval_samples_per_second": 114.708, "eval_steps_per_second": 15.932, "step": 1520 }, { "epoch": 38.25, "grad_norm": 3.441906665102579e-05, "learning_rate": 0.00013717069121861527, "loss": 0.0, "step": 1530 }, { "epoch": 38.5, "grad_norm": 3.80768469767645e-05, "learning_rate": 0.00013643601289342803, "loss": 0.0, "step": 1540 }, { "epoch": 38.75, "grad_norm": 1.9130562577629462e-05, "learning_rate": 0.00013569905980824788, "loss": 0.0, "step": 1550 }, { "epoch": 39.0, "grad_norm": 2.708647480176296e-05, "learning_rate": 0.0001349598779722636, "loss": 0.0, "step": 1560 }, { "epoch": 39.0, "eval_loss": 1.4059390878173872e-06, "eval_runtime": 0.326, "eval_samples_per_second": 110.43, "eval_steps_per_second": 15.337, "step": 1560 }, { "epoch": 39.25, "grad_norm": 2.7261641662335023e-05, "learning_rate": 0.00013421851353380857, "loss": 0.0, "step": 1570 }, { "epoch": 39.5, "grad_norm": 3.74881892639678e-05, "learning_rate": 0.00013347501277747955, "loss": 0.0, "step": 1580 }, { "epoch": 39.75, "grad_norm": 4.151304892729968e-05, "learning_rate": 0.00013272942212124705, "loss": 0.0, "step": 1590 }, { "epoch": 40.0, "grad_norm": 2.8103966542403214e-05, "learning_rate": 0.0001319817881135576, "loss": 0.0, "step": 1600 }, { "epoch": 40.0, "eval_loss": 1.3655937891599024e-06, "eval_runtime": 0.3183, "eval_samples_per_second": 113.09, "eval_steps_per_second": 15.707, "step": 1600 }, { "epoch": 40.25, "grad_norm": 2.1028572518844157e-05, "learning_rate": 0.0001312321574304275, "loss": 0.0, "step": 1610 }, { "epoch": 40.5, "grad_norm": 2.917735582741443e-05, "learning_rate": 0.00013048057687252865, "loss": 0.0, "step": 1620 }, { "epoch": 40.75, "grad_norm": 3.929531158064492e-05, "learning_rate": 0.00012972709336226697, "loss": 0.0, "step": 1630 }, { "epoch": 41.0, "grad_norm": 2.542526817705948e-05, "learning_rate": 0.00012897175394085267, "loss": 0.0, "step": 1640 }, { "epoch": 41.0, "eval_loss": 1.3143367141310591e-06, "eval_runtime": 0.32, "eval_samples_per_second": 112.487, "eval_steps_per_second": 15.623, "step": 1640 }, { "epoch": 41.25, "grad_norm": 2.2972772057983093e-05, "learning_rate": 0.00012821460576536363, "loss": 0.0, "step": 1650 }, { "epoch": 41.5, "grad_norm": 2.710890294110868e-05, "learning_rate": 0.0001274556961058012, "loss": 0.0, "step": 1660 }, { "epoch": 41.75, "grad_norm": 7.863906648708507e-05, "learning_rate": 0.00012669507234213908, "loss": 0.0, "step": 1670 }, { "epoch": 42.0, "grad_norm": 2.5962377549149096e-05, "learning_rate": 0.00012593278196136525, "loss": 0.0, "step": 1680 }, { "epoch": 42.0, "eval_loss": 1.2861806908404105e-06, "eval_runtime": 0.3211, "eval_samples_per_second": 112.131, "eval_steps_per_second": 15.574, "step": 1680 }, { "epoch": 42.25, "grad_norm": 2.938141733466182e-05, "learning_rate": 0.00012516887255451735, "loss": 0.0, "step": 1690 }, { "epoch": 42.5, "grad_norm": 2.2876229195389897e-05, "learning_rate": 0.00012440339181371148, "loss": 0.0, "step": 1700 }, { "epoch": 42.75, "grad_norm": 2.188000871683471e-05, "learning_rate": 0.00012363638752916468, "loss": 0.0, "step": 1710 }, { "epoch": 43.0, "grad_norm": 2.7062182198278606e-05, "learning_rate": 0.00012286790758621132, "loss": 0.0, "step": 1720 }, { "epoch": 43.0, "eval_loss": 1.24422297176352e-06, "eval_runtime": 0.3203, "eval_samples_per_second": 112.377, "eval_steps_per_second": 15.608, "step": 1720 }, { "epoch": 43.25, "grad_norm": 3.9851081965025514e-05, "learning_rate": 0.00012209799996231358, "loss": 0.0, "step": 1730 }, { "epoch": 43.5, "grad_norm": 3.9189981180243194e-05, "learning_rate": 0.00012132671272406604, "loss": 0.0, "step": 1740 }, { "epoch": 43.75, "grad_norm": 2.008090086746961e-05, "learning_rate": 0.00012055409402419494, "loss": 0.0, "step": 1750 }, { "epoch": 44.0, "grad_norm": 2.994649184984155e-05, "learning_rate": 0.00011978019209855174, "loss": 0.0, "step": 1760 }, { "epoch": 44.0, "eval_loss": 1.2121387271690764e-06, "eval_runtime": 0.3206, "eval_samples_per_second": 112.281, "eval_steps_per_second": 15.595, "step": 1760 }, { "epoch": 44.25, "grad_norm": 1.9228473320254125e-05, "learning_rate": 0.0001190050552631019, "loss": 0.0, "step": 1770 }, { "epoch": 44.5, "grad_norm": 2.6020699806394987e-05, "learning_rate": 0.00011822873191090833, "loss": 0.0, "step": 1780 }, { "epoch": 44.75, "grad_norm": 2.0412864614627324e-05, "learning_rate": 0.00011745127050910998, "loss": 0.0, "step": 1790 }, { "epoch": 45.0, "grad_norm": 2.493833380867727e-05, "learning_rate": 0.00011667271959589623, "loss": 0.0, "step": 1800 }, { "epoch": 45.0, "eval_loss": 1.1790700682468014e-06, "eval_runtime": 0.3173, "eval_samples_per_second": 113.472, "eval_steps_per_second": 15.76, "step": 1800 }, { "epoch": 45.25, "grad_norm": 3.828733315458521e-05, "learning_rate": 0.00011589312777747644, "loss": 0.0, "step": 1810 }, { "epoch": 45.5, "grad_norm": 2.1567129806498997e-05, "learning_rate": 0.00011511254372504531, "loss": 0.0, "step": 1820 }, { "epoch": 45.75, "grad_norm": 1.842524579842575e-05, "learning_rate": 0.0001143310161717444, "loss": 0.0, "step": 1830 }, { "epoch": 46.0, "grad_norm": 2.736481292231474e-05, "learning_rate": 0.00011354859390961958, "loss": 0.0, "step": 1840 }, { "epoch": 46.0, "eval_loss": 1.1555836181287304e-06, "eval_runtime": 0.3177, "eval_samples_per_second": 113.308, "eval_steps_per_second": 15.737, "step": 1840 }, { "epoch": 46.25, "grad_norm": 3.4207103453809395e-05, "learning_rate": 0.0001127653257865748, "loss": 0.0, "step": 1850 }, { "epoch": 46.5, "grad_norm": 3.1199837394524366e-05, "learning_rate": 0.00011198126070332253, "loss": 0.0, "step": 1860 }, { "epoch": 46.75, "grad_norm": 1.3810436030325945e-05, "learning_rate": 0.00011119644761033078, "loss": 0.0, "step": 1870 }, { "epoch": 47.0, "grad_norm": 2.9521519536501728e-05, "learning_rate": 0.00011041093550476707, "loss": 0.0, "step": 1880 }, { "epoch": 47.0, "eval_loss": 1.1195420484000351e-06, "eval_runtime": 0.3205, "eval_samples_per_second": 112.332, "eval_steps_per_second": 15.602, "step": 1880 }, { "epoch": 47.25, "grad_norm": 1.7040036254911683e-05, "learning_rate": 0.00010962477342743929, "loss": 0.0, "step": 1890 }, { "epoch": 47.5, "grad_norm": 2.9747276130365208e-05, "learning_rate": 0.00010883801045973425, "loss": 0.0, "step": 1900 }, { "epoch": 47.75, "grad_norm": 2.880042120523285e-05, "learning_rate": 0.00010805069572055334, "loss": 0.0, "step": 1910 }, { "epoch": 48.0, "grad_norm": 2.100724850606639e-05, "learning_rate": 0.00010726287836324582, "loss": 0.0, "step": 1920 }, { "epoch": 48.0, "eval_loss": 1.1032241218345007e-06, "eval_runtime": 0.3192, "eval_samples_per_second": 112.768, "eval_steps_per_second": 15.662, "step": 1920 }, { "epoch": 48.25, "grad_norm": 1.7086620573536493e-05, "learning_rate": 0.0001064746075725404, "loss": 0.0, "step": 1930 }, { "epoch": 48.5, "grad_norm": 2.3707199943601154e-05, "learning_rate": 0.00010568593256147421, "loss": 0.0, "step": 1940 }, { "epoch": 48.75, "grad_norm": 1.4947347153793089e-05, "learning_rate": 0.00010489690256832068, "loss": 0.0, "step": 1950 }, { "epoch": 49.0, "grad_norm": 2.3327078451984562e-05, "learning_rate": 0.00010410756685351517, "loss": 0.0, "step": 1960 }, { "epoch": 49.0, "eval_loss": 1.0602713018670329e-06, "eval_runtime": 0.3334, "eval_samples_per_second": 107.979, "eval_steps_per_second": 14.997, "step": 1960 }, { "epoch": 49.25, "grad_norm": 1.931817314471118e-05, "learning_rate": 0.00010331797469657992, "loss": 0.0, "step": 1970 }, { "epoch": 49.5, "grad_norm": 2.6536048608249985e-05, "learning_rate": 0.00010252817539304718, "loss": 0.0, "step": 1980 }, { "epoch": 49.75, "grad_norm": 2.2126323528937064e-05, "learning_rate": 0.00010173821825138172, "loss": 0.0, "step": 1990 }, { "epoch": 50.0, "grad_norm": 2.2889309548190795e-05, "learning_rate": 0.00010094815258990241, "loss": 0.0, "step": 2000 }, { "epoch": 50.0, "eval_loss": 1.040821643982781e-06, "eval_runtime": 0.3203, "eval_samples_per_second": 112.396, "eval_steps_per_second": 15.611, "step": 2000 }, { "epoch": 50.25, "grad_norm": 2.8334068701951765e-05, "learning_rate": 0.00010015802773370311, "loss": 0.0, "step": 2010 }, { "epoch": 50.5, "grad_norm": 1.9157972928951494e-05, "learning_rate": 9.936789301157347e-05, "loss": 0.0, "step": 2020 }, { "epoch": 50.75, "grad_norm": 2.7853264327859506e-05, "learning_rate": 9.857779775291898e-05, "loss": 0.0, "step": 2030 }, { "epoch": 51.0, "grad_norm": 2.194027547375299e-05, "learning_rate": 9.778779128468132e-05, "loss": 0.0, "step": 2040 }, { "epoch": 51.0, "eval_loss": 1.013436872199236e-06, "eval_runtime": 0.3177, "eval_samples_per_second": 113.312, "eval_steps_per_second": 15.738, "step": 2040 }, { "epoch": 51.25, "grad_norm": 1.2561698895297013e-05, "learning_rate": 9.699792292825892e-05, "loss": 0.0, "step": 2050 }, { "epoch": 51.5, "grad_norm": 2.041015432041604e-05, "learning_rate": 9.620824199642764e-05, "loss": 0.0, "step": 2060 }, { "epoch": 51.75, "grad_norm": 3.463058601482771e-05, "learning_rate": 9.541879779026209e-05, "loss": 0.0, "step": 2070 }, { "epoch": 52.0, "grad_norm": 1.9060191334574483e-05, "learning_rate": 9.462963959605778e-05, "loss": 0.0, "step": 2080 }, { "epoch": 52.0, "eval_loss": 1.0033103308160207e-06, "eval_runtime": 0.3157, "eval_samples_per_second": 114.025, "eval_steps_per_second": 15.837, "step": 2080 }, { "epoch": 52.25, "grad_norm": 1.4129647752270103e-05, "learning_rate": 9.384081668225387e-05, "loss": 0.0, "step": 2090 }, { "epoch": 52.5, "grad_norm": 2.1596322767436504e-05, "learning_rate": 9.30523782963576e-05, "loss": 0.0, "step": 2100 }, { "epoch": 52.75, "grad_norm": 1.7303984350292012e-05, "learning_rate": 9.226437366186941e-05, "loss": 0.0, "step": 2110 }, { "epoch": 53.0, "grad_norm": 2.7551333914743736e-05, "learning_rate": 9.147685197520995e-05, "loss": 0.0, "step": 2120 }, { "epoch": 53.0, "eval_loss": 9.675704859546386e-07, "eval_runtime": 0.3184, "eval_samples_per_second": 113.083, "eval_steps_per_second": 15.706, "step": 2120 }, { "epoch": 53.25, "grad_norm": 2.0771505660377443e-05, "learning_rate": 9.06898624026486e-05, "loss": 0.0, "step": 2130 }, { "epoch": 53.5, "grad_norm": 2.2202431864570826e-05, "learning_rate": 8.990345407723402e-05, "loss": 0.0, "step": 2140 }, { "epoch": 53.75, "grad_norm": 1.3855403267371003e-05, "learning_rate": 8.91176760957267e-05, "loss": 0.0, "step": 2150 }, { "epoch": 54.0, "grad_norm": 2.2561982405022718e-05, "learning_rate": 8.833257751553365e-05, "loss": 0.0, "step": 2160 }, { "epoch": 54.0, "eval_loss": 9.524069923827483e-07, "eval_runtime": 0.3172, "eval_samples_per_second": 113.496, "eval_steps_per_second": 15.763, "step": 2160 }, { "epoch": 54.25, "grad_norm": 1.5506595445913263e-05, "learning_rate": 8.754820735164576e-05, "loss": 0.0, "step": 2170 }, { "epoch": 54.5, "grad_norm": 2.101029167533852e-05, "learning_rate": 8.676461457357776e-05, "loss": 0.0, "step": 2180 }, { "epoch": 54.75, "grad_norm": 1.7293437849730253e-05, "learning_rate": 8.598184810231088e-05, "loss": 0.0, "step": 2190 }, { "epoch": 55.0, "grad_norm": 2.4345905330847017e-05, "learning_rate": 8.519995680723854e-05, "loss": 0.0, "step": 2200 }, { "epoch": 55.0, "eval_loss": 9.304160357714863e-07, "eval_runtime": 0.3151, "eval_samples_per_second": 114.245, "eval_steps_per_second": 15.867, "step": 2200 }, { "epoch": 55.25, "grad_norm": 3.5958666558144614e-05, "learning_rate": 8.44189895031157e-05, "loss": 0.0, "step": 2210 }, { "epoch": 55.5, "grad_norm": 2.3594711819896474e-05, "learning_rate": 8.363899494701086e-05, "loss": 0.0, "step": 2220 }, { "epoch": 55.75, "grad_norm": 1.3870093425794039e-05, "learning_rate": 8.286002183526237e-05, "loss": 0.0, "step": 2230 }, { "epoch": 56.0, "grad_norm": 2.6735531719168648e-05, "learning_rate": 8.208211880043812e-05, "loss": 0.0, "step": 2240 }, { "epoch": 56.0, "eval_loss": 9.174784736387664e-07, "eval_runtime": 0.3129, "eval_samples_per_second": 115.04, "eval_steps_per_second": 15.978, "step": 2240 }, { "epoch": 56.25, "grad_norm": 2.9232525776023977e-05, "learning_rate": 8.130533440829928e-05, "loss": 0.0, "step": 2250 }, { "epoch": 56.5, "grad_norm": 2.4526891138521023e-05, "learning_rate": 8.052971715476842e-05, "loss": 0.0, "step": 2260 }, { "epoch": 56.75, "grad_norm": 2.6106521545443684e-05, "learning_rate": 7.975531546290166e-05, "loss": 0.0, "step": 2270 }, { "epoch": 57.0, "grad_norm": 1.784413143468555e-05, "learning_rate": 7.898217767986562e-05, "loss": 0.0, "step": 2280 }, { "epoch": 57.0, "eval_loss": 9.079113851839793e-07, "eval_runtime": 0.3236, "eval_samples_per_second": 111.239, "eval_steps_per_second": 15.45, "step": 2280 }, { "epoch": 57.25, "grad_norm": 1.9261695342720486e-05, "learning_rate": 7.821035207391912e-05, "loss": 0.0, "step": 2290 }, { "epoch": 57.5, "grad_norm": 3.491761162877083e-05, "learning_rate": 7.743988683139943e-05, "loss": 0.0, "step": 2300 }, { "epoch": 57.75, "grad_norm": 1.3563810171035584e-05, "learning_rate": 7.66708300537143e-05, "loss": 0.0, "step": 2310 }, { "epoch": 58.0, "grad_norm": 1.2282480383873917e-05, "learning_rate": 7.590322975433857e-05, "loss": 0.0, "step": 2320 }, { "epoch": 58.0, "eval_loss": 8.861284754857479e-07, "eval_runtime": 0.3181, "eval_samples_per_second": 113.172, "eval_steps_per_second": 15.718, "step": 2320 }, { "epoch": 58.25, "grad_norm": 2.858146035578102e-05, "learning_rate": 7.51371338558168e-05, "loss": 0.0, "step": 2330 }, { "epoch": 58.5, "grad_norm": 2.0420882719918154e-05, "learning_rate": 7.437259018677136e-05, "loss": 0.0, "step": 2340 }, { "epoch": 58.75, "grad_norm": 9.892805792333093e-06, "learning_rate": 7.360964647891637e-05, "loss": 0.0, "step": 2350 }, { "epoch": 59.0, "grad_norm": 2.6135967345908284e-05, "learning_rate": 7.284835036407776e-05, "loss": 0.0, "step": 2360 }, { "epoch": 59.0, "eval_loss": 8.719437687432219e-07, "eval_runtime": 0.3182, "eval_samples_per_second": 113.153, "eval_steps_per_second": 15.716, "step": 2360 }, { "epoch": 59.25, "grad_norm": 3.855082468362525e-05, "learning_rate": 7.208874937121946e-05, "loss": 0.0, "step": 2370 }, { "epoch": 59.5, "grad_norm": 2.4621716875117272e-05, "learning_rate": 7.133089092347627e-05, "loss": 0.0, "step": 2380 }, { "epoch": 59.75, "grad_norm": 1.3933644368080422e-05, "learning_rate": 7.057482233519302e-05, "loss": 0.0, "step": 2390 }, { "epoch": 60.0, "grad_norm": 1.3702153410122264e-05, "learning_rate": 6.982059080897059e-05, "loss": 0.0, "step": 2400 }, { "epoch": 60.0, "eval_loss": 8.514528531122778e-07, "eval_runtime": 0.317, "eval_samples_per_second": 113.548, "eval_steps_per_second": 15.771, "step": 2400 }, { "epoch": 60.25, "grad_norm": 1.285933922190452e-05, "learning_rate": 6.906824343271916e-05, "loss": 0.0, "step": 2410 }, { "epoch": 60.5, "grad_norm": 1.753455217112787e-05, "learning_rate": 6.831782717671828e-05, "loss": 0.0, "step": 2420 }, { "epoch": 60.75, "grad_norm": 1.9983261154266074e-05, "learning_rate": 6.756938889068454e-05, "loss": 0.0, "step": 2430 }, { "epoch": 61.0, "grad_norm": 1.9891913325409405e-05, "learning_rate": 6.682297530084664e-05, "loss": 0.0, "step": 2440 }, { "epoch": 61.0, "eval_loss": 8.335572942996805e-07, "eval_runtime": 0.3281, "eval_samples_per_second": 109.721, "eval_steps_per_second": 15.239, "step": 2440 }, { "epoch": 61.25, "grad_norm": 1.8422002540319227e-05, "learning_rate": 6.607863300702807e-05, "loss": 0.0, "step": 2450 }, { "epoch": 61.5, "grad_norm": 1.9453251297818497e-05, "learning_rate": 6.533640847973808e-05, "loss": 0.0, "step": 2460 }, { "epoch": 61.75, "grad_norm": 1.48242861541803e-05, "learning_rate": 6.459634805727011e-05, "loss": 0.0, "step": 2470 }, { "epoch": 62.0, "grad_norm": 1.9470420738798566e-05, "learning_rate": 6.385849794280915e-05, "loss": 0.0, "step": 2480 }, { "epoch": 62.0, "eval_loss": 8.260683443950256e-07, "eval_runtime": 0.3297, "eval_samples_per_second": 109.182, "eval_steps_per_second": 15.164, "step": 2480 }, { "epoch": 62.25, "grad_norm": 2.976124051201623e-05, "learning_rate": 6.312290420154694e-05, "loss": 0.0, "step": 2490 }, { "epoch": 62.5, "grad_norm": 4.272747173672542e-05, "learning_rate": 6.238961275780613e-05, "loss": 0.0, "step": 2500 }, { "epoch": 62.75, "grad_norm": 1.2389010407787282e-05, "learning_rate": 6.165866939217328e-05, "loss": 0.0, "step": 2510 }, { "epoch": 63.0, "grad_norm": 1.4621130503655877e-05, "learning_rate": 6.0930119738640445e-05, "loss": 0.0, "step": 2520 }, { "epoch": 63.0, "eval_loss": 8.148024335241644e-07, "eval_runtime": 0.3292, "eval_samples_per_second": 109.354, "eval_steps_per_second": 15.188, "step": 2520 }, { "epoch": 63.25, "grad_norm": 1.0234934961772524e-05, "learning_rate": 6.020400928175637e-05, "loss": 0.0, "step": 2530 }, { "epoch": 63.5, "grad_norm": 1.937254455697257e-05, "learning_rate": 5.948038335378683e-05, "loss": 0.0, "step": 2540 }, { "epoch": 63.75, "grad_norm": 1.764351145538967e-05, "learning_rate": 5.8759287131884246e-05, "loss": 0.0, "step": 2550 }, { "epoch": 64.0, "grad_norm": 2.3509826860390604e-05, "learning_rate": 5.804076563526744e-05, "loss": 0.0, "step": 2560 }, { "epoch": 64.0, "eval_loss": 8.072562422967167e-07, "eval_runtime": 0.3217, "eval_samples_per_second": 111.904, "eval_steps_per_second": 15.542, "step": 2560 }, { "epoch": 64.25, "grad_norm": 1.288153634959599e-05, "learning_rate": 5.732486372241088e-05, "loss": 0.0, "step": 2570 }, { "epoch": 64.5, "grad_norm": 1.7124617443187162e-05, "learning_rate": 5.6611626088244194e-05, "loss": 0.0, "step": 2580 }, { "epoch": 64.75, "grad_norm": 3.4207390854135156e-05, "learning_rate": 5.5901097261361636e-05, "loss": 0.0, "step": 2590 }, { "epoch": 65.0, "grad_norm": 1.607052945473697e-05, "learning_rate": 5.5193321601242156e-05, "loss": 0.0, "step": 2600 }, { "epoch": 65.0, "eval_loss": 7.960065886436496e-07, "eval_runtime": 0.3236, "eval_samples_per_second": 111.263, "eval_steps_per_second": 15.453, "step": 2600 }, { "epoch": 65.25, "grad_norm": 2.7799209419754334e-05, "learning_rate": 5.448834329548016e-05, "loss": 0.0, "step": 2610 }, { "epoch": 65.5, "grad_norm": 1.6963076632237062e-05, "learning_rate": 5.378620635702643e-05, "loss": 0.0, "step": 2620 }, { "epoch": 65.75, "grad_norm": 1.7011914678732865e-05, "learning_rate": 5.308695462144068e-05, "loss": 0.0, "step": 2630 }, { "epoch": 66.0, "grad_norm": 1.719038118608296e-05, "learning_rate": 5.239063174415466e-05, "loss": 0.0, "step": 2640 }, { "epoch": 66.0, "eval_loss": 7.857981927372748e-07, "eval_runtime": 0.3159, "eval_samples_per_second": 113.963, "eval_steps_per_second": 15.828, "step": 2640 }, { "epoch": 66.25, "grad_norm": 1.87909827218391e-05, "learning_rate": 5.1697281197746596e-05, "loss": 0.0, "step": 2650 }, { "epoch": 66.5, "grad_norm": 1.997711297008209e-05, "learning_rate": 5.1006946269227376e-05, "loss": 0.0, "step": 2660 }, { "epoch": 66.75, "grad_norm": 2.0850015062023886e-05, "learning_rate": 5.03196700573378e-05, "loss": 0.0, "step": 2670 }, { "epoch": 67.0, "grad_norm": 2.2285566956270486e-05, "learning_rate": 4.963549546985799e-05, "loss": 0.0, "step": 2680 }, { "epoch": 67.0, "eval_loss": 7.721130259596976e-07, "eval_runtime": 0.3244, "eval_samples_per_second": 110.965, "eval_steps_per_second": 15.412, "step": 2680 }, { "epoch": 67.25, "grad_norm": 1.6444948414573446e-05, "learning_rate": 4.895446522092868e-05, "loss": 0.0, "step": 2690 }, { "epoch": 67.5, "grad_norm": 1.5268993593053892e-05, "learning_rate": 4.8276621828384225e-05, "loss": 0.0, "step": 2700 }, { "epoch": 67.75, "grad_norm": 1.7810820281738415e-05, "learning_rate": 4.760200761109852e-05, "loss": 0.0, "step": 2710 }, { "epoch": 68.0, "grad_norm": 1.7248778021894395e-05, "learning_rate": 4.6930664686342526e-05, "loss": 0.0, "step": 2720 }, { "epoch": 68.0, "eval_loss": 7.603679819112585e-07, "eval_runtime": 0.3117, "eval_samples_per_second": 115.513, "eval_steps_per_second": 16.044, "step": 2720 }, { "epoch": 68.25, "grad_norm": 2.448088525852654e-05, "learning_rate": 4.626263496715525e-05, "loss": 0.0, "step": 2730 }, { "epoch": 68.5, "grad_norm": 1.745475674397312e-05, "learning_rate": 4.559796015972677e-05, "loss": 0.0, "step": 2740 }, { "epoch": 68.75, "grad_norm": 1.6836595023050904e-05, "learning_rate": 4.49366817607945e-05, "loss": 0.0, "step": 2750 }, { "epoch": 69.0, "grad_norm": 2.0379737179609947e-05, "learning_rate": 4.427884105505251e-05, "loss": 0.0, "step": 2760 }, { "epoch": 69.0, "eval_loss": 7.604816119055613e-07, "eval_runtime": 0.3177, "eval_samples_per_second": 113.329, "eval_steps_per_second": 15.74, "step": 2760 }, { "epoch": 69.25, "grad_norm": 2.278652391396463e-05, "learning_rate": 4.362447911257406e-05, "loss": 0.0, "step": 2770 }, { "epoch": 69.5, "grad_norm": 1.2965742826054338e-05, "learning_rate": 4.297363678624753e-05, "loss": 0.0, "step": 2780 }, { "epoch": 69.75, "grad_norm": 1.8777451259666122e-05, "learning_rate": 4.2326354709225955e-05, "loss": 0.0, "step": 2790 }, { "epoch": 70.0, "grad_norm": 2.3537781089544296e-05, "learning_rate": 4.168267329239002e-05, "loss": 0.0, "step": 2800 }, { "epoch": 70.0, "eval_loss": 7.471541039194562e-07, "eval_runtime": 0.3194, "eval_samples_per_second": 112.703, "eval_steps_per_second": 15.653, "step": 2800 }, { "epoch": 70.25, "grad_norm": 1.4215344890544657e-05, "learning_rate": 4.104263272182546e-05, "loss": 0.0, "step": 2810 }, { "epoch": 70.5, "grad_norm": 1.8491147784516215e-05, "learning_rate": 4.0406272956313895e-05, "loss": 0.0, "step": 2820 }, { "epoch": 70.75, "grad_norm": 1.7631069567869417e-05, "learning_rate": 3.9773633724838265e-05, "loss": 0.0, "step": 2830 }, { "epoch": 71.0, "grad_norm": 1.9227232769480906e-05, "learning_rate": 3.914475452410257e-05, "loss": 0.0, "step": 2840 }, { "epoch": 71.0, "eval_loss": 7.375128916464746e-07, "eval_runtime": 0.321, "eval_samples_per_second": 112.152, "eval_steps_per_second": 15.577, "step": 2840 }, { "epoch": 71.25, "grad_norm": 1.6681302440701984e-05, "learning_rate": 3.8519674616065784e-05, "loss": 0.0, "step": 2850 }, { "epoch": 71.5, "grad_norm": 1.8769558664644137e-05, "learning_rate": 3.789843302549096e-05, "loss": 0.0, "step": 2860 }, { "epoch": 71.75, "grad_norm": 8.559236448490992e-06, "learning_rate": 3.7281068537508565e-05, "loss": 0.0, "step": 2870 }, { "epoch": 72.0, "grad_norm": 1.4404205103346612e-05, "learning_rate": 3.6667619695195285e-05, "loss": 0.0, "step": 2880 }, { "epoch": 72.0, "eval_loss": 7.320029453694588e-07, "eval_runtime": 0.3152, "eval_samples_per_second": 114.223, "eval_steps_per_second": 15.864, "step": 2880 }, { "epoch": 72.25, "grad_norm": 1.8397522580926307e-05, "learning_rate": 3.605812479716767e-05, "loss": 0.0, "step": 2890 }, { "epoch": 72.5, "grad_norm": 1.5880750652286224e-05, "learning_rate": 3.545262189519092e-05, "loss": 0.0, "step": 2900 }, { "epoch": 72.75, "grad_norm": 1.8930764781543985e-05, "learning_rate": 3.4851148791803465e-05, "loss": 0.0, "step": 2910 }, { "epoch": 73.0, "grad_norm": 4.1914405301213264e-05, "learning_rate": 3.425374303795675e-05, "loss": 0.0, "step": 2920 }, { "epoch": 73.0, "eval_loss": 7.22367474281782e-07, "eval_runtime": 0.319, "eval_samples_per_second": 112.859, "eval_steps_per_second": 15.675, "step": 2920 }, { "epoch": 73.25, "grad_norm": 1.0584836672933307e-05, "learning_rate": 3.3660441930671006e-05, "loss": 0.0, "step": 2930 }, { "epoch": 73.5, "grad_norm": 1.819963290472515e-05, "learning_rate": 3.3071282510706624e-05, "loss": 0.0, "step": 2940 }, { "epoch": 73.75, "grad_norm": 1.8003340301220305e-05, "learning_rate": 3.248630156025158e-05, "loss": 0.0, "step": 2950 }, { "epoch": 74.0, "grad_norm": 1.5387213352369145e-05, "learning_rate": 3.1905535600625314e-05, "loss": 0.0, "step": 2960 }, { "epoch": 74.0, "eval_loss": 7.147688734221447e-07, "eval_runtime": 0.3171, "eval_samples_per_second": 113.526, "eval_steps_per_second": 15.767, "step": 2960 }, { "epoch": 74.25, "grad_norm": 2.1973037291900255e-05, "learning_rate": 3.1329020889998306e-05, "loss": 0.0, "step": 2970 }, { "epoch": 74.5, "grad_norm": 1.8727620044955984e-05, "learning_rate": 3.075679342112874e-05, "loss": 0.0, "step": 2980 }, { "epoch": 74.75, "grad_norm": 1.0095293873746414e-05, "learning_rate": 3.01888889191152e-05, "loss": 0.0, "step": 2990 }, { "epoch": 75.0, "grad_norm": 1.2027586308249738e-05, "learning_rate": 2.9625342839166316e-05, "loss": 0.0, "step": 3000 }, { "epoch": 75.0, "eval_loss": 7.11524990038015e-07, "eval_runtime": 0.3322, "eval_samples_per_second": 108.367, "eval_steps_per_second": 15.051, "step": 3000 }, { "epoch": 75.25, "grad_norm": 2.197036155848764e-05, "learning_rate": 2.9066190364387437e-05, "loss": 0.0, "step": 3010 }, { "epoch": 75.5, "grad_norm": 1.3477620086632669e-05, "learning_rate": 2.8511466403583766e-05, "loss": 0.0, "step": 3020 }, { "epoch": 75.75, "grad_norm": 1.1739802175725345e-05, "learning_rate": 2.796120558908124e-05, "loss": 0.0, "step": 3030 }, { "epoch": 76.0, "grad_norm": 3.1627434509573504e-05, "learning_rate": 2.7415442274564273e-05, "loss": 0.0, "step": 3040 }, { "epoch": 76.0, "eval_loss": 7.128418815227633e-07, "eval_runtime": 0.315, "eval_samples_per_second": 114.285, "eval_steps_per_second": 15.873, "step": 3040 }, { "epoch": 76.25, "grad_norm": 9.673092790762894e-06, "learning_rate": 2.6874210532930855e-05, "loss": 0.0, "step": 3050 }, { "epoch": 76.5, "grad_norm": 1.989353768294677e-05, "learning_rate": 2.6337544154165604e-05, "loss": 0.0, "step": 3060 }, { "epoch": 76.75, "grad_norm": 1.5490039004362188e-05, "learning_rate": 2.5805476643229952e-05, "loss": 0.0, "step": 3070 }, { "epoch": 77.0, "grad_norm": 1.1932146662729792e-05, "learning_rate": 2.527804121797048e-05, "loss": 0.0, "step": 3080 }, { "epoch": 77.0, "eval_loss": 7.000676305324305e-07, "eval_runtime": 0.3245, "eval_samples_per_second": 110.942, "eval_steps_per_second": 15.409, "step": 3080 }, { "epoch": 77.25, "grad_norm": 1.2189483641122933e-05, "learning_rate": 2.4755270807045174e-05, "loss": 0.0, "step": 3090 }, { "epoch": 77.5, "grad_norm": 2.792781378957443e-05, "learning_rate": 2.423719804786737e-05, "loss": 0.0, "step": 3100 }, { "epoch": 77.75, "grad_norm": 1.3213076272222679e-05, "learning_rate": 2.3723855284568462e-05, "loss": 0.0, "step": 3110 }, { "epoch": 78.0, "grad_norm": 2.2985013856668957e-05, "learning_rate": 2.321527456597833e-05, "loss": 0.0, "step": 3120 }, { "epoch": 78.0, "eval_loss": 6.937642069715366e-07, "eval_runtime": 0.323, "eval_samples_per_second": 111.463, "eval_steps_per_second": 15.481, "step": 3120 }, { "epoch": 78.25, "grad_norm": 1.1034126146114431e-05, "learning_rate": 2.2711487643624675e-05, "loss": 0.0, "step": 3130 }, { "epoch": 78.5, "grad_norm": 1.3156452041584998e-05, "learning_rate": 2.2212525969750643e-05, "loss": 0.0, "step": 3140 }, { "epoch": 78.75, "grad_norm": 1.0150353773497045e-05, "learning_rate": 2.171842069535116e-05, "loss": 0.0, "step": 3150 }, { "epoch": 79.0, "grad_norm": 3.457269485807046e-05, "learning_rate": 2.1229202668228197e-05, "loss": 0.0, "step": 3160 }, { "epoch": 79.0, "eval_loss": 6.983178195696382e-07, "eval_runtime": 0.3211, "eval_samples_per_second": 112.129, "eval_steps_per_second": 15.573, "step": 3160 }, { "epoch": 79.25, "grad_norm": 1.4804916645516641e-05, "learning_rate": 2.074490243106485e-05, "loss": 0.0, "step": 3170 }, { "epoch": 79.5, "grad_norm": 1.8004166122409515e-05, "learning_rate": 2.026555021951858e-05, "loss": 0.0, "step": 3180 }, { "epoch": 79.75, "grad_norm": 2.1705473045585677e-05, "learning_rate": 1.9791175960333487e-05, "loss": 0.0, "step": 3190 }, { "epoch": 80.0, "grad_norm": 1.0873730388993863e-05, "learning_rate": 1.932180926947189e-05, "loss": 0.0, "step": 3200 }, { "epoch": 80.0, "eval_loss": 6.858597316750092e-07, "eval_runtime": 0.3385, "eval_samples_per_second": 106.338, "eval_steps_per_second": 14.769, "step": 3200 }, { "epoch": 80.25, "grad_norm": 1.706531475065276e-05, "learning_rate": 1.8857479450265503e-05, "loss": 0.0, "step": 3210 }, { "epoch": 80.5, "grad_norm": 2.120017961715348e-05, "learning_rate": 1.839821549158579e-05, "loss": 0.0, "step": 3220 }, { "epoch": 80.75, "grad_norm": 1.3771560588793363e-05, "learning_rate": 1.794404606603434e-05, "loss": 0.0, "step": 3230 }, { "epoch": 81.0, "grad_norm": 1.798778430384118e-05, "learning_rate": 1.74949995281526e-05, "loss": 0.0, "step": 3240 }, { "epoch": 81.0, "eval_loss": 6.865074624329282e-07, "eval_runtime": 0.3201, "eval_samples_per_second": 112.473, "eval_steps_per_second": 15.621, "step": 3240 }, { "epoch": 81.25, "grad_norm": 1.246057126991218e-05, "learning_rate": 1.705110391265179e-05, "loss": 0.0, "step": 3250 }, { "epoch": 81.5, "grad_norm": 2.145354483218398e-05, "learning_rate": 1.6612386932662627e-05, "loss": 0.0, "step": 3260 }, { "epoch": 81.75, "grad_norm": 9.91187789622927e-06, "learning_rate": 1.6178875978005058e-05, "loss": 0.0, "step": 3270 }, { "epoch": 82.0, "grad_norm": 2.266502815473359e-05, "learning_rate": 1.57505981134784e-05, "loss": 0.0, "step": 3280 }, { "epoch": 82.0, "eval_loss": 6.804688723605068e-07, "eval_runtime": 0.3188, "eval_samples_per_second": 112.926, "eval_steps_per_second": 15.684, "step": 3280 }, { "epoch": 82.25, "grad_norm": 1.2877572771685664e-05, "learning_rate": 1.5327580077171587e-05, "loss": 0.0, "step": 3290 }, { "epoch": 82.5, "grad_norm": 2.6757013984024525e-05, "learning_rate": 1.4909848278793782e-05, "loss": 0.0, "step": 3300 }, { "epoch": 82.75, "grad_norm": 1.6225705621764064e-05, "learning_rate": 1.4497428798025736e-05, "loss": 0.0, "step": 3310 }, { "epoch": 83.0, "grad_norm": 1.1286027074675076e-05, "learning_rate": 1.4090347382891455e-05, "loss": 0.0, "step": 3320 }, { "epoch": 83.0, "eval_loss": 6.749939984729281e-07, "eval_runtime": 0.3162, "eval_samples_per_second": 113.859, "eval_steps_per_second": 15.814, "step": 3320 }, { "epoch": 83.25, "grad_norm": 1.8980854292749427e-05, "learning_rate": 1.3688629448150747e-05, "loss": 0.0, "step": 3330 }, { "epoch": 83.5, "grad_norm": 1.4416699741559569e-05, "learning_rate": 1.3292300073712615e-05, "loss": 0.0, "step": 3340 }, { "epoch": 83.75, "grad_norm": 2.0767629393958487e-05, "learning_rate": 1.2901384003069328e-05, "loss": 0.0, "step": 3350 }, { "epoch": 84.0, "grad_norm": 1.8946042473544367e-05, "learning_rate": 1.2515905641751824e-05, "loss": 0.0, "step": 3360 }, { "epoch": 84.0, "eval_loss": 6.791258897465013e-07, "eval_runtime": 0.3165, "eval_samples_per_second": 113.733, "eval_steps_per_second": 15.796, "step": 3360 }, { "epoch": 84.25, "grad_norm": 2.060659790004138e-05, "learning_rate": 1.2135889055805837e-05, "loss": 0.0, "step": 3370 }, { "epoch": 84.5, "grad_norm": 2.793761814245954e-05, "learning_rate": 1.1761357970289588e-05, "loss": 0.0, "step": 3380 }, { "epoch": 84.75, "grad_norm": 1.5529620213783346e-05, "learning_rate": 1.1392335767792505e-05, "loss": 0.0, "step": 3390 }, { "epoch": 85.0, "grad_norm": 1.4130602721706964e-05, "learning_rate": 1.1028845486975403e-05, "loss": 0.0, "step": 3400 }, { "epoch": 85.0, "eval_loss": 6.698858783238393e-07, "eval_runtime": 0.3198, "eval_samples_per_second": 112.588, "eval_steps_per_second": 15.637, "step": 3400 }, { "epoch": 85.25, "grad_norm": 1.6358992070308886e-05, "learning_rate": 1.0670909821132136e-05, "loss": 0.0, "step": 3410 }, { "epoch": 85.5, "grad_norm": 3.4115200833184645e-05, "learning_rate": 1.0318551116772923e-05, "loss": 0.0, "step": 3420 }, { "epoch": 85.75, "grad_norm": 1.7895346900331788e-05, "learning_rate": 9.971791372229044e-06, "loss": 0.0, "step": 3430 }, { "epoch": 86.0, "grad_norm": 1.2962746950506698e-05, "learning_rate": 9.630652236279625e-06, "loss": 0.0, "step": 3440 }, { "epoch": 86.0, "eval_loss": 6.754976880074537e-07, "eval_runtime": 0.3233, "eval_samples_per_second": 111.361, "eval_steps_per_second": 15.467, "step": 3440 }, { "epoch": 86.25, "grad_norm": 1.5895795513642952e-05, "learning_rate": 9.295155006799917e-06, "loss": 0.0, "step": 3450 }, { "epoch": 86.5, "grad_norm": 1.6078374756034464e-05, "learning_rate": 8.96532062943175e-06, "loss": 0.0, "step": 3460 }, { "epoch": 86.75, "grad_norm": 1.0541101801209152e-05, "learning_rate": 8.641169696275831e-06, "loss": 0.0, "step": 3470 }, { "epoch": 87.0, "grad_norm": 1.940759102581069e-05, "learning_rate": 8.322722444606079e-06, "loss": 0.0, "step": 3480 }, { "epoch": 87.0, "eval_loss": 6.688477469651843e-07, "eval_runtime": 0.3141, "eval_samples_per_second": 114.612, "eval_steps_per_second": 15.918, "step": 3480 }, { "epoch": 87.25, "grad_norm": 1.1041982361348346e-05, "learning_rate": 8.009998755606263e-06, "loss": 0.0, "step": 3490 }, { "epoch": 87.5, "grad_norm": 2.7860867703566328e-05, "learning_rate": 7.703018153128739e-06, "loss": 0.0, "step": 3500 }, { "epoch": 87.75, "grad_norm": 8.007168617041316e-06, "learning_rate": 7.401799802475573e-06, "loss": 0.0, "step": 3510 }, { "epoch": 88.0, "grad_norm": 1.64666762429988e-05, "learning_rate": 7.106362509202036e-06, "loss": 0.0, "step": 3520 }, { "epoch": 88.0, "eval_loss": 6.721416525579116e-07, "eval_runtime": 0.3256, "eval_samples_per_second": 110.567, "eval_steps_per_second": 15.357, "step": 3520 }, { "epoch": 88.25, "grad_norm": 1.7286309230257757e-05, "learning_rate": 6.816724717942435e-06, "loss": 0.0, "step": 3530 }, { "epoch": 88.5, "grad_norm": 2.7998203222523443e-05, "learning_rate": 6.532904511258753e-06, "loss": 0.0, "step": 3540 }, { "epoch": 88.75, "grad_norm": 1.3463857612805441e-05, "learning_rate": 6.254919608511544e-06, "loss": 0.0, "step": 3550 }, { "epoch": 89.0, "grad_norm": 1.6592677638982423e-05, "learning_rate": 5.982787364753872e-06, "loss": 0.0, "step": 3560 }, { "epoch": 89.0, "eval_loss": 6.658329425590637e-07, "eval_runtime": 0.3184, "eval_samples_per_second": 113.061, "eval_steps_per_second": 15.703, "step": 3560 }, { "epoch": 89.25, "grad_norm": 2.4364608179894276e-05, "learning_rate": 5.716524769647646e-06, "loss": 0.0, "step": 3570 }, { "epoch": 89.5, "grad_norm": 1.557578070787713e-05, "learning_rate": 5.456148446402976e-06, "loss": 0.0, "step": 3580 }, { "epoch": 89.75, "grad_norm": 9.06794684851775e-06, "learning_rate": 5.2016746507404295e-06, "loss": 0.0, "step": 3590 }, { "epoch": 90.0, "grad_norm": 2.0602865333785303e-05, "learning_rate": 4.953119269876061e-06, "loss": 0.0, "step": 3600 }, { "epoch": 90.0, "eval_loss": 6.674051178379159e-07, "eval_runtime": 0.3268, "eval_samples_per_second": 110.152, "eval_steps_per_second": 15.299, "step": 3600 }, { "epoch": 90.25, "grad_norm": 2.491854138497729e-05, "learning_rate": 4.710497821529625e-06, "loss": 0.0, "step": 3610 }, { "epoch": 90.5, "grad_norm": 1.2203651749587152e-05, "learning_rate": 4.473825452955716e-06, "loss": 0.0, "step": 3620 }, { "epoch": 90.75, "grad_norm": 2.5209032173734158e-05, "learning_rate": 4.2431169399981485e-06, "loss": 0.0, "step": 3630 }, { "epoch": 91.0, "grad_norm": 1.514551604486769e-05, "learning_rate": 4.018386686167452e-06, "loss": 0.0, "step": 3640 }, { "epoch": 91.0, "eval_loss": 6.590207135559467e-07, "eval_runtime": 0.3159, "eval_samples_per_second": 113.952, "eval_steps_per_second": 15.827, "step": 3640 }, { "epoch": 91.25, "grad_norm": 9.24188134376891e-06, "learning_rate": 3.7996487217416223e-06, "loss": 0.0, "step": 3650 }, { "epoch": 91.5, "grad_norm": 1.9695198716362938e-05, "learning_rate": 3.5869167028902195e-06, "loss": 0.0, "step": 3660 }, { "epoch": 91.75, "grad_norm": 8.883437658369076e-06, "learning_rate": 3.380203910821833e-06, "loss": 0.0, "step": 3670 }, { "epoch": 92.0, "grad_norm": 3.180091880494729e-05, "learning_rate": 3.1795232509547633e-06, "loss": 0.0, "step": 3680 }, { "epoch": 92.0, "eval_loss": 6.601708264497574e-07, "eval_runtime": 0.3151, "eval_samples_per_second": 114.24, "eval_steps_per_second": 15.867, "step": 3680 }, { "epoch": 92.25, "grad_norm": 2.3148995751398616e-05, "learning_rate": 2.98488725211149e-06, "loss": 0.0, "step": 3690 }, { "epoch": 92.5, "grad_norm": 1.9166613128618337e-05, "learning_rate": 2.796308065736364e-06, "loss": 0.0, "step": 3700 }, { "epoch": 92.75, "grad_norm": 1.4863600881653838e-05, "learning_rate": 2.6137974651370134e-06, "loss": 0.0, "step": 3710 }, { "epoch": 93.0, "grad_norm": 1.7678094081929885e-05, "learning_rate": 2.4373668447493224e-06, "loss": 0.0, "step": 3720 }, { "epoch": 93.0, "eval_loss": 6.622615842388768e-07, "eval_runtime": 0.3193, "eval_samples_per_second": 112.738, "eval_steps_per_second": 15.658, "step": 3720 }, { "epoch": 93.25, "grad_norm": 2.3845455871196464e-05, "learning_rate": 2.2670272194260324e-06, "loss": 0.0, "step": 3730 }, { "epoch": 93.5, "grad_norm": 1.4557038412021939e-05, "learning_rate": 2.102789223749102e-06, "loss": 0.0, "step": 3740 }, { "epoch": 93.75, "grad_norm": 2.4488541384926066e-05, "learning_rate": 1.9446631113657187e-06, "loss": 0.0, "step": 3750 }, { "epoch": 94.0, "grad_norm": 1.9359116777195595e-05, "learning_rate": 1.7926587543482088e-06, "loss": 0.0, "step": 3760 }, { "epoch": 94.0, "eval_loss": 6.639800744778768e-07, "eval_runtime": 0.3201, "eval_samples_per_second": 112.453, "eval_steps_per_second": 15.618, "step": 3760 }, { "epoch": 94.25, "grad_norm": 1.9722852812265046e-05, "learning_rate": 1.6467856425776863e-06, "loss": 0.0, "step": 3770 }, { "epoch": 94.5, "grad_norm": 1.831287045206409e-05, "learning_rate": 1.5070528831515384e-06, "loss": 0.0, "step": 3780 }, { "epoch": 94.75, "grad_norm": 2.3000593500910327e-05, "learning_rate": 1.3734691998149474e-06, "loss": 0.0, "step": 3790 }, { "epoch": 95.0, "grad_norm": 1.1854316653625574e-05, "learning_rate": 1.246042932416136e-06, "loss": 0.0, "step": 3800 }, { "epoch": 95.0, "eval_loss": 6.561905934177048e-07, "eval_runtime": 0.318, "eval_samples_per_second": 113.209, "eval_steps_per_second": 15.724, "step": 3800 }, { "epoch": 95.25, "grad_norm": 1.4117299542704131e-05, "learning_rate": 1.1247820363858075e-06, "loss": 0.0, "step": 3810 }, { "epoch": 95.5, "grad_norm": 1.986858478630893e-05, "learning_rate": 1.00969408224042e-06, "loss": 0.0, "step": 3820 }, { "epoch": 95.75, "grad_norm": 2.425446109555196e-05, "learning_rate": 9.007862551095314e-07, "loss": 0.0, "step": 3830 }, { "epoch": 96.0, "grad_norm": 1.7652260794420727e-05, "learning_rate": 7.980653542872584e-07, "loss": 0.0, "step": 3840 }, { "epoch": 96.0, "eval_loss": 6.501233542621776e-07, "eval_runtime": 0.3244, "eval_samples_per_second": 110.979, "eval_steps_per_second": 15.414, "step": 3840 }, { "epoch": 96.25, "grad_norm": 1.0838626621989533e-05, "learning_rate": 7.015377928077827e-07, "loss": 0.0, "step": 3850 }, { "epoch": 96.5, "grad_norm": 1.3126472367730457e-05, "learning_rate": 6.11209597044926e-07, "loss": 0.0, "step": 3860 }, { "epoch": 96.75, "grad_norm": 2.100517667713575e-05, "learning_rate": 5.27086406335997e-07, "loss": 0.0, "step": 3870 }, { "epoch": 97.0, "grad_norm": 1.3467181815940421e-05, "learning_rate": 4.4917347262962705e-07, "loss": 0.0, "step": 3880 }, { "epoch": 97.0, "eval_loss": 6.613539653699263e-07, "eval_runtime": 0.3133, "eval_samples_per_second": 114.905, "eval_steps_per_second": 15.959, "step": 3880 }, { "epoch": 97.25, "grad_norm": 2.3665135813644156e-05, "learning_rate": 3.774756601579443e-07, "loss": 0.0, "step": 3890 }, { "epoch": 97.5, "grad_norm": 1.761297244229354e-05, "learning_rate": 3.119974451328833e-07, "loss": 0.0, "step": 3900 }, { "epoch": 97.75, "grad_norm": 2.0256773495930247e-05, "learning_rate": 2.5274291546669717e-07, "loss": 0.0, "step": 3910 }, { "epoch": 98.0, "grad_norm": 1.0930380994977895e-05, "learning_rate": 1.9971577051678404e-07, "loss": 0.0, "step": 3920 }, { "epoch": 98.0, "eval_loss": 6.56454744785151e-07, "eval_runtime": 0.3159, "eval_samples_per_second": 113.953, "eval_steps_per_second": 15.827, "step": 3920 }, { "epoch": 98.25, "grad_norm": 2.0974179278709926e-05, "learning_rate": 1.5291932085468307e-07, "loss": 0.0, "step": 3930 }, { "epoch": 98.5, "grad_norm": 2.5038380044861697e-05, "learning_rate": 1.1235648805945075e-07, "loss": 0.0, "step": 3940 }, { "epoch": 98.75, "grad_norm": 1.5341527614509687e-05, "learning_rate": 7.802980453519571e-08, "loss": 0.0, "step": 3950 }, { "epoch": 99.0, "grad_norm": 1.3212208614277188e-05, "learning_rate": 4.994141335303848e-08, "loss": 0.0, "step": 3960 }, { "epoch": 99.0, "eval_loss": 6.549934710164962e-07, "eval_runtime": 0.3268, "eval_samples_per_second": 110.154, "eval_steps_per_second": 15.299, "step": 3960 }, { "epoch": 99.25, "grad_norm": 1.4808772903052159e-05, "learning_rate": 2.8093068117240885e-08, "loss": 0.0, "step": 3970 }, { "epoch": 99.5, "grad_norm": 1.7599566490389407e-05, "learning_rate": 1.2486132855826781e-08, "loss": 0.0, "step": 3980 }, { "epoch": 99.75, "grad_norm": 9.601525562175084e-06, "learning_rate": 3.121581935328077e-09, "loss": 0.0, "step": 3990 }, { "epoch": 100.0, "grad_norm": 2.9180186174926348e-05, "learning_rate": 0.0, "loss": 0.0, "step": 4000 }, { "epoch": 100.0, "eval_loss": 6.550197895194287e-07, "eval_runtime": 0.3302, "eval_samples_per_second": 109.012, "eval_steps_per_second": 15.141, "step": 4000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.684080299081728e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }