|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 40, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025, |
|
"eval_loss": 0.9384576082229614, |
|
"eval_runtime": 0.4222, |
|
"eval_samples_per_second": 85.258, |
|
"eval_steps_per_second": 11.841, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06372307986021042, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.2623, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.029395487159490585, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.0007, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.7581191062927246, |
|
"learning_rate": 0.00019999887622676146, |
|
"loss": 0.026, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.48524343967437744, |
|
"learning_rate": 0.00019999200881510367, |
|
"loss": 0.0292, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.004332332406193018, |
|
"eval_runtime": 0.3245, |
|
"eval_samples_per_second": 110.942, |
|
"eval_steps_per_second": 15.409, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 43.160125732421875, |
|
"learning_rate": 0.00019997889873847797, |
|
"loss": 0.1101, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 71.57295989990234, |
|
"learning_rate": 0.00019995954681536798, |
|
"loss": 0.0241, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.3996742069721222, |
|
"learning_rate": 0.00019993395425394592, |
|
"loss": 0.0163, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.061198778450489044, |
|
"learning_rate": 0.00019990212265199738, |
|
"loss": 0.0148, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.033167850226163864, |
|
"eval_runtime": 0.3415, |
|
"eval_samples_per_second": 105.418, |
|
"eval_steps_per_second": 14.641, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.403183698654175, |
|
"learning_rate": 0.0001998640539968214, |
|
"loss": 0.01, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.304408550262451, |
|
"learning_rate": 0.00019981975066510655, |
|
"loss": 0.0435, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.02899610437452793, |
|
"learning_rate": 0.00019976921542278237, |
|
"loss": 0.0296, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.8328208923339844, |
|
"learning_rate": 0.0001997124514248469, |
|
"loss": 0.1015, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.00442217942327261, |
|
"eval_runtime": 0.3282, |
|
"eval_samples_per_second": 109.685, |
|
"eval_steps_per_second": 15.234, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.07601974904537201, |
|
"learning_rate": 0.00019964946221516953, |
|
"loss": 0.0273, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.02951742894947529, |
|
"learning_rate": 0.00019958025172626986, |
|
"loss": 0.0316, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.09413129091262817, |
|
"learning_rate": 0.00019950482427907211, |
|
"loss": 0.0071, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0033842374105006456, |
|
"learning_rate": 0.0001994231845826354, |
|
"loss": 0.0002, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.00014786835527047515, |
|
"eval_runtime": 0.3249, |
|
"eval_samples_per_second": 110.813, |
|
"eval_steps_per_second": 15.391, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.12055602669715881, |
|
"learning_rate": 0.00019933533773385976, |
|
"loss": 0.0001, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.007594508584588766, |
|
"learning_rate": 0.00019924128921716797, |
|
"loss": 0.0001, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.000780335278250277, |
|
"learning_rate": 0.000199141044904163, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.0012850259663537145, |
|
"learning_rate": 0.00019903461105326154, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.7732399757951498e-05, |
|
"eval_runtime": 0.3542, |
|
"eval_samples_per_second": 101.634, |
|
"eval_steps_per_second": 14.116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.0005795760662294924, |
|
"learning_rate": 0.0001989219943093034, |
|
"loss": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.0004225255688652396, |
|
"learning_rate": 0.0001988032017031364, |
|
"loss": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.0006476517883129418, |
|
"learning_rate": 0.00019867824065117765, |
|
"loss": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.0004615155339706689, |
|
"learning_rate": 0.00019854711895495036, |
|
"loss": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.7349062545690686e-05, |
|
"eval_runtime": 0.328, |
|
"eval_samples_per_second": 109.765, |
|
"eval_steps_per_second": 15.245, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.0003548109089024365, |
|
"learning_rate": 0.00019840984480059689, |
|
"loss": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.0010875174775719643, |
|
"learning_rate": 0.0001982664267583677, |
|
"loss": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.0003341367410030216, |
|
"learning_rate": 0.00019811687378208613, |
|
"loss": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.00045011454494670033, |
|
"learning_rate": 0.00019796119520858955, |
|
"loss": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.3329447938303929e-05, |
|
"eval_runtime": 0.3393, |
|
"eval_samples_per_second": 106.103, |
|
"eval_steps_per_second": 14.737, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.00023316974693443626, |
|
"learning_rate": 0.00019779940075714648, |
|
"loss": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.0002178147406084463, |
|
"learning_rate": 0.00019763150052884966, |
|
"loss": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.00018833605281542987, |
|
"learning_rate": 0.00019745750500598538, |
|
"loss": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.0005992311052978039, |
|
"learning_rate": 0.00019727742505137936, |
|
"loss": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.119767694035545e-05, |
|
"eval_runtime": 0.3439, |
|
"eval_samples_per_second": 104.694, |
|
"eval_steps_per_second": 14.541, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.00013200360990595073, |
|
"learning_rate": 0.00019709127190771825, |
|
"loss": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.0002380541991442442, |
|
"learning_rate": 0.00019689905719684782, |
|
"loss": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.00014980848936829716, |
|
"learning_rate": 0.00019670079291904752, |
|
"loss": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.0002899216488003731, |
|
"learning_rate": 0.00019649649145228102, |
|
"loss": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 9.466394658375066e-06, |
|
"eval_runtime": 0.3404, |
|
"eval_samples_per_second": 105.754, |
|
"eval_steps_per_second": 14.688, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.00021961786842439324, |
|
"learning_rate": 0.00019628616555142372, |
|
"loss": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.00020691509416792542, |
|
"learning_rate": 0.00019606982834746627, |
|
"loss": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.00023161708668339998, |
|
"learning_rate": 0.00019584749334669487, |
|
"loss": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.00017992363427765667, |
|
"learning_rate": 0.00019561917442984788, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 8.257883564510848e-06, |
|
"eval_runtime": 0.3275, |
|
"eval_samples_per_second": 109.923, |
|
"eval_steps_per_second": 15.267, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"grad_norm": 0.00013070827117189765, |
|
"learning_rate": 0.00019538488585124953, |
|
"loss": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"grad_norm": 0.00018156137957703322, |
|
"learning_rate": 0.00019514464223791965, |
|
"loss": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.0001987970608752221, |
|
"learning_rate": 0.00019489845858866066, |
|
"loss": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.00012906281335745007, |
|
"learning_rate": 0.00019464635027312128, |
|
"loss": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 7.331655979214702e-06, |
|
"eval_runtime": 0.3356, |
|
"eval_samples_per_second": 107.279, |
|
"eval_steps_per_second": 14.9, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"grad_norm": 0.00031813167151995003, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.00016680177941452712, |
|
"learning_rate": 0.00019412442297024637, |
|
"loss": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 11.75, |
|
"grad_norm": 0.00013162715185899287, |
|
"learning_rate": 0.00019385463656768762, |
|
"loss": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.00015330589667428285, |
|
"learning_rate": 0.00019357899066636773, |
|
"loss": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 6.5182412072317675e-06, |
|
"eval_runtime": 0.3246, |
|
"eval_samples_per_second": 110.889, |
|
"eval_steps_per_second": 15.401, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 12.25, |
|
"grad_norm": 0.00018356599321123213, |
|
"learning_rate": 0.00019329750247531205, |
|
"loss": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.00015767107834108174, |
|
"learning_rate": 0.00019301018956828964, |
|
"loss": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.00029690677183680236, |
|
"learning_rate": 0.00019271706988271606, |
|
"loss": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 9.481079177930951e-05, |
|
"learning_rate": 0.0001924181617185336, |
|
"loss": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 5.88237071497133e-06, |
|
"eval_runtime": 0.3263, |
|
"eval_samples_per_second": 110.333, |
|
"eval_steps_per_second": 15.324, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 13.25, |
|
"grad_norm": 0.00016097365005407482, |
|
"learning_rate": 0.00019211348373706884, |
|
"loss": 0.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.0001369424571748823, |
|
"learning_rate": 0.0001918030549598674, |
|
"loss": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 0.00018055856344290078, |
|
"learning_rate": 0.00019148689476750658, |
|
"loss": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.00010365981870563701, |
|
"learning_rate": 0.00019116502289838523, |
|
"loss": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 5.300180873746285e-06, |
|
"eval_runtime": 0.3471, |
|
"eval_samples_per_second": 103.705, |
|
"eval_steps_per_second": 14.404, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 7.365662168012932e-05, |
|
"learning_rate": 0.00019083745944749162, |
|
"loss": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.00015878217527642846, |
|
"learning_rate": 0.00019050422486514878, |
|
"loss": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 14.75, |
|
"grad_norm": 0.00016406863869633526, |
|
"learning_rate": 0.00019016533995573772, |
|
"loss": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.0001134676203946583, |
|
"learning_rate": 0.0001898208258763987, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 4.918438207823783e-06, |
|
"eval_runtime": 0.3237, |
|
"eval_samples_per_second": 111.198, |
|
"eval_steps_per_second": 15.444, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 15.25, |
|
"grad_norm": 0.00010196594666922465, |
|
"learning_rate": 0.00018947070413571026, |
|
"loss": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"grad_norm": 0.00013735589163843542, |
|
"learning_rate": 0.0001891149965923464, |
|
"loss": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 8.303586218971759e-05, |
|
"learning_rate": 0.00018875372545371194, |
|
"loss": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 8.282584167318419e-05, |
|
"learning_rate": 0.0001883869132745561, |
|
"loss": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 4.482135864236625e-06, |
|
"eval_runtime": 0.3334, |
|
"eval_samples_per_second": 107.988, |
|
"eval_steps_per_second": 14.998, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"grad_norm": 7.883716170908883e-05, |
|
"learning_rate": 0.00018801458295556435, |
|
"loss": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.00016775316908024251, |
|
"learning_rate": 0.0001876367577419286, |
|
"loss": 0.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"grad_norm": 8.655583224026486e-05, |
|
"learning_rate": 0.00018725346122189606, |
|
"loss": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 6.624006346100941e-05, |
|
"learning_rate": 0.00018686471732529665, |
|
"loss": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 4.2038600440719165e-06, |
|
"eval_runtime": 0.331, |
|
"eval_samples_per_second": 108.76, |
|
"eval_steps_per_second": 15.106, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 17.25, |
|
"grad_norm": 0.00013512188161257654, |
|
"learning_rate": 0.00018647055032204883, |
|
"loss": 0.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 8.678815356688574e-05, |
|
"learning_rate": 0.0001860709848206446, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 6.948116788407788e-05, |
|
"learning_rate": 0.00018566604576661288, |
|
"loss": 0.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 7.376579014817253e-05, |
|
"learning_rate": 0.00018525575844096243, |
|
"loss": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 3.883159479300957e-06, |
|
"eval_runtime": 0.3356, |
|
"eval_samples_per_second": 107.284, |
|
"eval_steps_per_second": 14.901, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 18.25, |
|
"grad_norm": 9.459959983360022e-05, |
|
"learning_rate": 0.0001848401484586034, |
|
"loss": 0.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 9.205293463310227e-05, |
|
"learning_rate": 0.00018441924176674794, |
|
"loss": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"grad_norm": 0.00011255600111326203, |
|
"learning_rate": 0.00018399306464329066, |
|
"loss": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 6.045972986612469e-05, |
|
"learning_rate": 0.0001835616436951677, |
|
"loss": 0.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 3.604589437600225e-06, |
|
"eval_runtime": 0.3263, |
|
"eval_samples_per_second": 110.323, |
|
"eval_steps_per_second": 15.323, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 5.718848478863947e-05, |
|
"learning_rate": 0.00018312500585669584, |
|
"loss": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 19.5, |
|
"grad_norm": 0.00010984807158820331, |
|
"learning_rate": 0.00018268317838789088, |
|
"loss": 0.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 19.75, |
|
"grad_norm": 4.868064570473507e-05, |
|
"learning_rate": 0.0001822361888727657, |
|
"loss": 0.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 7.550454029114917e-05, |
|
"learning_rate": 0.0001817840652176082, |
|
"loss": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 3.3906435419339687e-06, |
|
"eval_runtime": 0.3395, |
|
"eval_samples_per_second": 106.05, |
|
"eval_steps_per_second": 14.729, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 20.25, |
|
"grad_norm": 6.606967508560047e-05, |
|
"learning_rate": 0.00018132683564923906, |
|
"loss": 0.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 20.5, |
|
"grad_norm": 0.0001721412845654413, |
|
"learning_rate": 0.00018086452871324954, |
|
"loss": 0.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 20.75, |
|
"grad_norm": 4.9960210162680596e-05, |
|
"learning_rate": 0.00018039717327221925, |
|
"loss": 0.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 5.9810005041072145e-05, |
|
"learning_rate": 0.00017992479850391417, |
|
"loss": 0.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 3.1668755582359154e-06, |
|
"eval_runtime": 0.3326, |
|
"eval_samples_per_second": 108.232, |
|
"eval_steps_per_second": 15.032, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 21.25, |
|
"grad_norm": 5.891801993129775e-05, |
|
"learning_rate": 0.00017944743389946524, |
|
"loss": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 21.5, |
|
"grad_norm": 8.631425589555874e-05, |
|
"learning_rate": 0.0001789651092615269, |
|
"loss": 0.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 21.75, |
|
"grad_norm": 6.0596958064706996e-05, |
|
"learning_rate": 0.00017847785470241677, |
|
"loss": 0.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 7.751138764433563e-05, |
|
"learning_rate": 0.00017798570064223533, |
|
"loss": 0.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 2.9938312309241155e-06, |
|
"eval_runtime": 0.3268, |
|
"eval_samples_per_second": 110.167, |
|
"eval_steps_per_second": 15.301, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 22.25, |
|
"grad_norm": 6.764694990124553e-05, |
|
"learning_rate": 0.00017748867780696716, |
|
"loss": 0.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 22.5, |
|
"grad_norm": 7.38737580832094e-05, |
|
"learning_rate": 0.0001769868172265623, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 22.75, |
|
"grad_norm": 0.00010331822704756632, |
|
"learning_rate": 0.00017648015023299918, |
|
"loss": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.00010948543786071241, |
|
"learning_rate": 0.0001759687084583285, |
|
"loss": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 2.7970015707978746e-06, |
|
"eval_runtime": 0.3433, |
|
"eval_samples_per_second": 104.875, |
|
"eval_steps_per_second": 14.566, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 23.25, |
|
"grad_norm": 4.273112062946893e-05, |
|
"learning_rate": 0.00017545252383269837, |
|
"loss": 0.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 23.5, |
|
"grad_norm": 0.0001338142465101555, |
|
"learning_rate": 0.00017493162858236077, |
|
"loss": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 23.75, |
|
"grad_norm": 5.875607530470006e-05, |
|
"learning_rate": 0.00017440605522765984, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 7.345333142438903e-05, |
|
"learning_rate": 0.00017387583658100142, |
|
"loss": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 2.6630891625245567e-06, |
|
"eval_runtime": 0.3317, |
|
"eval_samples_per_second": 108.524, |
|
"eval_steps_per_second": 15.073, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 24.25, |
|
"grad_norm": 6.94195696269162e-05, |
|
"learning_rate": 0.00017334100574480435, |
|
"loss": 0.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 24.5, |
|
"grad_norm": 4.8001227696659043e-05, |
|
"learning_rate": 0.0001728015961094343, |
|
"loss": 0.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 24.75, |
|
"grad_norm": 4.3018935684813187e-05, |
|
"learning_rate": 0.00017225764135111868, |
|
"loss": 0.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 7.503097003791481e-05, |
|
"learning_rate": 0.00017170917542984443, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 2.498412186469068e-06, |
|
"eval_runtime": 0.3252, |
|
"eval_samples_per_second": 110.685, |
|
"eval_steps_per_second": 15.373, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 25.25, |
|
"grad_norm": 2.499126276234165e-05, |
|
"learning_rate": 0.00017115623258723783, |
|
"loss": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 25.5, |
|
"grad_norm": 8.122723374981433e-05, |
|
"learning_rate": 0.00017059884734442658, |
|
"loss": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 25.75, |
|
"grad_norm": 5.7621167798060924e-05, |
|
"learning_rate": 0.00017003705449988486, |
|
"loss": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 6.584699440281838e-05, |
|
"learning_rate": 0.00016947088912726052, |
|
"loss": 0.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 2.384617800998967e-06, |
|
"eval_runtime": 0.3289, |
|
"eval_samples_per_second": 109.466, |
|
"eval_steps_per_second": 15.204, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 26.25, |
|
"grad_norm": 3.284347985754721e-05, |
|
"learning_rate": 0.00016890038657318556, |
|
"loss": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 26.5, |
|
"grad_norm": 6.672390009043738e-05, |
|
"learning_rate": 0.00016832558245506935, |
|
"loss": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 26.75, |
|
"grad_norm": 3.635583561845124e-05, |
|
"learning_rate": 0.0001677465126588749, |
|
"loss": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 5.236966899246909e-05, |
|
"learning_rate": 0.00016716321333687848, |
|
"loss": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 2.2538335997523973e-06, |
|
"eval_runtime": 0.327, |
|
"eval_samples_per_second": 110.094, |
|
"eval_steps_per_second": 15.291, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 27.25, |
|
"grad_norm": 5.55117912881542e-05, |
|
"learning_rate": 0.00016657572090541262, |
|
"loss": 0.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 27.5, |
|
"grad_norm": 0.00013249287439975888, |
|
"learning_rate": 0.0001659840720425926, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 27.75, |
|
"grad_norm": 5.55339029233437e-05, |
|
"learning_rate": 0.00016538830368602648, |
|
"loss": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 5.33119855390396e-05, |
|
"learning_rate": 0.0001647884530305089, |
|
"loss": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 2.159326413675444e-06, |
|
"eval_runtime": 0.3173, |
|
"eval_samples_per_second": 113.452, |
|
"eval_steps_per_second": 15.757, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 28.25, |
|
"grad_norm": 6.674770702375099e-05, |
|
"learning_rate": 0.00016418455752569943, |
|
"loss": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 28.5, |
|
"grad_norm": 5.4036871006246656e-05, |
|
"learning_rate": 0.00016357665487378397, |
|
"loss": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 28.75, |
|
"grad_norm": 9.294509800383821e-05, |
|
"learning_rate": 0.00016296478302712126, |
|
"loss": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 6.301044049905613e-05, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 2.0828572360187536e-06, |
|
"eval_runtime": 0.3199, |
|
"eval_samples_per_second": 112.52, |
|
"eval_steps_per_second": 15.628, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 29.25, |
|
"grad_norm": 6.311033212114125e-05, |
|
"learning_rate": 0.00016172928479562078, |
|
"loss": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 29.5, |
|
"grad_norm": 3.820831625489518e-05, |
|
"learning_rate": 0.00016110573554496224, |
|
"loss": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 29.75, |
|
"grad_norm": 4.628980968846008e-05, |
|
"learning_rate": 0.00016047837136309924, |
|
"loss": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 3.80598139599897e-05, |
|
"learning_rate": 0.00015984723141740576, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 1.9744732071558246e-06, |
|
"eval_runtime": 0.3173, |
|
"eval_samples_per_second": 113.449, |
|
"eval_steps_per_second": 15.757, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 30.25, |
|
"grad_norm": 3.0195853469194844e-05, |
|
"learning_rate": 0.00015921235511098282, |
|
"loss": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 30.5, |
|
"grad_norm": 5.462007538881153e-05, |
|
"learning_rate": 0.00015857378208019863, |
|
"loss": 0.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 30.75, |
|
"grad_norm": 2.7883037546416745e-05, |
|
"learning_rate": 0.00015793155219221395, |
|
"loss": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 4.7888908738968894e-05, |
|
"learning_rate": 0.00015728570554249312, |
|
"loss": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 1.8858928569898126e-06, |
|
"eval_runtime": 0.3223, |
|
"eval_samples_per_second": 111.705, |
|
"eval_steps_per_second": 15.515, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 31.25, |
|
"grad_norm": 4.82973555335775e-05, |
|
"learning_rate": 0.0001566362824523008, |
|
"loss": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 31.5, |
|
"grad_norm": 3.9442336856154725e-05, |
|
"learning_rate": 0.00015598332346618472, |
|
"loss": 0.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 31.75, |
|
"grad_norm": 3.770321563933976e-05, |
|
"learning_rate": 0.00015532686934944438, |
|
"loss": 0.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 4.669040936278179e-05, |
|
"learning_rate": 0.00015466696108558611, |
|
"loss": 0.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 1.8240966710436624e-06, |
|
"eval_runtime": 0.3185, |
|
"eval_samples_per_second": 113.013, |
|
"eval_steps_per_second": 15.696, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 32.25, |
|
"grad_norm": 2.80893400486093e-05, |
|
"learning_rate": 0.00015400363987376413, |
|
"loss": 0.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 32.5, |
|
"grad_norm": 4.817240915144794e-05, |
|
"learning_rate": 0.00015333694712620877, |
|
"loss": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 32.75, |
|
"grad_norm": 4.6051696699578315e-05, |
|
"learning_rate": 0.00015266692446564063, |
|
"loss": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 3.602392098400742e-05, |
|
"learning_rate": 0.00015199361372267252, |
|
"loss": 0.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 1.7236499161299434e-06, |
|
"eval_runtime": 0.3163, |
|
"eval_samples_per_second": 113.807, |
|
"eval_steps_per_second": 15.806, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 33.25, |
|
"grad_norm": 2.2813776013208553e-05, |
|
"learning_rate": 0.00015131705693319743, |
|
"loss": 0.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 33.5, |
|
"grad_norm": 7.926914986455813e-05, |
|
"learning_rate": 0.0001506372963357644, |
|
"loss": 0.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 33.75, |
|
"grad_norm": 6.877528358018026e-05, |
|
"learning_rate": 0.00014995437436894147, |
|
"loss": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 2.7551081075216644e-05, |
|
"learning_rate": 0.0001492683336686661, |
|
"loss": 0.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 1.67099869941012e-06, |
|
"eval_runtime": 0.325, |
|
"eval_samples_per_second": 110.775, |
|
"eval_steps_per_second": 15.385, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 34.25, |
|
"grad_norm": 3.4323111322009936e-05, |
|
"learning_rate": 0.0001485792170655835, |
|
"loss": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 34.5, |
|
"grad_norm": 3.862389348796569e-05, |
|
"learning_rate": 0.00014788706758237237, |
|
"loss": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 34.75, |
|
"grad_norm": 3.117803134955466e-05, |
|
"learning_rate": 0.00014719192843105924, |
|
"loss": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 3.452876626397483e-05, |
|
"learning_rate": 0.00014649384301032044, |
|
"loss": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 1.6147401993293897e-06, |
|
"eval_runtime": 0.319, |
|
"eval_samples_per_second": 112.868, |
|
"eval_steps_per_second": 15.676, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 35.25, |
|
"grad_norm": 2.5607059797039255e-05, |
|
"learning_rate": 0.00014579285490277274, |
|
"loss": 0.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 35.5, |
|
"grad_norm": 7.004107465036213e-05, |
|
"learning_rate": 0.0001450890078722524, |
|
"loss": 0.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 35.75, |
|
"grad_norm": 5.070870975032449e-05, |
|
"learning_rate": 0.00014438234586108297, |
|
"loss": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 2.5347033442812972e-05, |
|
"learning_rate": 0.00014367291298733178, |
|
"loss": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 1.5523125966865337e-06, |
|
"eval_runtime": 0.3195, |
|
"eval_samples_per_second": 112.683, |
|
"eval_steps_per_second": 15.65, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 36.25, |
|
"grad_norm": 3.3264463127125055e-05, |
|
"learning_rate": 0.0001429607535420557, |
|
"loss": 0.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 36.5, |
|
"grad_norm": 4.0014037949731573e-05, |
|
"learning_rate": 0.00014224591198653595, |
|
"loss": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 36.75, |
|
"grad_norm": 4.455630187294446e-05, |
|
"learning_rate": 0.00014152843294950218, |
|
"loss": 0.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 3.4259654057677835e-05, |
|
"learning_rate": 0.0001408083612243465, |
|
"loss": 0.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 1.506923695160367e-06, |
|
"eval_runtime": 0.3136, |
|
"eval_samples_per_second": 114.814, |
|
"eval_steps_per_second": 15.946, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 37.25, |
|
"grad_norm": 3.984866998507641e-05, |
|
"learning_rate": 0.00014008574176632666, |
|
"loss": 0.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 37.5, |
|
"grad_norm": 3.252027090638876e-05, |
|
"learning_rate": 0.00013936061968975957, |
|
"loss": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 37.75, |
|
"grad_norm": 2.17838187381858e-05, |
|
"learning_rate": 0.00013863304026520473, |
|
"loss": 0.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 4.0549610275775194e-05, |
|
"learning_rate": 0.00013790304891663792, |
|
"loss": 0.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 1.457518123970658e-06, |
|
"eval_runtime": 0.3138, |
|
"eval_samples_per_second": 114.708, |
|
"eval_steps_per_second": 15.932, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 38.25, |
|
"grad_norm": 3.441906665102579e-05, |
|
"learning_rate": 0.00013717069121861527, |
|
"loss": 0.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 38.5, |
|
"grad_norm": 3.80768469767645e-05, |
|
"learning_rate": 0.00013643601289342803, |
|
"loss": 0.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 38.75, |
|
"grad_norm": 1.9130562577629462e-05, |
|
"learning_rate": 0.00013569905980824788, |
|
"loss": 0.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 2.708647480176296e-05, |
|
"learning_rate": 0.0001349598779722636, |
|
"loss": 0.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 1.4059390878173872e-06, |
|
"eval_runtime": 0.326, |
|
"eval_samples_per_second": 110.43, |
|
"eval_steps_per_second": 15.337, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 39.25, |
|
"grad_norm": 2.7261641662335023e-05, |
|
"learning_rate": 0.00013421851353380857, |
|
"loss": 0.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 39.5, |
|
"grad_norm": 3.74881892639678e-05, |
|
"learning_rate": 0.00013347501277747955, |
|
"loss": 0.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 39.75, |
|
"grad_norm": 4.151304892729968e-05, |
|
"learning_rate": 0.00013272942212124705, |
|
"loss": 0.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 2.8103966542403214e-05, |
|
"learning_rate": 0.0001319817881135576, |
|
"loss": 0.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.3655937891599024e-06, |
|
"eval_runtime": 0.3183, |
|
"eval_samples_per_second": 113.09, |
|
"eval_steps_per_second": 15.707, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 40.25, |
|
"grad_norm": 2.1028572518844157e-05, |
|
"learning_rate": 0.0001312321574304275, |
|
"loss": 0.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 40.5, |
|
"grad_norm": 2.917735582741443e-05, |
|
"learning_rate": 0.00013048057687252865, |
|
"loss": 0.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 40.75, |
|
"grad_norm": 3.929531158064492e-05, |
|
"learning_rate": 0.00012972709336226697, |
|
"loss": 0.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 2.542526817705948e-05, |
|
"learning_rate": 0.00012897175394085267, |
|
"loss": 0.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_loss": 1.3143367141310591e-06, |
|
"eval_runtime": 0.32, |
|
"eval_samples_per_second": 112.487, |
|
"eval_steps_per_second": 15.623, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 41.25, |
|
"grad_norm": 2.2972772057983093e-05, |
|
"learning_rate": 0.00012821460576536363, |
|
"loss": 0.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 41.5, |
|
"grad_norm": 2.710890294110868e-05, |
|
"learning_rate": 0.0001274556961058012, |
|
"loss": 0.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 41.75, |
|
"grad_norm": 7.863906648708507e-05, |
|
"learning_rate": 0.00012669507234213908, |
|
"loss": 0.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 2.5962377549149096e-05, |
|
"learning_rate": 0.00012593278196136525, |
|
"loss": 0.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 1.2861806908404105e-06, |
|
"eval_runtime": 0.3211, |
|
"eval_samples_per_second": 112.131, |
|
"eval_steps_per_second": 15.574, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 42.25, |
|
"grad_norm": 2.938141733466182e-05, |
|
"learning_rate": 0.00012516887255451735, |
|
"loss": 0.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 42.5, |
|
"grad_norm": 2.2876229195389897e-05, |
|
"learning_rate": 0.00012440339181371148, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 42.75, |
|
"grad_norm": 2.188000871683471e-05, |
|
"learning_rate": 0.00012363638752916468, |
|
"loss": 0.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 2.7062182198278606e-05, |
|
"learning_rate": 0.00012286790758621132, |
|
"loss": 0.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_loss": 1.24422297176352e-06, |
|
"eval_runtime": 0.3203, |
|
"eval_samples_per_second": 112.377, |
|
"eval_steps_per_second": 15.608, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 43.25, |
|
"grad_norm": 3.9851081965025514e-05, |
|
"learning_rate": 0.00012209799996231358, |
|
"loss": 0.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 43.5, |
|
"grad_norm": 3.9189981180243194e-05, |
|
"learning_rate": 0.00012132671272406604, |
|
"loss": 0.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 43.75, |
|
"grad_norm": 2.008090086746961e-05, |
|
"learning_rate": 0.00012055409402419494, |
|
"loss": 0.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 2.994649184984155e-05, |
|
"learning_rate": 0.00011978019209855174, |
|
"loss": 0.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 1.2121387271690764e-06, |
|
"eval_runtime": 0.3206, |
|
"eval_samples_per_second": 112.281, |
|
"eval_steps_per_second": 15.595, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 44.25, |
|
"grad_norm": 1.9228473320254125e-05, |
|
"learning_rate": 0.0001190050552631019, |
|
"loss": 0.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 44.5, |
|
"grad_norm": 2.6020699806394987e-05, |
|
"learning_rate": 0.00011822873191090833, |
|
"loss": 0.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 44.75, |
|
"grad_norm": 2.0412864614627324e-05, |
|
"learning_rate": 0.00011745127050910998, |
|
"loss": 0.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 2.493833380867727e-05, |
|
"learning_rate": 0.00011667271959589623, |
|
"loss": 0.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_loss": 1.1790700682468014e-06, |
|
"eval_runtime": 0.3173, |
|
"eval_samples_per_second": 113.472, |
|
"eval_steps_per_second": 15.76, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 45.25, |
|
"grad_norm": 3.828733315458521e-05, |
|
"learning_rate": 0.00011589312777747644, |
|
"loss": 0.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 45.5, |
|
"grad_norm": 2.1567129806498997e-05, |
|
"learning_rate": 0.00011511254372504531, |
|
"loss": 0.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 45.75, |
|
"grad_norm": 1.842524579842575e-05, |
|
"learning_rate": 0.0001143310161717444, |
|
"loss": 0.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 2.736481292231474e-05, |
|
"learning_rate": 0.00011354859390961958, |
|
"loss": 0.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_loss": 1.1555836181287304e-06, |
|
"eval_runtime": 0.3177, |
|
"eval_samples_per_second": 113.308, |
|
"eval_steps_per_second": 15.737, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 46.25, |
|
"grad_norm": 3.4207103453809395e-05, |
|
"learning_rate": 0.0001127653257865748, |
|
"loss": 0.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 46.5, |
|
"grad_norm": 3.1199837394524366e-05, |
|
"learning_rate": 0.00011198126070332253, |
|
"loss": 0.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 46.75, |
|
"grad_norm": 1.3810436030325945e-05, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 2.9521519536501728e-05, |
|
"learning_rate": 0.00011041093550476707, |
|
"loss": 0.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_loss": 1.1195420484000351e-06, |
|
"eval_runtime": 0.3205, |
|
"eval_samples_per_second": 112.332, |
|
"eval_steps_per_second": 15.602, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 47.25, |
|
"grad_norm": 1.7040036254911683e-05, |
|
"learning_rate": 0.00010962477342743929, |
|
"loss": 0.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 47.5, |
|
"grad_norm": 2.9747276130365208e-05, |
|
"learning_rate": 0.00010883801045973425, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 47.75, |
|
"grad_norm": 2.880042120523285e-05, |
|
"learning_rate": 0.00010805069572055334, |
|
"loss": 0.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 2.100724850606639e-05, |
|
"learning_rate": 0.00010726287836324582, |
|
"loss": 0.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 1.1032241218345007e-06, |
|
"eval_runtime": 0.3192, |
|
"eval_samples_per_second": 112.768, |
|
"eval_steps_per_second": 15.662, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 48.25, |
|
"grad_norm": 1.7086620573536493e-05, |
|
"learning_rate": 0.0001064746075725404, |
|
"loss": 0.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 48.5, |
|
"grad_norm": 2.3707199943601154e-05, |
|
"learning_rate": 0.00010568593256147421, |
|
"loss": 0.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 48.75, |
|
"grad_norm": 1.4947347153793089e-05, |
|
"learning_rate": 0.00010489690256832068, |
|
"loss": 0.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 2.3327078451984562e-05, |
|
"learning_rate": 0.00010410756685351517, |
|
"loss": 0.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_loss": 1.0602713018670329e-06, |
|
"eval_runtime": 0.3334, |
|
"eval_samples_per_second": 107.979, |
|
"eval_steps_per_second": 14.997, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 49.25, |
|
"grad_norm": 1.931817314471118e-05, |
|
"learning_rate": 0.00010331797469657992, |
|
"loss": 0.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 49.5, |
|
"grad_norm": 2.6536048608249985e-05, |
|
"learning_rate": 0.00010252817539304718, |
|
"loss": 0.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 49.75, |
|
"grad_norm": 2.2126323528937064e-05, |
|
"learning_rate": 0.00010173821825138172, |
|
"loss": 0.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 2.2889309548190795e-05, |
|
"learning_rate": 0.00010094815258990241, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 1.040821643982781e-06, |
|
"eval_runtime": 0.3203, |
|
"eval_samples_per_second": 112.396, |
|
"eval_steps_per_second": 15.611, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 50.25, |
|
"grad_norm": 2.8334068701951765e-05, |
|
"learning_rate": 0.00010015802773370311, |
|
"loss": 0.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 50.5, |
|
"grad_norm": 1.9157972928951494e-05, |
|
"learning_rate": 9.936789301157347e-05, |
|
"loss": 0.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 50.75, |
|
"grad_norm": 2.7853264327859506e-05, |
|
"learning_rate": 9.857779775291898e-05, |
|
"loss": 0.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 2.194027547375299e-05, |
|
"learning_rate": 9.778779128468132e-05, |
|
"loss": 0.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_loss": 1.013436872199236e-06, |
|
"eval_runtime": 0.3177, |
|
"eval_samples_per_second": 113.312, |
|
"eval_steps_per_second": 15.738, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 51.25, |
|
"grad_norm": 1.2561698895297013e-05, |
|
"learning_rate": 9.699792292825892e-05, |
|
"loss": 0.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 51.5, |
|
"grad_norm": 2.041015432041604e-05, |
|
"learning_rate": 9.620824199642764e-05, |
|
"loss": 0.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 51.75, |
|
"grad_norm": 3.463058601482771e-05, |
|
"learning_rate": 9.541879779026209e-05, |
|
"loss": 0.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 1.9060191334574483e-05, |
|
"learning_rate": 9.462963959605778e-05, |
|
"loss": 0.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_loss": 1.0033103308160207e-06, |
|
"eval_runtime": 0.3157, |
|
"eval_samples_per_second": 114.025, |
|
"eval_steps_per_second": 15.837, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 52.25, |
|
"grad_norm": 1.4129647752270103e-05, |
|
"learning_rate": 9.384081668225387e-05, |
|
"loss": 0.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 52.5, |
|
"grad_norm": 2.1596322767436504e-05, |
|
"learning_rate": 9.30523782963576e-05, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 52.75, |
|
"grad_norm": 1.7303984350292012e-05, |
|
"learning_rate": 9.226437366186941e-05, |
|
"loss": 0.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 2.7551333914743736e-05, |
|
"learning_rate": 9.147685197520995e-05, |
|
"loss": 0.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_loss": 9.675704859546386e-07, |
|
"eval_runtime": 0.3184, |
|
"eval_samples_per_second": 113.083, |
|
"eval_steps_per_second": 15.706, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 53.25, |
|
"grad_norm": 2.0771505660377443e-05, |
|
"learning_rate": 9.06898624026486e-05, |
|
"loss": 0.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 53.5, |
|
"grad_norm": 2.2202431864570826e-05, |
|
"learning_rate": 8.990345407723402e-05, |
|
"loss": 0.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 53.75, |
|
"grad_norm": 1.3855403267371003e-05, |
|
"learning_rate": 8.91176760957267e-05, |
|
"loss": 0.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 2.2561982405022718e-05, |
|
"learning_rate": 8.833257751553365e-05, |
|
"loss": 0.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_loss": 9.524069923827483e-07, |
|
"eval_runtime": 0.3172, |
|
"eval_samples_per_second": 113.496, |
|
"eval_steps_per_second": 15.763, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 54.25, |
|
"grad_norm": 1.5506595445913263e-05, |
|
"learning_rate": 8.754820735164576e-05, |
|
"loss": 0.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 54.5, |
|
"grad_norm": 2.101029167533852e-05, |
|
"learning_rate": 8.676461457357776e-05, |
|
"loss": 0.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 54.75, |
|
"grad_norm": 1.7293437849730253e-05, |
|
"learning_rate": 8.598184810231088e-05, |
|
"loss": 0.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 2.4345905330847017e-05, |
|
"learning_rate": 8.519995680723854e-05, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_loss": 9.304160357714863e-07, |
|
"eval_runtime": 0.3151, |
|
"eval_samples_per_second": 114.245, |
|
"eval_steps_per_second": 15.867, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 55.25, |
|
"grad_norm": 3.5958666558144614e-05, |
|
"learning_rate": 8.44189895031157e-05, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 55.5, |
|
"grad_norm": 2.3594711819896474e-05, |
|
"learning_rate": 8.363899494701086e-05, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 55.75, |
|
"grad_norm": 1.3870093425794039e-05, |
|
"learning_rate": 8.286002183526237e-05, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 2.6735531719168648e-05, |
|
"learning_rate": 8.208211880043812e-05, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_loss": 9.174784736387664e-07, |
|
"eval_runtime": 0.3129, |
|
"eval_samples_per_second": 115.04, |
|
"eval_steps_per_second": 15.978, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 56.25, |
|
"grad_norm": 2.9232525776023977e-05, |
|
"learning_rate": 8.130533440829928e-05, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 56.5, |
|
"grad_norm": 2.4526891138521023e-05, |
|
"learning_rate": 8.052971715476842e-05, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 56.75, |
|
"grad_norm": 2.6106521545443684e-05, |
|
"learning_rate": 7.975531546290166e-05, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 1.784413143468555e-05, |
|
"learning_rate": 7.898217767986562e-05, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_loss": 9.079113851839793e-07, |
|
"eval_runtime": 0.3236, |
|
"eval_samples_per_second": 111.239, |
|
"eval_steps_per_second": 15.45, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 57.25, |
|
"grad_norm": 1.9261695342720486e-05, |
|
"learning_rate": 7.821035207391912e-05, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 57.5, |
|
"grad_norm": 3.491761162877083e-05, |
|
"learning_rate": 7.743988683139943e-05, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 57.75, |
|
"grad_norm": 1.3563810171035584e-05, |
|
"learning_rate": 7.66708300537143e-05, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 1.2282480383873917e-05, |
|
"learning_rate": 7.590322975433857e-05, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_loss": 8.861284754857479e-07, |
|
"eval_runtime": 0.3181, |
|
"eval_samples_per_second": 113.172, |
|
"eval_steps_per_second": 15.718, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 58.25, |
|
"grad_norm": 2.858146035578102e-05, |
|
"learning_rate": 7.51371338558168e-05, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 58.5, |
|
"grad_norm": 2.0420882719918154e-05, |
|
"learning_rate": 7.437259018677136e-05, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 58.75, |
|
"grad_norm": 9.892805792333093e-06, |
|
"learning_rate": 7.360964647891637e-05, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 2.6135967345908284e-05, |
|
"learning_rate": 7.284835036407776e-05, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_loss": 8.719437687432219e-07, |
|
"eval_runtime": 0.3182, |
|
"eval_samples_per_second": 113.153, |
|
"eval_steps_per_second": 15.716, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 59.25, |
|
"grad_norm": 3.855082468362525e-05, |
|
"learning_rate": 7.208874937121946e-05, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 59.5, |
|
"grad_norm": 2.4621716875117272e-05, |
|
"learning_rate": 7.133089092347627e-05, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 59.75, |
|
"grad_norm": 1.3933644368080422e-05, |
|
"learning_rate": 7.057482233519302e-05, |
|
"loss": 0.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 1.3702153410122264e-05, |
|
"learning_rate": 6.982059080897059e-05, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 8.514528531122778e-07, |
|
"eval_runtime": 0.317, |
|
"eval_samples_per_second": 113.548, |
|
"eval_steps_per_second": 15.771, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 60.25, |
|
"grad_norm": 1.285933922190452e-05, |
|
"learning_rate": 6.906824343271916e-05, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 60.5, |
|
"grad_norm": 1.753455217112787e-05, |
|
"learning_rate": 6.831782717671828e-05, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 60.75, |
|
"grad_norm": 1.9983261154266074e-05, |
|
"learning_rate": 6.756938889068454e-05, |
|
"loss": 0.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 1.9891913325409405e-05, |
|
"learning_rate": 6.682297530084664e-05, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_loss": 8.335572942996805e-07, |
|
"eval_runtime": 0.3281, |
|
"eval_samples_per_second": 109.721, |
|
"eval_steps_per_second": 15.239, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 61.25, |
|
"grad_norm": 1.8422002540319227e-05, |
|
"learning_rate": 6.607863300702807e-05, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 61.5, |
|
"grad_norm": 1.9453251297818497e-05, |
|
"learning_rate": 6.533640847973808e-05, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 61.75, |
|
"grad_norm": 1.48242861541803e-05, |
|
"learning_rate": 6.459634805727011e-05, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 1.9470420738798566e-05, |
|
"learning_rate": 6.385849794280915e-05, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_loss": 8.260683443950256e-07, |
|
"eval_runtime": 0.3297, |
|
"eval_samples_per_second": 109.182, |
|
"eval_steps_per_second": 15.164, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 62.25, |
|
"grad_norm": 2.976124051201623e-05, |
|
"learning_rate": 6.312290420154694e-05, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 62.5, |
|
"grad_norm": 4.272747173672542e-05, |
|
"learning_rate": 6.238961275780613e-05, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 62.75, |
|
"grad_norm": 1.2389010407787282e-05, |
|
"learning_rate": 6.165866939217328e-05, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 1.4621130503655877e-05, |
|
"learning_rate": 6.0930119738640445e-05, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_loss": 8.148024335241644e-07, |
|
"eval_runtime": 0.3292, |
|
"eval_samples_per_second": 109.354, |
|
"eval_steps_per_second": 15.188, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 63.25, |
|
"grad_norm": 1.0234934961772524e-05, |
|
"learning_rate": 6.020400928175637e-05, |
|
"loss": 0.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 63.5, |
|
"grad_norm": 1.937254455697257e-05, |
|
"learning_rate": 5.948038335378683e-05, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 63.75, |
|
"grad_norm": 1.764351145538967e-05, |
|
"learning_rate": 5.8759287131884246e-05, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 2.3509826860390604e-05, |
|
"learning_rate": 5.804076563526744e-05, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_loss": 8.072562422967167e-07, |
|
"eval_runtime": 0.3217, |
|
"eval_samples_per_second": 111.904, |
|
"eval_steps_per_second": 15.542, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 64.25, |
|
"grad_norm": 1.288153634959599e-05, |
|
"learning_rate": 5.732486372241088e-05, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 64.5, |
|
"grad_norm": 1.7124617443187162e-05, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 64.75, |
|
"grad_norm": 3.4207390854135156e-05, |
|
"learning_rate": 5.5901097261361636e-05, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 1.607052945473697e-05, |
|
"learning_rate": 5.5193321601242156e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_loss": 7.960065886436496e-07, |
|
"eval_runtime": 0.3236, |
|
"eval_samples_per_second": 111.263, |
|
"eval_steps_per_second": 15.453, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 65.25, |
|
"grad_norm": 2.7799209419754334e-05, |
|
"learning_rate": 5.448834329548016e-05, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 65.5, |
|
"grad_norm": 1.6963076632237062e-05, |
|
"learning_rate": 5.378620635702643e-05, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 65.75, |
|
"grad_norm": 1.7011914678732865e-05, |
|
"learning_rate": 5.308695462144068e-05, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 1.719038118608296e-05, |
|
"learning_rate": 5.239063174415466e-05, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_loss": 7.857981927372748e-07, |
|
"eval_runtime": 0.3159, |
|
"eval_samples_per_second": 113.963, |
|
"eval_steps_per_second": 15.828, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 66.25, |
|
"grad_norm": 1.87909827218391e-05, |
|
"learning_rate": 5.1697281197746596e-05, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 66.5, |
|
"grad_norm": 1.997711297008209e-05, |
|
"learning_rate": 5.1006946269227376e-05, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 66.75, |
|
"grad_norm": 2.0850015062023886e-05, |
|
"learning_rate": 5.03196700573378e-05, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 2.2285566956270486e-05, |
|
"learning_rate": 4.963549546985799e-05, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_loss": 7.721130259596976e-07, |
|
"eval_runtime": 0.3244, |
|
"eval_samples_per_second": 110.965, |
|
"eval_steps_per_second": 15.412, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 67.25, |
|
"grad_norm": 1.6444948414573446e-05, |
|
"learning_rate": 4.895446522092868e-05, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 67.5, |
|
"grad_norm": 1.5268993593053892e-05, |
|
"learning_rate": 4.8276621828384225e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 67.75, |
|
"grad_norm": 1.7810820281738415e-05, |
|
"learning_rate": 4.760200761109852e-05, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 1.7248778021894395e-05, |
|
"learning_rate": 4.6930664686342526e-05, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_loss": 7.603679819112585e-07, |
|
"eval_runtime": 0.3117, |
|
"eval_samples_per_second": 115.513, |
|
"eval_steps_per_second": 16.044, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 68.25, |
|
"grad_norm": 2.448088525852654e-05, |
|
"learning_rate": 4.626263496715525e-05, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 68.5, |
|
"grad_norm": 1.745475674397312e-05, |
|
"learning_rate": 4.559796015972677e-05, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 68.75, |
|
"grad_norm": 1.6836595023050904e-05, |
|
"learning_rate": 4.49366817607945e-05, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 2.0379737179609947e-05, |
|
"learning_rate": 4.427884105505251e-05, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_loss": 7.604816119055613e-07, |
|
"eval_runtime": 0.3177, |
|
"eval_samples_per_second": 113.329, |
|
"eval_steps_per_second": 15.74, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 69.25, |
|
"grad_norm": 2.278652391396463e-05, |
|
"learning_rate": 4.362447911257406e-05, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 69.5, |
|
"grad_norm": 1.2965742826054338e-05, |
|
"learning_rate": 4.297363678624753e-05, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 69.75, |
|
"grad_norm": 1.8777451259666122e-05, |
|
"learning_rate": 4.2326354709225955e-05, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 2.3537781089544296e-05, |
|
"learning_rate": 4.168267329239002e-05, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_loss": 7.471541039194562e-07, |
|
"eval_runtime": 0.3194, |
|
"eval_samples_per_second": 112.703, |
|
"eval_steps_per_second": 15.653, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 70.25, |
|
"grad_norm": 1.4215344890544657e-05, |
|
"learning_rate": 4.104263272182546e-05, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 70.5, |
|
"grad_norm": 1.8491147784516215e-05, |
|
"learning_rate": 4.0406272956313895e-05, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 70.75, |
|
"grad_norm": 1.7631069567869417e-05, |
|
"learning_rate": 3.9773633724838265e-05, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"grad_norm": 1.9227232769480906e-05, |
|
"learning_rate": 3.914475452410257e-05, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_loss": 7.375128916464746e-07, |
|
"eval_runtime": 0.321, |
|
"eval_samples_per_second": 112.152, |
|
"eval_steps_per_second": 15.577, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 71.25, |
|
"grad_norm": 1.6681302440701984e-05, |
|
"learning_rate": 3.8519674616065784e-05, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 71.5, |
|
"grad_norm": 1.8769558664644137e-05, |
|
"learning_rate": 3.789843302549096e-05, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 71.75, |
|
"grad_norm": 8.559236448490992e-06, |
|
"learning_rate": 3.7281068537508565e-05, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 1.4404205103346612e-05, |
|
"learning_rate": 3.6667619695195285e-05, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_loss": 7.320029453694588e-07, |
|
"eval_runtime": 0.3152, |
|
"eval_samples_per_second": 114.223, |
|
"eval_steps_per_second": 15.864, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 72.25, |
|
"grad_norm": 1.8397522580926307e-05, |
|
"learning_rate": 3.605812479716767e-05, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 72.5, |
|
"grad_norm": 1.5880750652286224e-05, |
|
"learning_rate": 3.545262189519092e-05, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 72.75, |
|
"grad_norm": 1.8930764781543985e-05, |
|
"learning_rate": 3.4851148791803465e-05, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"grad_norm": 4.1914405301213264e-05, |
|
"learning_rate": 3.425374303795675e-05, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_loss": 7.22367474281782e-07, |
|
"eval_runtime": 0.319, |
|
"eval_samples_per_second": 112.859, |
|
"eval_steps_per_second": 15.675, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 73.25, |
|
"grad_norm": 1.0584836672933307e-05, |
|
"learning_rate": 3.3660441930671006e-05, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 73.5, |
|
"grad_norm": 1.819963290472515e-05, |
|
"learning_rate": 3.3071282510706624e-05, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 73.75, |
|
"grad_norm": 1.8003340301220305e-05, |
|
"learning_rate": 3.248630156025158e-05, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"grad_norm": 1.5387213352369145e-05, |
|
"learning_rate": 3.1905535600625314e-05, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_loss": 7.147688734221447e-07, |
|
"eval_runtime": 0.3171, |
|
"eval_samples_per_second": 113.526, |
|
"eval_steps_per_second": 15.767, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 74.25, |
|
"grad_norm": 2.1973037291900255e-05, |
|
"learning_rate": 3.1329020889998306e-05, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 74.5, |
|
"grad_norm": 1.8727620044955984e-05, |
|
"learning_rate": 3.075679342112874e-05, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 74.75, |
|
"grad_norm": 1.0095293873746414e-05, |
|
"learning_rate": 3.01888889191152e-05, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"grad_norm": 1.2027586308249738e-05, |
|
"learning_rate": 2.9625342839166316e-05, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_loss": 7.11524990038015e-07, |
|
"eval_runtime": 0.3322, |
|
"eval_samples_per_second": 108.367, |
|
"eval_steps_per_second": 15.051, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 75.25, |
|
"grad_norm": 2.197036155848764e-05, |
|
"learning_rate": 2.9066190364387437e-05, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 75.5, |
|
"grad_norm": 1.3477620086632669e-05, |
|
"learning_rate": 2.8511466403583766e-05, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 75.75, |
|
"grad_norm": 1.1739802175725345e-05, |
|
"learning_rate": 2.796120558908124e-05, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 3.1627434509573504e-05, |
|
"learning_rate": 2.7415442274564273e-05, |
|
"loss": 0.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_loss": 7.128418815227633e-07, |
|
"eval_runtime": 0.315, |
|
"eval_samples_per_second": 114.285, |
|
"eval_steps_per_second": 15.873, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 76.25, |
|
"grad_norm": 9.673092790762894e-06, |
|
"learning_rate": 2.6874210532930855e-05, |
|
"loss": 0.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 76.5, |
|
"grad_norm": 1.989353768294677e-05, |
|
"learning_rate": 2.6337544154165604e-05, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 76.75, |
|
"grad_norm": 1.5490039004362188e-05, |
|
"learning_rate": 2.5805476643229952e-05, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"grad_norm": 1.1932146662729792e-05, |
|
"learning_rate": 2.527804121797048e-05, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_loss": 7.000676305324305e-07, |
|
"eval_runtime": 0.3245, |
|
"eval_samples_per_second": 110.942, |
|
"eval_steps_per_second": 15.409, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 77.25, |
|
"grad_norm": 1.2189483641122933e-05, |
|
"learning_rate": 2.4755270807045174e-05, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 77.5, |
|
"grad_norm": 2.792781378957443e-05, |
|
"learning_rate": 2.423719804786737e-05, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 77.75, |
|
"grad_norm": 1.3213076272222679e-05, |
|
"learning_rate": 2.3723855284568462e-05, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"grad_norm": 2.2985013856668957e-05, |
|
"learning_rate": 2.321527456597833e-05, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_loss": 6.937642069715366e-07, |
|
"eval_runtime": 0.323, |
|
"eval_samples_per_second": 111.463, |
|
"eval_steps_per_second": 15.481, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 78.25, |
|
"grad_norm": 1.1034126146114431e-05, |
|
"learning_rate": 2.2711487643624675e-05, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 78.5, |
|
"grad_norm": 1.3156452041584998e-05, |
|
"learning_rate": 2.2212525969750643e-05, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 78.75, |
|
"grad_norm": 1.0150353773497045e-05, |
|
"learning_rate": 2.171842069535116e-05, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"grad_norm": 3.457269485807046e-05, |
|
"learning_rate": 2.1229202668228197e-05, |
|
"loss": 0.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_loss": 6.983178195696382e-07, |
|
"eval_runtime": 0.3211, |
|
"eval_samples_per_second": 112.129, |
|
"eval_steps_per_second": 15.573, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 79.25, |
|
"grad_norm": 1.4804916645516641e-05, |
|
"learning_rate": 2.074490243106485e-05, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 79.5, |
|
"grad_norm": 1.8004166122409515e-05, |
|
"learning_rate": 2.026555021951858e-05, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 79.75, |
|
"grad_norm": 2.1705473045585677e-05, |
|
"learning_rate": 1.9791175960333487e-05, |
|
"loss": 0.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 1.0873730388993863e-05, |
|
"learning_rate": 1.932180926947189e-05, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 6.858597316750092e-07, |
|
"eval_runtime": 0.3385, |
|
"eval_samples_per_second": 106.338, |
|
"eval_steps_per_second": 14.769, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 80.25, |
|
"grad_norm": 1.706531475065276e-05, |
|
"learning_rate": 1.8857479450265503e-05, |
|
"loss": 0.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 80.5, |
|
"grad_norm": 2.120017961715348e-05, |
|
"learning_rate": 1.839821549158579e-05, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 80.75, |
|
"grad_norm": 1.3771560588793363e-05, |
|
"learning_rate": 1.794404606603434e-05, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"grad_norm": 1.798778430384118e-05, |
|
"learning_rate": 1.74949995281526e-05, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_loss": 6.865074624329282e-07, |
|
"eval_runtime": 0.3201, |
|
"eval_samples_per_second": 112.473, |
|
"eval_steps_per_second": 15.621, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 81.25, |
|
"grad_norm": 1.246057126991218e-05, |
|
"learning_rate": 1.705110391265179e-05, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 81.5, |
|
"grad_norm": 2.145354483218398e-05, |
|
"learning_rate": 1.6612386932662627e-05, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 81.75, |
|
"grad_norm": 9.91187789622927e-06, |
|
"learning_rate": 1.6178875978005058e-05, |
|
"loss": 0.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"grad_norm": 2.266502815473359e-05, |
|
"learning_rate": 1.57505981134784e-05, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_loss": 6.804688723605068e-07, |
|
"eval_runtime": 0.3188, |
|
"eval_samples_per_second": 112.926, |
|
"eval_steps_per_second": 15.684, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 82.25, |
|
"grad_norm": 1.2877572771685664e-05, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 82.5, |
|
"grad_norm": 2.6757013984024525e-05, |
|
"learning_rate": 1.4909848278793782e-05, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 82.75, |
|
"grad_norm": 1.6225705621764064e-05, |
|
"learning_rate": 1.4497428798025736e-05, |
|
"loss": 0.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 1.1286027074675076e-05, |
|
"learning_rate": 1.4090347382891455e-05, |
|
"loss": 0.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_loss": 6.749939984729281e-07, |
|
"eval_runtime": 0.3162, |
|
"eval_samples_per_second": 113.859, |
|
"eval_steps_per_second": 15.814, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 83.25, |
|
"grad_norm": 1.8980854292749427e-05, |
|
"learning_rate": 1.3688629448150747e-05, |
|
"loss": 0.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 83.5, |
|
"grad_norm": 1.4416699741559569e-05, |
|
"learning_rate": 1.3292300073712615e-05, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 83.75, |
|
"grad_norm": 2.0767629393958487e-05, |
|
"learning_rate": 1.2901384003069328e-05, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 1.8946042473544367e-05, |
|
"learning_rate": 1.2515905641751824e-05, |
|
"loss": 0.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_loss": 6.791258897465013e-07, |
|
"eval_runtime": 0.3165, |
|
"eval_samples_per_second": 113.733, |
|
"eval_steps_per_second": 15.796, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 84.25, |
|
"grad_norm": 2.060659790004138e-05, |
|
"learning_rate": 1.2135889055805837e-05, |
|
"loss": 0.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 84.5, |
|
"grad_norm": 2.793761814245954e-05, |
|
"learning_rate": 1.1761357970289588e-05, |
|
"loss": 0.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 84.75, |
|
"grad_norm": 1.5529620213783346e-05, |
|
"learning_rate": 1.1392335767792505e-05, |
|
"loss": 0.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"grad_norm": 1.4130602721706964e-05, |
|
"learning_rate": 1.1028845486975403e-05, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_loss": 6.698858783238393e-07, |
|
"eval_runtime": 0.3198, |
|
"eval_samples_per_second": 112.588, |
|
"eval_steps_per_second": 15.637, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 85.25, |
|
"grad_norm": 1.6358992070308886e-05, |
|
"learning_rate": 1.0670909821132136e-05, |
|
"loss": 0.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 85.5, |
|
"grad_norm": 3.4115200833184645e-05, |
|
"learning_rate": 1.0318551116772923e-05, |
|
"loss": 0.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 85.75, |
|
"grad_norm": 1.7895346900331788e-05, |
|
"learning_rate": 9.971791372229044e-06, |
|
"loss": 0.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"grad_norm": 1.2962746950506698e-05, |
|
"learning_rate": 9.630652236279625e-06, |
|
"loss": 0.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_loss": 6.754976880074537e-07, |
|
"eval_runtime": 0.3233, |
|
"eval_samples_per_second": 111.361, |
|
"eval_steps_per_second": 15.467, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 86.25, |
|
"grad_norm": 1.5895795513642952e-05, |
|
"learning_rate": 9.295155006799917e-06, |
|
"loss": 0.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 86.5, |
|
"grad_norm": 1.6078374756034464e-05, |
|
"learning_rate": 8.96532062943175e-06, |
|
"loss": 0.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 86.75, |
|
"grad_norm": 1.0541101801209152e-05, |
|
"learning_rate": 8.641169696275831e-06, |
|
"loss": 0.0, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"grad_norm": 1.940759102581069e-05, |
|
"learning_rate": 8.322722444606079e-06, |
|
"loss": 0.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_loss": 6.688477469651843e-07, |
|
"eval_runtime": 0.3141, |
|
"eval_samples_per_second": 114.612, |
|
"eval_steps_per_second": 15.918, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 87.25, |
|
"grad_norm": 1.1041982361348346e-05, |
|
"learning_rate": 8.009998755606263e-06, |
|
"loss": 0.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 87.5, |
|
"grad_norm": 2.7860867703566328e-05, |
|
"learning_rate": 7.703018153128739e-06, |
|
"loss": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 87.75, |
|
"grad_norm": 8.007168617041316e-06, |
|
"learning_rate": 7.401799802475573e-06, |
|
"loss": 0.0, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 1.64666762429988e-05, |
|
"learning_rate": 7.106362509202036e-06, |
|
"loss": 0.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_loss": 6.721416525579116e-07, |
|
"eval_runtime": 0.3256, |
|
"eval_samples_per_second": 110.567, |
|
"eval_steps_per_second": 15.357, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 88.25, |
|
"grad_norm": 1.7286309230257757e-05, |
|
"learning_rate": 6.816724717942435e-06, |
|
"loss": 0.0, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 88.5, |
|
"grad_norm": 2.7998203222523443e-05, |
|
"learning_rate": 6.532904511258753e-06, |
|
"loss": 0.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 88.75, |
|
"grad_norm": 1.3463857612805441e-05, |
|
"learning_rate": 6.254919608511544e-06, |
|
"loss": 0.0, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"grad_norm": 1.6592677638982423e-05, |
|
"learning_rate": 5.982787364753872e-06, |
|
"loss": 0.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_loss": 6.658329425590637e-07, |
|
"eval_runtime": 0.3184, |
|
"eval_samples_per_second": 113.061, |
|
"eval_steps_per_second": 15.703, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 89.25, |
|
"grad_norm": 2.4364608179894276e-05, |
|
"learning_rate": 5.716524769647646e-06, |
|
"loss": 0.0, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 89.5, |
|
"grad_norm": 1.557578070787713e-05, |
|
"learning_rate": 5.456148446402976e-06, |
|
"loss": 0.0, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 89.75, |
|
"grad_norm": 9.06794684851775e-06, |
|
"learning_rate": 5.2016746507404295e-06, |
|
"loss": 0.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 2.0602865333785303e-05, |
|
"learning_rate": 4.953119269876061e-06, |
|
"loss": 0.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_loss": 6.674051178379159e-07, |
|
"eval_runtime": 0.3268, |
|
"eval_samples_per_second": 110.152, |
|
"eval_steps_per_second": 15.299, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 90.25, |
|
"grad_norm": 2.491854138497729e-05, |
|
"learning_rate": 4.710497821529625e-06, |
|
"loss": 0.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 90.5, |
|
"grad_norm": 1.2203651749587152e-05, |
|
"learning_rate": 4.473825452955716e-06, |
|
"loss": 0.0, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 90.75, |
|
"grad_norm": 2.5209032173734158e-05, |
|
"learning_rate": 4.2431169399981485e-06, |
|
"loss": 0.0, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"grad_norm": 1.514551604486769e-05, |
|
"learning_rate": 4.018386686167452e-06, |
|
"loss": 0.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_loss": 6.590207135559467e-07, |
|
"eval_runtime": 0.3159, |
|
"eval_samples_per_second": 113.952, |
|
"eval_steps_per_second": 15.827, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 91.25, |
|
"grad_norm": 9.24188134376891e-06, |
|
"learning_rate": 3.7996487217416223e-06, |
|
"loss": 0.0, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 91.5, |
|
"grad_norm": 1.9695198716362938e-05, |
|
"learning_rate": 3.5869167028902195e-06, |
|
"loss": 0.0, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 91.75, |
|
"grad_norm": 8.883437658369076e-06, |
|
"learning_rate": 3.380203910821833e-06, |
|
"loss": 0.0, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 3.180091880494729e-05, |
|
"learning_rate": 3.1795232509547633e-06, |
|
"loss": 0.0, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_loss": 6.601708264497574e-07, |
|
"eval_runtime": 0.3151, |
|
"eval_samples_per_second": 114.24, |
|
"eval_steps_per_second": 15.867, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 92.25, |
|
"grad_norm": 2.3148995751398616e-05, |
|
"learning_rate": 2.98488725211149e-06, |
|
"loss": 0.0, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 92.5, |
|
"grad_norm": 1.9166613128618337e-05, |
|
"learning_rate": 2.796308065736364e-06, |
|
"loss": 0.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 92.75, |
|
"grad_norm": 1.4863600881653838e-05, |
|
"learning_rate": 2.6137974651370134e-06, |
|
"loss": 0.0, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"grad_norm": 1.7678094081929885e-05, |
|
"learning_rate": 2.4373668447493224e-06, |
|
"loss": 0.0, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_loss": 6.622615842388768e-07, |
|
"eval_runtime": 0.3193, |
|
"eval_samples_per_second": 112.738, |
|
"eval_steps_per_second": 15.658, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 93.25, |
|
"grad_norm": 2.3845455871196464e-05, |
|
"learning_rate": 2.2670272194260324e-06, |
|
"loss": 0.0, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 93.5, |
|
"grad_norm": 1.4557038412021939e-05, |
|
"learning_rate": 2.102789223749102e-06, |
|
"loss": 0.0, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 93.75, |
|
"grad_norm": 2.4488541384926066e-05, |
|
"learning_rate": 1.9446631113657187e-06, |
|
"loss": 0.0, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"grad_norm": 1.9359116777195595e-05, |
|
"learning_rate": 1.7926587543482088e-06, |
|
"loss": 0.0, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_loss": 6.639800744778768e-07, |
|
"eval_runtime": 0.3201, |
|
"eval_samples_per_second": 112.453, |
|
"eval_steps_per_second": 15.618, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 94.25, |
|
"grad_norm": 1.9722852812265046e-05, |
|
"learning_rate": 1.6467856425776863e-06, |
|
"loss": 0.0, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 94.5, |
|
"grad_norm": 1.831287045206409e-05, |
|
"learning_rate": 1.5070528831515384e-06, |
|
"loss": 0.0, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 94.75, |
|
"grad_norm": 2.3000593500910327e-05, |
|
"learning_rate": 1.3734691998149474e-06, |
|
"loss": 0.0, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"grad_norm": 1.1854316653625574e-05, |
|
"learning_rate": 1.246042932416136e-06, |
|
"loss": 0.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_loss": 6.561905934177048e-07, |
|
"eval_runtime": 0.318, |
|
"eval_samples_per_second": 113.209, |
|
"eval_steps_per_second": 15.724, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 95.25, |
|
"grad_norm": 1.4117299542704131e-05, |
|
"learning_rate": 1.1247820363858075e-06, |
|
"loss": 0.0, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 95.5, |
|
"grad_norm": 1.986858478630893e-05, |
|
"learning_rate": 1.00969408224042e-06, |
|
"loss": 0.0, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 95.75, |
|
"grad_norm": 2.425446109555196e-05, |
|
"learning_rate": 9.007862551095314e-07, |
|
"loss": 0.0, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 1.7652260794420727e-05, |
|
"learning_rate": 7.980653542872584e-07, |
|
"loss": 0.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_loss": 6.501233542621776e-07, |
|
"eval_runtime": 0.3244, |
|
"eval_samples_per_second": 110.979, |
|
"eval_steps_per_second": 15.414, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 96.25, |
|
"grad_norm": 1.0838626621989533e-05, |
|
"learning_rate": 7.015377928077827e-07, |
|
"loss": 0.0, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 96.5, |
|
"grad_norm": 1.3126472367730457e-05, |
|
"learning_rate": 6.11209597044926e-07, |
|
"loss": 0.0, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 96.75, |
|
"grad_norm": 2.100517667713575e-05, |
|
"learning_rate": 5.27086406335997e-07, |
|
"loss": 0.0, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"grad_norm": 1.3467181815940421e-05, |
|
"learning_rate": 4.4917347262962705e-07, |
|
"loss": 0.0, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_loss": 6.613539653699263e-07, |
|
"eval_runtime": 0.3133, |
|
"eval_samples_per_second": 114.905, |
|
"eval_steps_per_second": 15.959, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 97.25, |
|
"grad_norm": 2.3665135813644156e-05, |
|
"learning_rate": 3.774756601579443e-07, |
|
"loss": 0.0, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 97.5, |
|
"grad_norm": 1.761297244229354e-05, |
|
"learning_rate": 3.119974451328833e-07, |
|
"loss": 0.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 97.75, |
|
"grad_norm": 2.0256773495930247e-05, |
|
"learning_rate": 2.5274291546669717e-07, |
|
"loss": 0.0, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"grad_norm": 1.0930380994977895e-05, |
|
"learning_rate": 1.9971577051678404e-07, |
|
"loss": 0.0, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_loss": 6.56454744785151e-07, |
|
"eval_runtime": 0.3159, |
|
"eval_samples_per_second": 113.953, |
|
"eval_steps_per_second": 15.827, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 98.25, |
|
"grad_norm": 2.0974179278709926e-05, |
|
"learning_rate": 1.5291932085468307e-07, |
|
"loss": 0.0, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 98.5, |
|
"grad_norm": 2.5038380044861697e-05, |
|
"learning_rate": 1.1235648805945075e-07, |
|
"loss": 0.0, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 98.75, |
|
"grad_norm": 1.5341527614509687e-05, |
|
"learning_rate": 7.802980453519571e-08, |
|
"loss": 0.0, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"grad_norm": 1.3212208614277188e-05, |
|
"learning_rate": 4.994141335303848e-08, |
|
"loss": 0.0, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_loss": 6.549934710164962e-07, |
|
"eval_runtime": 0.3268, |
|
"eval_samples_per_second": 110.154, |
|
"eval_steps_per_second": 15.299, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 99.25, |
|
"grad_norm": 1.4808772903052159e-05, |
|
"learning_rate": 2.8093068117240885e-08, |
|
"loss": 0.0, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 99.5, |
|
"grad_norm": 1.7599566490389407e-05, |
|
"learning_rate": 1.2486132855826781e-08, |
|
"loss": 0.0, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 99.75, |
|
"grad_norm": 9.601525562175084e-06, |
|
"learning_rate": 3.121581935328077e-09, |
|
"loss": 0.0, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 2.9180186174926348e-05, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 6.550197895194287e-07, |
|
"eval_runtime": 0.3302, |
|
"eval_samples_per_second": 109.012, |
|
"eval_steps_per_second": 15.141, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.684080299081728e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|