willtensora's picture
Training in progress, step 4000, checkpoint
99a8444 verified
raw
history blame
86 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 100.0,
"eval_steps": 40,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025,
"eval_loss": 0.9384576082229614,
"eval_runtime": 0.4222,
"eval_samples_per_second": 85.258,
"eval_steps_per_second": 11.841,
"step": 1
},
{
"epoch": 0.25,
"grad_norm": 0.06372307986021042,
"learning_rate": 8.333333333333334e-05,
"loss": 0.2623,
"step": 10
},
{
"epoch": 0.5,
"grad_norm": 0.029395487159490585,
"learning_rate": 0.0001666666666666667,
"loss": 0.0007,
"step": 20
},
{
"epoch": 0.75,
"grad_norm": 2.7581191062927246,
"learning_rate": 0.00019999887622676146,
"loss": 0.026,
"step": 30
},
{
"epoch": 1.0,
"grad_norm": 0.48524343967437744,
"learning_rate": 0.00019999200881510367,
"loss": 0.0292,
"step": 40
},
{
"epoch": 1.0,
"eval_loss": 0.004332332406193018,
"eval_runtime": 0.3245,
"eval_samples_per_second": 110.942,
"eval_steps_per_second": 15.409,
"step": 40
},
{
"epoch": 1.25,
"grad_norm": 43.160125732421875,
"learning_rate": 0.00019997889873847797,
"loss": 0.1101,
"step": 50
},
{
"epoch": 1.5,
"grad_norm": 71.57295989990234,
"learning_rate": 0.00019995954681536798,
"loss": 0.0241,
"step": 60
},
{
"epoch": 1.75,
"grad_norm": 0.3996742069721222,
"learning_rate": 0.00019993395425394592,
"loss": 0.0163,
"step": 70
},
{
"epoch": 2.0,
"grad_norm": 0.061198778450489044,
"learning_rate": 0.00019990212265199738,
"loss": 0.0148,
"step": 80
},
{
"epoch": 2.0,
"eval_loss": 0.033167850226163864,
"eval_runtime": 0.3415,
"eval_samples_per_second": 105.418,
"eval_steps_per_second": 14.641,
"step": 80
},
{
"epoch": 2.25,
"grad_norm": 2.403183698654175,
"learning_rate": 0.0001998640539968214,
"loss": 0.01,
"step": 90
},
{
"epoch": 2.5,
"grad_norm": 2.304408550262451,
"learning_rate": 0.00019981975066510655,
"loss": 0.0435,
"step": 100
},
{
"epoch": 2.75,
"grad_norm": 0.02899610437452793,
"learning_rate": 0.00019976921542278237,
"loss": 0.0296,
"step": 110
},
{
"epoch": 3.0,
"grad_norm": 3.8328208923339844,
"learning_rate": 0.0001997124514248469,
"loss": 0.1015,
"step": 120
},
{
"epoch": 3.0,
"eval_loss": 0.00442217942327261,
"eval_runtime": 0.3282,
"eval_samples_per_second": 109.685,
"eval_steps_per_second": 15.234,
"step": 120
},
{
"epoch": 3.25,
"grad_norm": 0.07601974904537201,
"learning_rate": 0.00019964946221516953,
"loss": 0.0273,
"step": 130
},
{
"epoch": 3.5,
"grad_norm": 0.02951742894947529,
"learning_rate": 0.00019958025172626986,
"loss": 0.0316,
"step": 140
},
{
"epoch": 3.75,
"grad_norm": 0.09413129091262817,
"learning_rate": 0.00019950482427907211,
"loss": 0.0071,
"step": 150
},
{
"epoch": 4.0,
"grad_norm": 0.0033842374105006456,
"learning_rate": 0.0001994231845826354,
"loss": 0.0002,
"step": 160
},
{
"epoch": 4.0,
"eval_loss": 0.00014786835527047515,
"eval_runtime": 0.3249,
"eval_samples_per_second": 110.813,
"eval_steps_per_second": 15.391,
"step": 160
},
{
"epoch": 4.25,
"grad_norm": 0.12055602669715881,
"learning_rate": 0.00019933533773385976,
"loss": 0.0001,
"step": 170
},
{
"epoch": 4.5,
"grad_norm": 0.007594508584588766,
"learning_rate": 0.00019924128921716797,
"loss": 0.0001,
"step": 180
},
{
"epoch": 4.75,
"grad_norm": 0.000780335278250277,
"learning_rate": 0.000199141044904163,
"loss": 0.0,
"step": 190
},
{
"epoch": 5.0,
"grad_norm": 0.0012850259663537145,
"learning_rate": 0.00019903461105326154,
"loss": 0.0,
"step": 200
},
{
"epoch": 5.0,
"eval_loss": 2.7732399757951498e-05,
"eval_runtime": 0.3542,
"eval_samples_per_second": 101.634,
"eval_steps_per_second": 14.116,
"step": 200
},
{
"epoch": 5.25,
"grad_norm": 0.0005795760662294924,
"learning_rate": 0.0001989219943093034,
"loss": 0.0,
"step": 210
},
{
"epoch": 5.5,
"grad_norm": 0.0004225255688652396,
"learning_rate": 0.0001988032017031364,
"loss": 0.0,
"step": 220
},
{
"epoch": 5.75,
"grad_norm": 0.0006476517883129418,
"learning_rate": 0.00019867824065117765,
"loss": 0.0,
"step": 230
},
{
"epoch": 6.0,
"grad_norm": 0.0004615155339706689,
"learning_rate": 0.00019854711895495036,
"loss": 0.0,
"step": 240
},
{
"epoch": 6.0,
"eval_loss": 1.7349062545690686e-05,
"eval_runtime": 0.328,
"eval_samples_per_second": 109.765,
"eval_steps_per_second": 15.245,
"step": 240
},
{
"epoch": 6.25,
"grad_norm": 0.0003548109089024365,
"learning_rate": 0.00019840984480059689,
"loss": 0.0,
"step": 250
},
{
"epoch": 6.5,
"grad_norm": 0.0010875174775719643,
"learning_rate": 0.0001982664267583677,
"loss": 0.0,
"step": 260
},
{
"epoch": 6.75,
"grad_norm": 0.0003341367410030216,
"learning_rate": 0.00019811687378208613,
"loss": 0.0,
"step": 270
},
{
"epoch": 7.0,
"grad_norm": 0.00045011454494670033,
"learning_rate": 0.00019796119520858955,
"loss": 0.0,
"step": 280
},
{
"epoch": 7.0,
"eval_loss": 1.3329447938303929e-05,
"eval_runtime": 0.3393,
"eval_samples_per_second": 106.103,
"eval_steps_per_second": 14.737,
"step": 280
},
{
"epoch": 7.25,
"grad_norm": 0.00023316974693443626,
"learning_rate": 0.00019779940075714648,
"loss": 0.0,
"step": 290
},
{
"epoch": 7.5,
"grad_norm": 0.0002178147406084463,
"learning_rate": 0.00019763150052884966,
"loss": 0.0,
"step": 300
},
{
"epoch": 7.75,
"grad_norm": 0.00018833605281542987,
"learning_rate": 0.00019745750500598538,
"loss": 0.0,
"step": 310
},
{
"epoch": 8.0,
"grad_norm": 0.0005992311052978039,
"learning_rate": 0.00019727742505137936,
"loss": 0.0,
"step": 320
},
{
"epoch": 8.0,
"eval_loss": 1.119767694035545e-05,
"eval_runtime": 0.3439,
"eval_samples_per_second": 104.694,
"eval_steps_per_second": 14.541,
"step": 320
},
{
"epoch": 8.25,
"grad_norm": 0.00013200360990595073,
"learning_rate": 0.00019709127190771825,
"loss": 0.0,
"step": 330
},
{
"epoch": 8.5,
"grad_norm": 0.0002380541991442442,
"learning_rate": 0.00019689905719684782,
"loss": 0.0,
"step": 340
},
{
"epoch": 8.75,
"grad_norm": 0.00014980848936829716,
"learning_rate": 0.00019670079291904752,
"loss": 0.0,
"step": 350
},
{
"epoch": 9.0,
"grad_norm": 0.0002899216488003731,
"learning_rate": 0.00019649649145228102,
"loss": 0.0,
"step": 360
},
{
"epoch": 9.0,
"eval_loss": 9.466394658375066e-06,
"eval_runtime": 0.3404,
"eval_samples_per_second": 105.754,
"eval_steps_per_second": 14.688,
"step": 360
},
{
"epoch": 9.25,
"grad_norm": 0.00021961786842439324,
"learning_rate": 0.00019628616555142372,
"loss": 0.0,
"step": 370
},
{
"epoch": 9.5,
"grad_norm": 0.00020691509416792542,
"learning_rate": 0.00019606982834746627,
"loss": 0.0,
"step": 380
},
{
"epoch": 9.75,
"grad_norm": 0.00023161708668339998,
"learning_rate": 0.00019584749334669487,
"loss": 0.0,
"step": 390
},
{
"epoch": 10.0,
"grad_norm": 0.00017992363427765667,
"learning_rate": 0.00019561917442984788,
"loss": 0.0,
"step": 400
},
{
"epoch": 10.0,
"eval_loss": 8.257883564510848e-06,
"eval_runtime": 0.3275,
"eval_samples_per_second": 109.923,
"eval_steps_per_second": 15.267,
"step": 400
},
{
"epoch": 10.25,
"grad_norm": 0.00013070827117189765,
"learning_rate": 0.00019538488585124953,
"loss": 0.0,
"step": 410
},
{
"epoch": 10.5,
"grad_norm": 0.00018156137957703322,
"learning_rate": 0.00019514464223791965,
"loss": 0.0,
"step": 420
},
{
"epoch": 10.75,
"grad_norm": 0.0001987970608752221,
"learning_rate": 0.00019489845858866066,
"loss": 0.0,
"step": 430
},
{
"epoch": 11.0,
"grad_norm": 0.00012906281335745007,
"learning_rate": 0.00019464635027312128,
"loss": 0.0,
"step": 440
},
{
"epoch": 11.0,
"eval_loss": 7.331655979214702e-06,
"eval_runtime": 0.3356,
"eval_samples_per_second": 107.279,
"eval_steps_per_second": 14.9,
"step": 440
},
{
"epoch": 11.25,
"grad_norm": 0.00031813167151995003,
"learning_rate": 0.00019438833303083678,
"loss": 0.0,
"step": 450
},
{
"epoch": 11.5,
"grad_norm": 0.00016680177941452712,
"learning_rate": 0.00019412442297024637,
"loss": 0.0,
"step": 460
},
{
"epoch": 11.75,
"grad_norm": 0.00013162715185899287,
"learning_rate": 0.00019385463656768762,
"loss": 0.0,
"step": 470
},
{
"epoch": 12.0,
"grad_norm": 0.00015330589667428285,
"learning_rate": 0.00019357899066636773,
"loss": 0.0,
"step": 480
},
{
"epoch": 12.0,
"eval_loss": 6.5182412072317675e-06,
"eval_runtime": 0.3246,
"eval_samples_per_second": 110.889,
"eval_steps_per_second": 15.401,
"step": 480
},
{
"epoch": 12.25,
"grad_norm": 0.00018356599321123213,
"learning_rate": 0.00019329750247531205,
"loss": 0.0,
"step": 490
},
{
"epoch": 12.5,
"grad_norm": 0.00015767107834108174,
"learning_rate": 0.00019301018956828964,
"loss": 0.0,
"step": 500
},
{
"epoch": 12.75,
"grad_norm": 0.00029690677183680236,
"learning_rate": 0.00019271706988271606,
"loss": 0.0,
"step": 510
},
{
"epoch": 13.0,
"grad_norm": 9.481079177930951e-05,
"learning_rate": 0.0001924181617185336,
"loss": 0.0,
"step": 520
},
{
"epoch": 13.0,
"eval_loss": 5.88237071497133e-06,
"eval_runtime": 0.3263,
"eval_samples_per_second": 110.333,
"eval_steps_per_second": 15.324,
"step": 520
},
{
"epoch": 13.25,
"grad_norm": 0.00016097365005407482,
"learning_rate": 0.00019211348373706884,
"loss": 0.0,
"step": 530
},
{
"epoch": 13.5,
"grad_norm": 0.0001369424571748823,
"learning_rate": 0.0001918030549598674,
"loss": 0.0,
"step": 540
},
{
"epoch": 13.75,
"grad_norm": 0.00018055856344290078,
"learning_rate": 0.00019148689476750658,
"loss": 0.0,
"step": 550
},
{
"epoch": 14.0,
"grad_norm": 0.00010365981870563701,
"learning_rate": 0.00019116502289838523,
"loss": 0.0,
"step": 560
},
{
"epoch": 14.0,
"eval_loss": 5.300180873746285e-06,
"eval_runtime": 0.3471,
"eval_samples_per_second": 103.705,
"eval_steps_per_second": 14.404,
"step": 560
},
{
"epoch": 14.25,
"grad_norm": 7.365662168012932e-05,
"learning_rate": 0.00019083745944749162,
"loss": 0.0,
"step": 570
},
{
"epoch": 14.5,
"grad_norm": 0.00015878217527642846,
"learning_rate": 0.00019050422486514878,
"loss": 0.0,
"step": 580
},
{
"epoch": 14.75,
"grad_norm": 0.00016406863869633526,
"learning_rate": 0.00019016533995573772,
"loss": 0.0,
"step": 590
},
{
"epoch": 15.0,
"grad_norm": 0.0001134676203946583,
"learning_rate": 0.0001898208258763987,
"loss": 0.0,
"step": 600
},
{
"epoch": 15.0,
"eval_loss": 4.918438207823783e-06,
"eval_runtime": 0.3237,
"eval_samples_per_second": 111.198,
"eval_steps_per_second": 15.444,
"step": 600
},
{
"epoch": 15.25,
"grad_norm": 0.00010196594666922465,
"learning_rate": 0.00018947070413571026,
"loss": 0.0,
"step": 610
},
{
"epoch": 15.5,
"grad_norm": 0.00013735589163843542,
"learning_rate": 0.0001891149965923464,
"loss": 0.0,
"step": 620
},
{
"epoch": 15.75,
"grad_norm": 8.303586218971759e-05,
"learning_rate": 0.00018875372545371194,
"loss": 0.0,
"step": 630
},
{
"epoch": 16.0,
"grad_norm": 8.282584167318419e-05,
"learning_rate": 0.0001883869132745561,
"loss": 0.0,
"step": 640
},
{
"epoch": 16.0,
"eval_loss": 4.482135864236625e-06,
"eval_runtime": 0.3334,
"eval_samples_per_second": 107.988,
"eval_steps_per_second": 14.998,
"step": 640
},
{
"epoch": 16.25,
"grad_norm": 7.883716170908883e-05,
"learning_rate": 0.00018801458295556435,
"loss": 0.0,
"step": 650
},
{
"epoch": 16.5,
"grad_norm": 0.00016775316908024251,
"learning_rate": 0.0001876367577419286,
"loss": 0.0,
"step": 660
},
{
"epoch": 16.75,
"grad_norm": 8.655583224026486e-05,
"learning_rate": 0.00018725346122189606,
"loss": 0.0,
"step": 670
},
{
"epoch": 17.0,
"grad_norm": 6.624006346100941e-05,
"learning_rate": 0.00018686471732529665,
"loss": 0.0,
"step": 680
},
{
"epoch": 17.0,
"eval_loss": 4.2038600440719165e-06,
"eval_runtime": 0.331,
"eval_samples_per_second": 108.76,
"eval_steps_per_second": 15.106,
"step": 680
},
{
"epoch": 17.25,
"grad_norm": 0.00013512188161257654,
"learning_rate": 0.00018647055032204883,
"loss": 0.0,
"step": 690
},
{
"epoch": 17.5,
"grad_norm": 8.678815356688574e-05,
"learning_rate": 0.0001860709848206446,
"loss": 0.0,
"step": 700
},
{
"epoch": 17.75,
"grad_norm": 6.948116788407788e-05,
"learning_rate": 0.00018566604576661288,
"loss": 0.0,
"step": 710
},
{
"epoch": 18.0,
"grad_norm": 7.376579014817253e-05,
"learning_rate": 0.00018525575844096243,
"loss": 0.0,
"step": 720
},
{
"epoch": 18.0,
"eval_loss": 3.883159479300957e-06,
"eval_runtime": 0.3356,
"eval_samples_per_second": 107.284,
"eval_steps_per_second": 14.901,
"step": 720
},
{
"epoch": 18.25,
"grad_norm": 9.459959983360022e-05,
"learning_rate": 0.0001848401484586034,
"loss": 0.0,
"step": 730
},
{
"epoch": 18.5,
"grad_norm": 9.205293463310227e-05,
"learning_rate": 0.00018441924176674794,
"loss": 0.0,
"step": 740
},
{
"epoch": 18.75,
"grad_norm": 0.00011255600111326203,
"learning_rate": 0.00018399306464329066,
"loss": 0.0,
"step": 750
},
{
"epoch": 19.0,
"grad_norm": 6.045972986612469e-05,
"learning_rate": 0.0001835616436951677,
"loss": 0.0,
"step": 760
},
{
"epoch": 19.0,
"eval_loss": 3.604589437600225e-06,
"eval_runtime": 0.3263,
"eval_samples_per_second": 110.323,
"eval_steps_per_second": 15.323,
"step": 760
},
{
"epoch": 19.25,
"grad_norm": 5.718848478863947e-05,
"learning_rate": 0.00018312500585669584,
"loss": 0.0,
"step": 770
},
{
"epoch": 19.5,
"grad_norm": 0.00010984807158820331,
"learning_rate": 0.00018268317838789088,
"loss": 0.0,
"step": 780
},
{
"epoch": 19.75,
"grad_norm": 4.868064570473507e-05,
"learning_rate": 0.0001822361888727657,
"loss": 0.0,
"step": 790
},
{
"epoch": 20.0,
"grad_norm": 7.550454029114917e-05,
"learning_rate": 0.0001817840652176082,
"loss": 0.0,
"step": 800
},
{
"epoch": 20.0,
"eval_loss": 3.3906435419339687e-06,
"eval_runtime": 0.3395,
"eval_samples_per_second": 106.05,
"eval_steps_per_second": 14.729,
"step": 800
},
{
"epoch": 20.25,
"grad_norm": 6.606967508560047e-05,
"learning_rate": 0.00018132683564923906,
"loss": 0.0,
"step": 810
},
{
"epoch": 20.5,
"grad_norm": 0.0001721412845654413,
"learning_rate": 0.00018086452871324954,
"loss": 0.0,
"step": 820
},
{
"epoch": 20.75,
"grad_norm": 4.9960210162680596e-05,
"learning_rate": 0.00018039717327221925,
"loss": 0.0,
"step": 830
},
{
"epoch": 21.0,
"grad_norm": 5.9810005041072145e-05,
"learning_rate": 0.00017992479850391417,
"loss": 0.0,
"step": 840
},
{
"epoch": 21.0,
"eval_loss": 3.1668755582359154e-06,
"eval_runtime": 0.3326,
"eval_samples_per_second": 108.232,
"eval_steps_per_second": 15.032,
"step": 840
},
{
"epoch": 21.25,
"grad_norm": 5.891801993129775e-05,
"learning_rate": 0.00017944743389946524,
"loss": 0.0,
"step": 850
},
{
"epoch": 21.5,
"grad_norm": 8.631425589555874e-05,
"learning_rate": 0.0001789651092615269,
"loss": 0.0,
"step": 860
},
{
"epoch": 21.75,
"grad_norm": 6.0596958064706996e-05,
"learning_rate": 0.00017847785470241677,
"loss": 0.0,
"step": 870
},
{
"epoch": 22.0,
"grad_norm": 7.751138764433563e-05,
"learning_rate": 0.00017798570064223533,
"loss": 0.0,
"step": 880
},
{
"epoch": 22.0,
"eval_loss": 2.9938312309241155e-06,
"eval_runtime": 0.3268,
"eval_samples_per_second": 110.167,
"eval_steps_per_second": 15.301,
"step": 880
},
{
"epoch": 22.25,
"grad_norm": 6.764694990124553e-05,
"learning_rate": 0.00017748867780696716,
"loss": 0.0,
"step": 890
},
{
"epoch": 22.5,
"grad_norm": 7.38737580832094e-05,
"learning_rate": 0.0001769868172265623,
"loss": 0.0,
"step": 900
},
{
"epoch": 22.75,
"grad_norm": 0.00010331822704756632,
"learning_rate": 0.00017648015023299918,
"loss": 0.0,
"step": 910
},
{
"epoch": 23.0,
"grad_norm": 0.00010948543786071241,
"learning_rate": 0.0001759687084583285,
"loss": 0.0,
"step": 920
},
{
"epoch": 23.0,
"eval_loss": 2.7970015707978746e-06,
"eval_runtime": 0.3433,
"eval_samples_per_second": 104.875,
"eval_steps_per_second": 14.566,
"step": 920
},
{
"epoch": 23.25,
"grad_norm": 4.273112062946893e-05,
"learning_rate": 0.00017545252383269837,
"loss": 0.0,
"step": 930
},
{
"epoch": 23.5,
"grad_norm": 0.0001338142465101555,
"learning_rate": 0.00017493162858236077,
"loss": 0.0,
"step": 940
},
{
"epoch": 23.75,
"grad_norm": 5.875607530470006e-05,
"learning_rate": 0.00017440605522765984,
"loss": 0.0,
"step": 950
},
{
"epoch": 24.0,
"grad_norm": 7.345333142438903e-05,
"learning_rate": 0.00017387583658100142,
"loss": 0.0,
"step": 960
},
{
"epoch": 24.0,
"eval_loss": 2.6630891625245567e-06,
"eval_runtime": 0.3317,
"eval_samples_per_second": 108.524,
"eval_steps_per_second": 15.073,
"step": 960
},
{
"epoch": 24.25,
"grad_norm": 6.94195696269162e-05,
"learning_rate": 0.00017334100574480435,
"loss": 0.0,
"step": 970
},
{
"epoch": 24.5,
"grad_norm": 4.8001227696659043e-05,
"learning_rate": 0.0001728015961094343,
"loss": 0.0,
"step": 980
},
{
"epoch": 24.75,
"grad_norm": 4.3018935684813187e-05,
"learning_rate": 0.00017225764135111868,
"loss": 0.0,
"step": 990
},
{
"epoch": 25.0,
"grad_norm": 7.503097003791481e-05,
"learning_rate": 0.00017170917542984443,
"loss": 0.0,
"step": 1000
},
{
"epoch": 25.0,
"eval_loss": 2.498412186469068e-06,
"eval_runtime": 0.3252,
"eval_samples_per_second": 110.685,
"eval_steps_per_second": 15.373,
"step": 1000
},
{
"epoch": 25.25,
"grad_norm": 2.499126276234165e-05,
"learning_rate": 0.00017115623258723783,
"loss": 0.0,
"step": 1010
},
{
"epoch": 25.5,
"grad_norm": 8.122723374981433e-05,
"learning_rate": 0.00017059884734442658,
"loss": 0.0,
"step": 1020
},
{
"epoch": 25.75,
"grad_norm": 5.7621167798060924e-05,
"learning_rate": 0.00017003705449988486,
"loss": 0.0,
"step": 1030
},
{
"epoch": 26.0,
"grad_norm": 6.584699440281838e-05,
"learning_rate": 0.00016947088912726052,
"loss": 0.0,
"step": 1040
},
{
"epoch": 26.0,
"eval_loss": 2.384617800998967e-06,
"eval_runtime": 0.3289,
"eval_samples_per_second": 109.466,
"eval_steps_per_second": 15.204,
"step": 1040
},
{
"epoch": 26.25,
"grad_norm": 3.284347985754721e-05,
"learning_rate": 0.00016890038657318556,
"loss": 0.0,
"step": 1050
},
{
"epoch": 26.5,
"grad_norm": 6.672390009043738e-05,
"learning_rate": 0.00016832558245506935,
"loss": 0.0,
"step": 1060
},
{
"epoch": 26.75,
"grad_norm": 3.635583561845124e-05,
"learning_rate": 0.0001677465126588749,
"loss": 0.0,
"step": 1070
},
{
"epoch": 27.0,
"grad_norm": 5.236966899246909e-05,
"learning_rate": 0.00016716321333687848,
"loss": 0.0,
"step": 1080
},
{
"epoch": 27.0,
"eval_loss": 2.2538335997523973e-06,
"eval_runtime": 0.327,
"eval_samples_per_second": 110.094,
"eval_steps_per_second": 15.291,
"step": 1080
},
{
"epoch": 27.25,
"grad_norm": 5.55117912881542e-05,
"learning_rate": 0.00016657572090541262,
"loss": 0.0,
"step": 1090
},
{
"epoch": 27.5,
"grad_norm": 0.00013249287439975888,
"learning_rate": 0.0001659840720425926,
"loss": 0.0,
"step": 1100
},
{
"epoch": 27.75,
"grad_norm": 5.55339029233437e-05,
"learning_rate": 0.00016538830368602648,
"loss": 0.0,
"step": 1110
},
{
"epoch": 28.0,
"grad_norm": 5.33119855390396e-05,
"learning_rate": 0.0001647884530305089,
"loss": 0.0,
"step": 1120
},
{
"epoch": 28.0,
"eval_loss": 2.159326413675444e-06,
"eval_runtime": 0.3173,
"eval_samples_per_second": 113.452,
"eval_steps_per_second": 15.757,
"step": 1120
},
{
"epoch": 28.25,
"grad_norm": 6.674770702375099e-05,
"learning_rate": 0.00016418455752569943,
"loss": 0.0,
"step": 1130
},
{
"epoch": 28.5,
"grad_norm": 5.4036871006246656e-05,
"learning_rate": 0.00016357665487378397,
"loss": 0.0,
"step": 1140
},
{
"epoch": 28.75,
"grad_norm": 9.294509800383821e-05,
"learning_rate": 0.00016296478302712126,
"loss": 0.0,
"step": 1150
},
{
"epoch": 29.0,
"grad_norm": 6.301044049905613e-05,
"learning_rate": 0.00016234898018587337,
"loss": 0.0,
"step": 1160
},
{
"epoch": 29.0,
"eval_loss": 2.0828572360187536e-06,
"eval_runtime": 0.3199,
"eval_samples_per_second": 112.52,
"eval_steps_per_second": 15.628,
"step": 1160
},
{
"epoch": 29.25,
"grad_norm": 6.311033212114125e-05,
"learning_rate": 0.00016172928479562078,
"loss": 0.0,
"step": 1170
},
{
"epoch": 29.5,
"grad_norm": 3.820831625489518e-05,
"learning_rate": 0.00016110573554496224,
"loss": 0.0,
"step": 1180
},
{
"epoch": 29.75,
"grad_norm": 4.628980968846008e-05,
"learning_rate": 0.00016047837136309924,
"loss": 0.0,
"step": 1190
},
{
"epoch": 30.0,
"grad_norm": 3.80598139599897e-05,
"learning_rate": 0.00015984723141740576,
"loss": 0.0,
"step": 1200
},
{
"epoch": 30.0,
"eval_loss": 1.9744732071558246e-06,
"eval_runtime": 0.3173,
"eval_samples_per_second": 113.449,
"eval_steps_per_second": 15.757,
"step": 1200
},
{
"epoch": 30.25,
"grad_norm": 3.0195853469194844e-05,
"learning_rate": 0.00015921235511098282,
"loss": 0.0,
"step": 1210
},
{
"epoch": 30.5,
"grad_norm": 5.462007538881153e-05,
"learning_rate": 0.00015857378208019863,
"loss": 0.0,
"step": 1220
},
{
"epoch": 30.75,
"grad_norm": 2.7883037546416745e-05,
"learning_rate": 0.00015793155219221395,
"loss": 0.0,
"step": 1230
},
{
"epoch": 31.0,
"grad_norm": 4.7888908738968894e-05,
"learning_rate": 0.00015728570554249312,
"loss": 0.0,
"step": 1240
},
{
"epoch": 31.0,
"eval_loss": 1.8858928569898126e-06,
"eval_runtime": 0.3223,
"eval_samples_per_second": 111.705,
"eval_steps_per_second": 15.515,
"step": 1240
},
{
"epoch": 31.25,
"grad_norm": 4.82973555335775e-05,
"learning_rate": 0.0001566362824523008,
"loss": 0.0,
"step": 1250
},
{
"epoch": 31.5,
"grad_norm": 3.9442336856154725e-05,
"learning_rate": 0.00015598332346618472,
"loss": 0.0,
"step": 1260
},
{
"epoch": 31.75,
"grad_norm": 3.770321563933976e-05,
"learning_rate": 0.00015532686934944438,
"loss": 0.0,
"step": 1270
},
{
"epoch": 32.0,
"grad_norm": 4.669040936278179e-05,
"learning_rate": 0.00015466696108558611,
"loss": 0.0,
"step": 1280
},
{
"epoch": 32.0,
"eval_loss": 1.8240966710436624e-06,
"eval_runtime": 0.3185,
"eval_samples_per_second": 113.013,
"eval_steps_per_second": 15.696,
"step": 1280
},
{
"epoch": 32.25,
"grad_norm": 2.80893400486093e-05,
"learning_rate": 0.00015400363987376413,
"loss": 0.0,
"step": 1290
},
{
"epoch": 32.5,
"grad_norm": 4.817240915144794e-05,
"learning_rate": 0.00015333694712620877,
"loss": 0.0,
"step": 1300
},
{
"epoch": 32.75,
"grad_norm": 4.6051696699578315e-05,
"learning_rate": 0.00015266692446564063,
"loss": 0.0,
"step": 1310
},
{
"epoch": 33.0,
"grad_norm": 3.602392098400742e-05,
"learning_rate": 0.00015199361372267252,
"loss": 0.0,
"step": 1320
},
{
"epoch": 33.0,
"eval_loss": 1.7236499161299434e-06,
"eval_runtime": 0.3163,
"eval_samples_per_second": 113.807,
"eval_steps_per_second": 15.806,
"step": 1320
},
{
"epoch": 33.25,
"grad_norm": 2.2813776013208553e-05,
"learning_rate": 0.00015131705693319743,
"loss": 0.0,
"step": 1330
},
{
"epoch": 33.5,
"grad_norm": 7.926914986455813e-05,
"learning_rate": 0.0001506372963357644,
"loss": 0.0,
"step": 1340
},
{
"epoch": 33.75,
"grad_norm": 6.877528358018026e-05,
"learning_rate": 0.00014995437436894147,
"loss": 0.0,
"step": 1350
},
{
"epoch": 34.0,
"grad_norm": 2.7551081075216644e-05,
"learning_rate": 0.0001492683336686661,
"loss": 0.0,
"step": 1360
},
{
"epoch": 34.0,
"eval_loss": 1.67099869941012e-06,
"eval_runtime": 0.325,
"eval_samples_per_second": 110.775,
"eval_steps_per_second": 15.385,
"step": 1360
},
{
"epoch": 34.25,
"grad_norm": 3.4323111322009936e-05,
"learning_rate": 0.0001485792170655835,
"loss": 0.0,
"step": 1370
},
{
"epoch": 34.5,
"grad_norm": 3.862389348796569e-05,
"learning_rate": 0.00014788706758237237,
"loss": 0.0,
"step": 1380
},
{
"epoch": 34.75,
"grad_norm": 3.117803134955466e-05,
"learning_rate": 0.00014719192843105924,
"loss": 0.0,
"step": 1390
},
{
"epoch": 35.0,
"grad_norm": 3.452876626397483e-05,
"learning_rate": 0.00014649384301032044,
"loss": 0.0,
"step": 1400
},
{
"epoch": 35.0,
"eval_loss": 1.6147401993293897e-06,
"eval_runtime": 0.319,
"eval_samples_per_second": 112.868,
"eval_steps_per_second": 15.676,
"step": 1400
},
{
"epoch": 35.25,
"grad_norm": 2.5607059797039255e-05,
"learning_rate": 0.00014579285490277274,
"loss": 0.0,
"step": 1410
},
{
"epoch": 35.5,
"grad_norm": 7.004107465036213e-05,
"learning_rate": 0.0001450890078722524,
"loss": 0.0,
"step": 1420
},
{
"epoch": 35.75,
"grad_norm": 5.070870975032449e-05,
"learning_rate": 0.00014438234586108297,
"loss": 0.0,
"step": 1430
},
{
"epoch": 36.0,
"grad_norm": 2.5347033442812972e-05,
"learning_rate": 0.00014367291298733178,
"loss": 0.0,
"step": 1440
},
{
"epoch": 36.0,
"eval_loss": 1.5523125966865337e-06,
"eval_runtime": 0.3195,
"eval_samples_per_second": 112.683,
"eval_steps_per_second": 15.65,
"step": 1440
},
{
"epoch": 36.25,
"grad_norm": 3.3264463127125055e-05,
"learning_rate": 0.0001429607535420557,
"loss": 0.0,
"step": 1450
},
{
"epoch": 36.5,
"grad_norm": 4.0014037949731573e-05,
"learning_rate": 0.00014224591198653595,
"loss": 0.0,
"step": 1460
},
{
"epoch": 36.75,
"grad_norm": 4.455630187294446e-05,
"learning_rate": 0.00014152843294950218,
"loss": 0.0,
"step": 1470
},
{
"epoch": 37.0,
"grad_norm": 3.4259654057677835e-05,
"learning_rate": 0.0001408083612243465,
"loss": 0.0,
"step": 1480
},
{
"epoch": 37.0,
"eval_loss": 1.506923695160367e-06,
"eval_runtime": 0.3136,
"eval_samples_per_second": 114.814,
"eval_steps_per_second": 15.946,
"step": 1480
},
{
"epoch": 37.25,
"grad_norm": 3.984866998507641e-05,
"learning_rate": 0.00014008574176632666,
"loss": 0.0,
"step": 1490
},
{
"epoch": 37.5,
"grad_norm": 3.252027090638876e-05,
"learning_rate": 0.00013936061968975957,
"loss": 0.0,
"step": 1500
},
{
"epoch": 37.75,
"grad_norm": 2.17838187381858e-05,
"learning_rate": 0.00013863304026520473,
"loss": 0.0,
"step": 1510
},
{
"epoch": 38.0,
"grad_norm": 4.0549610275775194e-05,
"learning_rate": 0.00013790304891663792,
"loss": 0.0,
"step": 1520
},
{
"epoch": 38.0,
"eval_loss": 1.457518123970658e-06,
"eval_runtime": 0.3138,
"eval_samples_per_second": 114.708,
"eval_steps_per_second": 15.932,
"step": 1520
},
{
"epoch": 38.25,
"grad_norm": 3.441906665102579e-05,
"learning_rate": 0.00013717069121861527,
"loss": 0.0,
"step": 1530
},
{
"epoch": 38.5,
"grad_norm": 3.80768469767645e-05,
"learning_rate": 0.00013643601289342803,
"loss": 0.0,
"step": 1540
},
{
"epoch": 38.75,
"grad_norm": 1.9130562577629462e-05,
"learning_rate": 0.00013569905980824788,
"loss": 0.0,
"step": 1550
},
{
"epoch": 39.0,
"grad_norm": 2.708647480176296e-05,
"learning_rate": 0.0001349598779722636,
"loss": 0.0,
"step": 1560
},
{
"epoch": 39.0,
"eval_loss": 1.4059390878173872e-06,
"eval_runtime": 0.326,
"eval_samples_per_second": 110.43,
"eval_steps_per_second": 15.337,
"step": 1560
},
{
"epoch": 39.25,
"grad_norm": 2.7261641662335023e-05,
"learning_rate": 0.00013421851353380857,
"loss": 0.0,
"step": 1570
},
{
"epoch": 39.5,
"grad_norm": 3.74881892639678e-05,
"learning_rate": 0.00013347501277747955,
"loss": 0.0,
"step": 1580
},
{
"epoch": 39.75,
"grad_norm": 4.151304892729968e-05,
"learning_rate": 0.00013272942212124705,
"loss": 0.0,
"step": 1590
},
{
"epoch": 40.0,
"grad_norm": 2.8103966542403214e-05,
"learning_rate": 0.0001319817881135576,
"loss": 0.0,
"step": 1600
},
{
"epoch": 40.0,
"eval_loss": 1.3655937891599024e-06,
"eval_runtime": 0.3183,
"eval_samples_per_second": 113.09,
"eval_steps_per_second": 15.707,
"step": 1600
},
{
"epoch": 40.25,
"grad_norm": 2.1028572518844157e-05,
"learning_rate": 0.0001312321574304275,
"loss": 0.0,
"step": 1610
},
{
"epoch": 40.5,
"grad_norm": 2.917735582741443e-05,
"learning_rate": 0.00013048057687252865,
"loss": 0.0,
"step": 1620
},
{
"epoch": 40.75,
"grad_norm": 3.929531158064492e-05,
"learning_rate": 0.00012972709336226697,
"loss": 0.0,
"step": 1630
},
{
"epoch": 41.0,
"grad_norm": 2.542526817705948e-05,
"learning_rate": 0.00012897175394085267,
"loss": 0.0,
"step": 1640
},
{
"epoch": 41.0,
"eval_loss": 1.3143367141310591e-06,
"eval_runtime": 0.32,
"eval_samples_per_second": 112.487,
"eval_steps_per_second": 15.623,
"step": 1640
},
{
"epoch": 41.25,
"grad_norm": 2.2972772057983093e-05,
"learning_rate": 0.00012821460576536363,
"loss": 0.0,
"step": 1650
},
{
"epoch": 41.5,
"grad_norm": 2.710890294110868e-05,
"learning_rate": 0.0001274556961058012,
"loss": 0.0,
"step": 1660
},
{
"epoch": 41.75,
"grad_norm": 7.863906648708507e-05,
"learning_rate": 0.00012669507234213908,
"loss": 0.0,
"step": 1670
},
{
"epoch": 42.0,
"grad_norm": 2.5962377549149096e-05,
"learning_rate": 0.00012593278196136525,
"loss": 0.0,
"step": 1680
},
{
"epoch": 42.0,
"eval_loss": 1.2861806908404105e-06,
"eval_runtime": 0.3211,
"eval_samples_per_second": 112.131,
"eval_steps_per_second": 15.574,
"step": 1680
},
{
"epoch": 42.25,
"grad_norm": 2.938141733466182e-05,
"learning_rate": 0.00012516887255451735,
"loss": 0.0,
"step": 1690
},
{
"epoch": 42.5,
"grad_norm": 2.2876229195389897e-05,
"learning_rate": 0.00012440339181371148,
"loss": 0.0,
"step": 1700
},
{
"epoch": 42.75,
"grad_norm": 2.188000871683471e-05,
"learning_rate": 0.00012363638752916468,
"loss": 0.0,
"step": 1710
},
{
"epoch": 43.0,
"grad_norm": 2.7062182198278606e-05,
"learning_rate": 0.00012286790758621132,
"loss": 0.0,
"step": 1720
},
{
"epoch": 43.0,
"eval_loss": 1.24422297176352e-06,
"eval_runtime": 0.3203,
"eval_samples_per_second": 112.377,
"eval_steps_per_second": 15.608,
"step": 1720
},
{
"epoch": 43.25,
"grad_norm": 3.9851081965025514e-05,
"learning_rate": 0.00012209799996231358,
"loss": 0.0,
"step": 1730
},
{
"epoch": 43.5,
"grad_norm": 3.9189981180243194e-05,
"learning_rate": 0.00012132671272406604,
"loss": 0.0,
"step": 1740
},
{
"epoch": 43.75,
"grad_norm": 2.008090086746961e-05,
"learning_rate": 0.00012055409402419494,
"loss": 0.0,
"step": 1750
},
{
"epoch": 44.0,
"grad_norm": 2.994649184984155e-05,
"learning_rate": 0.00011978019209855174,
"loss": 0.0,
"step": 1760
},
{
"epoch": 44.0,
"eval_loss": 1.2121387271690764e-06,
"eval_runtime": 0.3206,
"eval_samples_per_second": 112.281,
"eval_steps_per_second": 15.595,
"step": 1760
},
{
"epoch": 44.25,
"grad_norm": 1.9228473320254125e-05,
"learning_rate": 0.0001190050552631019,
"loss": 0.0,
"step": 1770
},
{
"epoch": 44.5,
"grad_norm": 2.6020699806394987e-05,
"learning_rate": 0.00011822873191090833,
"loss": 0.0,
"step": 1780
},
{
"epoch": 44.75,
"grad_norm": 2.0412864614627324e-05,
"learning_rate": 0.00011745127050910998,
"loss": 0.0,
"step": 1790
},
{
"epoch": 45.0,
"grad_norm": 2.493833380867727e-05,
"learning_rate": 0.00011667271959589623,
"loss": 0.0,
"step": 1800
},
{
"epoch": 45.0,
"eval_loss": 1.1790700682468014e-06,
"eval_runtime": 0.3173,
"eval_samples_per_second": 113.472,
"eval_steps_per_second": 15.76,
"step": 1800
},
{
"epoch": 45.25,
"grad_norm": 3.828733315458521e-05,
"learning_rate": 0.00011589312777747644,
"loss": 0.0,
"step": 1810
},
{
"epoch": 45.5,
"grad_norm": 2.1567129806498997e-05,
"learning_rate": 0.00011511254372504531,
"loss": 0.0,
"step": 1820
},
{
"epoch": 45.75,
"grad_norm": 1.842524579842575e-05,
"learning_rate": 0.0001143310161717444,
"loss": 0.0,
"step": 1830
},
{
"epoch": 46.0,
"grad_norm": 2.736481292231474e-05,
"learning_rate": 0.00011354859390961958,
"loss": 0.0,
"step": 1840
},
{
"epoch": 46.0,
"eval_loss": 1.1555836181287304e-06,
"eval_runtime": 0.3177,
"eval_samples_per_second": 113.308,
"eval_steps_per_second": 15.737,
"step": 1840
},
{
"epoch": 46.25,
"grad_norm": 3.4207103453809395e-05,
"learning_rate": 0.0001127653257865748,
"loss": 0.0,
"step": 1850
},
{
"epoch": 46.5,
"grad_norm": 3.1199837394524366e-05,
"learning_rate": 0.00011198126070332253,
"loss": 0.0,
"step": 1860
},
{
"epoch": 46.75,
"grad_norm": 1.3810436030325945e-05,
"learning_rate": 0.00011119644761033078,
"loss": 0.0,
"step": 1870
},
{
"epoch": 47.0,
"grad_norm": 2.9521519536501728e-05,
"learning_rate": 0.00011041093550476707,
"loss": 0.0,
"step": 1880
},
{
"epoch": 47.0,
"eval_loss": 1.1195420484000351e-06,
"eval_runtime": 0.3205,
"eval_samples_per_second": 112.332,
"eval_steps_per_second": 15.602,
"step": 1880
},
{
"epoch": 47.25,
"grad_norm": 1.7040036254911683e-05,
"learning_rate": 0.00010962477342743929,
"loss": 0.0,
"step": 1890
},
{
"epoch": 47.5,
"grad_norm": 2.9747276130365208e-05,
"learning_rate": 0.00010883801045973425,
"loss": 0.0,
"step": 1900
},
{
"epoch": 47.75,
"grad_norm": 2.880042120523285e-05,
"learning_rate": 0.00010805069572055334,
"loss": 0.0,
"step": 1910
},
{
"epoch": 48.0,
"grad_norm": 2.100724850606639e-05,
"learning_rate": 0.00010726287836324582,
"loss": 0.0,
"step": 1920
},
{
"epoch": 48.0,
"eval_loss": 1.1032241218345007e-06,
"eval_runtime": 0.3192,
"eval_samples_per_second": 112.768,
"eval_steps_per_second": 15.662,
"step": 1920
},
{
"epoch": 48.25,
"grad_norm": 1.7086620573536493e-05,
"learning_rate": 0.0001064746075725404,
"loss": 0.0,
"step": 1930
},
{
"epoch": 48.5,
"grad_norm": 2.3707199943601154e-05,
"learning_rate": 0.00010568593256147421,
"loss": 0.0,
"step": 1940
},
{
"epoch": 48.75,
"grad_norm": 1.4947347153793089e-05,
"learning_rate": 0.00010489690256832068,
"loss": 0.0,
"step": 1950
},
{
"epoch": 49.0,
"grad_norm": 2.3327078451984562e-05,
"learning_rate": 0.00010410756685351517,
"loss": 0.0,
"step": 1960
},
{
"epoch": 49.0,
"eval_loss": 1.0602713018670329e-06,
"eval_runtime": 0.3334,
"eval_samples_per_second": 107.979,
"eval_steps_per_second": 14.997,
"step": 1960
},
{
"epoch": 49.25,
"grad_norm": 1.931817314471118e-05,
"learning_rate": 0.00010331797469657992,
"loss": 0.0,
"step": 1970
},
{
"epoch": 49.5,
"grad_norm": 2.6536048608249985e-05,
"learning_rate": 0.00010252817539304718,
"loss": 0.0,
"step": 1980
},
{
"epoch": 49.75,
"grad_norm": 2.2126323528937064e-05,
"learning_rate": 0.00010173821825138172,
"loss": 0.0,
"step": 1990
},
{
"epoch": 50.0,
"grad_norm": 2.2889309548190795e-05,
"learning_rate": 0.00010094815258990241,
"loss": 0.0,
"step": 2000
},
{
"epoch": 50.0,
"eval_loss": 1.040821643982781e-06,
"eval_runtime": 0.3203,
"eval_samples_per_second": 112.396,
"eval_steps_per_second": 15.611,
"step": 2000
},
{
"epoch": 50.25,
"grad_norm": 2.8334068701951765e-05,
"learning_rate": 0.00010015802773370311,
"loss": 0.0,
"step": 2010
},
{
"epoch": 50.5,
"grad_norm": 1.9157972928951494e-05,
"learning_rate": 9.936789301157347e-05,
"loss": 0.0,
"step": 2020
},
{
"epoch": 50.75,
"grad_norm": 2.7853264327859506e-05,
"learning_rate": 9.857779775291898e-05,
"loss": 0.0,
"step": 2030
},
{
"epoch": 51.0,
"grad_norm": 2.194027547375299e-05,
"learning_rate": 9.778779128468132e-05,
"loss": 0.0,
"step": 2040
},
{
"epoch": 51.0,
"eval_loss": 1.013436872199236e-06,
"eval_runtime": 0.3177,
"eval_samples_per_second": 113.312,
"eval_steps_per_second": 15.738,
"step": 2040
},
{
"epoch": 51.25,
"grad_norm": 1.2561698895297013e-05,
"learning_rate": 9.699792292825892e-05,
"loss": 0.0,
"step": 2050
},
{
"epoch": 51.5,
"grad_norm": 2.041015432041604e-05,
"learning_rate": 9.620824199642764e-05,
"loss": 0.0,
"step": 2060
},
{
"epoch": 51.75,
"grad_norm": 3.463058601482771e-05,
"learning_rate": 9.541879779026209e-05,
"loss": 0.0,
"step": 2070
},
{
"epoch": 52.0,
"grad_norm": 1.9060191334574483e-05,
"learning_rate": 9.462963959605778e-05,
"loss": 0.0,
"step": 2080
},
{
"epoch": 52.0,
"eval_loss": 1.0033103308160207e-06,
"eval_runtime": 0.3157,
"eval_samples_per_second": 114.025,
"eval_steps_per_second": 15.837,
"step": 2080
},
{
"epoch": 52.25,
"grad_norm": 1.4129647752270103e-05,
"learning_rate": 9.384081668225387e-05,
"loss": 0.0,
"step": 2090
},
{
"epoch": 52.5,
"grad_norm": 2.1596322767436504e-05,
"learning_rate": 9.30523782963576e-05,
"loss": 0.0,
"step": 2100
},
{
"epoch": 52.75,
"grad_norm": 1.7303984350292012e-05,
"learning_rate": 9.226437366186941e-05,
"loss": 0.0,
"step": 2110
},
{
"epoch": 53.0,
"grad_norm": 2.7551333914743736e-05,
"learning_rate": 9.147685197520995e-05,
"loss": 0.0,
"step": 2120
},
{
"epoch": 53.0,
"eval_loss": 9.675704859546386e-07,
"eval_runtime": 0.3184,
"eval_samples_per_second": 113.083,
"eval_steps_per_second": 15.706,
"step": 2120
},
{
"epoch": 53.25,
"grad_norm": 2.0771505660377443e-05,
"learning_rate": 9.06898624026486e-05,
"loss": 0.0,
"step": 2130
},
{
"epoch": 53.5,
"grad_norm": 2.2202431864570826e-05,
"learning_rate": 8.990345407723402e-05,
"loss": 0.0,
"step": 2140
},
{
"epoch": 53.75,
"grad_norm": 1.3855403267371003e-05,
"learning_rate": 8.91176760957267e-05,
"loss": 0.0,
"step": 2150
},
{
"epoch": 54.0,
"grad_norm": 2.2561982405022718e-05,
"learning_rate": 8.833257751553365e-05,
"loss": 0.0,
"step": 2160
},
{
"epoch": 54.0,
"eval_loss": 9.524069923827483e-07,
"eval_runtime": 0.3172,
"eval_samples_per_second": 113.496,
"eval_steps_per_second": 15.763,
"step": 2160
},
{
"epoch": 54.25,
"grad_norm": 1.5506595445913263e-05,
"learning_rate": 8.754820735164576e-05,
"loss": 0.0,
"step": 2170
},
{
"epoch": 54.5,
"grad_norm": 2.101029167533852e-05,
"learning_rate": 8.676461457357776e-05,
"loss": 0.0,
"step": 2180
},
{
"epoch": 54.75,
"grad_norm": 1.7293437849730253e-05,
"learning_rate": 8.598184810231088e-05,
"loss": 0.0,
"step": 2190
},
{
"epoch": 55.0,
"grad_norm": 2.4345905330847017e-05,
"learning_rate": 8.519995680723854e-05,
"loss": 0.0,
"step": 2200
},
{
"epoch": 55.0,
"eval_loss": 9.304160357714863e-07,
"eval_runtime": 0.3151,
"eval_samples_per_second": 114.245,
"eval_steps_per_second": 15.867,
"step": 2200
},
{
"epoch": 55.25,
"grad_norm": 3.5958666558144614e-05,
"learning_rate": 8.44189895031157e-05,
"loss": 0.0,
"step": 2210
},
{
"epoch": 55.5,
"grad_norm": 2.3594711819896474e-05,
"learning_rate": 8.363899494701086e-05,
"loss": 0.0,
"step": 2220
},
{
"epoch": 55.75,
"grad_norm": 1.3870093425794039e-05,
"learning_rate": 8.286002183526237e-05,
"loss": 0.0,
"step": 2230
},
{
"epoch": 56.0,
"grad_norm": 2.6735531719168648e-05,
"learning_rate": 8.208211880043812e-05,
"loss": 0.0,
"step": 2240
},
{
"epoch": 56.0,
"eval_loss": 9.174784736387664e-07,
"eval_runtime": 0.3129,
"eval_samples_per_second": 115.04,
"eval_steps_per_second": 15.978,
"step": 2240
},
{
"epoch": 56.25,
"grad_norm": 2.9232525776023977e-05,
"learning_rate": 8.130533440829928e-05,
"loss": 0.0,
"step": 2250
},
{
"epoch": 56.5,
"grad_norm": 2.4526891138521023e-05,
"learning_rate": 8.052971715476842e-05,
"loss": 0.0,
"step": 2260
},
{
"epoch": 56.75,
"grad_norm": 2.6106521545443684e-05,
"learning_rate": 7.975531546290166e-05,
"loss": 0.0,
"step": 2270
},
{
"epoch": 57.0,
"grad_norm": 1.784413143468555e-05,
"learning_rate": 7.898217767986562e-05,
"loss": 0.0,
"step": 2280
},
{
"epoch": 57.0,
"eval_loss": 9.079113851839793e-07,
"eval_runtime": 0.3236,
"eval_samples_per_second": 111.239,
"eval_steps_per_second": 15.45,
"step": 2280
},
{
"epoch": 57.25,
"grad_norm": 1.9261695342720486e-05,
"learning_rate": 7.821035207391912e-05,
"loss": 0.0,
"step": 2290
},
{
"epoch": 57.5,
"grad_norm": 3.491761162877083e-05,
"learning_rate": 7.743988683139943e-05,
"loss": 0.0,
"step": 2300
},
{
"epoch": 57.75,
"grad_norm": 1.3563810171035584e-05,
"learning_rate": 7.66708300537143e-05,
"loss": 0.0,
"step": 2310
},
{
"epoch": 58.0,
"grad_norm": 1.2282480383873917e-05,
"learning_rate": 7.590322975433857e-05,
"loss": 0.0,
"step": 2320
},
{
"epoch": 58.0,
"eval_loss": 8.861284754857479e-07,
"eval_runtime": 0.3181,
"eval_samples_per_second": 113.172,
"eval_steps_per_second": 15.718,
"step": 2320
},
{
"epoch": 58.25,
"grad_norm": 2.858146035578102e-05,
"learning_rate": 7.51371338558168e-05,
"loss": 0.0,
"step": 2330
},
{
"epoch": 58.5,
"grad_norm": 2.0420882719918154e-05,
"learning_rate": 7.437259018677136e-05,
"loss": 0.0,
"step": 2340
},
{
"epoch": 58.75,
"grad_norm": 9.892805792333093e-06,
"learning_rate": 7.360964647891637e-05,
"loss": 0.0,
"step": 2350
},
{
"epoch": 59.0,
"grad_norm": 2.6135967345908284e-05,
"learning_rate": 7.284835036407776e-05,
"loss": 0.0,
"step": 2360
},
{
"epoch": 59.0,
"eval_loss": 8.719437687432219e-07,
"eval_runtime": 0.3182,
"eval_samples_per_second": 113.153,
"eval_steps_per_second": 15.716,
"step": 2360
},
{
"epoch": 59.25,
"grad_norm": 3.855082468362525e-05,
"learning_rate": 7.208874937121946e-05,
"loss": 0.0,
"step": 2370
},
{
"epoch": 59.5,
"grad_norm": 2.4621716875117272e-05,
"learning_rate": 7.133089092347627e-05,
"loss": 0.0,
"step": 2380
},
{
"epoch": 59.75,
"grad_norm": 1.3933644368080422e-05,
"learning_rate": 7.057482233519302e-05,
"loss": 0.0,
"step": 2390
},
{
"epoch": 60.0,
"grad_norm": 1.3702153410122264e-05,
"learning_rate": 6.982059080897059e-05,
"loss": 0.0,
"step": 2400
},
{
"epoch": 60.0,
"eval_loss": 8.514528531122778e-07,
"eval_runtime": 0.317,
"eval_samples_per_second": 113.548,
"eval_steps_per_second": 15.771,
"step": 2400
},
{
"epoch": 60.25,
"grad_norm": 1.285933922190452e-05,
"learning_rate": 6.906824343271916e-05,
"loss": 0.0,
"step": 2410
},
{
"epoch": 60.5,
"grad_norm": 1.753455217112787e-05,
"learning_rate": 6.831782717671828e-05,
"loss": 0.0,
"step": 2420
},
{
"epoch": 60.75,
"grad_norm": 1.9983261154266074e-05,
"learning_rate": 6.756938889068454e-05,
"loss": 0.0,
"step": 2430
},
{
"epoch": 61.0,
"grad_norm": 1.9891913325409405e-05,
"learning_rate": 6.682297530084664e-05,
"loss": 0.0,
"step": 2440
},
{
"epoch": 61.0,
"eval_loss": 8.335572942996805e-07,
"eval_runtime": 0.3281,
"eval_samples_per_second": 109.721,
"eval_steps_per_second": 15.239,
"step": 2440
},
{
"epoch": 61.25,
"grad_norm": 1.8422002540319227e-05,
"learning_rate": 6.607863300702807e-05,
"loss": 0.0,
"step": 2450
},
{
"epoch": 61.5,
"grad_norm": 1.9453251297818497e-05,
"learning_rate": 6.533640847973808e-05,
"loss": 0.0,
"step": 2460
},
{
"epoch": 61.75,
"grad_norm": 1.48242861541803e-05,
"learning_rate": 6.459634805727011e-05,
"loss": 0.0,
"step": 2470
},
{
"epoch": 62.0,
"grad_norm": 1.9470420738798566e-05,
"learning_rate": 6.385849794280915e-05,
"loss": 0.0,
"step": 2480
},
{
"epoch": 62.0,
"eval_loss": 8.260683443950256e-07,
"eval_runtime": 0.3297,
"eval_samples_per_second": 109.182,
"eval_steps_per_second": 15.164,
"step": 2480
},
{
"epoch": 62.25,
"grad_norm": 2.976124051201623e-05,
"learning_rate": 6.312290420154694e-05,
"loss": 0.0,
"step": 2490
},
{
"epoch": 62.5,
"grad_norm": 4.272747173672542e-05,
"learning_rate": 6.238961275780613e-05,
"loss": 0.0,
"step": 2500
},
{
"epoch": 62.75,
"grad_norm": 1.2389010407787282e-05,
"learning_rate": 6.165866939217328e-05,
"loss": 0.0,
"step": 2510
},
{
"epoch": 63.0,
"grad_norm": 1.4621130503655877e-05,
"learning_rate": 6.0930119738640445e-05,
"loss": 0.0,
"step": 2520
},
{
"epoch": 63.0,
"eval_loss": 8.148024335241644e-07,
"eval_runtime": 0.3292,
"eval_samples_per_second": 109.354,
"eval_steps_per_second": 15.188,
"step": 2520
},
{
"epoch": 63.25,
"grad_norm": 1.0234934961772524e-05,
"learning_rate": 6.020400928175637e-05,
"loss": 0.0,
"step": 2530
},
{
"epoch": 63.5,
"grad_norm": 1.937254455697257e-05,
"learning_rate": 5.948038335378683e-05,
"loss": 0.0,
"step": 2540
},
{
"epoch": 63.75,
"grad_norm": 1.764351145538967e-05,
"learning_rate": 5.8759287131884246e-05,
"loss": 0.0,
"step": 2550
},
{
"epoch": 64.0,
"grad_norm": 2.3509826860390604e-05,
"learning_rate": 5.804076563526744e-05,
"loss": 0.0,
"step": 2560
},
{
"epoch": 64.0,
"eval_loss": 8.072562422967167e-07,
"eval_runtime": 0.3217,
"eval_samples_per_second": 111.904,
"eval_steps_per_second": 15.542,
"step": 2560
},
{
"epoch": 64.25,
"grad_norm": 1.288153634959599e-05,
"learning_rate": 5.732486372241088e-05,
"loss": 0.0,
"step": 2570
},
{
"epoch": 64.5,
"grad_norm": 1.7124617443187162e-05,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.0,
"step": 2580
},
{
"epoch": 64.75,
"grad_norm": 3.4207390854135156e-05,
"learning_rate": 5.5901097261361636e-05,
"loss": 0.0,
"step": 2590
},
{
"epoch": 65.0,
"grad_norm": 1.607052945473697e-05,
"learning_rate": 5.5193321601242156e-05,
"loss": 0.0,
"step": 2600
},
{
"epoch": 65.0,
"eval_loss": 7.960065886436496e-07,
"eval_runtime": 0.3236,
"eval_samples_per_second": 111.263,
"eval_steps_per_second": 15.453,
"step": 2600
},
{
"epoch": 65.25,
"grad_norm": 2.7799209419754334e-05,
"learning_rate": 5.448834329548016e-05,
"loss": 0.0,
"step": 2610
},
{
"epoch": 65.5,
"grad_norm": 1.6963076632237062e-05,
"learning_rate": 5.378620635702643e-05,
"loss": 0.0,
"step": 2620
},
{
"epoch": 65.75,
"grad_norm": 1.7011914678732865e-05,
"learning_rate": 5.308695462144068e-05,
"loss": 0.0,
"step": 2630
},
{
"epoch": 66.0,
"grad_norm": 1.719038118608296e-05,
"learning_rate": 5.239063174415466e-05,
"loss": 0.0,
"step": 2640
},
{
"epoch": 66.0,
"eval_loss": 7.857981927372748e-07,
"eval_runtime": 0.3159,
"eval_samples_per_second": 113.963,
"eval_steps_per_second": 15.828,
"step": 2640
},
{
"epoch": 66.25,
"grad_norm": 1.87909827218391e-05,
"learning_rate": 5.1697281197746596e-05,
"loss": 0.0,
"step": 2650
},
{
"epoch": 66.5,
"grad_norm": 1.997711297008209e-05,
"learning_rate": 5.1006946269227376e-05,
"loss": 0.0,
"step": 2660
},
{
"epoch": 66.75,
"grad_norm": 2.0850015062023886e-05,
"learning_rate": 5.03196700573378e-05,
"loss": 0.0,
"step": 2670
},
{
"epoch": 67.0,
"grad_norm": 2.2285566956270486e-05,
"learning_rate": 4.963549546985799e-05,
"loss": 0.0,
"step": 2680
},
{
"epoch": 67.0,
"eval_loss": 7.721130259596976e-07,
"eval_runtime": 0.3244,
"eval_samples_per_second": 110.965,
"eval_steps_per_second": 15.412,
"step": 2680
},
{
"epoch": 67.25,
"grad_norm": 1.6444948414573446e-05,
"learning_rate": 4.895446522092868e-05,
"loss": 0.0,
"step": 2690
},
{
"epoch": 67.5,
"grad_norm": 1.5268993593053892e-05,
"learning_rate": 4.8276621828384225e-05,
"loss": 0.0,
"step": 2700
},
{
"epoch": 67.75,
"grad_norm": 1.7810820281738415e-05,
"learning_rate": 4.760200761109852e-05,
"loss": 0.0,
"step": 2710
},
{
"epoch": 68.0,
"grad_norm": 1.7248778021894395e-05,
"learning_rate": 4.6930664686342526e-05,
"loss": 0.0,
"step": 2720
},
{
"epoch": 68.0,
"eval_loss": 7.603679819112585e-07,
"eval_runtime": 0.3117,
"eval_samples_per_second": 115.513,
"eval_steps_per_second": 16.044,
"step": 2720
},
{
"epoch": 68.25,
"grad_norm": 2.448088525852654e-05,
"learning_rate": 4.626263496715525e-05,
"loss": 0.0,
"step": 2730
},
{
"epoch": 68.5,
"grad_norm": 1.745475674397312e-05,
"learning_rate": 4.559796015972677e-05,
"loss": 0.0,
"step": 2740
},
{
"epoch": 68.75,
"grad_norm": 1.6836595023050904e-05,
"learning_rate": 4.49366817607945e-05,
"loss": 0.0,
"step": 2750
},
{
"epoch": 69.0,
"grad_norm": 2.0379737179609947e-05,
"learning_rate": 4.427884105505251e-05,
"loss": 0.0,
"step": 2760
},
{
"epoch": 69.0,
"eval_loss": 7.604816119055613e-07,
"eval_runtime": 0.3177,
"eval_samples_per_second": 113.329,
"eval_steps_per_second": 15.74,
"step": 2760
},
{
"epoch": 69.25,
"grad_norm": 2.278652391396463e-05,
"learning_rate": 4.362447911257406e-05,
"loss": 0.0,
"step": 2770
},
{
"epoch": 69.5,
"grad_norm": 1.2965742826054338e-05,
"learning_rate": 4.297363678624753e-05,
"loss": 0.0,
"step": 2780
},
{
"epoch": 69.75,
"grad_norm": 1.8777451259666122e-05,
"learning_rate": 4.2326354709225955e-05,
"loss": 0.0,
"step": 2790
},
{
"epoch": 70.0,
"grad_norm": 2.3537781089544296e-05,
"learning_rate": 4.168267329239002e-05,
"loss": 0.0,
"step": 2800
},
{
"epoch": 70.0,
"eval_loss": 7.471541039194562e-07,
"eval_runtime": 0.3194,
"eval_samples_per_second": 112.703,
"eval_steps_per_second": 15.653,
"step": 2800
},
{
"epoch": 70.25,
"grad_norm": 1.4215344890544657e-05,
"learning_rate": 4.104263272182546e-05,
"loss": 0.0,
"step": 2810
},
{
"epoch": 70.5,
"grad_norm": 1.8491147784516215e-05,
"learning_rate": 4.0406272956313895e-05,
"loss": 0.0,
"step": 2820
},
{
"epoch": 70.75,
"grad_norm": 1.7631069567869417e-05,
"learning_rate": 3.9773633724838265e-05,
"loss": 0.0,
"step": 2830
},
{
"epoch": 71.0,
"grad_norm": 1.9227232769480906e-05,
"learning_rate": 3.914475452410257e-05,
"loss": 0.0,
"step": 2840
},
{
"epoch": 71.0,
"eval_loss": 7.375128916464746e-07,
"eval_runtime": 0.321,
"eval_samples_per_second": 112.152,
"eval_steps_per_second": 15.577,
"step": 2840
},
{
"epoch": 71.25,
"grad_norm": 1.6681302440701984e-05,
"learning_rate": 3.8519674616065784e-05,
"loss": 0.0,
"step": 2850
},
{
"epoch": 71.5,
"grad_norm": 1.8769558664644137e-05,
"learning_rate": 3.789843302549096e-05,
"loss": 0.0,
"step": 2860
},
{
"epoch": 71.75,
"grad_norm": 8.559236448490992e-06,
"learning_rate": 3.7281068537508565e-05,
"loss": 0.0,
"step": 2870
},
{
"epoch": 72.0,
"grad_norm": 1.4404205103346612e-05,
"learning_rate": 3.6667619695195285e-05,
"loss": 0.0,
"step": 2880
},
{
"epoch": 72.0,
"eval_loss": 7.320029453694588e-07,
"eval_runtime": 0.3152,
"eval_samples_per_second": 114.223,
"eval_steps_per_second": 15.864,
"step": 2880
},
{
"epoch": 72.25,
"grad_norm": 1.8397522580926307e-05,
"learning_rate": 3.605812479716767e-05,
"loss": 0.0,
"step": 2890
},
{
"epoch": 72.5,
"grad_norm": 1.5880750652286224e-05,
"learning_rate": 3.545262189519092e-05,
"loss": 0.0,
"step": 2900
},
{
"epoch": 72.75,
"grad_norm": 1.8930764781543985e-05,
"learning_rate": 3.4851148791803465e-05,
"loss": 0.0,
"step": 2910
},
{
"epoch": 73.0,
"grad_norm": 4.1914405301213264e-05,
"learning_rate": 3.425374303795675e-05,
"loss": 0.0,
"step": 2920
},
{
"epoch": 73.0,
"eval_loss": 7.22367474281782e-07,
"eval_runtime": 0.319,
"eval_samples_per_second": 112.859,
"eval_steps_per_second": 15.675,
"step": 2920
},
{
"epoch": 73.25,
"grad_norm": 1.0584836672933307e-05,
"learning_rate": 3.3660441930671006e-05,
"loss": 0.0,
"step": 2930
},
{
"epoch": 73.5,
"grad_norm": 1.819963290472515e-05,
"learning_rate": 3.3071282510706624e-05,
"loss": 0.0,
"step": 2940
},
{
"epoch": 73.75,
"grad_norm": 1.8003340301220305e-05,
"learning_rate": 3.248630156025158e-05,
"loss": 0.0,
"step": 2950
},
{
"epoch": 74.0,
"grad_norm": 1.5387213352369145e-05,
"learning_rate": 3.1905535600625314e-05,
"loss": 0.0,
"step": 2960
},
{
"epoch": 74.0,
"eval_loss": 7.147688734221447e-07,
"eval_runtime": 0.3171,
"eval_samples_per_second": 113.526,
"eval_steps_per_second": 15.767,
"step": 2960
},
{
"epoch": 74.25,
"grad_norm": 2.1973037291900255e-05,
"learning_rate": 3.1329020889998306e-05,
"loss": 0.0,
"step": 2970
},
{
"epoch": 74.5,
"grad_norm": 1.8727620044955984e-05,
"learning_rate": 3.075679342112874e-05,
"loss": 0.0,
"step": 2980
},
{
"epoch": 74.75,
"grad_norm": 1.0095293873746414e-05,
"learning_rate": 3.01888889191152e-05,
"loss": 0.0,
"step": 2990
},
{
"epoch": 75.0,
"grad_norm": 1.2027586308249738e-05,
"learning_rate": 2.9625342839166316e-05,
"loss": 0.0,
"step": 3000
},
{
"epoch": 75.0,
"eval_loss": 7.11524990038015e-07,
"eval_runtime": 0.3322,
"eval_samples_per_second": 108.367,
"eval_steps_per_second": 15.051,
"step": 3000
},
{
"epoch": 75.25,
"grad_norm": 2.197036155848764e-05,
"learning_rate": 2.9066190364387437e-05,
"loss": 0.0,
"step": 3010
},
{
"epoch": 75.5,
"grad_norm": 1.3477620086632669e-05,
"learning_rate": 2.8511466403583766e-05,
"loss": 0.0,
"step": 3020
},
{
"epoch": 75.75,
"grad_norm": 1.1739802175725345e-05,
"learning_rate": 2.796120558908124e-05,
"loss": 0.0,
"step": 3030
},
{
"epoch": 76.0,
"grad_norm": 3.1627434509573504e-05,
"learning_rate": 2.7415442274564273e-05,
"loss": 0.0,
"step": 3040
},
{
"epoch": 76.0,
"eval_loss": 7.128418815227633e-07,
"eval_runtime": 0.315,
"eval_samples_per_second": 114.285,
"eval_steps_per_second": 15.873,
"step": 3040
},
{
"epoch": 76.25,
"grad_norm": 9.673092790762894e-06,
"learning_rate": 2.6874210532930855e-05,
"loss": 0.0,
"step": 3050
},
{
"epoch": 76.5,
"grad_norm": 1.989353768294677e-05,
"learning_rate": 2.6337544154165604e-05,
"loss": 0.0,
"step": 3060
},
{
"epoch": 76.75,
"grad_norm": 1.5490039004362188e-05,
"learning_rate": 2.5805476643229952e-05,
"loss": 0.0,
"step": 3070
},
{
"epoch": 77.0,
"grad_norm": 1.1932146662729792e-05,
"learning_rate": 2.527804121797048e-05,
"loss": 0.0,
"step": 3080
},
{
"epoch": 77.0,
"eval_loss": 7.000676305324305e-07,
"eval_runtime": 0.3245,
"eval_samples_per_second": 110.942,
"eval_steps_per_second": 15.409,
"step": 3080
},
{
"epoch": 77.25,
"grad_norm": 1.2189483641122933e-05,
"learning_rate": 2.4755270807045174e-05,
"loss": 0.0,
"step": 3090
},
{
"epoch": 77.5,
"grad_norm": 2.792781378957443e-05,
"learning_rate": 2.423719804786737e-05,
"loss": 0.0,
"step": 3100
},
{
"epoch": 77.75,
"grad_norm": 1.3213076272222679e-05,
"learning_rate": 2.3723855284568462e-05,
"loss": 0.0,
"step": 3110
},
{
"epoch": 78.0,
"grad_norm": 2.2985013856668957e-05,
"learning_rate": 2.321527456597833e-05,
"loss": 0.0,
"step": 3120
},
{
"epoch": 78.0,
"eval_loss": 6.937642069715366e-07,
"eval_runtime": 0.323,
"eval_samples_per_second": 111.463,
"eval_steps_per_second": 15.481,
"step": 3120
},
{
"epoch": 78.25,
"grad_norm": 1.1034126146114431e-05,
"learning_rate": 2.2711487643624675e-05,
"loss": 0.0,
"step": 3130
},
{
"epoch": 78.5,
"grad_norm": 1.3156452041584998e-05,
"learning_rate": 2.2212525969750643e-05,
"loss": 0.0,
"step": 3140
},
{
"epoch": 78.75,
"grad_norm": 1.0150353773497045e-05,
"learning_rate": 2.171842069535116e-05,
"loss": 0.0,
"step": 3150
},
{
"epoch": 79.0,
"grad_norm": 3.457269485807046e-05,
"learning_rate": 2.1229202668228197e-05,
"loss": 0.0,
"step": 3160
},
{
"epoch": 79.0,
"eval_loss": 6.983178195696382e-07,
"eval_runtime": 0.3211,
"eval_samples_per_second": 112.129,
"eval_steps_per_second": 15.573,
"step": 3160
},
{
"epoch": 79.25,
"grad_norm": 1.4804916645516641e-05,
"learning_rate": 2.074490243106485e-05,
"loss": 0.0,
"step": 3170
},
{
"epoch": 79.5,
"grad_norm": 1.8004166122409515e-05,
"learning_rate": 2.026555021951858e-05,
"loss": 0.0,
"step": 3180
},
{
"epoch": 79.75,
"grad_norm": 2.1705473045585677e-05,
"learning_rate": 1.9791175960333487e-05,
"loss": 0.0,
"step": 3190
},
{
"epoch": 80.0,
"grad_norm": 1.0873730388993863e-05,
"learning_rate": 1.932180926947189e-05,
"loss": 0.0,
"step": 3200
},
{
"epoch": 80.0,
"eval_loss": 6.858597316750092e-07,
"eval_runtime": 0.3385,
"eval_samples_per_second": 106.338,
"eval_steps_per_second": 14.769,
"step": 3200
},
{
"epoch": 80.25,
"grad_norm": 1.706531475065276e-05,
"learning_rate": 1.8857479450265503e-05,
"loss": 0.0,
"step": 3210
},
{
"epoch": 80.5,
"grad_norm": 2.120017961715348e-05,
"learning_rate": 1.839821549158579e-05,
"loss": 0.0,
"step": 3220
},
{
"epoch": 80.75,
"grad_norm": 1.3771560588793363e-05,
"learning_rate": 1.794404606603434e-05,
"loss": 0.0,
"step": 3230
},
{
"epoch": 81.0,
"grad_norm": 1.798778430384118e-05,
"learning_rate": 1.74949995281526e-05,
"loss": 0.0,
"step": 3240
},
{
"epoch": 81.0,
"eval_loss": 6.865074624329282e-07,
"eval_runtime": 0.3201,
"eval_samples_per_second": 112.473,
"eval_steps_per_second": 15.621,
"step": 3240
},
{
"epoch": 81.25,
"grad_norm": 1.246057126991218e-05,
"learning_rate": 1.705110391265179e-05,
"loss": 0.0,
"step": 3250
},
{
"epoch": 81.5,
"grad_norm": 2.145354483218398e-05,
"learning_rate": 1.6612386932662627e-05,
"loss": 0.0,
"step": 3260
},
{
"epoch": 81.75,
"grad_norm": 9.91187789622927e-06,
"learning_rate": 1.6178875978005058e-05,
"loss": 0.0,
"step": 3270
},
{
"epoch": 82.0,
"grad_norm": 2.266502815473359e-05,
"learning_rate": 1.57505981134784e-05,
"loss": 0.0,
"step": 3280
},
{
"epoch": 82.0,
"eval_loss": 6.804688723605068e-07,
"eval_runtime": 0.3188,
"eval_samples_per_second": 112.926,
"eval_steps_per_second": 15.684,
"step": 3280
},
{
"epoch": 82.25,
"grad_norm": 1.2877572771685664e-05,
"learning_rate": 1.5327580077171587e-05,
"loss": 0.0,
"step": 3290
},
{
"epoch": 82.5,
"grad_norm": 2.6757013984024525e-05,
"learning_rate": 1.4909848278793782e-05,
"loss": 0.0,
"step": 3300
},
{
"epoch": 82.75,
"grad_norm": 1.6225705621764064e-05,
"learning_rate": 1.4497428798025736e-05,
"loss": 0.0,
"step": 3310
},
{
"epoch": 83.0,
"grad_norm": 1.1286027074675076e-05,
"learning_rate": 1.4090347382891455e-05,
"loss": 0.0,
"step": 3320
},
{
"epoch": 83.0,
"eval_loss": 6.749939984729281e-07,
"eval_runtime": 0.3162,
"eval_samples_per_second": 113.859,
"eval_steps_per_second": 15.814,
"step": 3320
},
{
"epoch": 83.25,
"grad_norm": 1.8980854292749427e-05,
"learning_rate": 1.3688629448150747e-05,
"loss": 0.0,
"step": 3330
},
{
"epoch": 83.5,
"grad_norm": 1.4416699741559569e-05,
"learning_rate": 1.3292300073712615e-05,
"loss": 0.0,
"step": 3340
},
{
"epoch": 83.75,
"grad_norm": 2.0767629393958487e-05,
"learning_rate": 1.2901384003069328e-05,
"loss": 0.0,
"step": 3350
},
{
"epoch": 84.0,
"grad_norm": 1.8946042473544367e-05,
"learning_rate": 1.2515905641751824e-05,
"loss": 0.0,
"step": 3360
},
{
"epoch": 84.0,
"eval_loss": 6.791258897465013e-07,
"eval_runtime": 0.3165,
"eval_samples_per_second": 113.733,
"eval_steps_per_second": 15.796,
"step": 3360
},
{
"epoch": 84.25,
"grad_norm": 2.060659790004138e-05,
"learning_rate": 1.2135889055805837e-05,
"loss": 0.0,
"step": 3370
},
{
"epoch": 84.5,
"grad_norm": 2.793761814245954e-05,
"learning_rate": 1.1761357970289588e-05,
"loss": 0.0,
"step": 3380
},
{
"epoch": 84.75,
"grad_norm": 1.5529620213783346e-05,
"learning_rate": 1.1392335767792505e-05,
"loss": 0.0,
"step": 3390
},
{
"epoch": 85.0,
"grad_norm": 1.4130602721706964e-05,
"learning_rate": 1.1028845486975403e-05,
"loss": 0.0,
"step": 3400
},
{
"epoch": 85.0,
"eval_loss": 6.698858783238393e-07,
"eval_runtime": 0.3198,
"eval_samples_per_second": 112.588,
"eval_steps_per_second": 15.637,
"step": 3400
},
{
"epoch": 85.25,
"grad_norm": 1.6358992070308886e-05,
"learning_rate": 1.0670909821132136e-05,
"loss": 0.0,
"step": 3410
},
{
"epoch": 85.5,
"grad_norm": 3.4115200833184645e-05,
"learning_rate": 1.0318551116772923e-05,
"loss": 0.0,
"step": 3420
},
{
"epoch": 85.75,
"grad_norm": 1.7895346900331788e-05,
"learning_rate": 9.971791372229044e-06,
"loss": 0.0,
"step": 3430
},
{
"epoch": 86.0,
"grad_norm": 1.2962746950506698e-05,
"learning_rate": 9.630652236279625e-06,
"loss": 0.0,
"step": 3440
},
{
"epoch": 86.0,
"eval_loss": 6.754976880074537e-07,
"eval_runtime": 0.3233,
"eval_samples_per_second": 111.361,
"eval_steps_per_second": 15.467,
"step": 3440
},
{
"epoch": 86.25,
"grad_norm": 1.5895795513642952e-05,
"learning_rate": 9.295155006799917e-06,
"loss": 0.0,
"step": 3450
},
{
"epoch": 86.5,
"grad_norm": 1.6078374756034464e-05,
"learning_rate": 8.96532062943175e-06,
"loss": 0.0,
"step": 3460
},
{
"epoch": 86.75,
"grad_norm": 1.0541101801209152e-05,
"learning_rate": 8.641169696275831e-06,
"loss": 0.0,
"step": 3470
},
{
"epoch": 87.0,
"grad_norm": 1.940759102581069e-05,
"learning_rate": 8.322722444606079e-06,
"loss": 0.0,
"step": 3480
},
{
"epoch": 87.0,
"eval_loss": 6.688477469651843e-07,
"eval_runtime": 0.3141,
"eval_samples_per_second": 114.612,
"eval_steps_per_second": 15.918,
"step": 3480
},
{
"epoch": 87.25,
"grad_norm": 1.1041982361348346e-05,
"learning_rate": 8.009998755606263e-06,
"loss": 0.0,
"step": 3490
},
{
"epoch": 87.5,
"grad_norm": 2.7860867703566328e-05,
"learning_rate": 7.703018153128739e-06,
"loss": 0.0,
"step": 3500
},
{
"epoch": 87.75,
"grad_norm": 8.007168617041316e-06,
"learning_rate": 7.401799802475573e-06,
"loss": 0.0,
"step": 3510
},
{
"epoch": 88.0,
"grad_norm": 1.64666762429988e-05,
"learning_rate": 7.106362509202036e-06,
"loss": 0.0,
"step": 3520
},
{
"epoch": 88.0,
"eval_loss": 6.721416525579116e-07,
"eval_runtime": 0.3256,
"eval_samples_per_second": 110.567,
"eval_steps_per_second": 15.357,
"step": 3520
},
{
"epoch": 88.25,
"grad_norm": 1.7286309230257757e-05,
"learning_rate": 6.816724717942435e-06,
"loss": 0.0,
"step": 3530
},
{
"epoch": 88.5,
"grad_norm": 2.7998203222523443e-05,
"learning_rate": 6.532904511258753e-06,
"loss": 0.0,
"step": 3540
},
{
"epoch": 88.75,
"grad_norm": 1.3463857612805441e-05,
"learning_rate": 6.254919608511544e-06,
"loss": 0.0,
"step": 3550
},
{
"epoch": 89.0,
"grad_norm": 1.6592677638982423e-05,
"learning_rate": 5.982787364753872e-06,
"loss": 0.0,
"step": 3560
},
{
"epoch": 89.0,
"eval_loss": 6.658329425590637e-07,
"eval_runtime": 0.3184,
"eval_samples_per_second": 113.061,
"eval_steps_per_second": 15.703,
"step": 3560
},
{
"epoch": 89.25,
"grad_norm": 2.4364608179894276e-05,
"learning_rate": 5.716524769647646e-06,
"loss": 0.0,
"step": 3570
},
{
"epoch": 89.5,
"grad_norm": 1.557578070787713e-05,
"learning_rate": 5.456148446402976e-06,
"loss": 0.0,
"step": 3580
},
{
"epoch": 89.75,
"grad_norm": 9.06794684851775e-06,
"learning_rate": 5.2016746507404295e-06,
"loss": 0.0,
"step": 3590
},
{
"epoch": 90.0,
"grad_norm": 2.0602865333785303e-05,
"learning_rate": 4.953119269876061e-06,
"loss": 0.0,
"step": 3600
},
{
"epoch": 90.0,
"eval_loss": 6.674051178379159e-07,
"eval_runtime": 0.3268,
"eval_samples_per_second": 110.152,
"eval_steps_per_second": 15.299,
"step": 3600
},
{
"epoch": 90.25,
"grad_norm": 2.491854138497729e-05,
"learning_rate": 4.710497821529625e-06,
"loss": 0.0,
"step": 3610
},
{
"epoch": 90.5,
"grad_norm": 1.2203651749587152e-05,
"learning_rate": 4.473825452955716e-06,
"loss": 0.0,
"step": 3620
},
{
"epoch": 90.75,
"grad_norm": 2.5209032173734158e-05,
"learning_rate": 4.2431169399981485e-06,
"loss": 0.0,
"step": 3630
},
{
"epoch": 91.0,
"grad_norm": 1.514551604486769e-05,
"learning_rate": 4.018386686167452e-06,
"loss": 0.0,
"step": 3640
},
{
"epoch": 91.0,
"eval_loss": 6.590207135559467e-07,
"eval_runtime": 0.3159,
"eval_samples_per_second": 113.952,
"eval_steps_per_second": 15.827,
"step": 3640
},
{
"epoch": 91.25,
"grad_norm": 9.24188134376891e-06,
"learning_rate": 3.7996487217416223e-06,
"loss": 0.0,
"step": 3650
},
{
"epoch": 91.5,
"grad_norm": 1.9695198716362938e-05,
"learning_rate": 3.5869167028902195e-06,
"loss": 0.0,
"step": 3660
},
{
"epoch": 91.75,
"grad_norm": 8.883437658369076e-06,
"learning_rate": 3.380203910821833e-06,
"loss": 0.0,
"step": 3670
},
{
"epoch": 92.0,
"grad_norm": 3.180091880494729e-05,
"learning_rate": 3.1795232509547633e-06,
"loss": 0.0,
"step": 3680
},
{
"epoch": 92.0,
"eval_loss": 6.601708264497574e-07,
"eval_runtime": 0.3151,
"eval_samples_per_second": 114.24,
"eval_steps_per_second": 15.867,
"step": 3680
},
{
"epoch": 92.25,
"grad_norm": 2.3148995751398616e-05,
"learning_rate": 2.98488725211149e-06,
"loss": 0.0,
"step": 3690
},
{
"epoch": 92.5,
"grad_norm": 1.9166613128618337e-05,
"learning_rate": 2.796308065736364e-06,
"loss": 0.0,
"step": 3700
},
{
"epoch": 92.75,
"grad_norm": 1.4863600881653838e-05,
"learning_rate": 2.6137974651370134e-06,
"loss": 0.0,
"step": 3710
},
{
"epoch": 93.0,
"grad_norm": 1.7678094081929885e-05,
"learning_rate": 2.4373668447493224e-06,
"loss": 0.0,
"step": 3720
},
{
"epoch": 93.0,
"eval_loss": 6.622615842388768e-07,
"eval_runtime": 0.3193,
"eval_samples_per_second": 112.738,
"eval_steps_per_second": 15.658,
"step": 3720
},
{
"epoch": 93.25,
"grad_norm": 2.3845455871196464e-05,
"learning_rate": 2.2670272194260324e-06,
"loss": 0.0,
"step": 3730
},
{
"epoch": 93.5,
"grad_norm": 1.4557038412021939e-05,
"learning_rate": 2.102789223749102e-06,
"loss": 0.0,
"step": 3740
},
{
"epoch": 93.75,
"grad_norm": 2.4488541384926066e-05,
"learning_rate": 1.9446631113657187e-06,
"loss": 0.0,
"step": 3750
},
{
"epoch": 94.0,
"grad_norm": 1.9359116777195595e-05,
"learning_rate": 1.7926587543482088e-06,
"loss": 0.0,
"step": 3760
},
{
"epoch": 94.0,
"eval_loss": 6.639800744778768e-07,
"eval_runtime": 0.3201,
"eval_samples_per_second": 112.453,
"eval_steps_per_second": 15.618,
"step": 3760
},
{
"epoch": 94.25,
"grad_norm": 1.9722852812265046e-05,
"learning_rate": 1.6467856425776863e-06,
"loss": 0.0,
"step": 3770
},
{
"epoch": 94.5,
"grad_norm": 1.831287045206409e-05,
"learning_rate": 1.5070528831515384e-06,
"loss": 0.0,
"step": 3780
},
{
"epoch": 94.75,
"grad_norm": 2.3000593500910327e-05,
"learning_rate": 1.3734691998149474e-06,
"loss": 0.0,
"step": 3790
},
{
"epoch": 95.0,
"grad_norm": 1.1854316653625574e-05,
"learning_rate": 1.246042932416136e-06,
"loss": 0.0,
"step": 3800
},
{
"epoch": 95.0,
"eval_loss": 6.561905934177048e-07,
"eval_runtime": 0.318,
"eval_samples_per_second": 113.209,
"eval_steps_per_second": 15.724,
"step": 3800
},
{
"epoch": 95.25,
"grad_norm": 1.4117299542704131e-05,
"learning_rate": 1.1247820363858075e-06,
"loss": 0.0,
"step": 3810
},
{
"epoch": 95.5,
"grad_norm": 1.986858478630893e-05,
"learning_rate": 1.00969408224042e-06,
"loss": 0.0,
"step": 3820
},
{
"epoch": 95.75,
"grad_norm": 2.425446109555196e-05,
"learning_rate": 9.007862551095314e-07,
"loss": 0.0,
"step": 3830
},
{
"epoch": 96.0,
"grad_norm": 1.7652260794420727e-05,
"learning_rate": 7.980653542872584e-07,
"loss": 0.0,
"step": 3840
},
{
"epoch": 96.0,
"eval_loss": 6.501233542621776e-07,
"eval_runtime": 0.3244,
"eval_samples_per_second": 110.979,
"eval_steps_per_second": 15.414,
"step": 3840
},
{
"epoch": 96.25,
"grad_norm": 1.0838626621989533e-05,
"learning_rate": 7.015377928077827e-07,
"loss": 0.0,
"step": 3850
},
{
"epoch": 96.5,
"grad_norm": 1.3126472367730457e-05,
"learning_rate": 6.11209597044926e-07,
"loss": 0.0,
"step": 3860
},
{
"epoch": 96.75,
"grad_norm": 2.100517667713575e-05,
"learning_rate": 5.27086406335997e-07,
"loss": 0.0,
"step": 3870
},
{
"epoch": 97.0,
"grad_norm": 1.3467181815940421e-05,
"learning_rate": 4.4917347262962705e-07,
"loss": 0.0,
"step": 3880
},
{
"epoch": 97.0,
"eval_loss": 6.613539653699263e-07,
"eval_runtime": 0.3133,
"eval_samples_per_second": 114.905,
"eval_steps_per_second": 15.959,
"step": 3880
},
{
"epoch": 97.25,
"grad_norm": 2.3665135813644156e-05,
"learning_rate": 3.774756601579443e-07,
"loss": 0.0,
"step": 3890
},
{
"epoch": 97.5,
"grad_norm": 1.761297244229354e-05,
"learning_rate": 3.119974451328833e-07,
"loss": 0.0,
"step": 3900
},
{
"epoch": 97.75,
"grad_norm": 2.0256773495930247e-05,
"learning_rate": 2.5274291546669717e-07,
"loss": 0.0,
"step": 3910
},
{
"epoch": 98.0,
"grad_norm": 1.0930380994977895e-05,
"learning_rate": 1.9971577051678404e-07,
"loss": 0.0,
"step": 3920
},
{
"epoch": 98.0,
"eval_loss": 6.56454744785151e-07,
"eval_runtime": 0.3159,
"eval_samples_per_second": 113.953,
"eval_steps_per_second": 15.827,
"step": 3920
},
{
"epoch": 98.25,
"grad_norm": 2.0974179278709926e-05,
"learning_rate": 1.5291932085468307e-07,
"loss": 0.0,
"step": 3930
},
{
"epoch": 98.5,
"grad_norm": 2.5038380044861697e-05,
"learning_rate": 1.1235648805945075e-07,
"loss": 0.0,
"step": 3940
},
{
"epoch": 98.75,
"grad_norm": 1.5341527614509687e-05,
"learning_rate": 7.802980453519571e-08,
"loss": 0.0,
"step": 3950
},
{
"epoch": 99.0,
"grad_norm": 1.3212208614277188e-05,
"learning_rate": 4.994141335303848e-08,
"loss": 0.0,
"step": 3960
},
{
"epoch": 99.0,
"eval_loss": 6.549934710164962e-07,
"eval_runtime": 0.3268,
"eval_samples_per_second": 110.154,
"eval_steps_per_second": 15.299,
"step": 3960
},
{
"epoch": 99.25,
"grad_norm": 1.4808772903052159e-05,
"learning_rate": 2.8093068117240885e-08,
"loss": 0.0,
"step": 3970
},
{
"epoch": 99.5,
"grad_norm": 1.7599566490389407e-05,
"learning_rate": 1.2486132855826781e-08,
"loss": 0.0,
"step": 3980
},
{
"epoch": 99.75,
"grad_norm": 9.601525562175084e-06,
"learning_rate": 3.121581935328077e-09,
"loss": 0.0,
"step": 3990
},
{
"epoch": 100.0,
"grad_norm": 2.9180186174926348e-05,
"learning_rate": 0.0,
"loss": 0.0,
"step": 4000
},
{
"epoch": 100.0,
"eval_loss": 6.550197895194287e-07,
"eval_runtime": 0.3302,
"eval_samples_per_second": 109.012,
"eval_steps_per_second": 15.141,
"step": 4000
}
],
"logging_steps": 10,
"max_steps": 4000,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.684080299081728e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}