SusGPT_Test / trainer_state.json
Darkester's picture
Upload 8 files
4db0604 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9925611052072263,
"eval_steps": 500,
"global_step": 7500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002656748140276302,
"grad_norm": 207.1781768798828,
"learning_rate": 1.9982288345731494e-05,
"loss": 6.5351,
"step": 10
},
{
"epoch": 0.005313496280552604,
"grad_norm": 265.0539855957031,
"learning_rate": 1.9964576691462986e-05,
"loss": 5.2822,
"step": 20
},
{
"epoch": 0.007970244420828906,
"grad_norm": 1933.7158203125,
"learning_rate": 1.9946865037194475e-05,
"loss": 4.7764,
"step": 30
},
{
"epoch": 0.010626992561105207,
"grad_norm": 896.5211791992188,
"learning_rate": 1.9929153382925967e-05,
"loss": 4.5617,
"step": 40
},
{
"epoch": 0.013283740701381509,
"grad_norm": 2147.634765625,
"learning_rate": 1.991144172865746e-05,
"loss": 4.5559,
"step": 50
},
{
"epoch": 0.015940488841657812,
"grad_norm": 1384.8623046875,
"learning_rate": 1.9893730074388952e-05,
"loss": 4.1671,
"step": 60
},
{
"epoch": 0.018597236981934114,
"grad_norm": 3381.87060546875,
"learning_rate": 1.987601842012044e-05,
"loss": 3.9988,
"step": 70
},
{
"epoch": 0.021253985122210415,
"grad_norm": 398.0505676269531,
"learning_rate": 1.985830676585193e-05,
"loss": 4.0844,
"step": 80
},
{
"epoch": 0.023910733262486716,
"grad_norm": 2040.54736328125,
"learning_rate": 1.9840595111583422e-05,
"loss": 3.9493,
"step": 90
},
{
"epoch": 0.026567481402763018,
"grad_norm": 8612.021484375,
"learning_rate": 1.9822883457314914e-05,
"loss": 3.6944,
"step": 100
},
{
"epoch": 0.02922422954303932,
"grad_norm": 22271.3125,
"learning_rate": 1.9805171803046406e-05,
"loss": 4.1335,
"step": 110
},
{
"epoch": 0.031880977683315624,
"grad_norm": 5334.6806640625,
"learning_rate": 1.97874601487779e-05,
"loss": 3.9284,
"step": 120
},
{
"epoch": 0.03453772582359192,
"grad_norm": 1616.4825439453125,
"learning_rate": 1.9769748494509388e-05,
"loss": 3.9407,
"step": 130
},
{
"epoch": 0.03719447396386823,
"grad_norm": 137.30589294433594,
"learning_rate": 1.975203684024088e-05,
"loss": 3.7372,
"step": 140
},
{
"epoch": 0.039851222104144525,
"grad_norm": 2417.81982421875,
"learning_rate": 1.9734325185972372e-05,
"loss": 3.6944,
"step": 150
},
{
"epoch": 0.04250797024442083,
"grad_norm": 7971.87451171875,
"learning_rate": 1.9716613531703864e-05,
"loss": 3.6615,
"step": 160
},
{
"epoch": 0.04516471838469713,
"grad_norm": 1645.13916015625,
"learning_rate": 1.9698901877435353e-05,
"loss": 3.4582,
"step": 170
},
{
"epoch": 0.04782146652497343,
"grad_norm": 2899.1162109375,
"learning_rate": 1.9681190223166846e-05,
"loss": 3.4193,
"step": 180
},
{
"epoch": 0.05047821466524974,
"grad_norm": 13782.0908203125,
"learning_rate": 1.9663478568898338e-05,
"loss": 3.577,
"step": 190
},
{
"epoch": 0.053134962805526036,
"grad_norm": 7818.07177734375,
"learning_rate": 1.964576691462983e-05,
"loss": 3.2082,
"step": 200
},
{
"epoch": 0.05579171094580234,
"grad_norm": 14882.34375,
"learning_rate": 1.962805526036132e-05,
"loss": 3.1947,
"step": 210
},
{
"epoch": 0.05844845908607864,
"grad_norm": 27526.642578125,
"learning_rate": 1.961034360609281e-05,
"loss": 3.23,
"step": 220
},
{
"epoch": 0.06110520722635494,
"grad_norm": 9511.650390625,
"learning_rate": 1.95926319518243e-05,
"loss": 3.0386,
"step": 230
},
{
"epoch": 0.06376195536663125,
"grad_norm": 2172.15625,
"learning_rate": 1.9574920297555792e-05,
"loss": 3.1756,
"step": 240
},
{
"epoch": 0.06641870350690754,
"grad_norm": 11950.30078125,
"learning_rate": 1.9557208643287285e-05,
"loss": 3.36,
"step": 250
},
{
"epoch": 0.06907545164718384,
"grad_norm": 17726.330078125,
"learning_rate": 1.9539496989018777e-05,
"loss": 3.0231,
"step": 260
},
{
"epoch": 0.07173219978746015,
"grad_norm": 4690.27587890625,
"learning_rate": 1.9521785334750266e-05,
"loss": 3.0029,
"step": 270
},
{
"epoch": 0.07438894792773645,
"grad_norm": 40308.61328125,
"learning_rate": 1.9504073680481758e-05,
"loss": 3.688,
"step": 280
},
{
"epoch": 0.07704569606801276,
"grad_norm": 27147.087890625,
"learning_rate": 1.948636202621325e-05,
"loss": 3.3881,
"step": 290
},
{
"epoch": 0.07970244420828905,
"grad_norm": 59977.046875,
"learning_rate": 1.9468650371944743e-05,
"loss": 3.4571,
"step": 300
},
{
"epoch": 0.08235919234856535,
"grad_norm": 66940.046875,
"learning_rate": 1.9450938717676235e-05,
"loss": 3.3864,
"step": 310
},
{
"epoch": 0.08501594048884166,
"grad_norm": 5094.89013671875,
"learning_rate": 1.9433227063407724e-05,
"loss": 3.3697,
"step": 320
},
{
"epoch": 0.08767268862911796,
"grad_norm": 4367.36474609375,
"learning_rate": 1.9415515409139216e-05,
"loss": 3.3016,
"step": 330
},
{
"epoch": 0.09032943676939426,
"grad_norm": 7941.5458984375,
"learning_rate": 1.9397803754870705e-05,
"loss": 3.0374,
"step": 340
},
{
"epoch": 0.09298618490967056,
"grad_norm": 3960.741943359375,
"learning_rate": 1.9380092100602197e-05,
"loss": 3.3324,
"step": 350
},
{
"epoch": 0.09564293304994687,
"grad_norm": 18565.732421875,
"learning_rate": 1.936238044633369e-05,
"loss": 3.2402,
"step": 360
},
{
"epoch": 0.09829968119022317,
"grad_norm": 66859.0,
"learning_rate": 1.9344668792065178e-05,
"loss": 3.3142,
"step": 370
},
{
"epoch": 0.10095642933049948,
"grad_norm": 1521.879638671875,
"learning_rate": 1.932695713779667e-05,
"loss": 3.0546,
"step": 380
},
{
"epoch": 0.10361317747077577,
"grad_norm": 12662.775390625,
"learning_rate": 1.9309245483528163e-05,
"loss": 3.5396,
"step": 390
},
{
"epoch": 0.10626992561105207,
"grad_norm": 105807.59375,
"learning_rate": 1.9291533829259655e-05,
"loss": 3.5301,
"step": 400
},
{
"epoch": 0.10892667375132838,
"grad_norm": 663547.875,
"learning_rate": 1.9273822174991147e-05,
"loss": 4.28,
"step": 410
},
{
"epoch": 0.11158342189160468,
"grad_norm": 8186676.0,
"learning_rate": 1.9256110520722636e-05,
"loss": 5.9807,
"step": 420
},
{
"epoch": 0.11424017003188097,
"grad_norm": 2142551.25,
"learning_rate": 1.923839886645413e-05,
"loss": 9.4764,
"step": 430
},
{
"epoch": 0.11689691817215728,
"grad_norm": 366486.1875,
"learning_rate": 1.922068721218562e-05,
"loss": 10.9151,
"step": 440
},
{
"epoch": 0.11955366631243358,
"grad_norm": 2276693.0,
"learning_rate": 1.9202975557917113e-05,
"loss": 12.5549,
"step": 450
},
{
"epoch": 0.12221041445270989,
"grad_norm": 2184425.5,
"learning_rate": 1.9185263903648602e-05,
"loss": 13.1915,
"step": 460
},
{
"epoch": 0.12486716259298619,
"grad_norm": 2937578.75,
"learning_rate": 1.9167552249380094e-05,
"loss": 14.2279,
"step": 470
},
{
"epoch": 0.1275239107332625,
"grad_norm": 10091141.0,
"learning_rate": 1.9149840595111583e-05,
"loss": 13.4766,
"step": 480
},
{
"epoch": 0.1301806588735388,
"grad_norm": 5426885.5,
"learning_rate": 1.9132128940843075e-05,
"loss": 14.8065,
"step": 490
},
{
"epoch": 0.13283740701381508,
"grad_norm": 2068535.25,
"learning_rate": 1.9114417286574568e-05,
"loss": 16.3781,
"step": 500
},
{
"epoch": 0.13549415515409138,
"grad_norm": 3599295.0,
"learning_rate": 1.909670563230606e-05,
"loss": 15.1519,
"step": 510
},
{
"epoch": 0.1381509032943677,
"grad_norm": 761431.875,
"learning_rate": 1.907899397803755e-05,
"loss": 15.1124,
"step": 520
},
{
"epoch": 0.140807651434644,
"grad_norm": 933641.375,
"learning_rate": 1.906128232376904e-05,
"loss": 14.1038,
"step": 530
},
{
"epoch": 0.1434643995749203,
"grad_norm": 423861.0625,
"learning_rate": 1.9043570669500533e-05,
"loss": 13.7131,
"step": 540
},
{
"epoch": 0.1461211477151966,
"grad_norm": 5383.50537109375,
"learning_rate": 1.9025859015232026e-05,
"loss": 12.8075,
"step": 550
},
{
"epoch": 0.1487778958554729,
"grad_norm": 3759.12548828125,
"learning_rate": 1.9008147360963514e-05,
"loss": 10.7237,
"step": 560
},
{
"epoch": 0.1514346439957492,
"grad_norm": 2150.089111328125,
"learning_rate": 1.8990435706695007e-05,
"loss": 7.2887,
"step": 570
},
{
"epoch": 0.15409139213602552,
"grad_norm": 3893.645751953125,
"learning_rate": 1.89727240524265e-05,
"loss": 4.8237,
"step": 580
},
{
"epoch": 0.1567481402763018,
"grad_norm": 11881.3046875,
"learning_rate": 1.895501239815799e-05,
"loss": 3.9525,
"step": 590
},
{
"epoch": 0.1594048884165781,
"grad_norm": 14820.740234375,
"learning_rate": 1.8937300743889483e-05,
"loss": 4.6401,
"step": 600
},
{
"epoch": 0.1620616365568544,
"grad_norm": 99031.640625,
"learning_rate": 1.8919589089620972e-05,
"loss": 4.9725,
"step": 610
},
{
"epoch": 0.1647183846971307,
"grad_norm": 47882.5859375,
"learning_rate": 1.890187743535246e-05,
"loss": 4.6917,
"step": 620
},
{
"epoch": 0.16737513283740701,
"grad_norm": 77129.8046875,
"learning_rate": 1.8884165781083953e-05,
"loss": 4.3883,
"step": 630
},
{
"epoch": 0.17003188097768332,
"grad_norm": 85341.125,
"learning_rate": 1.8866454126815446e-05,
"loss": 5.114,
"step": 640
},
{
"epoch": 0.17268862911795962,
"grad_norm": 34883.13671875,
"learning_rate": 1.8848742472546938e-05,
"loss": 4.9715,
"step": 650
},
{
"epoch": 0.17534537725823593,
"grad_norm": 22649.3359375,
"learning_rate": 1.8831030818278427e-05,
"loss": 4.9266,
"step": 660
},
{
"epoch": 0.17800212539851223,
"grad_norm": 59614.453125,
"learning_rate": 1.881331916400992e-05,
"loss": 4.3894,
"step": 670
},
{
"epoch": 0.1806588735387885,
"grad_norm": 13419.771484375,
"learning_rate": 1.879560750974141e-05,
"loss": 4.238,
"step": 680
},
{
"epoch": 0.18331562167906482,
"grad_norm": 26652.462890625,
"learning_rate": 1.8777895855472904e-05,
"loss": 4.5253,
"step": 690
},
{
"epoch": 0.18597236981934112,
"grad_norm": 37440.6015625,
"learning_rate": 1.8760184201204396e-05,
"loss": 4.0546,
"step": 700
},
{
"epoch": 0.18862911795961743,
"grad_norm": 43147.1796875,
"learning_rate": 1.8742472546935885e-05,
"loss": 4.4831,
"step": 710
},
{
"epoch": 0.19128586609989373,
"grad_norm": 143355.296875,
"learning_rate": 1.8724760892667377e-05,
"loss": 4.5257,
"step": 720
},
{
"epoch": 0.19394261424017004,
"grad_norm": 12484.8466796875,
"learning_rate": 1.870704923839887e-05,
"loss": 4.9662,
"step": 730
},
{
"epoch": 0.19659936238044634,
"grad_norm": 10305.0126953125,
"learning_rate": 1.868933758413036e-05,
"loss": 5.3629,
"step": 740
},
{
"epoch": 0.19925611052072265,
"grad_norm": 3247.491943359375,
"learning_rate": 1.867162592986185e-05,
"loss": 5.014,
"step": 750
},
{
"epoch": 0.20191285866099895,
"grad_norm": 2328.57470703125,
"learning_rate": 1.8653914275593343e-05,
"loss": 4.9864,
"step": 760
},
{
"epoch": 0.20456960680127523,
"grad_norm": 16007.7978515625,
"learning_rate": 1.863620262132483e-05,
"loss": 4.5492,
"step": 770
},
{
"epoch": 0.20722635494155153,
"grad_norm": 39521.5078125,
"learning_rate": 1.8618490967056324e-05,
"loss": 4.4608,
"step": 780
},
{
"epoch": 0.20988310308182784,
"grad_norm": 553922.0,
"learning_rate": 1.8600779312787816e-05,
"loss": 4.9998,
"step": 790
},
{
"epoch": 0.21253985122210414,
"grad_norm": 623164.25,
"learning_rate": 1.858306765851931e-05,
"loss": 4.6969,
"step": 800
},
{
"epoch": 0.21519659936238045,
"grad_norm": 849724.3125,
"learning_rate": 1.8565356004250797e-05,
"loss": 5.2992,
"step": 810
},
{
"epoch": 0.21785334750265675,
"grad_norm": 1883489.125,
"learning_rate": 1.854764434998229e-05,
"loss": 5.5446,
"step": 820
},
{
"epoch": 0.22051009564293306,
"grad_norm": 1473608.5,
"learning_rate": 1.8529932695713782e-05,
"loss": 5.6081,
"step": 830
},
{
"epoch": 0.22316684378320936,
"grad_norm": 6046079.5,
"learning_rate": 1.8512221041445274e-05,
"loss": 5.543,
"step": 840
},
{
"epoch": 0.22582359192348567,
"grad_norm": 3414641.75,
"learning_rate": 1.8494509387176763e-05,
"loss": 6.5477,
"step": 850
},
{
"epoch": 0.22848034006376194,
"grad_norm": 3107066.0,
"learning_rate": 1.8476797732908255e-05,
"loss": 6.6238,
"step": 860
},
{
"epoch": 0.23113708820403825,
"grad_norm": 2057658.75,
"learning_rate": 1.8459086078639748e-05,
"loss": 6.6566,
"step": 870
},
{
"epoch": 0.23379383634431455,
"grad_norm": 689954.125,
"learning_rate": 1.8441374424371236e-05,
"loss": 5.6908,
"step": 880
},
{
"epoch": 0.23645058448459086,
"grad_norm": 5757.73388671875,
"learning_rate": 1.842366277010273e-05,
"loss": 4.5477,
"step": 890
},
{
"epoch": 0.23910733262486716,
"grad_norm": 5359.6728515625,
"learning_rate": 1.840595111583422e-05,
"loss": 3.6785,
"step": 900
},
{
"epoch": 0.24176408076514347,
"grad_norm": 2013.8673095703125,
"learning_rate": 1.838823946156571e-05,
"loss": 3.519,
"step": 910
},
{
"epoch": 0.24442082890541977,
"grad_norm": 6289.10888671875,
"learning_rate": 1.8370527807297202e-05,
"loss": 3.6842,
"step": 920
},
{
"epoch": 0.24707757704569608,
"grad_norm": 3089.353759765625,
"learning_rate": 1.8352816153028694e-05,
"loss": 3.6535,
"step": 930
},
{
"epoch": 0.24973432518597238,
"grad_norm": 2002.3780517578125,
"learning_rate": 1.8335104498760187e-05,
"loss": 3.5385,
"step": 940
},
{
"epoch": 0.25239107332624866,
"grad_norm": 5194.0224609375,
"learning_rate": 1.8317392844491676e-05,
"loss": 3.4652,
"step": 950
},
{
"epoch": 0.255047821466525,
"grad_norm": 2200.886962890625,
"learning_rate": 1.8299681190223168e-05,
"loss": 3.6788,
"step": 960
},
{
"epoch": 0.25770456960680127,
"grad_norm": 10148.009765625,
"learning_rate": 1.828196953595466e-05,
"loss": 3.7478,
"step": 970
},
{
"epoch": 0.2603613177470776,
"grad_norm": 2540.3837890625,
"learning_rate": 1.8264257881686152e-05,
"loss": 3.4836,
"step": 980
},
{
"epoch": 0.2630180658873539,
"grad_norm": 2385.15625,
"learning_rate": 1.8246546227417645e-05,
"loss": 3.2733,
"step": 990
},
{
"epoch": 0.26567481402763016,
"grad_norm": 8635.650390625,
"learning_rate": 1.8228834573149134e-05,
"loss": 3.4935,
"step": 1000
},
{
"epoch": 0.2683315621679065,
"grad_norm": 17405.947265625,
"learning_rate": 1.8211122918880626e-05,
"loss": 3.3743,
"step": 1010
},
{
"epoch": 0.27098831030818277,
"grad_norm": 2616.988037109375,
"learning_rate": 1.8193411264612115e-05,
"loss": 4.0444,
"step": 1020
},
{
"epoch": 0.2736450584484591,
"grad_norm": 9487.044921875,
"learning_rate": 1.8175699610343607e-05,
"loss": 3.8644,
"step": 1030
},
{
"epoch": 0.2763018065887354,
"grad_norm": 681.0313110351562,
"learning_rate": 1.81579879560751e-05,
"loss": 3.2198,
"step": 1040
},
{
"epoch": 0.2789585547290117,
"grad_norm": 1654.2945556640625,
"learning_rate": 1.8140276301806588e-05,
"loss": 3.741,
"step": 1050
},
{
"epoch": 0.281615302869288,
"grad_norm": 2555.9970703125,
"learning_rate": 1.812256464753808e-05,
"loss": 3.5377,
"step": 1060
},
{
"epoch": 0.2842720510095643,
"grad_norm": 1187.751220703125,
"learning_rate": 1.8104852993269573e-05,
"loss": 3.6048,
"step": 1070
},
{
"epoch": 0.2869287991498406,
"grad_norm": 2747.8486328125,
"learning_rate": 1.8087141339001065e-05,
"loss": 3.6148,
"step": 1080
},
{
"epoch": 0.2895855472901169,
"grad_norm": 624.16650390625,
"learning_rate": 1.8069429684732557e-05,
"loss": 3.0917,
"step": 1090
},
{
"epoch": 0.2922422954303932,
"grad_norm": 283.41033935546875,
"learning_rate": 1.8051718030464046e-05,
"loss": 3.4423,
"step": 1100
},
{
"epoch": 0.2948990435706695,
"grad_norm": 563.9237670898438,
"learning_rate": 1.8034006376195538e-05,
"loss": 3.1134,
"step": 1110
},
{
"epoch": 0.2975557917109458,
"grad_norm": 419.8347473144531,
"learning_rate": 1.801629472192703e-05,
"loss": 3.3765,
"step": 1120
},
{
"epoch": 0.3002125398512221,
"grad_norm": 328.199462890625,
"learning_rate": 1.7998583067658523e-05,
"loss": 3.1981,
"step": 1130
},
{
"epoch": 0.3028692879914984,
"grad_norm": 1167.4515380859375,
"learning_rate": 1.7980871413390012e-05,
"loss": 2.9826,
"step": 1140
},
{
"epoch": 0.3055260361317747,
"grad_norm": 1590.5523681640625,
"learning_rate": 1.7963159759121504e-05,
"loss": 3.2378,
"step": 1150
},
{
"epoch": 0.30818278427205104,
"grad_norm": 1228.88037109375,
"learning_rate": 1.7945448104852993e-05,
"loss": 3.2167,
"step": 1160
},
{
"epoch": 0.3108395324123273,
"grad_norm": 866.290283203125,
"learning_rate": 1.7927736450584485e-05,
"loss": 2.9749,
"step": 1170
},
{
"epoch": 0.3134962805526036,
"grad_norm": 326.7938537597656,
"learning_rate": 1.7910024796315977e-05,
"loss": 3.111,
"step": 1180
},
{
"epoch": 0.3161530286928799,
"grad_norm": 603.0250854492188,
"learning_rate": 1.789231314204747e-05,
"loss": 3.1647,
"step": 1190
},
{
"epoch": 0.3188097768331562,
"grad_norm": 553.5940551757812,
"learning_rate": 1.787460148777896e-05,
"loss": 3.1094,
"step": 1200
},
{
"epoch": 0.32146652497343253,
"grad_norm": 417.6220703125,
"learning_rate": 1.785688983351045e-05,
"loss": 3.195,
"step": 1210
},
{
"epoch": 0.3241232731137088,
"grad_norm": 745.7908935546875,
"learning_rate": 1.7839178179241943e-05,
"loss": 2.8119,
"step": 1220
},
{
"epoch": 0.32678002125398514,
"grad_norm": 963.697021484375,
"learning_rate": 1.7821466524973435e-05,
"loss": 2.9828,
"step": 1230
},
{
"epoch": 0.3294367693942614,
"grad_norm": 3789.7373046875,
"learning_rate": 1.7803754870704924e-05,
"loss": 2.8971,
"step": 1240
},
{
"epoch": 0.33209351753453775,
"grad_norm": 1777.551025390625,
"learning_rate": 1.7786043216436416e-05,
"loss": 2.8533,
"step": 1250
},
{
"epoch": 0.33475026567481403,
"grad_norm": 725.1536254882812,
"learning_rate": 1.776833156216791e-05,
"loss": 2.6644,
"step": 1260
},
{
"epoch": 0.3374070138150903,
"grad_norm": 2410.62060546875,
"learning_rate": 1.77506199078994e-05,
"loss": 3.058,
"step": 1270
},
{
"epoch": 0.34006376195536664,
"grad_norm": 825.2067260742188,
"learning_rate": 1.7732908253630893e-05,
"loss": 2.7154,
"step": 1280
},
{
"epoch": 0.3427205100956429,
"grad_norm": 835.7099609375,
"learning_rate": 1.7715196599362382e-05,
"loss": 3.5358,
"step": 1290
},
{
"epoch": 0.34537725823591925,
"grad_norm": 2334.035888671875,
"learning_rate": 1.769748494509387e-05,
"loss": 3.2141,
"step": 1300
},
{
"epoch": 0.3480340063761955,
"grad_norm": 1089.702392578125,
"learning_rate": 1.7679773290825363e-05,
"loss": 2.8534,
"step": 1310
},
{
"epoch": 0.35069075451647186,
"grad_norm": 643.6981811523438,
"learning_rate": 1.7662061636556856e-05,
"loss": 3.14,
"step": 1320
},
{
"epoch": 0.35334750265674814,
"grad_norm": 927.3551025390625,
"learning_rate": 1.7644349982288348e-05,
"loss": 3.255,
"step": 1330
},
{
"epoch": 0.35600425079702447,
"grad_norm": 642.1421508789062,
"learning_rate": 1.7626638328019837e-05,
"loss": 2.9875,
"step": 1340
},
{
"epoch": 0.35866099893730075,
"grad_norm": 1514.4876708984375,
"learning_rate": 1.760892667375133e-05,
"loss": 2.7786,
"step": 1350
},
{
"epoch": 0.361317747077577,
"grad_norm": 2913.84912109375,
"learning_rate": 1.759121501948282e-05,
"loss": 2.83,
"step": 1360
},
{
"epoch": 0.36397449521785336,
"grad_norm": 1152.3695068359375,
"learning_rate": 1.7573503365214314e-05,
"loss": 3.316,
"step": 1370
},
{
"epoch": 0.36663124335812963,
"grad_norm": 2364.73876953125,
"learning_rate": 1.7555791710945806e-05,
"loss": 3.1473,
"step": 1380
},
{
"epoch": 0.36928799149840597,
"grad_norm": 1560.827392578125,
"learning_rate": 1.7538080056677295e-05,
"loss": 2.875,
"step": 1390
},
{
"epoch": 0.37194473963868224,
"grad_norm": 672.7749633789062,
"learning_rate": 1.7520368402408787e-05,
"loss": 3.3416,
"step": 1400
},
{
"epoch": 0.3746014877789586,
"grad_norm": 3212.583740234375,
"learning_rate": 1.750265674814028e-05,
"loss": 2.6347,
"step": 1410
},
{
"epoch": 0.37725823591923485,
"grad_norm": 9892.419921875,
"learning_rate": 1.7484945093871768e-05,
"loss": 2.9356,
"step": 1420
},
{
"epoch": 0.3799149840595112,
"grad_norm": 13098.6201171875,
"learning_rate": 1.746723343960326e-05,
"loss": 3.0818,
"step": 1430
},
{
"epoch": 0.38257173219978746,
"grad_norm": 33038.46484375,
"learning_rate": 1.7449521785334753e-05,
"loss": 3.4073,
"step": 1440
},
{
"epoch": 0.38522848034006374,
"grad_norm": 58945.421875,
"learning_rate": 1.743181013106624e-05,
"loss": 3.4505,
"step": 1450
},
{
"epoch": 0.38788522848034007,
"grad_norm": 53823.19921875,
"learning_rate": 1.7414098476797734e-05,
"loss": 3.4398,
"step": 1460
},
{
"epoch": 0.39054197662061635,
"grad_norm": 213358.46875,
"learning_rate": 1.7396386822529226e-05,
"loss": 3.1337,
"step": 1470
},
{
"epoch": 0.3931987247608927,
"grad_norm": 174113.078125,
"learning_rate": 1.7378675168260718e-05,
"loss": 3.6872,
"step": 1480
},
{
"epoch": 0.39585547290116896,
"grad_norm": 110265.9609375,
"learning_rate": 1.7360963513992207e-05,
"loss": 3.5268,
"step": 1490
},
{
"epoch": 0.3985122210414453,
"grad_norm": 125626.78125,
"learning_rate": 1.73432518597237e-05,
"loss": 3.8027,
"step": 1500
},
{
"epoch": 0.40116896918172157,
"grad_norm": 119383.8359375,
"learning_rate": 1.7325540205455192e-05,
"loss": 3.6381,
"step": 1510
},
{
"epoch": 0.4038257173219979,
"grad_norm": 78246.125,
"learning_rate": 1.7307828551186684e-05,
"loss": 3.6688,
"step": 1520
},
{
"epoch": 0.4064824654622742,
"grad_norm": 77016.8671875,
"learning_rate": 1.7290116896918173e-05,
"loss": 3.7796,
"step": 1530
},
{
"epoch": 0.40913921360255046,
"grad_norm": 471759.21875,
"learning_rate": 1.7272405242649665e-05,
"loss": 3.738,
"step": 1540
},
{
"epoch": 0.4117959617428268,
"grad_norm": 108969.1171875,
"learning_rate": 1.7254693588381157e-05,
"loss": 3.4583,
"step": 1550
},
{
"epoch": 0.41445270988310307,
"grad_norm": 44717.91015625,
"learning_rate": 1.7236981934112646e-05,
"loss": 3.0156,
"step": 1560
},
{
"epoch": 0.4171094580233794,
"grad_norm": 56418.765625,
"learning_rate": 1.721927027984414e-05,
"loss": 3.339,
"step": 1570
},
{
"epoch": 0.4197662061636557,
"grad_norm": 82086.234375,
"learning_rate": 1.720155862557563e-05,
"loss": 3.2477,
"step": 1580
},
{
"epoch": 0.422422954303932,
"grad_norm": 38437.12890625,
"learning_rate": 1.718384697130712e-05,
"loss": 3.0923,
"step": 1590
},
{
"epoch": 0.4250797024442083,
"grad_norm": 64070.26953125,
"learning_rate": 1.7166135317038612e-05,
"loss": 3.8784,
"step": 1600
},
{
"epoch": 0.4277364505844846,
"grad_norm": 96363.0078125,
"learning_rate": 1.7148423662770104e-05,
"loss": 3.1945,
"step": 1610
},
{
"epoch": 0.4303931987247609,
"grad_norm": 101021.7578125,
"learning_rate": 1.7130712008501596e-05,
"loss": 2.9785,
"step": 1620
},
{
"epoch": 0.43304994686503717,
"grad_norm": 33741.50390625,
"learning_rate": 1.7113000354233085e-05,
"loss": 3.0544,
"step": 1630
},
{
"epoch": 0.4357066950053135,
"grad_norm": 18486.07421875,
"learning_rate": 1.7095288699964578e-05,
"loss": 3.3951,
"step": 1640
},
{
"epoch": 0.4383634431455898,
"grad_norm": 141817.4375,
"learning_rate": 1.707757704569607e-05,
"loss": 3.8719,
"step": 1650
},
{
"epoch": 0.4410201912858661,
"grad_norm": 18356.125,
"learning_rate": 1.7059865391427562e-05,
"loss": 3.217,
"step": 1660
},
{
"epoch": 0.4436769394261424,
"grad_norm": 75286.890625,
"learning_rate": 1.7042153737159054e-05,
"loss": 3.2279,
"step": 1670
},
{
"epoch": 0.4463336875664187,
"grad_norm": 93692.8671875,
"learning_rate": 1.7024442082890543e-05,
"loss": 3.3421,
"step": 1680
},
{
"epoch": 0.448990435706695,
"grad_norm": 137171.109375,
"learning_rate": 1.7006730428622032e-05,
"loss": 3.4727,
"step": 1690
},
{
"epoch": 0.45164718384697133,
"grad_norm": 143812.296875,
"learning_rate": 1.6989018774353524e-05,
"loss": 3.24,
"step": 1700
},
{
"epoch": 0.4543039319872476,
"grad_norm": 35345.19921875,
"learning_rate": 1.6971307120085017e-05,
"loss": 3.2903,
"step": 1710
},
{
"epoch": 0.4569606801275239,
"grad_norm": 69917.4375,
"learning_rate": 1.695359546581651e-05,
"loss": 3.1309,
"step": 1720
},
{
"epoch": 0.4596174282678002,
"grad_norm": 71451.5859375,
"learning_rate": 1.6935883811547998e-05,
"loss": 3.8151,
"step": 1730
},
{
"epoch": 0.4622741764080765,
"grad_norm": 54897.4375,
"learning_rate": 1.691817215727949e-05,
"loss": 3.7961,
"step": 1740
},
{
"epoch": 0.46493092454835283,
"grad_norm": 42574.12109375,
"learning_rate": 1.6900460503010982e-05,
"loss": 3.3018,
"step": 1750
},
{
"epoch": 0.4675876726886291,
"grad_norm": 118568.609375,
"learning_rate": 1.6882748848742475e-05,
"loss": 3.4044,
"step": 1760
},
{
"epoch": 0.47024442082890544,
"grad_norm": 141536.96875,
"learning_rate": 1.6865037194473967e-05,
"loss": 3.5705,
"step": 1770
},
{
"epoch": 0.4729011689691817,
"grad_norm": 153274.9375,
"learning_rate": 1.6847325540205456e-05,
"loss": 3.7034,
"step": 1780
},
{
"epoch": 0.47555791710945805,
"grad_norm": 121872.7890625,
"learning_rate": 1.6829613885936948e-05,
"loss": 3.6836,
"step": 1790
},
{
"epoch": 0.4782146652497343,
"grad_norm": 101665.6640625,
"learning_rate": 1.681190223166844e-05,
"loss": 3.5983,
"step": 1800
},
{
"epoch": 0.4808714133900106,
"grad_norm": 212873.5,
"learning_rate": 1.6794190577399933e-05,
"loss": 3.3915,
"step": 1810
},
{
"epoch": 0.48352816153028694,
"grad_norm": 19234.345703125,
"learning_rate": 1.677647892313142e-05,
"loss": 3.1403,
"step": 1820
},
{
"epoch": 0.4861849096705632,
"grad_norm": 126968.46875,
"learning_rate": 1.6758767268862914e-05,
"loss": 3.3559,
"step": 1830
},
{
"epoch": 0.48884165781083955,
"grad_norm": 40483.28515625,
"learning_rate": 1.6741055614594403e-05,
"loss": 3.4042,
"step": 1840
},
{
"epoch": 0.4914984059511158,
"grad_norm": 281826.84375,
"learning_rate": 1.6723343960325895e-05,
"loss": 3.5656,
"step": 1850
},
{
"epoch": 0.49415515409139216,
"grad_norm": 112396.421875,
"learning_rate": 1.6705632306057387e-05,
"loss": 3.5217,
"step": 1860
},
{
"epoch": 0.49681190223166843,
"grad_norm": 430567.96875,
"learning_rate": 1.668792065178888e-05,
"loss": 3.784,
"step": 1870
},
{
"epoch": 0.49946865037194477,
"grad_norm": 19857.708984375,
"learning_rate": 1.667020899752037e-05,
"loss": 3.2844,
"step": 1880
},
{
"epoch": 0.502125398512221,
"grad_norm": 153824.828125,
"learning_rate": 1.665249734325186e-05,
"loss": 3.5734,
"step": 1890
},
{
"epoch": 0.5047821466524973,
"grad_norm": 555864.875,
"learning_rate": 1.6634785688983353e-05,
"loss": 3.5042,
"step": 1900
},
{
"epoch": 0.5074388947927736,
"grad_norm": 1425396.625,
"learning_rate": 1.6617074034714845e-05,
"loss": 3.8919,
"step": 1910
},
{
"epoch": 0.51009564293305,
"grad_norm": 1588321.5,
"learning_rate": 1.6599362380446334e-05,
"loss": 3.7013,
"step": 1920
},
{
"epoch": 0.5127523910733263,
"grad_norm": 843313.25,
"learning_rate": 1.6581650726177826e-05,
"loss": 4.0527,
"step": 1930
},
{
"epoch": 0.5154091392136025,
"grad_norm": 121270.0859375,
"learning_rate": 1.656393907190932e-05,
"loss": 3.6732,
"step": 1940
},
{
"epoch": 0.5180658873538788,
"grad_norm": 194603.609375,
"learning_rate": 1.654622741764081e-05,
"loss": 3.5416,
"step": 1950
},
{
"epoch": 0.5207226354941552,
"grad_norm": 103689.84375,
"learning_rate": 1.65285157633723e-05,
"loss": 3.6058,
"step": 1960
},
{
"epoch": 0.5233793836344315,
"grad_norm": 148743.953125,
"learning_rate": 1.6510804109103792e-05,
"loss": 3.4376,
"step": 1970
},
{
"epoch": 0.5260361317747078,
"grad_norm": 23079.94140625,
"learning_rate": 1.649309245483528e-05,
"loss": 3.524,
"step": 1980
},
{
"epoch": 0.528692879914984,
"grad_norm": 12263.953125,
"learning_rate": 1.6475380800566773e-05,
"loss": 3.1242,
"step": 1990
},
{
"epoch": 0.5313496280552603,
"grad_norm": 270958.5625,
"learning_rate": 1.6457669146298265e-05,
"loss": 3.8531,
"step": 2000
},
{
"epoch": 0.5340063761955367,
"grad_norm": 145561.640625,
"learning_rate": 1.6439957492029758e-05,
"loss": 3.104,
"step": 2010
},
{
"epoch": 0.536663124335813,
"grad_norm": 104717.5625,
"learning_rate": 1.6422245837761247e-05,
"loss": 3.3674,
"step": 2020
},
{
"epoch": 0.5393198724760893,
"grad_norm": 112249.3515625,
"learning_rate": 1.640453418349274e-05,
"loss": 3.2119,
"step": 2030
},
{
"epoch": 0.5419766206163655,
"grad_norm": 131700.71875,
"learning_rate": 1.638682252922423e-05,
"loss": 3.6448,
"step": 2040
},
{
"epoch": 0.5446333687566419,
"grad_norm": 119026.4140625,
"learning_rate": 1.6369110874955723e-05,
"loss": 3.0097,
"step": 2050
},
{
"epoch": 0.5472901168969182,
"grad_norm": 103121.09375,
"learning_rate": 1.6351399220687216e-05,
"loss": 3.4205,
"step": 2060
},
{
"epoch": 0.5499468650371945,
"grad_norm": 237787.03125,
"learning_rate": 1.6333687566418704e-05,
"loss": 3.349,
"step": 2070
},
{
"epoch": 0.5526036131774708,
"grad_norm": 49652.95703125,
"learning_rate": 1.6315975912150197e-05,
"loss": 3.1665,
"step": 2080
},
{
"epoch": 0.555260361317747,
"grad_norm": 262178.34375,
"learning_rate": 1.629826425788169e-05,
"loss": 3.4743,
"step": 2090
},
{
"epoch": 0.5579171094580234,
"grad_norm": 130814.703125,
"learning_rate": 1.6280552603613178e-05,
"loss": 3.4995,
"step": 2100
},
{
"epoch": 0.5605738575982997,
"grad_norm": 273671.09375,
"learning_rate": 1.626284094934467e-05,
"loss": 3.3983,
"step": 2110
},
{
"epoch": 0.563230605738576,
"grad_norm": 385060.25,
"learning_rate": 1.6245129295076162e-05,
"loss": 3.5215,
"step": 2120
},
{
"epoch": 0.5658873538788523,
"grad_norm": 165007.71875,
"learning_rate": 1.622741764080765e-05,
"loss": 3.1164,
"step": 2130
},
{
"epoch": 0.5685441020191286,
"grad_norm": 70266.53125,
"learning_rate": 1.6209705986539144e-05,
"loss": 3.1971,
"step": 2140
},
{
"epoch": 0.5712008501594049,
"grad_norm": 271687.3125,
"learning_rate": 1.6191994332270636e-05,
"loss": 3.4932,
"step": 2150
},
{
"epoch": 0.5738575982996812,
"grad_norm": 35143.67578125,
"learning_rate": 1.6174282678002128e-05,
"loss": 3.4214,
"step": 2160
},
{
"epoch": 0.5765143464399575,
"grad_norm": 1173879.625,
"learning_rate": 1.6156571023733617e-05,
"loss": 3.3194,
"step": 2170
},
{
"epoch": 0.5791710945802337,
"grad_norm": 306067.03125,
"learning_rate": 1.613885936946511e-05,
"loss": 3.1417,
"step": 2180
},
{
"epoch": 0.5818278427205101,
"grad_norm": 342329.0625,
"learning_rate": 1.61211477151966e-05,
"loss": 3.355,
"step": 2190
},
{
"epoch": 0.5844845908607864,
"grad_norm": 50600.97265625,
"learning_rate": 1.6103436060928094e-05,
"loss": 3.3974,
"step": 2200
},
{
"epoch": 0.5871413390010627,
"grad_norm": 360589.03125,
"learning_rate": 1.6085724406659583e-05,
"loss": 3.4514,
"step": 2210
},
{
"epoch": 0.589798087141339,
"grad_norm": 94335.3828125,
"learning_rate": 1.6068012752391075e-05,
"loss": 3.2719,
"step": 2220
},
{
"epoch": 0.5924548352816154,
"grad_norm": 53790.76953125,
"learning_rate": 1.6050301098122564e-05,
"loss": 3.3992,
"step": 2230
},
{
"epoch": 0.5951115834218916,
"grad_norm": 107421.421875,
"learning_rate": 1.6032589443854056e-05,
"loss": 3.3512,
"step": 2240
},
{
"epoch": 0.5977683315621679,
"grad_norm": 142487.859375,
"learning_rate": 1.601487778958555e-05,
"loss": 3.826,
"step": 2250
},
{
"epoch": 0.6004250797024442,
"grad_norm": 1261580.75,
"learning_rate": 1.599716613531704e-05,
"loss": 3.7385,
"step": 2260
},
{
"epoch": 0.6030818278427205,
"grad_norm": 648111.0,
"learning_rate": 1.597945448104853e-05,
"loss": 3.2839,
"step": 2270
},
{
"epoch": 0.6057385759829969,
"grad_norm": 326968.125,
"learning_rate": 1.5961742826780022e-05,
"loss": 3.7895,
"step": 2280
},
{
"epoch": 0.6083953241232731,
"grad_norm": 808961.5625,
"learning_rate": 1.5944031172511514e-05,
"loss": 3.7051,
"step": 2290
},
{
"epoch": 0.6110520722635494,
"grad_norm": 2958079.0,
"learning_rate": 1.5926319518243006e-05,
"loss": 3.8099,
"step": 2300
},
{
"epoch": 0.6137088204038257,
"grad_norm": 314874.03125,
"learning_rate": 1.5908607863974495e-05,
"loss": 3.6654,
"step": 2310
},
{
"epoch": 0.6163655685441021,
"grad_norm": 8078548.0,
"learning_rate": 1.5890896209705987e-05,
"loss": 4.018,
"step": 2320
},
{
"epoch": 0.6190223166843783,
"grad_norm": 135695.46875,
"learning_rate": 1.587318455543748e-05,
"loss": 3.6651,
"step": 2330
},
{
"epoch": 0.6216790648246546,
"grad_norm": 18501240.0,
"learning_rate": 1.5855472901168972e-05,
"loss": 4.0069,
"step": 2340
},
{
"epoch": 0.6243358129649309,
"grad_norm": 4980981.5,
"learning_rate": 1.5837761246900464e-05,
"loss": 3.9174,
"step": 2350
},
{
"epoch": 0.6269925611052072,
"grad_norm": 1297274.125,
"learning_rate": 1.5820049592631953e-05,
"loss": 3.3223,
"step": 2360
},
{
"epoch": 0.6296493092454836,
"grad_norm": 1378757.625,
"learning_rate": 1.5802337938363442e-05,
"loss": 3.7712,
"step": 2370
},
{
"epoch": 0.6323060573857598,
"grad_norm": 2027859.875,
"learning_rate": 1.5784626284094934e-05,
"loss": 3.5468,
"step": 2380
},
{
"epoch": 0.6349628055260361,
"grad_norm": 157107.65625,
"learning_rate": 1.5766914629826427e-05,
"loss": 3.5328,
"step": 2390
},
{
"epoch": 0.6376195536663124,
"grad_norm": 1103094.75,
"learning_rate": 1.574920297555792e-05,
"loss": 3.6031,
"step": 2400
},
{
"epoch": 0.6402763018065888,
"grad_norm": 725449.5,
"learning_rate": 1.5731491321289408e-05,
"loss": 3.9276,
"step": 2410
},
{
"epoch": 0.6429330499468651,
"grad_norm": 214425.640625,
"learning_rate": 1.57137796670209e-05,
"loss": 3.517,
"step": 2420
},
{
"epoch": 0.6455897980871413,
"grad_norm": 876419.625,
"learning_rate": 1.5696068012752392e-05,
"loss": 3.4675,
"step": 2430
},
{
"epoch": 0.6482465462274176,
"grad_norm": 1504300.25,
"learning_rate": 1.5678356358483884e-05,
"loss": 3.4772,
"step": 2440
},
{
"epoch": 0.6509032943676939,
"grad_norm": 144657.71875,
"learning_rate": 1.5660644704215377e-05,
"loss": 3.42,
"step": 2450
},
{
"epoch": 0.6535600425079703,
"grad_norm": 371512.40625,
"learning_rate": 1.5642933049946866e-05,
"loss": 3.6802,
"step": 2460
},
{
"epoch": 0.6562167906482466,
"grad_norm": 1322714.5,
"learning_rate": 1.5625221395678358e-05,
"loss": 3.805,
"step": 2470
},
{
"epoch": 0.6588735387885228,
"grad_norm": 218897.765625,
"learning_rate": 1.560750974140985e-05,
"loss": 3.252,
"step": 2480
},
{
"epoch": 0.6615302869287991,
"grad_norm": 1596077.0,
"learning_rate": 1.5589798087141342e-05,
"loss": 3.626,
"step": 2490
},
{
"epoch": 0.6641870350690755,
"grad_norm": 2922875.75,
"learning_rate": 1.557208643287283e-05,
"loss": 3.5045,
"step": 2500
},
{
"epoch": 0.6668437832093518,
"grad_norm": 96812.5859375,
"learning_rate": 1.5554374778604324e-05,
"loss": 3.7078,
"step": 2510
},
{
"epoch": 0.6695005313496281,
"grad_norm": 1580814.125,
"learning_rate": 1.5536663124335812e-05,
"loss": 3.615,
"step": 2520
},
{
"epoch": 0.6721572794899043,
"grad_norm": 235169.53125,
"learning_rate": 1.5518951470067305e-05,
"loss": 3.5076,
"step": 2530
},
{
"epoch": 0.6748140276301806,
"grad_norm": 816632.0,
"learning_rate": 1.5501239815798797e-05,
"loss": 4.0074,
"step": 2540
},
{
"epoch": 0.677470775770457,
"grad_norm": 3783126.5,
"learning_rate": 1.548352816153029e-05,
"loss": 3.7162,
"step": 2550
},
{
"epoch": 0.6801275239107333,
"grad_norm": 1676969.875,
"learning_rate": 1.5465816507261778e-05,
"loss": 3.9383,
"step": 2560
},
{
"epoch": 0.6827842720510096,
"grad_norm": 944205.0,
"learning_rate": 1.544810485299327e-05,
"loss": 3.6335,
"step": 2570
},
{
"epoch": 0.6854410201912858,
"grad_norm": 532299.0,
"learning_rate": 1.5430393198724763e-05,
"loss": 3.776,
"step": 2580
},
{
"epoch": 0.6880977683315622,
"grad_norm": 324683.46875,
"learning_rate": 1.5412681544456255e-05,
"loss": 4.0332,
"step": 2590
},
{
"epoch": 0.6907545164718385,
"grad_norm": 371158.6875,
"learning_rate": 1.5394969890187744e-05,
"loss": 3.2831,
"step": 2600
},
{
"epoch": 0.6934112646121148,
"grad_norm": 626177.8125,
"learning_rate": 1.5377258235919236e-05,
"loss": 3.7419,
"step": 2610
},
{
"epoch": 0.696068012752391,
"grad_norm": 489480.3125,
"learning_rate": 1.535954658165073e-05,
"loss": 3.9135,
"step": 2620
},
{
"epoch": 0.6987247608926673,
"grad_norm": 840057.5625,
"learning_rate": 1.534183492738222e-05,
"loss": 3.6214,
"step": 2630
},
{
"epoch": 0.7013815090329437,
"grad_norm": 641658.4375,
"learning_rate": 1.532412327311371e-05,
"loss": 3.9029,
"step": 2640
},
{
"epoch": 0.70403825717322,
"grad_norm": 1129191.0,
"learning_rate": 1.5306411618845202e-05,
"loss": 3.6271,
"step": 2650
},
{
"epoch": 0.7066950053134963,
"grad_norm": 758676.8125,
"learning_rate": 1.528869996457669e-05,
"loss": 3.8411,
"step": 2660
},
{
"epoch": 0.7093517534537725,
"grad_norm": 946755.25,
"learning_rate": 1.5270988310308183e-05,
"loss": 3.8184,
"step": 2670
},
{
"epoch": 0.7120085015940489,
"grad_norm": 1282365.625,
"learning_rate": 1.5253276656039675e-05,
"loss": 3.8393,
"step": 2680
},
{
"epoch": 0.7146652497343252,
"grad_norm": 1212575.875,
"learning_rate": 1.5235565001771166e-05,
"loss": 3.6106,
"step": 2690
},
{
"epoch": 0.7173219978746015,
"grad_norm": 2197153.75,
"learning_rate": 1.5217853347502658e-05,
"loss": 3.5554,
"step": 2700
},
{
"epoch": 0.7199787460148778,
"grad_norm": 621252.1875,
"learning_rate": 1.520014169323415e-05,
"loss": 3.3832,
"step": 2710
},
{
"epoch": 0.722635494155154,
"grad_norm": 243552.59375,
"learning_rate": 1.5182430038965641e-05,
"loss": 3.4785,
"step": 2720
},
{
"epoch": 0.7252922422954304,
"grad_norm": 3559921.0,
"learning_rate": 1.5164718384697133e-05,
"loss": 3.7972,
"step": 2730
},
{
"epoch": 0.7279489904357067,
"grad_norm": 8816077.0,
"learning_rate": 1.5147006730428624e-05,
"loss": 3.6698,
"step": 2740
},
{
"epoch": 0.730605738575983,
"grad_norm": 2959412.0,
"learning_rate": 1.5129295076160116e-05,
"loss": 3.9389,
"step": 2750
},
{
"epoch": 0.7332624867162593,
"grad_norm": 13276429.0,
"learning_rate": 1.5111583421891607e-05,
"loss": 3.6811,
"step": 2760
},
{
"epoch": 0.7359192348565357,
"grad_norm": 24583468.0,
"learning_rate": 1.5093871767623095e-05,
"loss": 3.9955,
"step": 2770
},
{
"epoch": 0.7385759829968119,
"grad_norm": 11388400.0,
"learning_rate": 1.5076160113354588e-05,
"loss": 3.4851,
"step": 2780
},
{
"epoch": 0.7412327311370882,
"grad_norm": 2901875.5,
"learning_rate": 1.5058448459086078e-05,
"loss": 4.0118,
"step": 2790
},
{
"epoch": 0.7438894792773645,
"grad_norm": 7893670.0,
"learning_rate": 1.504073680481757e-05,
"loss": 4.3674,
"step": 2800
},
{
"epoch": 0.7465462274176408,
"grad_norm": 13170602.0,
"learning_rate": 1.5023025150549063e-05,
"loss": 3.5882,
"step": 2810
},
{
"epoch": 0.7492029755579172,
"grad_norm": 12720932.0,
"learning_rate": 1.5005313496280553e-05,
"loss": 4.7013,
"step": 2820
},
{
"epoch": 0.7518597236981934,
"grad_norm": 7461363.0,
"learning_rate": 1.4987601842012046e-05,
"loss": 3.5194,
"step": 2830
},
{
"epoch": 0.7545164718384697,
"grad_norm": 3747000.25,
"learning_rate": 1.4969890187743536e-05,
"loss": 3.9811,
"step": 2840
},
{
"epoch": 0.757173219978746,
"grad_norm": 2111091.0,
"learning_rate": 1.4952178533475028e-05,
"loss": 3.3212,
"step": 2850
},
{
"epoch": 0.7598299681190224,
"grad_norm": 4919647.5,
"learning_rate": 1.4934466879206519e-05,
"loss": 4.0383,
"step": 2860
},
{
"epoch": 0.7624867162592986,
"grad_norm": 3595169.25,
"learning_rate": 1.4916755224938011e-05,
"loss": 3.7293,
"step": 2870
},
{
"epoch": 0.7651434643995749,
"grad_norm": 1647251.75,
"learning_rate": 1.4899043570669502e-05,
"loss": 4.166,
"step": 2880
},
{
"epoch": 0.7678002125398512,
"grad_norm": 4398145.0,
"learning_rate": 1.4881331916400994e-05,
"loss": 3.4454,
"step": 2890
},
{
"epoch": 0.7704569606801275,
"grad_norm": 3135213.0,
"learning_rate": 1.4863620262132485e-05,
"loss": 4.0135,
"step": 2900
},
{
"epoch": 0.7731137088204039,
"grad_norm": 7072787.0,
"learning_rate": 1.4845908607863975e-05,
"loss": 3.4145,
"step": 2910
},
{
"epoch": 0.7757704569606801,
"grad_norm": 2635511.75,
"learning_rate": 1.4828196953595466e-05,
"loss": 3.8201,
"step": 2920
},
{
"epoch": 0.7784272051009564,
"grad_norm": 4616754.5,
"learning_rate": 1.4810485299326958e-05,
"loss": 4.1764,
"step": 2930
},
{
"epoch": 0.7810839532412327,
"grad_norm": 877153.0,
"learning_rate": 1.4792773645058449e-05,
"loss": 3.9471,
"step": 2940
},
{
"epoch": 0.7837407013815091,
"grad_norm": 569671.3125,
"learning_rate": 1.4775061990789941e-05,
"loss": 3.7697,
"step": 2950
},
{
"epoch": 0.7863974495217854,
"grad_norm": 810236.125,
"learning_rate": 1.4757350336521432e-05,
"loss": 4.4753,
"step": 2960
},
{
"epoch": 0.7890541976620616,
"grad_norm": 877906.875,
"learning_rate": 1.4739638682252924e-05,
"loss": 3.6654,
"step": 2970
},
{
"epoch": 0.7917109458023379,
"grad_norm": 481885.46875,
"learning_rate": 1.4721927027984414e-05,
"loss": 4.1253,
"step": 2980
},
{
"epoch": 0.7943676939426142,
"grad_norm": 1338787.0,
"learning_rate": 1.4704215373715907e-05,
"loss": 4.0294,
"step": 2990
},
{
"epoch": 0.7970244420828906,
"grad_norm": 1250065.875,
"learning_rate": 1.4686503719447397e-05,
"loss": 4.7282,
"step": 3000
},
{
"epoch": 0.7996811902231669,
"grad_norm": 1604171.375,
"learning_rate": 1.466879206517889e-05,
"loss": 4.0439,
"step": 3010
},
{
"epoch": 0.8023379383634431,
"grad_norm": 512070.90625,
"learning_rate": 1.4651080410910382e-05,
"loss": 3.5779,
"step": 3020
},
{
"epoch": 0.8049946865037194,
"grad_norm": 312113.46875,
"learning_rate": 1.4633368756641872e-05,
"loss": 3.6514,
"step": 3030
},
{
"epoch": 0.8076514346439958,
"grad_norm": 23779.923828125,
"learning_rate": 1.4615657102373361e-05,
"loss": 3.8136,
"step": 3040
},
{
"epoch": 0.8103081827842721,
"grad_norm": 8204.794921875,
"learning_rate": 1.4597945448104854e-05,
"loss": 4.1336,
"step": 3050
},
{
"epoch": 0.8129649309245484,
"grad_norm": 76479.1640625,
"learning_rate": 1.4580233793836344e-05,
"loss": 3.4411,
"step": 3060
},
{
"epoch": 0.8156216790648246,
"grad_norm": 66624.71875,
"learning_rate": 1.4562522139567836e-05,
"loss": 3.8493,
"step": 3070
},
{
"epoch": 0.8182784272051009,
"grad_norm": 22607.904296875,
"learning_rate": 1.4544810485299327e-05,
"loss": 3.2428,
"step": 3080
},
{
"epoch": 0.8209351753453773,
"grad_norm": 119469.640625,
"learning_rate": 1.452709883103082e-05,
"loss": 3.4363,
"step": 3090
},
{
"epoch": 0.8235919234856536,
"grad_norm": 108868.203125,
"learning_rate": 1.4509387176762311e-05,
"loss": 3.5903,
"step": 3100
},
{
"epoch": 0.8262486716259299,
"grad_norm": 5543388.0,
"learning_rate": 1.4491675522493802e-05,
"loss": 3.7918,
"step": 3110
},
{
"epoch": 0.8289054197662061,
"grad_norm": 2565445.75,
"learning_rate": 1.4473963868225294e-05,
"loss": 3.8573,
"step": 3120
},
{
"epoch": 0.8315621679064825,
"grad_norm": 702086.4375,
"learning_rate": 1.4456252213956785e-05,
"loss": 3.3944,
"step": 3130
},
{
"epoch": 0.8342189160467588,
"grad_norm": 115243.6484375,
"learning_rate": 1.4438540559688277e-05,
"loss": 3.2222,
"step": 3140
},
{
"epoch": 0.8368756641870351,
"grad_norm": 476268.625,
"learning_rate": 1.4420828905419768e-05,
"loss": 3.6144,
"step": 3150
},
{
"epoch": 0.8395324123273114,
"grad_norm": 65992.0,
"learning_rate": 1.440311725115126e-05,
"loss": 3.1891,
"step": 3160
},
{
"epoch": 0.8421891604675876,
"grad_norm": 1161863.375,
"learning_rate": 1.438540559688275e-05,
"loss": 3.6714,
"step": 3170
},
{
"epoch": 0.844845908607864,
"grad_norm": 185466.84375,
"learning_rate": 1.4367693942614241e-05,
"loss": 3.4372,
"step": 3180
},
{
"epoch": 0.8475026567481403,
"grad_norm": 56940.96875,
"learning_rate": 1.4349982288345732e-05,
"loss": 3.7385,
"step": 3190
},
{
"epoch": 0.8501594048884166,
"grad_norm": 99763.78125,
"learning_rate": 1.4332270634077224e-05,
"loss": 3.5612,
"step": 3200
},
{
"epoch": 0.8528161530286928,
"grad_norm": 91525.1328125,
"learning_rate": 1.4314558979808715e-05,
"loss": 3.6116,
"step": 3210
},
{
"epoch": 0.8554729011689692,
"grad_norm": 23506.251953125,
"learning_rate": 1.4296847325540207e-05,
"loss": 3.4268,
"step": 3220
},
{
"epoch": 0.8581296493092455,
"grad_norm": 36794.52734375,
"learning_rate": 1.4279135671271697e-05,
"loss": 3.7912,
"step": 3230
},
{
"epoch": 0.8607863974495218,
"grad_norm": 14971.548828125,
"learning_rate": 1.426142401700319e-05,
"loss": 3.7623,
"step": 3240
},
{
"epoch": 0.8634431455897981,
"grad_norm": 29957.119140625,
"learning_rate": 1.424371236273468e-05,
"loss": 3.5765,
"step": 3250
},
{
"epoch": 0.8660998937300743,
"grad_norm": 24691.1796875,
"learning_rate": 1.4226000708466172e-05,
"loss": 3.4663,
"step": 3260
},
{
"epoch": 0.8687566418703507,
"grad_norm": 21935.2734375,
"learning_rate": 1.4208289054197663e-05,
"loss": 3.6494,
"step": 3270
},
{
"epoch": 0.871413390010627,
"grad_norm": 26350.591796875,
"learning_rate": 1.4190577399929155e-05,
"loss": 3.5611,
"step": 3280
},
{
"epoch": 0.8740701381509033,
"grad_norm": 30286.142578125,
"learning_rate": 1.4172865745660646e-05,
"loss": 3.7046,
"step": 3290
},
{
"epoch": 0.8767268862911796,
"grad_norm": 6965.02734375,
"learning_rate": 1.4155154091392138e-05,
"loss": 3.9012,
"step": 3300
},
{
"epoch": 0.879383634431456,
"grad_norm": 34496.1171875,
"learning_rate": 1.4137442437123627e-05,
"loss": 3.5102,
"step": 3310
},
{
"epoch": 0.8820403825717322,
"grad_norm": 15867.46875,
"learning_rate": 1.411973078285512e-05,
"loss": 3.9485,
"step": 3320
},
{
"epoch": 0.8846971307120085,
"grad_norm": 8408.2509765625,
"learning_rate": 1.410201912858661e-05,
"loss": 4.0955,
"step": 3330
},
{
"epoch": 0.8873538788522848,
"grad_norm": 12868.8935546875,
"learning_rate": 1.4084307474318102e-05,
"loss": 3.8902,
"step": 3340
},
{
"epoch": 0.8900106269925611,
"grad_norm": 39027.8125,
"learning_rate": 1.4066595820049593e-05,
"loss": 3.7809,
"step": 3350
},
{
"epoch": 0.8926673751328374,
"grad_norm": 30144.494140625,
"learning_rate": 1.4048884165781085e-05,
"loss": 3.8368,
"step": 3360
},
{
"epoch": 0.8953241232731137,
"grad_norm": 14916.984375,
"learning_rate": 1.4031172511512576e-05,
"loss": 3.8361,
"step": 3370
},
{
"epoch": 0.89798087141339,
"grad_norm": 10657.8974609375,
"learning_rate": 1.4013460857244068e-05,
"loss": 3.9388,
"step": 3380
},
{
"epoch": 0.9006376195536663,
"grad_norm": 20504.70703125,
"learning_rate": 1.399574920297556e-05,
"loss": 4.257,
"step": 3390
},
{
"epoch": 0.9032943676939427,
"grad_norm": 32460.078125,
"learning_rate": 1.397803754870705e-05,
"loss": 4.0817,
"step": 3400
},
{
"epoch": 0.905951115834219,
"grad_norm": 6730.14404296875,
"learning_rate": 1.3960325894438543e-05,
"loss": 4.2065,
"step": 3410
},
{
"epoch": 0.9086078639744952,
"grad_norm": 17531.017578125,
"learning_rate": 1.3942614240170034e-05,
"loss": 3.5729,
"step": 3420
},
{
"epoch": 0.9112646121147715,
"grad_norm": 17859.064453125,
"learning_rate": 1.3924902585901526e-05,
"loss": 4.3419,
"step": 3430
},
{
"epoch": 0.9139213602550478,
"grad_norm": 99839.4296875,
"learning_rate": 1.3907190931633016e-05,
"loss": 3.9653,
"step": 3440
},
{
"epoch": 0.9165781083953242,
"grad_norm": 13036.796875,
"learning_rate": 1.3889479277364505e-05,
"loss": 3.8463,
"step": 3450
},
{
"epoch": 0.9192348565356004,
"grad_norm": 54209.05859375,
"learning_rate": 1.3871767623095998e-05,
"loss": 3.9493,
"step": 3460
},
{
"epoch": 0.9218916046758767,
"grad_norm": 227248.34375,
"learning_rate": 1.385405596882749e-05,
"loss": 3.7791,
"step": 3470
},
{
"epoch": 0.924548352816153,
"grad_norm": 856476.3125,
"learning_rate": 1.383634431455898e-05,
"loss": 4.2208,
"step": 3480
},
{
"epoch": 0.9272051009564294,
"grad_norm": 373248.40625,
"learning_rate": 1.3818632660290473e-05,
"loss": 4.6665,
"step": 3490
},
{
"epoch": 0.9298618490967057,
"grad_norm": 476773.1875,
"learning_rate": 1.3800921006021963e-05,
"loss": 4.3373,
"step": 3500
},
{
"epoch": 0.9325185972369819,
"grad_norm": 3948952.0,
"learning_rate": 1.3783209351753455e-05,
"loss": 3.9872,
"step": 3510
},
{
"epoch": 0.9351753453772582,
"grad_norm": 131342.296875,
"learning_rate": 1.3765497697484946e-05,
"loss": 4.0315,
"step": 3520
},
{
"epoch": 0.9378320935175345,
"grad_norm": 1021533.8125,
"learning_rate": 1.3747786043216438e-05,
"loss": 3.898,
"step": 3530
},
{
"epoch": 0.9404888416578109,
"grad_norm": 70664288.0,
"learning_rate": 1.3730074388947929e-05,
"loss": 4.0261,
"step": 3540
},
{
"epoch": 0.9431455897980872,
"grad_norm": 1955257.25,
"learning_rate": 1.3712362734679421e-05,
"loss": 3.9837,
"step": 3550
},
{
"epoch": 0.9458023379383634,
"grad_norm": 10510368.0,
"learning_rate": 1.3694651080410912e-05,
"loss": 4.2089,
"step": 3560
},
{
"epoch": 0.9484590860786397,
"grad_norm": 4540049.0,
"learning_rate": 1.3676939426142404e-05,
"loss": 4.0757,
"step": 3570
},
{
"epoch": 0.9511158342189161,
"grad_norm": 1934832.5,
"learning_rate": 1.3659227771873893e-05,
"loss": 3.8116,
"step": 3580
},
{
"epoch": 0.9537725823591924,
"grad_norm": 721523.875,
"learning_rate": 1.3641516117605385e-05,
"loss": 3.8604,
"step": 3590
},
{
"epoch": 0.9564293304994687,
"grad_norm": 3694456.5,
"learning_rate": 1.3623804463336876e-05,
"loss": 4.3438,
"step": 3600
},
{
"epoch": 0.9590860786397449,
"grad_norm": 4130751.5,
"learning_rate": 1.3606092809068368e-05,
"loss": 3.7722,
"step": 3610
},
{
"epoch": 0.9617428267800212,
"grad_norm": 3232915.5,
"learning_rate": 1.3588381154799859e-05,
"loss": 4.1108,
"step": 3620
},
{
"epoch": 0.9643995749202976,
"grad_norm": 5608699.5,
"learning_rate": 1.357066950053135e-05,
"loss": 4.4695,
"step": 3630
},
{
"epoch": 0.9670563230605739,
"grad_norm": 37526024.0,
"learning_rate": 1.3552957846262841e-05,
"loss": 3.8838,
"step": 3640
},
{
"epoch": 0.9697130712008502,
"grad_norm": 11544401.0,
"learning_rate": 1.3535246191994334e-05,
"loss": 3.7949,
"step": 3650
},
{
"epoch": 0.9723698193411264,
"grad_norm": 1559264.75,
"learning_rate": 1.3517534537725824e-05,
"loss": 3.8236,
"step": 3660
},
{
"epoch": 0.9750265674814028,
"grad_norm": 10817994.0,
"learning_rate": 1.3499822883457316e-05,
"loss": 4.0035,
"step": 3670
},
{
"epoch": 0.9776833156216791,
"grad_norm": 20268342.0,
"learning_rate": 1.3482111229188807e-05,
"loss": 3.6612,
"step": 3680
},
{
"epoch": 0.9803400637619554,
"grad_norm": 51181968.0,
"learning_rate": 1.34643995749203e-05,
"loss": 3.7019,
"step": 3690
},
{
"epoch": 0.9829968119022316,
"grad_norm": 74098400.0,
"learning_rate": 1.3446687920651792e-05,
"loss": 3.779,
"step": 3700
},
{
"epoch": 0.9856535600425079,
"grad_norm": 48340468.0,
"learning_rate": 1.3428976266383282e-05,
"loss": 3.6759,
"step": 3710
},
{
"epoch": 0.9883103081827843,
"grad_norm": 8802756.0,
"learning_rate": 1.3411264612114771e-05,
"loss": 3.6238,
"step": 3720
},
{
"epoch": 0.9909670563230606,
"grad_norm": 3833086.75,
"learning_rate": 1.3393552957846263e-05,
"loss": 3.3759,
"step": 3730
},
{
"epoch": 0.9936238044633369,
"grad_norm": 29499648.0,
"learning_rate": 1.3375841303577754e-05,
"loss": 3.6134,
"step": 3740
},
{
"epoch": 0.9962805526036131,
"grad_norm": 6612167.0,
"learning_rate": 1.3358129649309246e-05,
"loss": 3.5491,
"step": 3750
},
{
"epoch": 0.9989373007438895,
"grad_norm": 21236494.0,
"learning_rate": 1.3340417995040737e-05,
"loss": 3.7831,
"step": 3760
},
{
"epoch": 1.0,
"eval_loss": 3.75178599357605,
"eval_runtime": 744.4128,
"eval_samples_per_second": 20.225,
"eval_steps_per_second": 5.056,
"step": 3764
},
{
"epoch": 1.0015940488841657,
"grad_norm": 40179844.0,
"learning_rate": 1.3322706340772229e-05,
"loss": 3.711,
"step": 3770
},
{
"epoch": 1.004250797024442,
"grad_norm": 17010662.0,
"learning_rate": 1.3304994686503721e-05,
"loss": 3.4946,
"step": 3780
},
{
"epoch": 1.0069075451647185,
"grad_norm": 19932106.0,
"learning_rate": 1.3287283032235212e-05,
"loss": 3.5648,
"step": 3790
},
{
"epoch": 1.0095642933049946,
"grad_norm": 5492312.0,
"learning_rate": 1.3269571377966704e-05,
"loss": 4.0635,
"step": 3800
},
{
"epoch": 1.012221041445271,
"grad_norm": 192937568.0,
"learning_rate": 1.3251859723698195e-05,
"loss": 3.4178,
"step": 3810
},
{
"epoch": 1.0148777895855472,
"grad_norm": 1293443.125,
"learning_rate": 1.3234148069429687e-05,
"loss": 3.9658,
"step": 3820
},
{
"epoch": 1.0175345377258236,
"grad_norm": 158162096.0,
"learning_rate": 1.3216436415161178e-05,
"loss": 3.6695,
"step": 3830
},
{
"epoch": 1.0201912858661,
"grad_norm": 207503072.0,
"learning_rate": 1.319872476089267e-05,
"loss": 4.1104,
"step": 3840
},
{
"epoch": 1.0228480340063761,
"grad_norm": 5859501.0,
"learning_rate": 1.3181013106624159e-05,
"loss": 3.7423,
"step": 3850
},
{
"epoch": 1.0255047821466525,
"grad_norm": 65099376.0,
"learning_rate": 1.3163301452355651e-05,
"loss": 3.8122,
"step": 3860
},
{
"epoch": 1.0281615302869287,
"grad_norm": 13768734.0,
"learning_rate": 1.3145589798087142e-05,
"loss": 3.8062,
"step": 3870
},
{
"epoch": 1.030818278427205,
"grad_norm": 24830612.0,
"learning_rate": 1.3127878143818634e-05,
"loss": 3.5577,
"step": 3880
},
{
"epoch": 1.0334750265674815,
"grad_norm": 109977040.0,
"learning_rate": 1.3110166489550124e-05,
"loss": 3.8904,
"step": 3890
},
{
"epoch": 1.0361317747077576,
"grad_norm": 22621510.0,
"learning_rate": 1.3092454835281617e-05,
"loss": 3.7924,
"step": 3900
},
{
"epoch": 1.038788522848034,
"grad_norm": 15618693.0,
"learning_rate": 1.3074743181013107e-05,
"loss": 3.9009,
"step": 3910
},
{
"epoch": 1.0414452709883104,
"grad_norm": 102296992.0,
"learning_rate": 1.30570315267446e-05,
"loss": 4.0488,
"step": 3920
},
{
"epoch": 1.0441020191285866,
"grad_norm": 180104320.0,
"learning_rate": 1.303931987247609e-05,
"loss": 4.0832,
"step": 3930
},
{
"epoch": 1.046758767268863,
"grad_norm": 8426886.0,
"learning_rate": 1.3021608218207582e-05,
"loss": 3.9811,
"step": 3940
},
{
"epoch": 1.0494155154091391,
"grad_norm": 23817282.0,
"learning_rate": 1.3003896563939073e-05,
"loss": 3.5573,
"step": 3950
},
{
"epoch": 1.0520722635494155,
"grad_norm": 34805012.0,
"learning_rate": 1.2986184909670565e-05,
"loss": 3.6933,
"step": 3960
},
{
"epoch": 1.054729011689692,
"grad_norm": 27546222.0,
"learning_rate": 1.2968473255402056e-05,
"loss": 3.826,
"step": 3970
},
{
"epoch": 1.057385759829968,
"grad_norm": 73101112.0,
"learning_rate": 1.2950761601133548e-05,
"loss": 4.3474,
"step": 3980
},
{
"epoch": 1.0600425079702445,
"grad_norm": 60012056.0,
"learning_rate": 1.2933049946865037e-05,
"loss": 3.4645,
"step": 3990
},
{
"epoch": 1.0626992561105206,
"grad_norm": 10204493.0,
"learning_rate": 1.2915338292596529e-05,
"loss": 3.7942,
"step": 4000
},
{
"epoch": 1.065356004250797,
"grad_norm": 67629928.0,
"learning_rate": 1.289762663832802e-05,
"loss": 3.6377,
"step": 4010
},
{
"epoch": 1.0680127523910734,
"grad_norm": 31746526.0,
"learning_rate": 1.2879914984059512e-05,
"loss": 3.7846,
"step": 4020
},
{
"epoch": 1.0706695005313496,
"grad_norm": 52992448.0,
"learning_rate": 1.2862203329791003e-05,
"loss": 3.2981,
"step": 4030
},
{
"epoch": 1.073326248671626,
"grad_norm": 36022592.0,
"learning_rate": 1.2844491675522495e-05,
"loss": 3.6733,
"step": 4040
},
{
"epoch": 1.0759829968119021,
"grad_norm": 11422725.0,
"learning_rate": 1.2826780021253985e-05,
"loss": 3.5682,
"step": 4050
},
{
"epoch": 1.0786397449521785,
"grad_norm": 77457192.0,
"learning_rate": 1.2809068366985478e-05,
"loss": 3.8538,
"step": 4060
},
{
"epoch": 1.081296493092455,
"grad_norm": 109772792.0,
"learning_rate": 1.279135671271697e-05,
"loss": 4.0151,
"step": 4070
},
{
"epoch": 1.083953241232731,
"grad_norm": 126942304.0,
"learning_rate": 1.277364505844846e-05,
"loss": 4.418,
"step": 4080
},
{
"epoch": 1.0866099893730075,
"grad_norm": 215005632.0,
"learning_rate": 1.2755933404179953e-05,
"loss": 3.6302,
"step": 4090
},
{
"epoch": 1.0892667375132838,
"grad_norm": 18895672.0,
"learning_rate": 1.2738221749911443e-05,
"loss": 4.2548,
"step": 4100
},
{
"epoch": 1.09192348565356,
"grad_norm": 20576284.0,
"learning_rate": 1.2720510095642936e-05,
"loss": 3.9913,
"step": 4110
},
{
"epoch": 1.0945802337938364,
"grad_norm": 90564424.0,
"learning_rate": 1.2702798441374424e-05,
"loss": 3.8335,
"step": 4120
},
{
"epoch": 1.0972369819341126,
"grad_norm": 136458144.0,
"learning_rate": 1.2685086787105915e-05,
"loss": 4.0485,
"step": 4130
},
{
"epoch": 1.099893730074389,
"grad_norm": 175102016.0,
"learning_rate": 1.2667375132837407e-05,
"loss": 4.1181,
"step": 4140
},
{
"epoch": 1.1025504782146653,
"grad_norm": 15060149.0,
"learning_rate": 1.26496634785689e-05,
"loss": 3.753,
"step": 4150
},
{
"epoch": 1.1052072263549415,
"grad_norm": 92020808.0,
"learning_rate": 1.263195182430039e-05,
"loss": 3.9935,
"step": 4160
},
{
"epoch": 1.107863974495218,
"grad_norm": 133574952.0,
"learning_rate": 1.2614240170031882e-05,
"loss": 4.0376,
"step": 4170
},
{
"epoch": 1.110520722635494,
"grad_norm": 69448336.0,
"learning_rate": 1.2596528515763373e-05,
"loss": 3.7264,
"step": 4180
},
{
"epoch": 1.1131774707757705,
"grad_norm": 24695358.0,
"learning_rate": 1.2578816861494865e-05,
"loss": 3.6435,
"step": 4190
},
{
"epoch": 1.1158342189160468,
"grad_norm": 26981000.0,
"learning_rate": 1.2561105207226356e-05,
"loss": 4.1867,
"step": 4200
},
{
"epoch": 1.118490967056323,
"grad_norm": 26429450.0,
"learning_rate": 1.2543393552957848e-05,
"loss": 4.2308,
"step": 4210
},
{
"epoch": 1.1211477151965994,
"grad_norm": 75864056.0,
"learning_rate": 1.2525681898689339e-05,
"loss": 4.1067,
"step": 4220
},
{
"epoch": 1.1238044633368758,
"grad_norm": 53176204.0,
"learning_rate": 1.2507970244420831e-05,
"loss": 4.3122,
"step": 4230
},
{
"epoch": 1.126461211477152,
"grad_norm": 27715404.0,
"learning_rate": 1.2490258590152322e-05,
"loss": 4.0918,
"step": 4240
},
{
"epoch": 1.1291179596174283,
"grad_norm": 6029370.0,
"learning_rate": 1.2472546935883814e-05,
"loss": 4.1725,
"step": 4250
},
{
"epoch": 1.1317747077577045,
"grad_norm": 26051718.0,
"learning_rate": 1.2454835281615303e-05,
"loss": 3.9757,
"step": 4260
},
{
"epoch": 1.134431455897981,
"grad_norm": 77973728.0,
"learning_rate": 1.2437123627346795e-05,
"loss": 3.989,
"step": 4270
},
{
"epoch": 1.1370882040382573,
"grad_norm": 11366385.0,
"learning_rate": 1.2419411973078286e-05,
"loss": 4.3978,
"step": 4280
},
{
"epoch": 1.1397449521785334,
"grad_norm": 19926490.0,
"learning_rate": 1.2401700318809778e-05,
"loss": 3.7446,
"step": 4290
},
{
"epoch": 1.1424017003188098,
"grad_norm": 66211068.0,
"learning_rate": 1.2383988664541268e-05,
"loss": 3.9591,
"step": 4300
},
{
"epoch": 1.145058448459086,
"grad_norm": 7617592.5,
"learning_rate": 1.236627701027276e-05,
"loss": 4.2812,
"step": 4310
},
{
"epoch": 1.1477151965993624,
"grad_norm": 47218612.0,
"learning_rate": 1.2348565356004251e-05,
"loss": 4.137,
"step": 4320
},
{
"epoch": 1.1503719447396388,
"grad_norm": 115950944.0,
"learning_rate": 1.2330853701735743e-05,
"loss": 4.1344,
"step": 4330
},
{
"epoch": 1.153028692879915,
"grad_norm": 27328380.0,
"learning_rate": 1.2313142047467234e-05,
"loss": 4.0865,
"step": 4340
},
{
"epoch": 1.1556854410201913,
"grad_norm": 8267316.5,
"learning_rate": 1.2295430393198726e-05,
"loss": 4.3048,
"step": 4350
},
{
"epoch": 1.1583421891604675,
"grad_norm": 18654644.0,
"learning_rate": 1.2277718738930217e-05,
"loss": 4.4512,
"step": 4360
},
{
"epoch": 1.1609989373007439,
"grad_norm": 123494120.0,
"learning_rate": 1.2260007084661709e-05,
"loss": 4.1863,
"step": 4370
},
{
"epoch": 1.1636556854410203,
"grad_norm": 87930224.0,
"learning_rate": 1.2242295430393201e-05,
"loss": 4.1395,
"step": 4380
},
{
"epoch": 1.1663124335812964,
"grad_norm": 60926568.0,
"learning_rate": 1.222458377612469e-05,
"loss": 3.9975,
"step": 4390
},
{
"epoch": 1.1689691817215728,
"grad_norm": 15561844.0,
"learning_rate": 1.2206872121856181e-05,
"loss": 4.1746,
"step": 4400
},
{
"epoch": 1.171625929861849,
"grad_norm": 14337786.0,
"learning_rate": 1.2189160467587673e-05,
"loss": 4.0762,
"step": 4410
},
{
"epoch": 1.1742826780021254,
"grad_norm": 27260074.0,
"learning_rate": 1.2171448813319164e-05,
"loss": 4.3436,
"step": 4420
},
{
"epoch": 1.1769394261424018,
"grad_norm": 14445331.0,
"learning_rate": 1.2153737159050656e-05,
"loss": 3.9788,
"step": 4430
},
{
"epoch": 1.179596174282678,
"grad_norm": 21041896.0,
"learning_rate": 1.2136025504782147e-05,
"loss": 4.3681,
"step": 4440
},
{
"epoch": 1.1822529224229543,
"grad_norm": 15333385.0,
"learning_rate": 1.2118313850513639e-05,
"loss": 4.1638,
"step": 4450
},
{
"epoch": 1.1849096705632305,
"grad_norm": 18882606.0,
"learning_rate": 1.2100602196245131e-05,
"loss": 3.9175,
"step": 4460
},
{
"epoch": 1.1875664187035069,
"grad_norm": 6002330.5,
"learning_rate": 1.2082890541976622e-05,
"loss": 4.0274,
"step": 4470
},
{
"epoch": 1.1902231668437833,
"grad_norm": 12174502.0,
"learning_rate": 1.2065178887708114e-05,
"loss": 4.0163,
"step": 4480
},
{
"epoch": 1.1928799149840594,
"grad_norm": 3046521.75,
"learning_rate": 1.2047467233439604e-05,
"loss": 4.2218,
"step": 4490
},
{
"epoch": 1.1955366631243358,
"grad_norm": 7046191.0,
"learning_rate": 1.2029755579171097e-05,
"loss": 3.8047,
"step": 4500
},
{
"epoch": 1.1981934112646122,
"grad_norm": 2158310.5,
"learning_rate": 1.2012043924902587e-05,
"loss": 4.0102,
"step": 4510
},
{
"epoch": 1.2008501594048884,
"grad_norm": 1953139.875,
"learning_rate": 1.199433227063408e-05,
"loss": 3.9815,
"step": 4520
},
{
"epoch": 1.2035069075451648,
"grad_norm": 10403948.0,
"learning_rate": 1.1976620616365568e-05,
"loss": 4.2106,
"step": 4530
},
{
"epoch": 1.206163655685441,
"grad_norm": 1701127.5,
"learning_rate": 1.195890896209706e-05,
"loss": 4.1719,
"step": 4540
},
{
"epoch": 1.2088204038257173,
"grad_norm": 1922839.625,
"learning_rate": 1.1941197307828551e-05,
"loss": 4.2,
"step": 4550
},
{
"epoch": 1.2114771519659937,
"grad_norm": 1249251.375,
"learning_rate": 1.1923485653560044e-05,
"loss": 4.3854,
"step": 4560
},
{
"epoch": 1.2141339001062699,
"grad_norm": 3677515.25,
"learning_rate": 1.1905773999291534e-05,
"loss": 4.1928,
"step": 4570
},
{
"epoch": 1.2167906482465463,
"grad_norm": 1778515.5,
"learning_rate": 1.1888062345023026e-05,
"loss": 4.282,
"step": 4580
},
{
"epoch": 1.2194473963868226,
"grad_norm": 2142989.75,
"learning_rate": 1.1870350690754517e-05,
"loss": 4.0862,
"step": 4590
},
{
"epoch": 1.2221041445270988,
"grad_norm": 3376149.5,
"learning_rate": 1.185263903648601e-05,
"loss": 4.9249,
"step": 4600
},
{
"epoch": 1.2247608926673752,
"grad_norm": 918137.0625,
"learning_rate": 1.18349273822175e-05,
"loss": 4.4397,
"step": 4610
},
{
"epoch": 1.2274176408076514,
"grad_norm": 5548887.5,
"learning_rate": 1.1817215727948992e-05,
"loss": 4.186,
"step": 4620
},
{
"epoch": 1.2300743889479278,
"grad_norm": 1206121.0,
"learning_rate": 1.1799504073680483e-05,
"loss": 4.4369,
"step": 4630
},
{
"epoch": 1.2327311370882041,
"grad_norm": 1302905.0,
"learning_rate": 1.1781792419411975e-05,
"loss": 4.2492,
"step": 4640
},
{
"epoch": 1.2353878852284803,
"grad_norm": 1243181.25,
"learning_rate": 1.1764080765143466e-05,
"loss": 4.3557,
"step": 4650
},
{
"epoch": 1.2380446333687567,
"grad_norm": 1636811.25,
"learning_rate": 1.1746369110874956e-05,
"loss": 4.4305,
"step": 4660
},
{
"epoch": 1.2407013815090329,
"grad_norm": 3252745.75,
"learning_rate": 1.1728657456606447e-05,
"loss": 4.4447,
"step": 4670
},
{
"epoch": 1.2433581296493093,
"grad_norm": 3218180.0,
"learning_rate": 1.1710945802337939e-05,
"loss": 4.1695,
"step": 4680
},
{
"epoch": 1.2460148777895856,
"grad_norm": 7251921.5,
"learning_rate": 1.169323414806943e-05,
"loss": 4.0679,
"step": 4690
},
{
"epoch": 1.2486716259298618,
"grad_norm": 3886631.0,
"learning_rate": 1.1675522493800922e-05,
"loss": 3.9159,
"step": 4700
},
{
"epoch": 1.2513283740701382,
"grad_norm": 2420017.75,
"learning_rate": 1.1657810839532412e-05,
"loss": 4.6458,
"step": 4710
},
{
"epoch": 1.2539851222104144,
"grad_norm": 1138159.875,
"learning_rate": 1.1640099185263905e-05,
"loss": 4.078,
"step": 4720
},
{
"epoch": 1.2566418703506907,
"grad_norm": 930125.875,
"learning_rate": 1.1622387530995395e-05,
"loss": 4.0812,
"step": 4730
},
{
"epoch": 1.2592986184909671,
"grad_norm": 3835148.25,
"learning_rate": 1.1604675876726887e-05,
"loss": 4.1012,
"step": 4740
},
{
"epoch": 1.2619553666312433,
"grad_norm": 6243373.5,
"learning_rate": 1.158696422245838e-05,
"loss": 3.8252,
"step": 4750
},
{
"epoch": 1.2646121147715197,
"grad_norm": 3021652.25,
"learning_rate": 1.156925256818987e-05,
"loss": 3.9515,
"step": 4760
},
{
"epoch": 1.2672688629117959,
"grad_norm": 4503118.5,
"learning_rate": 1.1551540913921363e-05,
"loss": 4.0478,
"step": 4770
},
{
"epoch": 1.2699256110520722,
"grad_norm": 5867597.5,
"learning_rate": 1.1533829259652853e-05,
"loss": 4.0726,
"step": 4780
},
{
"epoch": 1.2725823591923486,
"grad_norm": 23690828.0,
"learning_rate": 1.1516117605384345e-05,
"loss": 3.5037,
"step": 4790
},
{
"epoch": 1.2752391073326248,
"grad_norm": 5260964.5,
"learning_rate": 1.1498405951115834e-05,
"loss": 4.0774,
"step": 4800
},
{
"epoch": 1.2778958554729012,
"grad_norm": 4894551.5,
"learning_rate": 1.1480694296847325e-05,
"loss": 3.7113,
"step": 4810
},
{
"epoch": 1.2805526036131774,
"grad_norm": 4784902.0,
"learning_rate": 1.1462982642578817e-05,
"loss": 3.886,
"step": 4820
},
{
"epoch": 1.2832093517534537,
"grad_norm": 22511842.0,
"learning_rate": 1.144527098831031e-05,
"loss": 3.7413,
"step": 4830
},
{
"epoch": 1.2858660998937301,
"grad_norm": 13445524.0,
"learning_rate": 1.14275593340418e-05,
"loss": 4.3171,
"step": 4840
},
{
"epoch": 1.2885228480340063,
"grad_norm": 4879641.0,
"learning_rate": 1.1409847679773292e-05,
"loss": 4.0366,
"step": 4850
},
{
"epoch": 1.2911795961742827,
"grad_norm": 5458451.0,
"learning_rate": 1.1392136025504783e-05,
"loss": 4.0356,
"step": 4860
},
{
"epoch": 1.2938363443145589,
"grad_norm": 1152951.125,
"learning_rate": 1.1374424371236275e-05,
"loss": 3.9322,
"step": 4870
},
{
"epoch": 1.2964930924548352,
"grad_norm": 1573109.875,
"learning_rate": 1.1356712716967766e-05,
"loss": 3.5684,
"step": 4880
},
{
"epoch": 1.2991498405951116,
"grad_norm": 3557934.25,
"learning_rate": 1.1339001062699258e-05,
"loss": 3.8874,
"step": 4890
},
{
"epoch": 1.301806588735388,
"grad_norm": 2637183.5,
"learning_rate": 1.1321289408430748e-05,
"loss": 4.0737,
"step": 4900
},
{
"epoch": 1.3044633368756642,
"grad_norm": 1852644.25,
"learning_rate": 1.130357775416224e-05,
"loss": 4.4462,
"step": 4910
},
{
"epoch": 1.3071200850159406,
"grad_norm": 7577384.5,
"learning_rate": 1.1285866099893731e-05,
"loss": 3.8546,
"step": 4920
},
{
"epoch": 1.3097768331562167,
"grad_norm": 4401453.5,
"learning_rate": 1.1268154445625224e-05,
"loss": 4.0443,
"step": 4930
},
{
"epoch": 1.3124335812964931,
"grad_norm": 3643839.75,
"learning_rate": 1.1250442791356712e-05,
"loss": 3.678,
"step": 4940
},
{
"epoch": 1.3150903294367695,
"grad_norm": 27145024.0,
"learning_rate": 1.1232731137088205e-05,
"loss": 3.8589,
"step": 4950
},
{
"epoch": 1.3177470775770457,
"grad_norm": 1982266.875,
"learning_rate": 1.1215019482819695e-05,
"loss": 3.587,
"step": 4960
},
{
"epoch": 1.320403825717322,
"grad_norm": 2339293.25,
"learning_rate": 1.1197307828551188e-05,
"loss": 3.6116,
"step": 4970
},
{
"epoch": 1.3230605738575982,
"grad_norm": 21441204.0,
"learning_rate": 1.1179596174282678e-05,
"loss": 3.4365,
"step": 4980
},
{
"epoch": 1.3257173219978746,
"grad_norm": 3329228.0,
"learning_rate": 1.116188452001417e-05,
"loss": 4.184,
"step": 4990
},
{
"epoch": 1.328374070138151,
"grad_norm": 2602702.75,
"learning_rate": 1.1144172865745661e-05,
"loss": 3.6095,
"step": 5000
},
{
"epoch": 1.3310308182784272,
"grad_norm": 62917268.0,
"learning_rate": 1.1126461211477153e-05,
"loss": 3.4086,
"step": 5010
},
{
"epoch": 1.3336875664187036,
"grad_norm": 9320738.0,
"learning_rate": 1.1108749557208644e-05,
"loss": 3.8485,
"step": 5020
},
{
"epoch": 1.3363443145589797,
"grad_norm": 11171778.0,
"learning_rate": 1.1091037902940136e-05,
"loss": 3.5241,
"step": 5030
},
{
"epoch": 1.3390010626992561,
"grad_norm": 13504690.0,
"learning_rate": 1.1073326248671628e-05,
"loss": 3.7951,
"step": 5040
},
{
"epoch": 1.3416578108395325,
"grad_norm": 1940023.625,
"learning_rate": 1.1055614594403119e-05,
"loss": 3.938,
"step": 5050
},
{
"epoch": 1.3443145589798087,
"grad_norm": 9250230.0,
"learning_rate": 1.1037902940134611e-05,
"loss": 3.6501,
"step": 5060
},
{
"epoch": 1.346971307120085,
"grad_norm": 8658494.0,
"learning_rate": 1.10201912858661e-05,
"loss": 3.4101,
"step": 5070
},
{
"epoch": 1.3496280552603612,
"grad_norm": 24788584.0,
"learning_rate": 1.100247963159759e-05,
"loss": 3.2665,
"step": 5080
},
{
"epoch": 1.3522848034006376,
"grad_norm": 17288262.0,
"learning_rate": 1.0984767977329083e-05,
"loss": 3.9485,
"step": 5090
},
{
"epoch": 1.354941551540914,
"grad_norm": 1679803.0,
"learning_rate": 1.0967056323060574e-05,
"loss": 3.7726,
"step": 5100
},
{
"epoch": 1.3575982996811902,
"grad_norm": 14593549.0,
"learning_rate": 1.0949344668792066e-05,
"loss": 4.0024,
"step": 5110
},
{
"epoch": 1.3602550478214666,
"grad_norm": 4186409.75,
"learning_rate": 1.0931633014523556e-05,
"loss": 3.6818,
"step": 5120
},
{
"epoch": 1.3629117959617427,
"grad_norm": 747755.5625,
"learning_rate": 1.0913921360255049e-05,
"loss": 3.4717,
"step": 5130
},
{
"epoch": 1.365568544102019,
"grad_norm": 445103.3125,
"learning_rate": 1.0896209705986541e-05,
"loss": 3.4684,
"step": 5140
},
{
"epoch": 1.3682252922422955,
"grad_norm": 1250102.625,
"learning_rate": 1.0878498051718031e-05,
"loss": 3.2248,
"step": 5150
},
{
"epoch": 1.3708820403825717,
"grad_norm": 532045.3125,
"learning_rate": 1.0860786397449524e-05,
"loss": 3.3662,
"step": 5160
},
{
"epoch": 1.373538788522848,
"grad_norm": 454849.5625,
"learning_rate": 1.0843074743181014e-05,
"loss": 3.5507,
"step": 5170
},
{
"epoch": 1.3761955366631242,
"grad_norm": 3551179.5,
"learning_rate": 1.0825363088912507e-05,
"loss": 3.2755,
"step": 5180
},
{
"epoch": 1.3788522848034006,
"grad_norm": 6700418.0,
"learning_rate": 1.0807651434643997e-05,
"loss": 3.2751,
"step": 5190
},
{
"epoch": 1.381509032943677,
"grad_norm": 37462192.0,
"learning_rate": 1.078993978037549e-05,
"loss": 3.5327,
"step": 5200
},
{
"epoch": 1.3841657810839532,
"grad_norm": 9333666.0,
"learning_rate": 1.0772228126106978e-05,
"loss": 3.1278,
"step": 5210
},
{
"epoch": 1.3868225292242295,
"grad_norm": 16026876.0,
"learning_rate": 1.075451647183847e-05,
"loss": 3.5275,
"step": 5220
},
{
"epoch": 1.3894792773645057,
"grad_norm": 24360552.0,
"learning_rate": 1.0736804817569961e-05,
"loss": 3.6815,
"step": 5230
},
{
"epoch": 1.392136025504782,
"grad_norm": 12289483.0,
"learning_rate": 1.0719093163301453e-05,
"loss": 3.1039,
"step": 5240
},
{
"epoch": 1.3947927736450585,
"grad_norm": 1954500.625,
"learning_rate": 1.0701381509032944e-05,
"loss": 3.3327,
"step": 5250
},
{
"epoch": 1.3974495217853349,
"grad_norm": 5957172.5,
"learning_rate": 1.0683669854764436e-05,
"loss": 3.6985,
"step": 5260
},
{
"epoch": 1.400106269925611,
"grad_norm": 136582976.0,
"learning_rate": 1.0665958200495927e-05,
"loss": 3.4845,
"step": 5270
},
{
"epoch": 1.4027630180658874,
"grad_norm": 21799228.0,
"learning_rate": 1.0648246546227419e-05,
"loss": 3.4648,
"step": 5280
},
{
"epoch": 1.4054197662061636,
"grad_norm": 1183856.625,
"learning_rate": 1.063053489195891e-05,
"loss": 3.2929,
"step": 5290
},
{
"epoch": 1.40807651434644,
"grad_norm": 28349394.0,
"learning_rate": 1.0612823237690402e-05,
"loss": 3.611,
"step": 5300
},
{
"epoch": 1.4107332624867164,
"grad_norm": 1230487.75,
"learning_rate": 1.0595111583421892e-05,
"loss": 3.0602,
"step": 5310
},
{
"epoch": 1.4133900106269925,
"grad_norm": 29549574.0,
"learning_rate": 1.0577399929153385e-05,
"loss": 3.6129,
"step": 5320
},
{
"epoch": 1.416046758767269,
"grad_norm": 65607896.0,
"learning_rate": 1.0559688274884875e-05,
"loss": 3.305,
"step": 5330
},
{
"epoch": 1.418703506907545,
"grad_norm": 21593944.0,
"learning_rate": 1.0541976620616366e-05,
"loss": 4.182,
"step": 5340
},
{
"epoch": 1.4213602550478215,
"grad_norm": 9913192.0,
"learning_rate": 1.0524264966347856e-05,
"loss": 3.333,
"step": 5350
},
{
"epoch": 1.4240170031880979,
"grad_norm": 5600408.5,
"learning_rate": 1.0506553312079349e-05,
"loss": 3.2001,
"step": 5360
},
{
"epoch": 1.426673751328374,
"grad_norm": 4921900.0,
"learning_rate": 1.048884165781084e-05,
"loss": 3.8381,
"step": 5370
},
{
"epoch": 1.4293304994686504,
"grad_norm": 22669404.0,
"learning_rate": 1.0471130003542332e-05,
"loss": 3.438,
"step": 5380
},
{
"epoch": 1.4319872476089266,
"grad_norm": 11211402.0,
"learning_rate": 1.0453418349273822e-05,
"loss": 3.3608,
"step": 5390
},
{
"epoch": 1.434643995749203,
"grad_norm": 10033162.0,
"learning_rate": 1.0435706695005314e-05,
"loss": 3.2148,
"step": 5400
},
{
"epoch": 1.4373007438894794,
"grad_norm": 34627448.0,
"learning_rate": 1.0417995040736805e-05,
"loss": 3.3408,
"step": 5410
},
{
"epoch": 1.4399574920297555,
"grad_norm": 19163360.0,
"learning_rate": 1.0400283386468297e-05,
"loss": 3.0767,
"step": 5420
},
{
"epoch": 1.442614240170032,
"grad_norm": 11876396.0,
"learning_rate": 1.038257173219979e-05,
"loss": 3.8624,
"step": 5430
},
{
"epoch": 1.445270988310308,
"grad_norm": 6485251.5,
"learning_rate": 1.036486007793128e-05,
"loss": 3.4212,
"step": 5440
},
{
"epoch": 1.4479277364505845,
"grad_norm": 2855033.5,
"learning_rate": 1.0347148423662772e-05,
"loss": 3.5543,
"step": 5450
},
{
"epoch": 1.4505844845908609,
"grad_norm": 39419356.0,
"learning_rate": 1.0329436769394263e-05,
"loss": 3.6357,
"step": 5460
},
{
"epoch": 1.453241232731137,
"grad_norm": 8782708.0,
"learning_rate": 1.0311725115125755e-05,
"loss": 3.7995,
"step": 5470
},
{
"epoch": 1.4558979808714134,
"grad_norm": 32046924.0,
"learning_rate": 1.0294013460857244e-05,
"loss": 3.2472,
"step": 5480
},
{
"epoch": 1.4585547290116896,
"grad_norm": 30402538.0,
"learning_rate": 1.0276301806588735e-05,
"loss": 3.1715,
"step": 5490
},
{
"epoch": 1.461211477151966,
"grad_norm": 19326186.0,
"learning_rate": 1.0258590152320227e-05,
"loss": 3.9161,
"step": 5500
},
{
"epoch": 1.4638682252922424,
"grad_norm": 9990077.0,
"learning_rate": 1.024087849805172e-05,
"loss": 3.849,
"step": 5510
},
{
"epoch": 1.4665249734325185,
"grad_norm": 29835254.0,
"learning_rate": 1.022316684378321e-05,
"loss": 3.331,
"step": 5520
},
{
"epoch": 1.469181721572795,
"grad_norm": 84350656.0,
"learning_rate": 1.0205455189514702e-05,
"loss": 3.3592,
"step": 5530
},
{
"epoch": 1.471838469713071,
"grad_norm": 5173333.5,
"learning_rate": 1.0187743535246193e-05,
"loss": 3.3015,
"step": 5540
},
{
"epoch": 1.4744952178533475,
"grad_norm": 3443425.5,
"learning_rate": 1.0170031880977685e-05,
"loss": 3.5236,
"step": 5550
},
{
"epoch": 1.4771519659936239,
"grad_norm": 2188022.75,
"learning_rate": 1.0152320226709175e-05,
"loss": 3.5614,
"step": 5560
},
{
"epoch": 1.4798087141339,
"grad_norm": 16931794.0,
"learning_rate": 1.0134608572440668e-05,
"loss": 3.6685,
"step": 5570
},
{
"epoch": 1.4824654622741764,
"grad_norm": 10456564.0,
"learning_rate": 1.0116896918172158e-05,
"loss": 3.4864,
"step": 5580
},
{
"epoch": 1.4851222104144526,
"grad_norm": 27239420.0,
"learning_rate": 1.009918526390365e-05,
"loss": 3.5637,
"step": 5590
},
{
"epoch": 1.487778958554729,
"grad_norm": 16616771.0,
"learning_rate": 1.0081473609635141e-05,
"loss": 3.6085,
"step": 5600
},
{
"epoch": 1.4904357066950054,
"grad_norm": 10221569.0,
"learning_rate": 1.0063761955366632e-05,
"loss": 3.5812,
"step": 5610
},
{
"epoch": 1.4930924548352817,
"grad_norm": 1452260.75,
"learning_rate": 1.0046050301098122e-05,
"loss": 3.9326,
"step": 5620
},
{
"epoch": 1.495749202975558,
"grad_norm": 3546143.0,
"learning_rate": 1.0028338646829615e-05,
"loss": 3.2541,
"step": 5630
},
{
"epoch": 1.4984059511158343,
"grad_norm": 12791246.0,
"learning_rate": 1.0010626992561105e-05,
"loss": 3.4152,
"step": 5640
},
{
"epoch": 1.5010626992561105,
"grad_norm": 12529229.0,
"learning_rate": 9.992915338292597e-06,
"loss": 3.0508,
"step": 5650
},
{
"epoch": 1.5037194473963869,
"grad_norm": 9755405.0,
"learning_rate": 9.975203684024088e-06,
"loss": 3.5064,
"step": 5660
},
{
"epoch": 1.5063761955366632,
"grad_norm": 6901898.0,
"learning_rate": 9.95749202975558e-06,
"loss": 3.6654,
"step": 5670
},
{
"epoch": 1.5090329436769394,
"grad_norm": 9542270.0,
"learning_rate": 9.93978037548707e-06,
"loss": 3.3481,
"step": 5680
},
{
"epoch": 1.5116896918172156,
"grad_norm": 14570059.0,
"learning_rate": 9.922068721218563e-06,
"loss": 3.6342,
"step": 5690
},
{
"epoch": 1.514346439957492,
"grad_norm": 130252984.0,
"learning_rate": 9.904357066950054e-06,
"loss": 3.3275,
"step": 5700
},
{
"epoch": 1.5170031880977684,
"grad_norm": 12491921.0,
"learning_rate": 9.886645412681544e-06,
"loss": 3.1862,
"step": 5710
},
{
"epoch": 1.5196599362380447,
"grad_norm": 171955248.0,
"learning_rate": 9.868933758413036e-06,
"loss": 3.6,
"step": 5720
},
{
"epoch": 1.522316684378321,
"grad_norm": 67972536.0,
"learning_rate": 9.851222104144527e-06,
"loss": 3.5839,
"step": 5730
},
{
"epoch": 1.524973432518597,
"grad_norm": 19312536.0,
"learning_rate": 9.83351044987602e-06,
"loss": 3.3906,
"step": 5740
},
{
"epoch": 1.5276301806588735,
"grad_norm": 39636108.0,
"learning_rate": 9.81579879560751e-06,
"loss": 3.5388,
"step": 5750
},
{
"epoch": 1.5302869287991498,
"grad_norm": 54133548.0,
"learning_rate": 9.798087141339002e-06,
"loss": 3.2938,
"step": 5760
},
{
"epoch": 1.5329436769394262,
"grad_norm": 28021788.0,
"learning_rate": 9.780375487070494e-06,
"loss": 3.565,
"step": 5770
},
{
"epoch": 1.5356004250797024,
"grad_norm": 12500334.0,
"learning_rate": 9.762663832801983e-06,
"loss": 3.4099,
"step": 5780
},
{
"epoch": 1.5382571732199788,
"grad_norm": 20677724.0,
"learning_rate": 9.744952178533476e-06,
"loss": 3.8265,
"step": 5790
},
{
"epoch": 1.540913921360255,
"grad_norm": 25849000.0,
"learning_rate": 9.727240524264968e-06,
"loss": 3.5107,
"step": 5800
},
{
"epoch": 1.5435706695005313,
"grad_norm": 7106916.0,
"learning_rate": 9.709528869996458e-06,
"loss": 3.7538,
"step": 5810
},
{
"epoch": 1.5462274176408077,
"grad_norm": 78143128.0,
"learning_rate": 9.69181721572795e-06,
"loss": 3.8139,
"step": 5820
},
{
"epoch": 1.548884165781084,
"grad_norm": 124880632.0,
"learning_rate": 9.674105561459441e-06,
"loss": 3.4966,
"step": 5830
},
{
"epoch": 1.5515409139213603,
"grad_norm": 16674735.0,
"learning_rate": 9.656393907190934e-06,
"loss": 3.7779,
"step": 5840
},
{
"epoch": 1.5541976620616365,
"grad_norm": 36204444.0,
"learning_rate": 9.638682252922424e-06,
"loss": 3.5086,
"step": 5850
},
{
"epoch": 1.5568544102019128,
"grad_norm": 7019197.5,
"learning_rate": 9.620970598653915e-06,
"loss": 3.3062,
"step": 5860
},
{
"epoch": 1.5595111583421892,
"grad_norm": 14028569.0,
"learning_rate": 9.603258944385407e-06,
"loss": 3.4862,
"step": 5870
},
{
"epoch": 1.5621679064824656,
"grad_norm": 24143218.0,
"learning_rate": 9.585547290116898e-06,
"loss": 3.388,
"step": 5880
},
{
"epoch": 1.5648246546227418,
"grad_norm": 8635328.0,
"learning_rate": 9.56783563584839e-06,
"loss": 3.9959,
"step": 5890
},
{
"epoch": 1.567481402763018,
"grad_norm": 14461347.0,
"learning_rate": 9.55012398157988e-06,
"loss": 3.3619,
"step": 5900
},
{
"epoch": 1.5701381509032943,
"grad_norm": 45164232.0,
"learning_rate": 9.532412327311371e-06,
"loss": 3.7565,
"step": 5910
},
{
"epoch": 1.5727948990435707,
"grad_norm": 43768708.0,
"learning_rate": 9.514700673042863e-06,
"loss": 3.2873,
"step": 5920
},
{
"epoch": 1.5754516471838471,
"grad_norm": 102944216.0,
"learning_rate": 9.496989018774354e-06,
"loss": 3.5849,
"step": 5930
},
{
"epoch": 1.5781083953241233,
"grad_norm": 8864102.0,
"learning_rate": 9.479277364505846e-06,
"loss": 3.3615,
"step": 5940
},
{
"epoch": 1.5807651434643994,
"grad_norm": 17926040.0,
"learning_rate": 9.461565710237337e-06,
"loss": 3.3599,
"step": 5950
},
{
"epoch": 1.5834218916046758,
"grad_norm": 563806208.0,
"learning_rate": 9.443854055968829e-06,
"loss": 3.6726,
"step": 5960
},
{
"epoch": 1.5860786397449522,
"grad_norm": 4375813.5,
"learning_rate": 9.42614240170032e-06,
"loss": 3.5982,
"step": 5970
},
{
"epoch": 1.5887353878852286,
"grad_norm": 23817932.0,
"learning_rate": 9.40843074743181e-06,
"loss": 3.6873,
"step": 5980
},
{
"epoch": 1.5913921360255048,
"grad_norm": 3588041.25,
"learning_rate": 9.390719093163302e-06,
"loss": 3.8219,
"step": 5990
},
{
"epoch": 1.594048884165781,
"grad_norm": 97096224.0,
"learning_rate": 9.373007438894793e-06,
"loss": 3.5905,
"step": 6000
},
{
"epoch": 1.5967056323060573,
"grad_norm": 4066724.0,
"learning_rate": 9.355295784626285e-06,
"loss": 3.5762,
"step": 6010
},
{
"epoch": 1.5993623804463337,
"grad_norm": 44529008.0,
"learning_rate": 9.337584130357776e-06,
"loss": 3.821,
"step": 6020
},
{
"epoch": 1.60201912858661,
"grad_norm": 10141793.0,
"learning_rate": 9.319872476089268e-06,
"loss": 3.4989,
"step": 6030
},
{
"epoch": 1.6046758767268863,
"grad_norm": 22102744.0,
"learning_rate": 9.302160821820759e-06,
"loss": 3.4363,
"step": 6040
},
{
"epoch": 1.6073326248671624,
"grad_norm": 1421525.375,
"learning_rate": 9.284449167552249e-06,
"loss": 3.3543,
"step": 6050
},
{
"epoch": 1.6099893730074388,
"grad_norm": 17624050.0,
"learning_rate": 9.266737513283741e-06,
"loss": 3.5835,
"step": 6060
},
{
"epoch": 1.6126461211477152,
"grad_norm": 2787807.5,
"learning_rate": 9.249025859015232e-06,
"loss": 3.7715,
"step": 6070
},
{
"epoch": 1.6153028692879916,
"grad_norm": 36419916.0,
"learning_rate": 9.231314204746724e-06,
"loss": 3.2874,
"step": 6080
},
{
"epoch": 1.6179596174282678,
"grad_norm": 550304.0,
"learning_rate": 9.213602550478215e-06,
"loss": 3.775,
"step": 6090
},
{
"epoch": 1.620616365568544,
"grad_norm": 13110638.0,
"learning_rate": 9.195890896209707e-06,
"loss": 4.0895,
"step": 6100
},
{
"epoch": 1.6232731137088203,
"grad_norm": 153279.40625,
"learning_rate": 9.1781792419412e-06,
"loss": 3.5868,
"step": 6110
},
{
"epoch": 1.6259298618490967,
"grad_norm": 274644.03125,
"learning_rate": 9.160467587672688e-06,
"loss": 3.4759,
"step": 6120
},
{
"epoch": 1.628586609989373,
"grad_norm": 21545.19921875,
"learning_rate": 9.14275593340418e-06,
"loss": 4.0524,
"step": 6130
},
{
"epoch": 1.6312433581296493,
"grad_norm": 27863.1015625,
"learning_rate": 9.125044279135673e-06,
"loss": 3.4133,
"step": 6140
},
{
"epoch": 1.6339001062699257,
"grad_norm": 146765.640625,
"learning_rate": 9.107332624867163e-06,
"loss": 3.6765,
"step": 6150
},
{
"epoch": 1.6365568544102018,
"grad_norm": 60709.375,
"learning_rate": 9.089620970598656e-06,
"loss": 3.8558,
"step": 6160
},
{
"epoch": 1.6392136025504782,
"grad_norm": 290704.21875,
"learning_rate": 9.071909316330146e-06,
"loss": 3.3615,
"step": 6170
},
{
"epoch": 1.6418703506907546,
"grad_norm": 198007.828125,
"learning_rate": 9.054197662061637e-06,
"loss": 3.6759,
"step": 6180
},
{
"epoch": 1.6445270988310308,
"grad_norm": 30211.29296875,
"learning_rate": 9.036486007793129e-06,
"loss": 4.1618,
"step": 6190
},
{
"epoch": 1.6471838469713072,
"grad_norm": 697217.3125,
"learning_rate": 9.01877435352462e-06,
"loss": 3.5873,
"step": 6200
},
{
"epoch": 1.6498405951115833,
"grad_norm": 311260.34375,
"learning_rate": 9.001062699256112e-06,
"loss": 4.0309,
"step": 6210
},
{
"epoch": 1.6524973432518597,
"grad_norm": 7285945.0,
"learning_rate": 8.983351044987602e-06,
"loss": 3.7024,
"step": 6220
},
{
"epoch": 1.655154091392136,
"grad_norm": 238075.265625,
"learning_rate": 8.965639390719095e-06,
"loss": 3.7081,
"step": 6230
},
{
"epoch": 1.6578108395324125,
"grad_norm": 104777.8828125,
"learning_rate": 8.947927736450585e-06,
"loss": 3.6374,
"step": 6240
},
{
"epoch": 1.6604675876726886,
"grad_norm": 45899.98828125,
"learning_rate": 8.930216082182076e-06,
"loss": 3.7753,
"step": 6250
},
{
"epoch": 1.6631243358129648,
"grad_norm": 4903258.0,
"learning_rate": 8.912504427913568e-06,
"loss": 3.7641,
"step": 6260
},
{
"epoch": 1.6657810839532412,
"grad_norm": 691504.875,
"learning_rate": 8.894792773645059e-06,
"loss": 3.012,
"step": 6270
},
{
"epoch": 1.6684378320935176,
"grad_norm": 7211197.0,
"learning_rate": 8.877081119376551e-06,
"loss": 3.278,
"step": 6280
},
{
"epoch": 1.671094580233794,
"grad_norm": 55386.39453125,
"learning_rate": 8.859369465108042e-06,
"loss": 3.5972,
"step": 6290
},
{
"epoch": 1.6737513283740701,
"grad_norm": 4803297.5,
"learning_rate": 8.841657810839534e-06,
"loss": 3.5168,
"step": 6300
},
{
"epoch": 1.6764080765143463,
"grad_norm": 153394.5625,
"learning_rate": 8.823946156571024e-06,
"loss": 3.4884,
"step": 6310
},
{
"epoch": 1.6790648246546227,
"grad_norm": 105014.6796875,
"learning_rate": 8.806234502302515e-06,
"loss": 3.5724,
"step": 6320
},
{
"epoch": 1.681721572794899,
"grad_norm": 425531.6875,
"learning_rate": 8.788522848034007e-06,
"loss": 3.7171,
"step": 6330
},
{
"epoch": 1.6843783209351755,
"grad_norm": 881638.625,
"learning_rate": 8.770811193765498e-06,
"loss": 3.5689,
"step": 6340
},
{
"epoch": 1.6870350690754516,
"grad_norm": 506417.84375,
"learning_rate": 8.75309953949699e-06,
"loss": 3.3471,
"step": 6350
},
{
"epoch": 1.6896918172157278,
"grad_norm": 218658.8125,
"learning_rate": 8.73538788522848e-06,
"loss": 3.0762,
"step": 6360
},
{
"epoch": 1.6923485653560042,
"grad_norm": 3747502.5,
"learning_rate": 8.717676230959973e-06,
"loss": 3.7819,
"step": 6370
},
{
"epoch": 1.6950053134962806,
"grad_norm": 402977.15625,
"learning_rate": 8.699964576691463e-06,
"loss": 3.2238,
"step": 6380
},
{
"epoch": 1.697662061636557,
"grad_norm": 354610.0,
"learning_rate": 8.682252922422954e-06,
"loss": 3.5365,
"step": 6390
},
{
"epoch": 1.7003188097768331,
"grad_norm": 737137.25,
"learning_rate": 8.664541268154446e-06,
"loss": 3.7334,
"step": 6400
},
{
"epoch": 1.7029755579171093,
"grad_norm": 270020.3125,
"learning_rate": 8.646829613885937e-06,
"loss": 3.6183,
"step": 6410
},
{
"epoch": 1.7056323060573857,
"grad_norm": 740626.4375,
"learning_rate": 8.629117959617429e-06,
"loss": 3.7487,
"step": 6420
},
{
"epoch": 1.708289054197662,
"grad_norm": 1305229.75,
"learning_rate": 8.61140630534892e-06,
"loss": 3.7039,
"step": 6430
},
{
"epoch": 1.7109458023379385,
"grad_norm": 172010.875,
"learning_rate": 8.593694651080412e-06,
"loss": 2.9064,
"step": 6440
},
{
"epoch": 1.7136025504782146,
"grad_norm": 36386.55859375,
"learning_rate": 8.575982996811903e-06,
"loss": 3.5462,
"step": 6450
},
{
"epoch": 1.7162592986184908,
"grad_norm": 280424.5,
"learning_rate": 8.558271342543393e-06,
"loss": 3.7119,
"step": 6460
},
{
"epoch": 1.7189160467587672,
"grad_norm": 65134.73828125,
"learning_rate": 8.540559688274885e-06,
"loss": 4.058,
"step": 6470
},
{
"epoch": 1.7215727948990436,
"grad_norm": 66937.53125,
"learning_rate": 8.522848034006378e-06,
"loss": 3.3975,
"step": 6480
},
{
"epoch": 1.72422954303932,
"grad_norm": 131224.421875,
"learning_rate": 8.505136379737868e-06,
"loss": 3.4813,
"step": 6490
},
{
"epoch": 1.7268862911795961,
"grad_norm": 108172.1640625,
"learning_rate": 8.48742472546936e-06,
"loss": 3.1716,
"step": 6500
},
{
"epoch": 1.7295430393198725,
"grad_norm": 25198.029296875,
"learning_rate": 8.469713071200851e-06,
"loss": 3.6849,
"step": 6510
},
{
"epoch": 1.7321997874601487,
"grad_norm": 61498.03515625,
"learning_rate": 8.452001416932342e-06,
"loss": 3.4036,
"step": 6520
},
{
"epoch": 1.734856535600425,
"grad_norm": 442683.875,
"learning_rate": 8.434289762663834e-06,
"loss": 3.3497,
"step": 6530
},
{
"epoch": 1.7375132837407015,
"grad_norm": 27654.84765625,
"learning_rate": 8.416578108395324e-06,
"loss": 3.2324,
"step": 6540
},
{
"epoch": 1.7401700318809776,
"grad_norm": 87875.5546875,
"learning_rate": 8.398866454126817e-06,
"loss": 3.211,
"step": 6550
},
{
"epoch": 1.742826780021254,
"grad_norm": 443493.65625,
"learning_rate": 8.381154799858307e-06,
"loss": 3.5746,
"step": 6560
},
{
"epoch": 1.7454835281615302,
"grad_norm": 112091.3046875,
"learning_rate": 8.3634431455898e-06,
"loss": 3.2604,
"step": 6570
},
{
"epoch": 1.7481402763018066,
"grad_norm": 37516.62109375,
"learning_rate": 8.34573149132129e-06,
"loss": 3.3058,
"step": 6580
},
{
"epoch": 1.750797024442083,
"grad_norm": 98792.796875,
"learning_rate": 8.32801983705278e-06,
"loss": 3.4504,
"step": 6590
},
{
"epoch": 1.7534537725823593,
"grad_norm": 24296.8125,
"learning_rate": 8.310308182784273e-06,
"loss": 3.2476,
"step": 6600
},
{
"epoch": 1.7561105207226355,
"grad_norm": 27490.43359375,
"learning_rate": 8.292596528515764e-06,
"loss": 3.4551,
"step": 6610
},
{
"epoch": 1.7587672688629117,
"grad_norm": 163381.75,
"learning_rate": 8.274884874247256e-06,
"loss": 3.56,
"step": 6620
},
{
"epoch": 1.761424017003188,
"grad_norm": 5022.00244140625,
"learning_rate": 8.257173219978746e-06,
"loss": 3.2829,
"step": 6630
},
{
"epoch": 1.7640807651434645,
"grad_norm": 873426.5,
"learning_rate": 8.239461565710239e-06,
"loss": 3.292,
"step": 6640
},
{
"epoch": 1.7667375132837408,
"grad_norm": 48760.75390625,
"learning_rate": 8.22174991144173e-06,
"loss": 3.3971,
"step": 6650
},
{
"epoch": 1.769394261424017,
"grad_norm": 22562.328125,
"learning_rate": 8.20403825717322e-06,
"loss": 3.4901,
"step": 6660
},
{
"epoch": 1.7720510095642932,
"grad_norm": 110952.984375,
"learning_rate": 8.186326602904712e-06,
"loss": 3.5824,
"step": 6670
},
{
"epoch": 1.7747077577045696,
"grad_norm": 11664.615234375,
"learning_rate": 8.168614948636203e-06,
"loss": 3.6433,
"step": 6680
},
{
"epoch": 1.777364505844846,
"grad_norm": 296820.28125,
"learning_rate": 8.150903294367695e-06,
"loss": 3.4816,
"step": 6690
},
{
"epoch": 1.7800212539851223,
"grad_norm": 28750.556640625,
"learning_rate": 8.133191640099186e-06,
"loss": 3.4851,
"step": 6700
},
{
"epoch": 1.7826780021253985,
"grad_norm": 86309.7890625,
"learning_rate": 8.115479985830678e-06,
"loss": 3.3058,
"step": 6710
},
{
"epoch": 1.7853347502656747,
"grad_norm": 91584.7734375,
"learning_rate": 8.097768331562168e-06,
"loss": 3.7495,
"step": 6720
},
{
"epoch": 1.787991498405951,
"grad_norm": 132450.96875,
"learning_rate": 8.080056677293659e-06,
"loss": 3.4955,
"step": 6730
},
{
"epoch": 1.7906482465462275,
"grad_norm": 134387.046875,
"learning_rate": 8.062345023025151e-06,
"loss": 3.4655,
"step": 6740
},
{
"epoch": 1.7933049946865038,
"grad_norm": 74426.6875,
"learning_rate": 8.044633368756642e-06,
"loss": 3.7594,
"step": 6750
},
{
"epoch": 1.79596174282678,
"grad_norm": 58667.3984375,
"learning_rate": 8.026921714488134e-06,
"loss": 3.7655,
"step": 6760
},
{
"epoch": 1.7986184909670562,
"grad_norm": 130389.9140625,
"learning_rate": 8.009210060219625e-06,
"loss": 3.673,
"step": 6770
},
{
"epoch": 1.8012752391073326,
"grad_norm": 89147.9296875,
"learning_rate": 7.991498405951117e-06,
"loss": 3.2874,
"step": 6780
},
{
"epoch": 1.803931987247609,
"grad_norm": 44793.80859375,
"learning_rate": 7.973786751682607e-06,
"loss": 3.2517,
"step": 6790
},
{
"epoch": 1.8065887353878853,
"grad_norm": 15245.392578125,
"learning_rate": 7.956075097414098e-06,
"loss": 3.5179,
"step": 6800
},
{
"epoch": 1.8092454835281615,
"grad_norm": 15995.4912109375,
"learning_rate": 7.93836344314559e-06,
"loss": 3.6515,
"step": 6810
},
{
"epoch": 1.8119022316684377,
"grad_norm": 16524.787109375,
"learning_rate": 7.920651788877083e-06,
"loss": 3.1618,
"step": 6820
},
{
"epoch": 1.814558979808714,
"grad_norm": 42409.20703125,
"learning_rate": 7.902940134608573e-06,
"loss": 3.58,
"step": 6830
},
{
"epoch": 1.8172157279489904,
"grad_norm": 10542.6796875,
"learning_rate": 7.885228480340065e-06,
"loss": 3.508,
"step": 6840
},
{
"epoch": 1.8198724760892668,
"grad_norm": 25151.1484375,
"learning_rate": 7.867516826071556e-06,
"loss": 3.0635,
"step": 6850
},
{
"epoch": 1.822529224229543,
"grad_norm": 9499.1826171875,
"learning_rate": 7.849805171803047e-06,
"loss": 2.9901,
"step": 6860
},
{
"epoch": 1.8251859723698194,
"grad_norm": 54946.984375,
"learning_rate": 7.832093517534539e-06,
"loss": 2.9531,
"step": 6870
},
{
"epoch": 1.8278427205100956,
"grad_norm": 10790.599609375,
"learning_rate": 7.81438186326603e-06,
"loss": 3.5882,
"step": 6880
},
{
"epoch": 1.830499468650372,
"grad_norm": 13575.8759765625,
"learning_rate": 7.796670208997522e-06,
"loss": 3.3612,
"step": 6890
},
{
"epoch": 1.8331562167906483,
"grad_norm": 20945.48046875,
"learning_rate": 7.778958554729012e-06,
"loss": 3.0764,
"step": 6900
},
{
"epoch": 1.8358129649309245,
"grad_norm": 232869.03125,
"learning_rate": 7.761246900460504e-06,
"loss": 3.1716,
"step": 6910
},
{
"epoch": 1.8384697130712009,
"grad_norm": 43791.59765625,
"learning_rate": 7.743535246191995e-06,
"loss": 3.2311,
"step": 6920
},
{
"epoch": 1.841126461211477,
"grad_norm": 22579.091796875,
"learning_rate": 7.725823591923486e-06,
"loss": 3.4563,
"step": 6930
},
{
"epoch": 1.8437832093517534,
"grad_norm": 28530.806640625,
"learning_rate": 7.708111937654978e-06,
"loss": 3.452,
"step": 6940
},
{
"epoch": 1.8464399574920298,
"grad_norm": 12486.0390625,
"learning_rate": 7.690400283386468e-06,
"loss": 3.2791,
"step": 6950
},
{
"epoch": 1.8490967056323062,
"grad_norm": 17018.11328125,
"learning_rate": 7.67268862911796e-06,
"loss": 3.6792,
"step": 6960
},
{
"epoch": 1.8517534537725824,
"grad_norm": 16199.2470703125,
"learning_rate": 7.654976974849451e-06,
"loss": 3.2561,
"step": 6970
},
{
"epoch": 1.8544102019128585,
"grad_norm": 10388.2470703125,
"learning_rate": 7.637265320580944e-06,
"loss": 3.0233,
"step": 6980
},
{
"epoch": 1.857066950053135,
"grad_norm": 15407.7548828125,
"learning_rate": 7.619553666312433e-06,
"loss": 3.167,
"step": 6990
},
{
"epoch": 1.8597236981934113,
"grad_norm": 26815.095703125,
"learning_rate": 7.601842012043925e-06,
"loss": 3.2972,
"step": 7000
},
{
"epoch": 1.8623804463336877,
"grad_norm": 58698.21875,
"learning_rate": 7.584130357775417e-06,
"loss": 3.2334,
"step": 7010
},
{
"epoch": 1.8650371944739639,
"grad_norm": 27274.71875,
"learning_rate": 7.566418703506908e-06,
"loss": 3.1432,
"step": 7020
},
{
"epoch": 1.86769394261424,
"grad_norm": 83316.0703125,
"learning_rate": 7.5487070492384e-06,
"loss": 3.1284,
"step": 7030
},
{
"epoch": 1.8703506907545164,
"grad_norm": 30122.771484375,
"learning_rate": 7.530995394969891e-06,
"loss": 2.8904,
"step": 7040
},
{
"epoch": 1.8730074388947928,
"grad_norm": 40200.9609375,
"learning_rate": 7.513283740701383e-06,
"loss": 3.3255,
"step": 7050
},
{
"epoch": 1.8756641870350692,
"grad_norm": 16342.447265625,
"learning_rate": 7.495572086432873e-06,
"loss": 3.1073,
"step": 7060
},
{
"epoch": 1.8783209351753454,
"grad_norm": 14423.703125,
"learning_rate": 7.477860432164365e-06,
"loss": 3.4831,
"step": 7070
},
{
"epoch": 1.8809776833156215,
"grad_norm": 34366.14453125,
"learning_rate": 7.460148777895856e-06,
"loss": 3.2063,
"step": 7080
},
{
"epoch": 1.883634431455898,
"grad_norm": 70803.8359375,
"learning_rate": 7.4424371236273475e-06,
"loss": 3.5181,
"step": 7090
},
{
"epoch": 1.8862911795961743,
"grad_norm": 13800.69140625,
"learning_rate": 7.424725469358839e-06,
"loss": 3.4993,
"step": 7100
},
{
"epoch": 1.8889479277364507,
"grad_norm": 48057.68359375,
"learning_rate": 7.40701381509033e-06,
"loss": 3.1418,
"step": 7110
},
{
"epoch": 1.8916046758767269,
"grad_norm": 40145.4921875,
"learning_rate": 7.389302160821822e-06,
"loss": 3.3427,
"step": 7120
},
{
"epoch": 1.894261424017003,
"grad_norm": 13148.1484375,
"learning_rate": 7.371590506553312e-06,
"loss": 3.2584,
"step": 7130
},
{
"epoch": 1.8969181721572794,
"grad_norm": 10740.6826171875,
"learning_rate": 7.353878852284804e-06,
"loss": 2.8656,
"step": 7140
},
{
"epoch": 1.8995749202975558,
"grad_norm": 7270.3818359375,
"learning_rate": 7.336167198016295e-06,
"loss": 3.1929,
"step": 7150
},
{
"epoch": 1.9022316684378322,
"grad_norm": 3250.9072265625,
"learning_rate": 7.318455543747787e-06,
"loss": 3.3218,
"step": 7160
},
{
"epoch": 1.9048884165781084,
"grad_norm": 40904.6484375,
"learning_rate": 7.300743889479278e-06,
"loss": 3.0994,
"step": 7170
},
{
"epoch": 1.9075451647183845,
"grad_norm": 9426.0009765625,
"learning_rate": 7.2830322352107695e-06,
"loss": 3.2861,
"step": 7180
},
{
"epoch": 1.910201912858661,
"grad_norm": 10107.427734375,
"learning_rate": 7.265320580942261e-06,
"loss": 3.3694,
"step": 7190
},
{
"epoch": 1.9128586609989373,
"grad_norm": 25632.7734375,
"learning_rate": 7.2476089266737514e-06,
"loss": 3.1918,
"step": 7200
},
{
"epoch": 1.9155154091392137,
"grad_norm": 10823.2509765625,
"learning_rate": 7.229897272405243e-06,
"loss": 3.1984,
"step": 7210
},
{
"epoch": 1.9181721572794899,
"grad_norm": 8237.4482421875,
"learning_rate": 7.212185618136734e-06,
"loss": 2.7874,
"step": 7220
},
{
"epoch": 1.9208289054197663,
"grad_norm": 4823.09716796875,
"learning_rate": 7.194473963868226e-06,
"loss": 3.2625,
"step": 7230
},
{
"epoch": 1.9234856535600424,
"grad_norm": 6276.54150390625,
"learning_rate": 7.176762309599717e-06,
"loss": 3.1739,
"step": 7240
},
{
"epoch": 1.9261424017003188,
"grad_norm": 9979.935546875,
"learning_rate": 7.1590506553312085e-06,
"loss": 3.34,
"step": 7250
},
{
"epoch": 1.9287991498405952,
"grad_norm": 3373.656982421875,
"learning_rate": 7.141339001062701e-06,
"loss": 3.4366,
"step": 7260
},
{
"epoch": 1.9314558979808714,
"grad_norm": 9178.9404296875,
"learning_rate": 7.1236273467941905e-06,
"loss": 3.245,
"step": 7270
},
{
"epoch": 1.9341126461211477,
"grad_norm": 11173.3037109375,
"learning_rate": 7.105915692525682e-06,
"loss": 3.1306,
"step": 7280
},
{
"epoch": 1.936769394261424,
"grad_norm": 6969.20849609375,
"learning_rate": 7.088204038257173e-06,
"loss": 3.5482,
"step": 7290
},
{
"epoch": 1.9394261424017003,
"grad_norm": 22079.796875,
"learning_rate": 7.070492383988665e-06,
"loss": 3.2338,
"step": 7300
},
{
"epoch": 1.9420828905419767,
"grad_norm": 51803.05078125,
"learning_rate": 7.052780729720157e-06,
"loss": 3.1844,
"step": 7310
},
{
"epoch": 1.944739638682253,
"grad_norm": 17502.84375,
"learning_rate": 7.0350690754516485e-06,
"loss": 3.3796,
"step": 7320
},
{
"epoch": 1.9473963868225292,
"grad_norm": 4275.10009765625,
"learning_rate": 7.017357421183138e-06,
"loss": 3.0306,
"step": 7330
},
{
"epoch": 1.9500531349628054,
"grad_norm": 3620.85400390625,
"learning_rate": 6.99964576691463e-06,
"loss": 3.3635,
"step": 7340
},
{
"epoch": 1.9527098831030818,
"grad_norm": 32547.673828125,
"learning_rate": 6.981934112646122e-06,
"loss": 3.1764,
"step": 7350
},
{
"epoch": 1.9553666312433582,
"grad_norm": 5065.5751953125,
"learning_rate": 6.964222458377613e-06,
"loss": 3.1895,
"step": 7360
},
{
"epoch": 1.9580233793836346,
"grad_norm": 10395.2060546875,
"learning_rate": 6.946510804109105e-06,
"loss": 3.1655,
"step": 7370
},
{
"epoch": 1.9606801275239107,
"grad_norm": 4557.41796875,
"learning_rate": 6.928799149840596e-06,
"loss": 3.0581,
"step": 7380
},
{
"epoch": 1.963336875664187,
"grad_norm": 38417.4765625,
"learning_rate": 6.911087495572088e-06,
"loss": 3.0893,
"step": 7390
},
{
"epoch": 1.9659936238044633,
"grad_norm": 5107.16796875,
"learning_rate": 6.893375841303578e-06,
"loss": 3.4167,
"step": 7400
},
{
"epoch": 1.9686503719447397,
"grad_norm": 5035.6201171875,
"learning_rate": 6.87566418703507e-06,
"loss": 3.0664,
"step": 7410
},
{
"epoch": 1.971307120085016,
"grad_norm": 12651.587890625,
"learning_rate": 6.857952532766561e-06,
"loss": 3.1299,
"step": 7420
},
{
"epoch": 1.9739638682252922,
"grad_norm": 7539.5400390625,
"learning_rate": 6.840240878498052e-06,
"loss": 3.0492,
"step": 7430
},
{
"epoch": 1.9766206163655684,
"grad_norm": 5577.158203125,
"learning_rate": 6.822529224229544e-06,
"loss": 3.098,
"step": 7440
},
{
"epoch": 1.9792773645058448,
"grad_norm": 41558.4921875,
"learning_rate": 6.804817569961035e-06,
"loss": 3.2933,
"step": 7450
},
{
"epoch": 1.9819341126461212,
"grad_norm": 3775.939697265625,
"learning_rate": 6.787105915692527e-06,
"loss": 2.9667,
"step": 7460
},
{
"epoch": 1.9845908607863976,
"grad_norm": 30318.9921875,
"learning_rate": 6.769394261424017e-06,
"loss": 3.0666,
"step": 7470
},
{
"epoch": 1.9872476089266737,
"grad_norm": 21865.806640625,
"learning_rate": 6.751682607155509e-06,
"loss": 2.9213,
"step": 7480
},
{
"epoch": 1.98990435706695,
"grad_norm": 10458.220703125,
"learning_rate": 6.733970952887e-06,
"loss": 3.2567,
"step": 7490
},
{
"epoch": 1.9925611052072263,
"grad_norm": 14638.0439453125,
"learning_rate": 6.7162592986184915e-06,
"loss": 3.2132,
"step": 7500
}
],
"logging_steps": 10,
"max_steps": 11292,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7836212920320000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}