{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9925611052072263, "eval_steps": 500, "global_step": 7500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002656748140276302, "grad_norm": 207.1781768798828, "learning_rate": 1.9982288345731494e-05, "loss": 6.5351, "step": 10 }, { "epoch": 0.005313496280552604, "grad_norm": 265.0539855957031, "learning_rate": 1.9964576691462986e-05, "loss": 5.2822, "step": 20 }, { "epoch": 0.007970244420828906, "grad_norm": 1933.7158203125, "learning_rate": 1.9946865037194475e-05, "loss": 4.7764, "step": 30 }, { "epoch": 0.010626992561105207, "grad_norm": 896.5211791992188, "learning_rate": 1.9929153382925967e-05, "loss": 4.5617, "step": 40 }, { "epoch": 0.013283740701381509, "grad_norm": 2147.634765625, "learning_rate": 1.991144172865746e-05, "loss": 4.5559, "step": 50 }, { "epoch": 0.015940488841657812, "grad_norm": 1384.8623046875, "learning_rate": 1.9893730074388952e-05, "loss": 4.1671, "step": 60 }, { "epoch": 0.018597236981934114, "grad_norm": 3381.87060546875, "learning_rate": 1.987601842012044e-05, "loss": 3.9988, "step": 70 }, { "epoch": 0.021253985122210415, "grad_norm": 398.0505676269531, "learning_rate": 1.985830676585193e-05, "loss": 4.0844, "step": 80 }, { "epoch": 0.023910733262486716, "grad_norm": 2040.54736328125, "learning_rate": 1.9840595111583422e-05, "loss": 3.9493, "step": 90 }, { "epoch": 0.026567481402763018, "grad_norm": 8612.021484375, "learning_rate": 1.9822883457314914e-05, "loss": 3.6944, "step": 100 }, { "epoch": 0.02922422954303932, "grad_norm": 22271.3125, "learning_rate": 1.9805171803046406e-05, "loss": 4.1335, "step": 110 }, { "epoch": 0.031880977683315624, "grad_norm": 5334.6806640625, "learning_rate": 1.97874601487779e-05, "loss": 3.9284, "step": 120 }, { "epoch": 0.03453772582359192, "grad_norm": 1616.4825439453125, "learning_rate": 1.9769748494509388e-05, "loss": 3.9407, "step": 130 }, { "epoch": 0.03719447396386823, "grad_norm": 137.30589294433594, "learning_rate": 1.975203684024088e-05, "loss": 3.7372, "step": 140 }, { "epoch": 0.039851222104144525, "grad_norm": 2417.81982421875, "learning_rate": 1.9734325185972372e-05, "loss": 3.6944, "step": 150 }, { "epoch": 0.04250797024442083, "grad_norm": 7971.87451171875, "learning_rate": 1.9716613531703864e-05, "loss": 3.6615, "step": 160 }, { "epoch": 0.04516471838469713, "grad_norm": 1645.13916015625, "learning_rate": 1.9698901877435353e-05, "loss": 3.4582, "step": 170 }, { "epoch": 0.04782146652497343, "grad_norm": 2899.1162109375, "learning_rate": 1.9681190223166846e-05, "loss": 3.4193, "step": 180 }, { "epoch": 0.05047821466524974, "grad_norm": 13782.0908203125, "learning_rate": 1.9663478568898338e-05, "loss": 3.577, "step": 190 }, { "epoch": 0.053134962805526036, "grad_norm": 7818.07177734375, "learning_rate": 1.964576691462983e-05, "loss": 3.2082, "step": 200 }, { "epoch": 0.05579171094580234, "grad_norm": 14882.34375, "learning_rate": 1.962805526036132e-05, "loss": 3.1947, "step": 210 }, { "epoch": 0.05844845908607864, "grad_norm": 27526.642578125, "learning_rate": 1.961034360609281e-05, "loss": 3.23, "step": 220 }, { "epoch": 0.06110520722635494, "grad_norm": 9511.650390625, "learning_rate": 1.95926319518243e-05, "loss": 3.0386, "step": 230 }, { "epoch": 0.06376195536663125, "grad_norm": 2172.15625, "learning_rate": 1.9574920297555792e-05, "loss": 3.1756, "step": 240 }, { "epoch": 0.06641870350690754, "grad_norm": 11950.30078125, "learning_rate": 1.9557208643287285e-05, "loss": 3.36, "step": 250 }, { "epoch": 0.06907545164718384, "grad_norm": 17726.330078125, "learning_rate": 1.9539496989018777e-05, "loss": 3.0231, "step": 260 }, { "epoch": 0.07173219978746015, "grad_norm": 4690.27587890625, "learning_rate": 1.9521785334750266e-05, "loss": 3.0029, "step": 270 }, { "epoch": 0.07438894792773645, "grad_norm": 40308.61328125, "learning_rate": 1.9504073680481758e-05, "loss": 3.688, "step": 280 }, { "epoch": 0.07704569606801276, "grad_norm": 27147.087890625, "learning_rate": 1.948636202621325e-05, "loss": 3.3881, "step": 290 }, { "epoch": 0.07970244420828905, "grad_norm": 59977.046875, "learning_rate": 1.9468650371944743e-05, "loss": 3.4571, "step": 300 }, { "epoch": 0.08235919234856535, "grad_norm": 66940.046875, "learning_rate": 1.9450938717676235e-05, "loss": 3.3864, "step": 310 }, { "epoch": 0.08501594048884166, "grad_norm": 5094.89013671875, "learning_rate": 1.9433227063407724e-05, "loss": 3.3697, "step": 320 }, { "epoch": 0.08767268862911796, "grad_norm": 4367.36474609375, "learning_rate": 1.9415515409139216e-05, "loss": 3.3016, "step": 330 }, { "epoch": 0.09032943676939426, "grad_norm": 7941.5458984375, "learning_rate": 1.9397803754870705e-05, "loss": 3.0374, "step": 340 }, { "epoch": 0.09298618490967056, "grad_norm": 3960.741943359375, "learning_rate": 1.9380092100602197e-05, "loss": 3.3324, "step": 350 }, { "epoch": 0.09564293304994687, "grad_norm": 18565.732421875, "learning_rate": 1.936238044633369e-05, "loss": 3.2402, "step": 360 }, { "epoch": 0.09829968119022317, "grad_norm": 66859.0, "learning_rate": 1.9344668792065178e-05, "loss": 3.3142, "step": 370 }, { "epoch": 0.10095642933049948, "grad_norm": 1521.879638671875, "learning_rate": 1.932695713779667e-05, "loss": 3.0546, "step": 380 }, { "epoch": 0.10361317747077577, "grad_norm": 12662.775390625, "learning_rate": 1.9309245483528163e-05, "loss": 3.5396, "step": 390 }, { "epoch": 0.10626992561105207, "grad_norm": 105807.59375, "learning_rate": 1.9291533829259655e-05, "loss": 3.5301, "step": 400 }, { "epoch": 0.10892667375132838, "grad_norm": 663547.875, "learning_rate": 1.9273822174991147e-05, "loss": 4.28, "step": 410 }, { "epoch": 0.11158342189160468, "grad_norm": 8186676.0, "learning_rate": 1.9256110520722636e-05, "loss": 5.9807, "step": 420 }, { "epoch": 0.11424017003188097, "grad_norm": 2142551.25, "learning_rate": 1.923839886645413e-05, "loss": 9.4764, "step": 430 }, { "epoch": 0.11689691817215728, "grad_norm": 366486.1875, "learning_rate": 1.922068721218562e-05, "loss": 10.9151, "step": 440 }, { "epoch": 0.11955366631243358, "grad_norm": 2276693.0, "learning_rate": 1.9202975557917113e-05, "loss": 12.5549, "step": 450 }, { "epoch": 0.12221041445270989, "grad_norm": 2184425.5, "learning_rate": 1.9185263903648602e-05, "loss": 13.1915, "step": 460 }, { "epoch": 0.12486716259298619, "grad_norm": 2937578.75, "learning_rate": 1.9167552249380094e-05, "loss": 14.2279, "step": 470 }, { "epoch": 0.1275239107332625, "grad_norm": 10091141.0, "learning_rate": 1.9149840595111583e-05, "loss": 13.4766, "step": 480 }, { "epoch": 0.1301806588735388, "grad_norm": 5426885.5, "learning_rate": 1.9132128940843075e-05, "loss": 14.8065, "step": 490 }, { "epoch": 0.13283740701381508, "grad_norm": 2068535.25, "learning_rate": 1.9114417286574568e-05, "loss": 16.3781, "step": 500 }, { "epoch": 0.13549415515409138, "grad_norm": 3599295.0, "learning_rate": 1.909670563230606e-05, "loss": 15.1519, "step": 510 }, { "epoch": 0.1381509032943677, "grad_norm": 761431.875, "learning_rate": 1.907899397803755e-05, "loss": 15.1124, "step": 520 }, { "epoch": 0.140807651434644, "grad_norm": 933641.375, "learning_rate": 1.906128232376904e-05, "loss": 14.1038, "step": 530 }, { "epoch": 0.1434643995749203, "grad_norm": 423861.0625, "learning_rate": 1.9043570669500533e-05, "loss": 13.7131, "step": 540 }, { "epoch": 0.1461211477151966, "grad_norm": 5383.50537109375, "learning_rate": 1.9025859015232026e-05, "loss": 12.8075, "step": 550 }, { "epoch": 0.1487778958554729, "grad_norm": 3759.12548828125, "learning_rate": 1.9008147360963514e-05, "loss": 10.7237, "step": 560 }, { "epoch": 0.1514346439957492, "grad_norm": 2150.089111328125, "learning_rate": 1.8990435706695007e-05, "loss": 7.2887, "step": 570 }, { "epoch": 0.15409139213602552, "grad_norm": 3893.645751953125, "learning_rate": 1.89727240524265e-05, "loss": 4.8237, "step": 580 }, { "epoch": 0.1567481402763018, "grad_norm": 11881.3046875, "learning_rate": 1.895501239815799e-05, "loss": 3.9525, "step": 590 }, { "epoch": 0.1594048884165781, "grad_norm": 14820.740234375, "learning_rate": 1.8937300743889483e-05, "loss": 4.6401, "step": 600 }, { "epoch": 0.1620616365568544, "grad_norm": 99031.640625, "learning_rate": 1.8919589089620972e-05, "loss": 4.9725, "step": 610 }, { "epoch": 0.1647183846971307, "grad_norm": 47882.5859375, "learning_rate": 1.890187743535246e-05, "loss": 4.6917, "step": 620 }, { "epoch": 0.16737513283740701, "grad_norm": 77129.8046875, "learning_rate": 1.8884165781083953e-05, "loss": 4.3883, "step": 630 }, { "epoch": 0.17003188097768332, "grad_norm": 85341.125, "learning_rate": 1.8866454126815446e-05, "loss": 5.114, "step": 640 }, { "epoch": 0.17268862911795962, "grad_norm": 34883.13671875, "learning_rate": 1.8848742472546938e-05, "loss": 4.9715, "step": 650 }, { "epoch": 0.17534537725823593, "grad_norm": 22649.3359375, "learning_rate": 1.8831030818278427e-05, "loss": 4.9266, "step": 660 }, { "epoch": 0.17800212539851223, "grad_norm": 59614.453125, "learning_rate": 1.881331916400992e-05, "loss": 4.3894, "step": 670 }, { "epoch": 0.1806588735387885, "grad_norm": 13419.771484375, "learning_rate": 1.879560750974141e-05, "loss": 4.238, "step": 680 }, { "epoch": 0.18331562167906482, "grad_norm": 26652.462890625, "learning_rate": 1.8777895855472904e-05, "loss": 4.5253, "step": 690 }, { "epoch": 0.18597236981934112, "grad_norm": 37440.6015625, "learning_rate": 1.8760184201204396e-05, "loss": 4.0546, "step": 700 }, { "epoch": 0.18862911795961743, "grad_norm": 43147.1796875, "learning_rate": 1.8742472546935885e-05, "loss": 4.4831, "step": 710 }, { "epoch": 0.19128586609989373, "grad_norm": 143355.296875, "learning_rate": 1.8724760892667377e-05, "loss": 4.5257, "step": 720 }, { "epoch": 0.19394261424017004, "grad_norm": 12484.8466796875, "learning_rate": 1.870704923839887e-05, "loss": 4.9662, "step": 730 }, { "epoch": 0.19659936238044634, "grad_norm": 10305.0126953125, "learning_rate": 1.868933758413036e-05, "loss": 5.3629, "step": 740 }, { "epoch": 0.19925611052072265, "grad_norm": 3247.491943359375, "learning_rate": 1.867162592986185e-05, "loss": 5.014, "step": 750 }, { "epoch": 0.20191285866099895, "grad_norm": 2328.57470703125, "learning_rate": 1.8653914275593343e-05, "loss": 4.9864, "step": 760 }, { "epoch": 0.20456960680127523, "grad_norm": 16007.7978515625, "learning_rate": 1.863620262132483e-05, "loss": 4.5492, "step": 770 }, { "epoch": 0.20722635494155153, "grad_norm": 39521.5078125, "learning_rate": 1.8618490967056324e-05, "loss": 4.4608, "step": 780 }, { "epoch": 0.20988310308182784, "grad_norm": 553922.0, "learning_rate": 1.8600779312787816e-05, "loss": 4.9998, "step": 790 }, { "epoch": 0.21253985122210414, "grad_norm": 623164.25, "learning_rate": 1.858306765851931e-05, "loss": 4.6969, "step": 800 }, { "epoch": 0.21519659936238045, "grad_norm": 849724.3125, "learning_rate": 1.8565356004250797e-05, "loss": 5.2992, "step": 810 }, { "epoch": 0.21785334750265675, "grad_norm": 1883489.125, "learning_rate": 1.854764434998229e-05, "loss": 5.5446, "step": 820 }, { "epoch": 0.22051009564293306, "grad_norm": 1473608.5, "learning_rate": 1.8529932695713782e-05, "loss": 5.6081, "step": 830 }, { "epoch": 0.22316684378320936, "grad_norm": 6046079.5, "learning_rate": 1.8512221041445274e-05, "loss": 5.543, "step": 840 }, { "epoch": 0.22582359192348567, "grad_norm": 3414641.75, "learning_rate": 1.8494509387176763e-05, "loss": 6.5477, "step": 850 }, { "epoch": 0.22848034006376194, "grad_norm": 3107066.0, "learning_rate": 1.8476797732908255e-05, "loss": 6.6238, "step": 860 }, { "epoch": 0.23113708820403825, "grad_norm": 2057658.75, "learning_rate": 1.8459086078639748e-05, "loss": 6.6566, "step": 870 }, { "epoch": 0.23379383634431455, "grad_norm": 689954.125, "learning_rate": 1.8441374424371236e-05, "loss": 5.6908, "step": 880 }, { "epoch": 0.23645058448459086, "grad_norm": 5757.73388671875, "learning_rate": 1.842366277010273e-05, "loss": 4.5477, "step": 890 }, { "epoch": 0.23910733262486716, "grad_norm": 5359.6728515625, "learning_rate": 1.840595111583422e-05, "loss": 3.6785, "step": 900 }, { "epoch": 0.24176408076514347, "grad_norm": 2013.8673095703125, "learning_rate": 1.838823946156571e-05, "loss": 3.519, "step": 910 }, { "epoch": 0.24442082890541977, "grad_norm": 6289.10888671875, "learning_rate": 1.8370527807297202e-05, "loss": 3.6842, "step": 920 }, { "epoch": 0.24707757704569608, "grad_norm": 3089.353759765625, "learning_rate": 1.8352816153028694e-05, "loss": 3.6535, "step": 930 }, { "epoch": 0.24973432518597238, "grad_norm": 2002.3780517578125, "learning_rate": 1.8335104498760187e-05, "loss": 3.5385, "step": 940 }, { "epoch": 0.25239107332624866, "grad_norm": 5194.0224609375, "learning_rate": 1.8317392844491676e-05, "loss": 3.4652, "step": 950 }, { "epoch": 0.255047821466525, "grad_norm": 2200.886962890625, "learning_rate": 1.8299681190223168e-05, "loss": 3.6788, "step": 960 }, { "epoch": 0.25770456960680127, "grad_norm": 10148.009765625, "learning_rate": 1.828196953595466e-05, "loss": 3.7478, "step": 970 }, { "epoch": 0.2603613177470776, "grad_norm": 2540.3837890625, "learning_rate": 1.8264257881686152e-05, "loss": 3.4836, "step": 980 }, { "epoch": 0.2630180658873539, "grad_norm": 2385.15625, "learning_rate": 1.8246546227417645e-05, "loss": 3.2733, "step": 990 }, { "epoch": 0.26567481402763016, "grad_norm": 8635.650390625, "learning_rate": 1.8228834573149134e-05, "loss": 3.4935, "step": 1000 }, { "epoch": 0.2683315621679065, "grad_norm": 17405.947265625, "learning_rate": 1.8211122918880626e-05, "loss": 3.3743, "step": 1010 }, { "epoch": 0.27098831030818277, "grad_norm": 2616.988037109375, "learning_rate": 1.8193411264612115e-05, "loss": 4.0444, "step": 1020 }, { "epoch": 0.2736450584484591, "grad_norm": 9487.044921875, "learning_rate": 1.8175699610343607e-05, "loss": 3.8644, "step": 1030 }, { "epoch": 0.2763018065887354, "grad_norm": 681.0313110351562, "learning_rate": 1.81579879560751e-05, "loss": 3.2198, "step": 1040 }, { "epoch": 0.2789585547290117, "grad_norm": 1654.2945556640625, "learning_rate": 1.8140276301806588e-05, "loss": 3.741, "step": 1050 }, { "epoch": 0.281615302869288, "grad_norm": 2555.9970703125, "learning_rate": 1.812256464753808e-05, "loss": 3.5377, "step": 1060 }, { "epoch": 0.2842720510095643, "grad_norm": 1187.751220703125, "learning_rate": 1.8104852993269573e-05, "loss": 3.6048, "step": 1070 }, { "epoch": 0.2869287991498406, "grad_norm": 2747.8486328125, "learning_rate": 1.8087141339001065e-05, "loss": 3.6148, "step": 1080 }, { "epoch": 0.2895855472901169, "grad_norm": 624.16650390625, "learning_rate": 1.8069429684732557e-05, "loss": 3.0917, "step": 1090 }, { "epoch": 0.2922422954303932, "grad_norm": 283.41033935546875, "learning_rate": 1.8051718030464046e-05, "loss": 3.4423, "step": 1100 }, { "epoch": 0.2948990435706695, "grad_norm": 563.9237670898438, "learning_rate": 1.8034006376195538e-05, "loss": 3.1134, "step": 1110 }, { "epoch": 0.2975557917109458, "grad_norm": 419.8347473144531, "learning_rate": 1.801629472192703e-05, "loss": 3.3765, "step": 1120 }, { "epoch": 0.3002125398512221, "grad_norm": 328.199462890625, "learning_rate": 1.7998583067658523e-05, "loss": 3.1981, "step": 1130 }, { "epoch": 0.3028692879914984, "grad_norm": 1167.4515380859375, "learning_rate": 1.7980871413390012e-05, "loss": 2.9826, "step": 1140 }, { "epoch": 0.3055260361317747, "grad_norm": 1590.5523681640625, "learning_rate": 1.7963159759121504e-05, "loss": 3.2378, "step": 1150 }, { "epoch": 0.30818278427205104, "grad_norm": 1228.88037109375, "learning_rate": 1.7945448104852993e-05, "loss": 3.2167, "step": 1160 }, { "epoch": 0.3108395324123273, "grad_norm": 866.290283203125, "learning_rate": 1.7927736450584485e-05, "loss": 2.9749, "step": 1170 }, { "epoch": 0.3134962805526036, "grad_norm": 326.7938537597656, "learning_rate": 1.7910024796315977e-05, "loss": 3.111, "step": 1180 }, { "epoch": 0.3161530286928799, "grad_norm": 603.0250854492188, "learning_rate": 1.789231314204747e-05, "loss": 3.1647, "step": 1190 }, { "epoch": 0.3188097768331562, "grad_norm": 553.5940551757812, "learning_rate": 1.787460148777896e-05, "loss": 3.1094, "step": 1200 }, { "epoch": 0.32146652497343253, "grad_norm": 417.6220703125, "learning_rate": 1.785688983351045e-05, "loss": 3.195, "step": 1210 }, { "epoch": 0.3241232731137088, "grad_norm": 745.7908935546875, "learning_rate": 1.7839178179241943e-05, "loss": 2.8119, "step": 1220 }, { "epoch": 0.32678002125398514, "grad_norm": 963.697021484375, "learning_rate": 1.7821466524973435e-05, "loss": 2.9828, "step": 1230 }, { "epoch": 0.3294367693942614, "grad_norm": 3789.7373046875, "learning_rate": 1.7803754870704924e-05, "loss": 2.8971, "step": 1240 }, { "epoch": 0.33209351753453775, "grad_norm": 1777.551025390625, "learning_rate": 1.7786043216436416e-05, "loss": 2.8533, "step": 1250 }, { "epoch": 0.33475026567481403, "grad_norm": 725.1536254882812, "learning_rate": 1.776833156216791e-05, "loss": 2.6644, "step": 1260 }, { "epoch": 0.3374070138150903, "grad_norm": 2410.62060546875, "learning_rate": 1.77506199078994e-05, "loss": 3.058, "step": 1270 }, { "epoch": 0.34006376195536664, "grad_norm": 825.2067260742188, "learning_rate": 1.7732908253630893e-05, "loss": 2.7154, "step": 1280 }, { "epoch": 0.3427205100956429, "grad_norm": 835.7099609375, "learning_rate": 1.7715196599362382e-05, "loss": 3.5358, "step": 1290 }, { "epoch": 0.34537725823591925, "grad_norm": 2334.035888671875, "learning_rate": 1.769748494509387e-05, "loss": 3.2141, "step": 1300 }, { "epoch": 0.3480340063761955, "grad_norm": 1089.702392578125, "learning_rate": 1.7679773290825363e-05, "loss": 2.8534, "step": 1310 }, { "epoch": 0.35069075451647186, "grad_norm": 643.6981811523438, "learning_rate": 1.7662061636556856e-05, "loss": 3.14, "step": 1320 }, { "epoch": 0.35334750265674814, "grad_norm": 927.3551025390625, "learning_rate": 1.7644349982288348e-05, "loss": 3.255, "step": 1330 }, { "epoch": 0.35600425079702447, "grad_norm": 642.1421508789062, "learning_rate": 1.7626638328019837e-05, "loss": 2.9875, "step": 1340 }, { "epoch": 0.35866099893730075, "grad_norm": 1514.4876708984375, "learning_rate": 1.760892667375133e-05, "loss": 2.7786, "step": 1350 }, { "epoch": 0.361317747077577, "grad_norm": 2913.84912109375, "learning_rate": 1.759121501948282e-05, "loss": 2.83, "step": 1360 }, { "epoch": 0.36397449521785336, "grad_norm": 1152.3695068359375, "learning_rate": 1.7573503365214314e-05, "loss": 3.316, "step": 1370 }, { "epoch": 0.36663124335812963, "grad_norm": 2364.73876953125, "learning_rate": 1.7555791710945806e-05, "loss": 3.1473, "step": 1380 }, { "epoch": 0.36928799149840597, "grad_norm": 1560.827392578125, "learning_rate": 1.7538080056677295e-05, "loss": 2.875, "step": 1390 }, { "epoch": 0.37194473963868224, "grad_norm": 672.7749633789062, "learning_rate": 1.7520368402408787e-05, "loss": 3.3416, "step": 1400 }, { "epoch": 0.3746014877789586, "grad_norm": 3212.583740234375, "learning_rate": 1.750265674814028e-05, "loss": 2.6347, "step": 1410 }, { "epoch": 0.37725823591923485, "grad_norm": 9892.419921875, "learning_rate": 1.7484945093871768e-05, "loss": 2.9356, "step": 1420 }, { "epoch": 0.3799149840595112, "grad_norm": 13098.6201171875, "learning_rate": 1.746723343960326e-05, "loss": 3.0818, "step": 1430 }, { "epoch": 0.38257173219978746, "grad_norm": 33038.46484375, "learning_rate": 1.7449521785334753e-05, "loss": 3.4073, "step": 1440 }, { "epoch": 0.38522848034006374, "grad_norm": 58945.421875, "learning_rate": 1.743181013106624e-05, "loss": 3.4505, "step": 1450 }, { "epoch": 0.38788522848034007, "grad_norm": 53823.19921875, "learning_rate": 1.7414098476797734e-05, "loss": 3.4398, "step": 1460 }, { "epoch": 0.39054197662061635, "grad_norm": 213358.46875, "learning_rate": 1.7396386822529226e-05, "loss": 3.1337, "step": 1470 }, { "epoch": 0.3931987247608927, "grad_norm": 174113.078125, "learning_rate": 1.7378675168260718e-05, "loss": 3.6872, "step": 1480 }, { "epoch": 0.39585547290116896, "grad_norm": 110265.9609375, "learning_rate": 1.7360963513992207e-05, "loss": 3.5268, "step": 1490 }, { "epoch": 0.3985122210414453, "grad_norm": 125626.78125, "learning_rate": 1.73432518597237e-05, "loss": 3.8027, "step": 1500 }, { "epoch": 0.40116896918172157, "grad_norm": 119383.8359375, "learning_rate": 1.7325540205455192e-05, "loss": 3.6381, "step": 1510 }, { "epoch": 0.4038257173219979, "grad_norm": 78246.125, "learning_rate": 1.7307828551186684e-05, "loss": 3.6688, "step": 1520 }, { "epoch": 0.4064824654622742, "grad_norm": 77016.8671875, "learning_rate": 1.7290116896918173e-05, "loss": 3.7796, "step": 1530 }, { "epoch": 0.40913921360255046, "grad_norm": 471759.21875, "learning_rate": 1.7272405242649665e-05, "loss": 3.738, "step": 1540 }, { "epoch": 0.4117959617428268, "grad_norm": 108969.1171875, "learning_rate": 1.7254693588381157e-05, "loss": 3.4583, "step": 1550 }, { "epoch": 0.41445270988310307, "grad_norm": 44717.91015625, "learning_rate": 1.7236981934112646e-05, "loss": 3.0156, "step": 1560 }, { "epoch": 0.4171094580233794, "grad_norm": 56418.765625, "learning_rate": 1.721927027984414e-05, "loss": 3.339, "step": 1570 }, { "epoch": 0.4197662061636557, "grad_norm": 82086.234375, "learning_rate": 1.720155862557563e-05, "loss": 3.2477, "step": 1580 }, { "epoch": 0.422422954303932, "grad_norm": 38437.12890625, "learning_rate": 1.718384697130712e-05, "loss": 3.0923, "step": 1590 }, { "epoch": 0.4250797024442083, "grad_norm": 64070.26953125, "learning_rate": 1.7166135317038612e-05, "loss": 3.8784, "step": 1600 }, { "epoch": 0.4277364505844846, "grad_norm": 96363.0078125, "learning_rate": 1.7148423662770104e-05, "loss": 3.1945, "step": 1610 }, { "epoch": 0.4303931987247609, "grad_norm": 101021.7578125, "learning_rate": 1.7130712008501596e-05, "loss": 2.9785, "step": 1620 }, { "epoch": 0.43304994686503717, "grad_norm": 33741.50390625, "learning_rate": 1.7113000354233085e-05, "loss": 3.0544, "step": 1630 }, { "epoch": 0.4357066950053135, "grad_norm": 18486.07421875, "learning_rate": 1.7095288699964578e-05, "loss": 3.3951, "step": 1640 }, { "epoch": 0.4383634431455898, "grad_norm": 141817.4375, "learning_rate": 1.707757704569607e-05, "loss": 3.8719, "step": 1650 }, { "epoch": 0.4410201912858661, "grad_norm": 18356.125, "learning_rate": 1.7059865391427562e-05, "loss": 3.217, "step": 1660 }, { "epoch": 0.4436769394261424, "grad_norm": 75286.890625, "learning_rate": 1.7042153737159054e-05, "loss": 3.2279, "step": 1670 }, { "epoch": 0.4463336875664187, "grad_norm": 93692.8671875, "learning_rate": 1.7024442082890543e-05, "loss": 3.3421, "step": 1680 }, { "epoch": 0.448990435706695, "grad_norm": 137171.109375, "learning_rate": 1.7006730428622032e-05, "loss": 3.4727, "step": 1690 }, { "epoch": 0.45164718384697133, "grad_norm": 143812.296875, "learning_rate": 1.6989018774353524e-05, "loss": 3.24, "step": 1700 }, { "epoch": 0.4543039319872476, "grad_norm": 35345.19921875, "learning_rate": 1.6971307120085017e-05, "loss": 3.2903, "step": 1710 }, { "epoch": 0.4569606801275239, "grad_norm": 69917.4375, "learning_rate": 1.695359546581651e-05, "loss": 3.1309, "step": 1720 }, { "epoch": 0.4596174282678002, "grad_norm": 71451.5859375, "learning_rate": 1.6935883811547998e-05, "loss": 3.8151, "step": 1730 }, { "epoch": 0.4622741764080765, "grad_norm": 54897.4375, "learning_rate": 1.691817215727949e-05, "loss": 3.7961, "step": 1740 }, { "epoch": 0.46493092454835283, "grad_norm": 42574.12109375, "learning_rate": 1.6900460503010982e-05, "loss": 3.3018, "step": 1750 }, { "epoch": 0.4675876726886291, "grad_norm": 118568.609375, "learning_rate": 1.6882748848742475e-05, "loss": 3.4044, "step": 1760 }, { "epoch": 0.47024442082890544, "grad_norm": 141536.96875, "learning_rate": 1.6865037194473967e-05, "loss": 3.5705, "step": 1770 }, { "epoch": 0.4729011689691817, "grad_norm": 153274.9375, "learning_rate": 1.6847325540205456e-05, "loss": 3.7034, "step": 1780 }, { "epoch": 0.47555791710945805, "grad_norm": 121872.7890625, "learning_rate": 1.6829613885936948e-05, "loss": 3.6836, "step": 1790 }, { "epoch": 0.4782146652497343, "grad_norm": 101665.6640625, "learning_rate": 1.681190223166844e-05, "loss": 3.5983, "step": 1800 }, { "epoch": 0.4808714133900106, "grad_norm": 212873.5, "learning_rate": 1.6794190577399933e-05, "loss": 3.3915, "step": 1810 }, { "epoch": 0.48352816153028694, "grad_norm": 19234.345703125, "learning_rate": 1.677647892313142e-05, "loss": 3.1403, "step": 1820 }, { "epoch": 0.4861849096705632, "grad_norm": 126968.46875, "learning_rate": 1.6758767268862914e-05, "loss": 3.3559, "step": 1830 }, { "epoch": 0.48884165781083955, "grad_norm": 40483.28515625, "learning_rate": 1.6741055614594403e-05, "loss": 3.4042, "step": 1840 }, { "epoch": 0.4914984059511158, "grad_norm": 281826.84375, "learning_rate": 1.6723343960325895e-05, "loss": 3.5656, "step": 1850 }, { "epoch": 0.49415515409139216, "grad_norm": 112396.421875, "learning_rate": 1.6705632306057387e-05, "loss": 3.5217, "step": 1860 }, { "epoch": 0.49681190223166843, "grad_norm": 430567.96875, "learning_rate": 1.668792065178888e-05, "loss": 3.784, "step": 1870 }, { "epoch": 0.49946865037194477, "grad_norm": 19857.708984375, "learning_rate": 1.667020899752037e-05, "loss": 3.2844, "step": 1880 }, { "epoch": 0.502125398512221, "grad_norm": 153824.828125, "learning_rate": 1.665249734325186e-05, "loss": 3.5734, "step": 1890 }, { "epoch": 0.5047821466524973, "grad_norm": 555864.875, "learning_rate": 1.6634785688983353e-05, "loss": 3.5042, "step": 1900 }, { "epoch": 0.5074388947927736, "grad_norm": 1425396.625, "learning_rate": 1.6617074034714845e-05, "loss": 3.8919, "step": 1910 }, { "epoch": 0.51009564293305, "grad_norm": 1588321.5, "learning_rate": 1.6599362380446334e-05, "loss": 3.7013, "step": 1920 }, { "epoch": 0.5127523910733263, "grad_norm": 843313.25, "learning_rate": 1.6581650726177826e-05, "loss": 4.0527, "step": 1930 }, { "epoch": 0.5154091392136025, "grad_norm": 121270.0859375, "learning_rate": 1.656393907190932e-05, "loss": 3.6732, "step": 1940 }, { "epoch": 0.5180658873538788, "grad_norm": 194603.609375, "learning_rate": 1.654622741764081e-05, "loss": 3.5416, "step": 1950 }, { "epoch": 0.5207226354941552, "grad_norm": 103689.84375, "learning_rate": 1.65285157633723e-05, "loss": 3.6058, "step": 1960 }, { "epoch": 0.5233793836344315, "grad_norm": 148743.953125, "learning_rate": 1.6510804109103792e-05, "loss": 3.4376, "step": 1970 }, { "epoch": 0.5260361317747078, "grad_norm": 23079.94140625, "learning_rate": 1.649309245483528e-05, "loss": 3.524, "step": 1980 }, { "epoch": 0.528692879914984, "grad_norm": 12263.953125, "learning_rate": 1.6475380800566773e-05, "loss": 3.1242, "step": 1990 }, { "epoch": 0.5313496280552603, "grad_norm": 270958.5625, "learning_rate": 1.6457669146298265e-05, "loss": 3.8531, "step": 2000 }, { "epoch": 0.5340063761955367, "grad_norm": 145561.640625, "learning_rate": 1.6439957492029758e-05, "loss": 3.104, "step": 2010 }, { "epoch": 0.536663124335813, "grad_norm": 104717.5625, "learning_rate": 1.6422245837761247e-05, "loss": 3.3674, "step": 2020 }, { "epoch": 0.5393198724760893, "grad_norm": 112249.3515625, "learning_rate": 1.640453418349274e-05, "loss": 3.2119, "step": 2030 }, { "epoch": 0.5419766206163655, "grad_norm": 131700.71875, "learning_rate": 1.638682252922423e-05, "loss": 3.6448, "step": 2040 }, { "epoch": 0.5446333687566419, "grad_norm": 119026.4140625, "learning_rate": 1.6369110874955723e-05, "loss": 3.0097, "step": 2050 }, { "epoch": 0.5472901168969182, "grad_norm": 103121.09375, "learning_rate": 1.6351399220687216e-05, "loss": 3.4205, "step": 2060 }, { "epoch": 0.5499468650371945, "grad_norm": 237787.03125, "learning_rate": 1.6333687566418704e-05, "loss": 3.349, "step": 2070 }, { "epoch": 0.5526036131774708, "grad_norm": 49652.95703125, "learning_rate": 1.6315975912150197e-05, "loss": 3.1665, "step": 2080 }, { "epoch": 0.555260361317747, "grad_norm": 262178.34375, "learning_rate": 1.629826425788169e-05, "loss": 3.4743, "step": 2090 }, { "epoch": 0.5579171094580234, "grad_norm": 130814.703125, "learning_rate": 1.6280552603613178e-05, "loss": 3.4995, "step": 2100 }, { "epoch": 0.5605738575982997, "grad_norm": 273671.09375, "learning_rate": 1.626284094934467e-05, "loss": 3.3983, "step": 2110 }, { "epoch": 0.563230605738576, "grad_norm": 385060.25, "learning_rate": 1.6245129295076162e-05, "loss": 3.5215, "step": 2120 }, { "epoch": 0.5658873538788523, "grad_norm": 165007.71875, "learning_rate": 1.622741764080765e-05, "loss": 3.1164, "step": 2130 }, { "epoch": 0.5685441020191286, "grad_norm": 70266.53125, "learning_rate": 1.6209705986539144e-05, "loss": 3.1971, "step": 2140 }, { "epoch": 0.5712008501594049, "grad_norm": 271687.3125, "learning_rate": 1.6191994332270636e-05, "loss": 3.4932, "step": 2150 }, { "epoch": 0.5738575982996812, "grad_norm": 35143.67578125, "learning_rate": 1.6174282678002128e-05, "loss": 3.4214, "step": 2160 }, { "epoch": 0.5765143464399575, "grad_norm": 1173879.625, "learning_rate": 1.6156571023733617e-05, "loss": 3.3194, "step": 2170 }, { "epoch": 0.5791710945802337, "grad_norm": 306067.03125, "learning_rate": 1.613885936946511e-05, "loss": 3.1417, "step": 2180 }, { "epoch": 0.5818278427205101, "grad_norm": 342329.0625, "learning_rate": 1.61211477151966e-05, "loss": 3.355, "step": 2190 }, { "epoch": 0.5844845908607864, "grad_norm": 50600.97265625, "learning_rate": 1.6103436060928094e-05, "loss": 3.3974, "step": 2200 }, { "epoch": 0.5871413390010627, "grad_norm": 360589.03125, "learning_rate": 1.6085724406659583e-05, "loss": 3.4514, "step": 2210 }, { "epoch": 0.589798087141339, "grad_norm": 94335.3828125, "learning_rate": 1.6068012752391075e-05, "loss": 3.2719, "step": 2220 }, { "epoch": 0.5924548352816154, "grad_norm": 53790.76953125, "learning_rate": 1.6050301098122564e-05, "loss": 3.3992, "step": 2230 }, { "epoch": 0.5951115834218916, "grad_norm": 107421.421875, "learning_rate": 1.6032589443854056e-05, "loss": 3.3512, "step": 2240 }, { "epoch": 0.5977683315621679, "grad_norm": 142487.859375, "learning_rate": 1.601487778958555e-05, "loss": 3.826, "step": 2250 }, { "epoch": 0.6004250797024442, "grad_norm": 1261580.75, "learning_rate": 1.599716613531704e-05, "loss": 3.7385, "step": 2260 }, { "epoch": 0.6030818278427205, "grad_norm": 648111.0, "learning_rate": 1.597945448104853e-05, "loss": 3.2839, "step": 2270 }, { "epoch": 0.6057385759829969, "grad_norm": 326968.125, "learning_rate": 1.5961742826780022e-05, "loss": 3.7895, "step": 2280 }, { "epoch": 0.6083953241232731, "grad_norm": 808961.5625, "learning_rate": 1.5944031172511514e-05, "loss": 3.7051, "step": 2290 }, { "epoch": 0.6110520722635494, "grad_norm": 2958079.0, "learning_rate": 1.5926319518243006e-05, "loss": 3.8099, "step": 2300 }, { "epoch": 0.6137088204038257, "grad_norm": 314874.03125, "learning_rate": 1.5908607863974495e-05, "loss": 3.6654, "step": 2310 }, { "epoch": 0.6163655685441021, "grad_norm": 8078548.0, "learning_rate": 1.5890896209705987e-05, "loss": 4.018, "step": 2320 }, { "epoch": 0.6190223166843783, "grad_norm": 135695.46875, "learning_rate": 1.587318455543748e-05, "loss": 3.6651, "step": 2330 }, { "epoch": 0.6216790648246546, "grad_norm": 18501240.0, "learning_rate": 1.5855472901168972e-05, "loss": 4.0069, "step": 2340 }, { "epoch": 0.6243358129649309, "grad_norm": 4980981.5, "learning_rate": 1.5837761246900464e-05, "loss": 3.9174, "step": 2350 }, { "epoch": 0.6269925611052072, "grad_norm": 1297274.125, "learning_rate": 1.5820049592631953e-05, "loss": 3.3223, "step": 2360 }, { "epoch": 0.6296493092454836, "grad_norm": 1378757.625, "learning_rate": 1.5802337938363442e-05, "loss": 3.7712, "step": 2370 }, { "epoch": 0.6323060573857598, "grad_norm": 2027859.875, "learning_rate": 1.5784626284094934e-05, "loss": 3.5468, "step": 2380 }, { "epoch": 0.6349628055260361, "grad_norm": 157107.65625, "learning_rate": 1.5766914629826427e-05, "loss": 3.5328, "step": 2390 }, { "epoch": 0.6376195536663124, "grad_norm": 1103094.75, "learning_rate": 1.574920297555792e-05, "loss": 3.6031, "step": 2400 }, { "epoch": 0.6402763018065888, "grad_norm": 725449.5, "learning_rate": 1.5731491321289408e-05, "loss": 3.9276, "step": 2410 }, { "epoch": 0.6429330499468651, "grad_norm": 214425.640625, "learning_rate": 1.57137796670209e-05, "loss": 3.517, "step": 2420 }, { "epoch": 0.6455897980871413, "grad_norm": 876419.625, "learning_rate": 1.5696068012752392e-05, "loss": 3.4675, "step": 2430 }, { "epoch": 0.6482465462274176, "grad_norm": 1504300.25, "learning_rate": 1.5678356358483884e-05, "loss": 3.4772, "step": 2440 }, { "epoch": 0.6509032943676939, "grad_norm": 144657.71875, "learning_rate": 1.5660644704215377e-05, "loss": 3.42, "step": 2450 }, { "epoch": 0.6535600425079703, "grad_norm": 371512.40625, "learning_rate": 1.5642933049946866e-05, "loss": 3.6802, "step": 2460 }, { "epoch": 0.6562167906482466, "grad_norm": 1322714.5, "learning_rate": 1.5625221395678358e-05, "loss": 3.805, "step": 2470 }, { "epoch": 0.6588735387885228, "grad_norm": 218897.765625, "learning_rate": 1.560750974140985e-05, "loss": 3.252, "step": 2480 }, { "epoch": 0.6615302869287991, "grad_norm": 1596077.0, "learning_rate": 1.5589798087141342e-05, "loss": 3.626, "step": 2490 }, { "epoch": 0.6641870350690755, "grad_norm": 2922875.75, "learning_rate": 1.557208643287283e-05, "loss": 3.5045, "step": 2500 }, { "epoch": 0.6668437832093518, "grad_norm": 96812.5859375, "learning_rate": 1.5554374778604324e-05, "loss": 3.7078, "step": 2510 }, { "epoch": 0.6695005313496281, "grad_norm": 1580814.125, "learning_rate": 1.5536663124335812e-05, "loss": 3.615, "step": 2520 }, { "epoch": 0.6721572794899043, "grad_norm": 235169.53125, "learning_rate": 1.5518951470067305e-05, "loss": 3.5076, "step": 2530 }, { "epoch": 0.6748140276301806, "grad_norm": 816632.0, "learning_rate": 1.5501239815798797e-05, "loss": 4.0074, "step": 2540 }, { "epoch": 0.677470775770457, "grad_norm": 3783126.5, "learning_rate": 1.548352816153029e-05, "loss": 3.7162, "step": 2550 }, { "epoch": 0.6801275239107333, "grad_norm": 1676969.875, "learning_rate": 1.5465816507261778e-05, "loss": 3.9383, "step": 2560 }, { "epoch": 0.6827842720510096, "grad_norm": 944205.0, "learning_rate": 1.544810485299327e-05, "loss": 3.6335, "step": 2570 }, { "epoch": 0.6854410201912858, "grad_norm": 532299.0, "learning_rate": 1.5430393198724763e-05, "loss": 3.776, "step": 2580 }, { "epoch": 0.6880977683315622, "grad_norm": 324683.46875, "learning_rate": 1.5412681544456255e-05, "loss": 4.0332, "step": 2590 }, { "epoch": 0.6907545164718385, "grad_norm": 371158.6875, "learning_rate": 1.5394969890187744e-05, "loss": 3.2831, "step": 2600 }, { "epoch": 0.6934112646121148, "grad_norm": 626177.8125, "learning_rate": 1.5377258235919236e-05, "loss": 3.7419, "step": 2610 }, { "epoch": 0.696068012752391, "grad_norm": 489480.3125, "learning_rate": 1.535954658165073e-05, "loss": 3.9135, "step": 2620 }, { "epoch": 0.6987247608926673, "grad_norm": 840057.5625, "learning_rate": 1.534183492738222e-05, "loss": 3.6214, "step": 2630 }, { "epoch": 0.7013815090329437, "grad_norm": 641658.4375, "learning_rate": 1.532412327311371e-05, "loss": 3.9029, "step": 2640 }, { "epoch": 0.70403825717322, "grad_norm": 1129191.0, "learning_rate": 1.5306411618845202e-05, "loss": 3.6271, "step": 2650 }, { "epoch": 0.7066950053134963, "grad_norm": 758676.8125, "learning_rate": 1.528869996457669e-05, "loss": 3.8411, "step": 2660 }, { "epoch": 0.7093517534537725, "grad_norm": 946755.25, "learning_rate": 1.5270988310308183e-05, "loss": 3.8184, "step": 2670 }, { "epoch": 0.7120085015940489, "grad_norm": 1282365.625, "learning_rate": 1.5253276656039675e-05, "loss": 3.8393, "step": 2680 }, { "epoch": 0.7146652497343252, "grad_norm": 1212575.875, "learning_rate": 1.5235565001771166e-05, "loss": 3.6106, "step": 2690 }, { "epoch": 0.7173219978746015, "grad_norm": 2197153.75, "learning_rate": 1.5217853347502658e-05, "loss": 3.5554, "step": 2700 }, { "epoch": 0.7199787460148778, "grad_norm": 621252.1875, "learning_rate": 1.520014169323415e-05, "loss": 3.3832, "step": 2710 }, { "epoch": 0.722635494155154, "grad_norm": 243552.59375, "learning_rate": 1.5182430038965641e-05, "loss": 3.4785, "step": 2720 }, { "epoch": 0.7252922422954304, "grad_norm": 3559921.0, "learning_rate": 1.5164718384697133e-05, "loss": 3.7972, "step": 2730 }, { "epoch": 0.7279489904357067, "grad_norm": 8816077.0, "learning_rate": 1.5147006730428624e-05, "loss": 3.6698, "step": 2740 }, { "epoch": 0.730605738575983, "grad_norm": 2959412.0, "learning_rate": 1.5129295076160116e-05, "loss": 3.9389, "step": 2750 }, { "epoch": 0.7332624867162593, "grad_norm": 13276429.0, "learning_rate": 1.5111583421891607e-05, "loss": 3.6811, "step": 2760 }, { "epoch": 0.7359192348565357, "grad_norm": 24583468.0, "learning_rate": 1.5093871767623095e-05, "loss": 3.9955, "step": 2770 }, { "epoch": 0.7385759829968119, "grad_norm": 11388400.0, "learning_rate": 1.5076160113354588e-05, "loss": 3.4851, "step": 2780 }, { "epoch": 0.7412327311370882, "grad_norm": 2901875.5, "learning_rate": 1.5058448459086078e-05, "loss": 4.0118, "step": 2790 }, { "epoch": 0.7438894792773645, "grad_norm": 7893670.0, "learning_rate": 1.504073680481757e-05, "loss": 4.3674, "step": 2800 }, { "epoch": 0.7465462274176408, "grad_norm": 13170602.0, "learning_rate": 1.5023025150549063e-05, "loss": 3.5882, "step": 2810 }, { "epoch": 0.7492029755579172, "grad_norm": 12720932.0, "learning_rate": 1.5005313496280553e-05, "loss": 4.7013, "step": 2820 }, { "epoch": 0.7518597236981934, "grad_norm": 7461363.0, "learning_rate": 1.4987601842012046e-05, "loss": 3.5194, "step": 2830 }, { "epoch": 0.7545164718384697, "grad_norm": 3747000.25, "learning_rate": 1.4969890187743536e-05, "loss": 3.9811, "step": 2840 }, { "epoch": 0.757173219978746, "grad_norm": 2111091.0, "learning_rate": 1.4952178533475028e-05, "loss": 3.3212, "step": 2850 }, { "epoch": 0.7598299681190224, "grad_norm": 4919647.5, "learning_rate": 1.4934466879206519e-05, "loss": 4.0383, "step": 2860 }, { "epoch": 0.7624867162592986, "grad_norm": 3595169.25, "learning_rate": 1.4916755224938011e-05, "loss": 3.7293, "step": 2870 }, { "epoch": 0.7651434643995749, "grad_norm": 1647251.75, "learning_rate": 1.4899043570669502e-05, "loss": 4.166, "step": 2880 }, { "epoch": 0.7678002125398512, "grad_norm": 4398145.0, "learning_rate": 1.4881331916400994e-05, "loss": 3.4454, "step": 2890 }, { "epoch": 0.7704569606801275, "grad_norm": 3135213.0, "learning_rate": 1.4863620262132485e-05, "loss": 4.0135, "step": 2900 }, { "epoch": 0.7731137088204039, "grad_norm": 7072787.0, "learning_rate": 1.4845908607863975e-05, "loss": 3.4145, "step": 2910 }, { "epoch": 0.7757704569606801, "grad_norm": 2635511.75, "learning_rate": 1.4828196953595466e-05, "loss": 3.8201, "step": 2920 }, { "epoch": 0.7784272051009564, "grad_norm": 4616754.5, "learning_rate": 1.4810485299326958e-05, "loss": 4.1764, "step": 2930 }, { "epoch": 0.7810839532412327, "grad_norm": 877153.0, "learning_rate": 1.4792773645058449e-05, "loss": 3.9471, "step": 2940 }, { "epoch": 0.7837407013815091, "grad_norm": 569671.3125, "learning_rate": 1.4775061990789941e-05, "loss": 3.7697, "step": 2950 }, { "epoch": 0.7863974495217854, "grad_norm": 810236.125, "learning_rate": 1.4757350336521432e-05, "loss": 4.4753, "step": 2960 }, { "epoch": 0.7890541976620616, "grad_norm": 877906.875, "learning_rate": 1.4739638682252924e-05, "loss": 3.6654, "step": 2970 }, { "epoch": 0.7917109458023379, "grad_norm": 481885.46875, "learning_rate": 1.4721927027984414e-05, "loss": 4.1253, "step": 2980 }, { "epoch": 0.7943676939426142, "grad_norm": 1338787.0, "learning_rate": 1.4704215373715907e-05, "loss": 4.0294, "step": 2990 }, { "epoch": 0.7970244420828906, "grad_norm": 1250065.875, "learning_rate": 1.4686503719447397e-05, "loss": 4.7282, "step": 3000 }, { "epoch": 0.7996811902231669, "grad_norm": 1604171.375, "learning_rate": 1.466879206517889e-05, "loss": 4.0439, "step": 3010 }, { "epoch": 0.8023379383634431, "grad_norm": 512070.90625, "learning_rate": 1.4651080410910382e-05, "loss": 3.5779, "step": 3020 }, { "epoch": 0.8049946865037194, "grad_norm": 312113.46875, "learning_rate": 1.4633368756641872e-05, "loss": 3.6514, "step": 3030 }, { "epoch": 0.8076514346439958, "grad_norm": 23779.923828125, "learning_rate": 1.4615657102373361e-05, "loss": 3.8136, "step": 3040 }, { "epoch": 0.8103081827842721, "grad_norm": 8204.794921875, "learning_rate": 1.4597945448104854e-05, "loss": 4.1336, "step": 3050 }, { "epoch": 0.8129649309245484, "grad_norm": 76479.1640625, "learning_rate": 1.4580233793836344e-05, "loss": 3.4411, "step": 3060 }, { "epoch": 0.8156216790648246, "grad_norm": 66624.71875, "learning_rate": 1.4562522139567836e-05, "loss": 3.8493, "step": 3070 }, { "epoch": 0.8182784272051009, "grad_norm": 22607.904296875, "learning_rate": 1.4544810485299327e-05, "loss": 3.2428, "step": 3080 }, { "epoch": 0.8209351753453773, "grad_norm": 119469.640625, "learning_rate": 1.452709883103082e-05, "loss": 3.4363, "step": 3090 }, { "epoch": 0.8235919234856536, "grad_norm": 108868.203125, "learning_rate": 1.4509387176762311e-05, "loss": 3.5903, "step": 3100 }, { "epoch": 0.8262486716259299, "grad_norm": 5543388.0, "learning_rate": 1.4491675522493802e-05, "loss": 3.7918, "step": 3110 }, { "epoch": 0.8289054197662061, "grad_norm": 2565445.75, "learning_rate": 1.4473963868225294e-05, "loss": 3.8573, "step": 3120 }, { "epoch": 0.8315621679064825, "grad_norm": 702086.4375, "learning_rate": 1.4456252213956785e-05, "loss": 3.3944, "step": 3130 }, { "epoch": 0.8342189160467588, "grad_norm": 115243.6484375, "learning_rate": 1.4438540559688277e-05, "loss": 3.2222, "step": 3140 }, { "epoch": 0.8368756641870351, "grad_norm": 476268.625, "learning_rate": 1.4420828905419768e-05, "loss": 3.6144, "step": 3150 }, { "epoch": 0.8395324123273114, "grad_norm": 65992.0, "learning_rate": 1.440311725115126e-05, "loss": 3.1891, "step": 3160 }, { "epoch": 0.8421891604675876, "grad_norm": 1161863.375, "learning_rate": 1.438540559688275e-05, "loss": 3.6714, "step": 3170 }, { "epoch": 0.844845908607864, "grad_norm": 185466.84375, "learning_rate": 1.4367693942614241e-05, "loss": 3.4372, "step": 3180 }, { "epoch": 0.8475026567481403, "grad_norm": 56940.96875, "learning_rate": 1.4349982288345732e-05, "loss": 3.7385, "step": 3190 }, { "epoch": 0.8501594048884166, "grad_norm": 99763.78125, "learning_rate": 1.4332270634077224e-05, "loss": 3.5612, "step": 3200 }, { "epoch": 0.8528161530286928, "grad_norm": 91525.1328125, "learning_rate": 1.4314558979808715e-05, "loss": 3.6116, "step": 3210 }, { "epoch": 0.8554729011689692, "grad_norm": 23506.251953125, "learning_rate": 1.4296847325540207e-05, "loss": 3.4268, "step": 3220 }, { "epoch": 0.8581296493092455, "grad_norm": 36794.52734375, "learning_rate": 1.4279135671271697e-05, "loss": 3.7912, "step": 3230 }, { "epoch": 0.8607863974495218, "grad_norm": 14971.548828125, "learning_rate": 1.426142401700319e-05, "loss": 3.7623, "step": 3240 }, { "epoch": 0.8634431455897981, "grad_norm": 29957.119140625, "learning_rate": 1.424371236273468e-05, "loss": 3.5765, "step": 3250 }, { "epoch": 0.8660998937300743, "grad_norm": 24691.1796875, "learning_rate": 1.4226000708466172e-05, "loss": 3.4663, "step": 3260 }, { "epoch": 0.8687566418703507, "grad_norm": 21935.2734375, "learning_rate": 1.4208289054197663e-05, "loss": 3.6494, "step": 3270 }, { "epoch": 0.871413390010627, "grad_norm": 26350.591796875, "learning_rate": 1.4190577399929155e-05, "loss": 3.5611, "step": 3280 }, { "epoch": 0.8740701381509033, "grad_norm": 30286.142578125, "learning_rate": 1.4172865745660646e-05, "loss": 3.7046, "step": 3290 }, { "epoch": 0.8767268862911796, "grad_norm": 6965.02734375, "learning_rate": 1.4155154091392138e-05, "loss": 3.9012, "step": 3300 }, { "epoch": 0.879383634431456, "grad_norm": 34496.1171875, "learning_rate": 1.4137442437123627e-05, "loss": 3.5102, "step": 3310 }, { "epoch": 0.8820403825717322, "grad_norm": 15867.46875, "learning_rate": 1.411973078285512e-05, "loss": 3.9485, "step": 3320 }, { "epoch": 0.8846971307120085, "grad_norm": 8408.2509765625, "learning_rate": 1.410201912858661e-05, "loss": 4.0955, "step": 3330 }, { "epoch": 0.8873538788522848, "grad_norm": 12868.8935546875, "learning_rate": 1.4084307474318102e-05, "loss": 3.8902, "step": 3340 }, { "epoch": 0.8900106269925611, "grad_norm": 39027.8125, "learning_rate": 1.4066595820049593e-05, "loss": 3.7809, "step": 3350 }, { "epoch": 0.8926673751328374, "grad_norm": 30144.494140625, "learning_rate": 1.4048884165781085e-05, "loss": 3.8368, "step": 3360 }, { "epoch": 0.8953241232731137, "grad_norm": 14916.984375, "learning_rate": 1.4031172511512576e-05, "loss": 3.8361, "step": 3370 }, { "epoch": 0.89798087141339, "grad_norm": 10657.8974609375, "learning_rate": 1.4013460857244068e-05, "loss": 3.9388, "step": 3380 }, { "epoch": 0.9006376195536663, "grad_norm": 20504.70703125, "learning_rate": 1.399574920297556e-05, "loss": 4.257, "step": 3390 }, { "epoch": 0.9032943676939427, "grad_norm": 32460.078125, "learning_rate": 1.397803754870705e-05, "loss": 4.0817, "step": 3400 }, { "epoch": 0.905951115834219, "grad_norm": 6730.14404296875, "learning_rate": 1.3960325894438543e-05, "loss": 4.2065, "step": 3410 }, { "epoch": 0.9086078639744952, "grad_norm": 17531.017578125, "learning_rate": 1.3942614240170034e-05, "loss": 3.5729, "step": 3420 }, { "epoch": 0.9112646121147715, "grad_norm": 17859.064453125, "learning_rate": 1.3924902585901526e-05, "loss": 4.3419, "step": 3430 }, { "epoch": 0.9139213602550478, "grad_norm": 99839.4296875, "learning_rate": 1.3907190931633016e-05, "loss": 3.9653, "step": 3440 }, { "epoch": 0.9165781083953242, "grad_norm": 13036.796875, "learning_rate": 1.3889479277364505e-05, "loss": 3.8463, "step": 3450 }, { "epoch": 0.9192348565356004, "grad_norm": 54209.05859375, "learning_rate": 1.3871767623095998e-05, "loss": 3.9493, "step": 3460 }, { "epoch": 0.9218916046758767, "grad_norm": 227248.34375, "learning_rate": 1.385405596882749e-05, "loss": 3.7791, "step": 3470 }, { "epoch": 0.924548352816153, "grad_norm": 856476.3125, "learning_rate": 1.383634431455898e-05, "loss": 4.2208, "step": 3480 }, { "epoch": 0.9272051009564294, "grad_norm": 373248.40625, "learning_rate": 1.3818632660290473e-05, "loss": 4.6665, "step": 3490 }, { "epoch": 0.9298618490967057, "grad_norm": 476773.1875, "learning_rate": 1.3800921006021963e-05, "loss": 4.3373, "step": 3500 }, { "epoch": 0.9325185972369819, "grad_norm": 3948952.0, "learning_rate": 1.3783209351753455e-05, "loss": 3.9872, "step": 3510 }, { "epoch": 0.9351753453772582, "grad_norm": 131342.296875, "learning_rate": 1.3765497697484946e-05, "loss": 4.0315, "step": 3520 }, { "epoch": 0.9378320935175345, "grad_norm": 1021533.8125, "learning_rate": 1.3747786043216438e-05, "loss": 3.898, "step": 3530 }, { "epoch": 0.9404888416578109, "grad_norm": 70664288.0, "learning_rate": 1.3730074388947929e-05, "loss": 4.0261, "step": 3540 }, { "epoch": 0.9431455897980872, "grad_norm": 1955257.25, "learning_rate": 1.3712362734679421e-05, "loss": 3.9837, "step": 3550 }, { "epoch": 0.9458023379383634, "grad_norm": 10510368.0, "learning_rate": 1.3694651080410912e-05, "loss": 4.2089, "step": 3560 }, { "epoch": 0.9484590860786397, "grad_norm": 4540049.0, "learning_rate": 1.3676939426142404e-05, "loss": 4.0757, "step": 3570 }, { "epoch": 0.9511158342189161, "grad_norm": 1934832.5, "learning_rate": 1.3659227771873893e-05, "loss": 3.8116, "step": 3580 }, { "epoch": 0.9537725823591924, "grad_norm": 721523.875, "learning_rate": 1.3641516117605385e-05, "loss": 3.8604, "step": 3590 }, { "epoch": 0.9564293304994687, "grad_norm": 3694456.5, "learning_rate": 1.3623804463336876e-05, "loss": 4.3438, "step": 3600 }, { "epoch": 0.9590860786397449, "grad_norm": 4130751.5, "learning_rate": 1.3606092809068368e-05, "loss": 3.7722, "step": 3610 }, { "epoch": 0.9617428267800212, "grad_norm": 3232915.5, "learning_rate": 1.3588381154799859e-05, "loss": 4.1108, "step": 3620 }, { "epoch": 0.9643995749202976, "grad_norm": 5608699.5, "learning_rate": 1.357066950053135e-05, "loss": 4.4695, "step": 3630 }, { "epoch": 0.9670563230605739, "grad_norm": 37526024.0, "learning_rate": 1.3552957846262841e-05, "loss": 3.8838, "step": 3640 }, { "epoch": 0.9697130712008502, "grad_norm": 11544401.0, "learning_rate": 1.3535246191994334e-05, "loss": 3.7949, "step": 3650 }, { "epoch": 0.9723698193411264, "grad_norm": 1559264.75, "learning_rate": 1.3517534537725824e-05, "loss": 3.8236, "step": 3660 }, { "epoch": 0.9750265674814028, "grad_norm": 10817994.0, "learning_rate": 1.3499822883457316e-05, "loss": 4.0035, "step": 3670 }, { "epoch": 0.9776833156216791, "grad_norm": 20268342.0, "learning_rate": 1.3482111229188807e-05, "loss": 3.6612, "step": 3680 }, { "epoch": 0.9803400637619554, "grad_norm": 51181968.0, "learning_rate": 1.34643995749203e-05, "loss": 3.7019, "step": 3690 }, { "epoch": 0.9829968119022316, "grad_norm": 74098400.0, "learning_rate": 1.3446687920651792e-05, "loss": 3.779, "step": 3700 }, { "epoch": 0.9856535600425079, "grad_norm": 48340468.0, "learning_rate": 1.3428976266383282e-05, "loss": 3.6759, "step": 3710 }, { "epoch": 0.9883103081827843, "grad_norm": 8802756.0, "learning_rate": 1.3411264612114771e-05, "loss": 3.6238, "step": 3720 }, { "epoch": 0.9909670563230606, "grad_norm": 3833086.75, "learning_rate": 1.3393552957846263e-05, "loss": 3.3759, "step": 3730 }, { "epoch": 0.9936238044633369, "grad_norm": 29499648.0, "learning_rate": 1.3375841303577754e-05, "loss": 3.6134, "step": 3740 }, { "epoch": 0.9962805526036131, "grad_norm": 6612167.0, "learning_rate": 1.3358129649309246e-05, "loss": 3.5491, "step": 3750 }, { "epoch": 0.9989373007438895, "grad_norm": 21236494.0, "learning_rate": 1.3340417995040737e-05, "loss": 3.7831, "step": 3760 }, { "epoch": 1.0, "eval_loss": 3.75178599357605, "eval_runtime": 744.4128, "eval_samples_per_second": 20.225, "eval_steps_per_second": 5.056, "step": 3764 }, { "epoch": 1.0015940488841657, "grad_norm": 40179844.0, "learning_rate": 1.3322706340772229e-05, "loss": 3.711, "step": 3770 }, { "epoch": 1.004250797024442, "grad_norm": 17010662.0, "learning_rate": 1.3304994686503721e-05, "loss": 3.4946, "step": 3780 }, { "epoch": 1.0069075451647185, "grad_norm": 19932106.0, "learning_rate": 1.3287283032235212e-05, "loss": 3.5648, "step": 3790 }, { "epoch": 1.0095642933049946, "grad_norm": 5492312.0, "learning_rate": 1.3269571377966704e-05, "loss": 4.0635, "step": 3800 }, { "epoch": 1.012221041445271, "grad_norm": 192937568.0, "learning_rate": 1.3251859723698195e-05, "loss": 3.4178, "step": 3810 }, { "epoch": 1.0148777895855472, "grad_norm": 1293443.125, "learning_rate": 1.3234148069429687e-05, "loss": 3.9658, "step": 3820 }, { "epoch": 1.0175345377258236, "grad_norm": 158162096.0, "learning_rate": 1.3216436415161178e-05, "loss": 3.6695, "step": 3830 }, { "epoch": 1.0201912858661, "grad_norm": 207503072.0, "learning_rate": 1.319872476089267e-05, "loss": 4.1104, "step": 3840 }, { "epoch": 1.0228480340063761, "grad_norm": 5859501.0, "learning_rate": 1.3181013106624159e-05, "loss": 3.7423, "step": 3850 }, { "epoch": 1.0255047821466525, "grad_norm": 65099376.0, "learning_rate": 1.3163301452355651e-05, "loss": 3.8122, "step": 3860 }, { "epoch": 1.0281615302869287, "grad_norm": 13768734.0, "learning_rate": 1.3145589798087142e-05, "loss": 3.8062, "step": 3870 }, { "epoch": 1.030818278427205, "grad_norm": 24830612.0, "learning_rate": 1.3127878143818634e-05, "loss": 3.5577, "step": 3880 }, { "epoch": 1.0334750265674815, "grad_norm": 109977040.0, "learning_rate": 1.3110166489550124e-05, "loss": 3.8904, "step": 3890 }, { "epoch": 1.0361317747077576, "grad_norm": 22621510.0, "learning_rate": 1.3092454835281617e-05, "loss": 3.7924, "step": 3900 }, { "epoch": 1.038788522848034, "grad_norm": 15618693.0, "learning_rate": 1.3074743181013107e-05, "loss": 3.9009, "step": 3910 }, { "epoch": 1.0414452709883104, "grad_norm": 102296992.0, "learning_rate": 1.30570315267446e-05, "loss": 4.0488, "step": 3920 }, { "epoch": 1.0441020191285866, "grad_norm": 180104320.0, "learning_rate": 1.303931987247609e-05, "loss": 4.0832, "step": 3930 }, { "epoch": 1.046758767268863, "grad_norm": 8426886.0, "learning_rate": 1.3021608218207582e-05, "loss": 3.9811, "step": 3940 }, { "epoch": 1.0494155154091391, "grad_norm": 23817282.0, "learning_rate": 1.3003896563939073e-05, "loss": 3.5573, "step": 3950 }, { "epoch": 1.0520722635494155, "grad_norm": 34805012.0, "learning_rate": 1.2986184909670565e-05, "loss": 3.6933, "step": 3960 }, { "epoch": 1.054729011689692, "grad_norm": 27546222.0, "learning_rate": 1.2968473255402056e-05, "loss": 3.826, "step": 3970 }, { "epoch": 1.057385759829968, "grad_norm": 73101112.0, "learning_rate": 1.2950761601133548e-05, "loss": 4.3474, "step": 3980 }, { "epoch": 1.0600425079702445, "grad_norm": 60012056.0, "learning_rate": 1.2933049946865037e-05, "loss": 3.4645, "step": 3990 }, { "epoch": 1.0626992561105206, "grad_norm": 10204493.0, "learning_rate": 1.2915338292596529e-05, "loss": 3.7942, "step": 4000 }, { "epoch": 1.065356004250797, "grad_norm": 67629928.0, "learning_rate": 1.289762663832802e-05, "loss": 3.6377, "step": 4010 }, { "epoch": 1.0680127523910734, "grad_norm": 31746526.0, "learning_rate": 1.2879914984059512e-05, "loss": 3.7846, "step": 4020 }, { "epoch": 1.0706695005313496, "grad_norm": 52992448.0, "learning_rate": 1.2862203329791003e-05, "loss": 3.2981, "step": 4030 }, { "epoch": 1.073326248671626, "grad_norm": 36022592.0, "learning_rate": 1.2844491675522495e-05, "loss": 3.6733, "step": 4040 }, { "epoch": 1.0759829968119021, "grad_norm": 11422725.0, "learning_rate": 1.2826780021253985e-05, "loss": 3.5682, "step": 4050 }, { "epoch": 1.0786397449521785, "grad_norm": 77457192.0, "learning_rate": 1.2809068366985478e-05, "loss": 3.8538, "step": 4060 }, { "epoch": 1.081296493092455, "grad_norm": 109772792.0, "learning_rate": 1.279135671271697e-05, "loss": 4.0151, "step": 4070 }, { "epoch": 1.083953241232731, "grad_norm": 126942304.0, "learning_rate": 1.277364505844846e-05, "loss": 4.418, "step": 4080 }, { "epoch": 1.0866099893730075, "grad_norm": 215005632.0, "learning_rate": 1.2755933404179953e-05, "loss": 3.6302, "step": 4090 }, { "epoch": 1.0892667375132838, "grad_norm": 18895672.0, "learning_rate": 1.2738221749911443e-05, "loss": 4.2548, "step": 4100 }, { "epoch": 1.09192348565356, "grad_norm": 20576284.0, "learning_rate": 1.2720510095642936e-05, "loss": 3.9913, "step": 4110 }, { "epoch": 1.0945802337938364, "grad_norm": 90564424.0, "learning_rate": 1.2702798441374424e-05, "loss": 3.8335, "step": 4120 }, { "epoch": 1.0972369819341126, "grad_norm": 136458144.0, "learning_rate": 1.2685086787105915e-05, "loss": 4.0485, "step": 4130 }, { "epoch": 1.099893730074389, "grad_norm": 175102016.0, "learning_rate": 1.2667375132837407e-05, "loss": 4.1181, "step": 4140 }, { "epoch": 1.1025504782146653, "grad_norm": 15060149.0, "learning_rate": 1.26496634785689e-05, "loss": 3.753, "step": 4150 }, { "epoch": 1.1052072263549415, "grad_norm": 92020808.0, "learning_rate": 1.263195182430039e-05, "loss": 3.9935, "step": 4160 }, { "epoch": 1.107863974495218, "grad_norm": 133574952.0, "learning_rate": 1.2614240170031882e-05, "loss": 4.0376, "step": 4170 }, { "epoch": 1.110520722635494, "grad_norm": 69448336.0, "learning_rate": 1.2596528515763373e-05, "loss": 3.7264, "step": 4180 }, { "epoch": 1.1131774707757705, "grad_norm": 24695358.0, "learning_rate": 1.2578816861494865e-05, "loss": 3.6435, "step": 4190 }, { "epoch": 1.1158342189160468, "grad_norm": 26981000.0, "learning_rate": 1.2561105207226356e-05, "loss": 4.1867, "step": 4200 }, { "epoch": 1.118490967056323, "grad_norm": 26429450.0, "learning_rate": 1.2543393552957848e-05, "loss": 4.2308, "step": 4210 }, { "epoch": 1.1211477151965994, "grad_norm": 75864056.0, "learning_rate": 1.2525681898689339e-05, "loss": 4.1067, "step": 4220 }, { "epoch": 1.1238044633368758, "grad_norm": 53176204.0, "learning_rate": 1.2507970244420831e-05, "loss": 4.3122, "step": 4230 }, { "epoch": 1.126461211477152, "grad_norm": 27715404.0, "learning_rate": 1.2490258590152322e-05, "loss": 4.0918, "step": 4240 }, { "epoch": 1.1291179596174283, "grad_norm": 6029370.0, "learning_rate": 1.2472546935883814e-05, "loss": 4.1725, "step": 4250 }, { "epoch": 1.1317747077577045, "grad_norm": 26051718.0, "learning_rate": 1.2454835281615303e-05, "loss": 3.9757, "step": 4260 }, { "epoch": 1.134431455897981, "grad_norm": 77973728.0, "learning_rate": 1.2437123627346795e-05, "loss": 3.989, "step": 4270 }, { "epoch": 1.1370882040382573, "grad_norm": 11366385.0, "learning_rate": 1.2419411973078286e-05, "loss": 4.3978, "step": 4280 }, { "epoch": 1.1397449521785334, "grad_norm": 19926490.0, "learning_rate": 1.2401700318809778e-05, "loss": 3.7446, "step": 4290 }, { "epoch": 1.1424017003188098, "grad_norm": 66211068.0, "learning_rate": 1.2383988664541268e-05, "loss": 3.9591, "step": 4300 }, { "epoch": 1.145058448459086, "grad_norm": 7617592.5, "learning_rate": 1.236627701027276e-05, "loss": 4.2812, "step": 4310 }, { "epoch": 1.1477151965993624, "grad_norm": 47218612.0, "learning_rate": 1.2348565356004251e-05, "loss": 4.137, "step": 4320 }, { "epoch": 1.1503719447396388, "grad_norm": 115950944.0, "learning_rate": 1.2330853701735743e-05, "loss": 4.1344, "step": 4330 }, { "epoch": 1.153028692879915, "grad_norm": 27328380.0, "learning_rate": 1.2313142047467234e-05, "loss": 4.0865, "step": 4340 }, { "epoch": 1.1556854410201913, "grad_norm": 8267316.5, "learning_rate": 1.2295430393198726e-05, "loss": 4.3048, "step": 4350 }, { "epoch": 1.1583421891604675, "grad_norm": 18654644.0, "learning_rate": 1.2277718738930217e-05, "loss": 4.4512, "step": 4360 }, { "epoch": 1.1609989373007439, "grad_norm": 123494120.0, "learning_rate": 1.2260007084661709e-05, "loss": 4.1863, "step": 4370 }, { "epoch": 1.1636556854410203, "grad_norm": 87930224.0, "learning_rate": 1.2242295430393201e-05, "loss": 4.1395, "step": 4380 }, { "epoch": 1.1663124335812964, "grad_norm": 60926568.0, "learning_rate": 1.222458377612469e-05, "loss": 3.9975, "step": 4390 }, { "epoch": 1.1689691817215728, "grad_norm": 15561844.0, "learning_rate": 1.2206872121856181e-05, "loss": 4.1746, "step": 4400 }, { "epoch": 1.171625929861849, "grad_norm": 14337786.0, "learning_rate": 1.2189160467587673e-05, "loss": 4.0762, "step": 4410 }, { "epoch": 1.1742826780021254, "grad_norm": 27260074.0, "learning_rate": 1.2171448813319164e-05, "loss": 4.3436, "step": 4420 }, { "epoch": 1.1769394261424018, "grad_norm": 14445331.0, "learning_rate": 1.2153737159050656e-05, "loss": 3.9788, "step": 4430 }, { "epoch": 1.179596174282678, "grad_norm": 21041896.0, "learning_rate": 1.2136025504782147e-05, "loss": 4.3681, "step": 4440 }, { "epoch": 1.1822529224229543, "grad_norm": 15333385.0, "learning_rate": 1.2118313850513639e-05, "loss": 4.1638, "step": 4450 }, { "epoch": 1.1849096705632305, "grad_norm": 18882606.0, "learning_rate": 1.2100602196245131e-05, "loss": 3.9175, "step": 4460 }, { "epoch": 1.1875664187035069, "grad_norm": 6002330.5, "learning_rate": 1.2082890541976622e-05, "loss": 4.0274, "step": 4470 }, { "epoch": 1.1902231668437833, "grad_norm": 12174502.0, "learning_rate": 1.2065178887708114e-05, "loss": 4.0163, "step": 4480 }, { "epoch": 1.1928799149840594, "grad_norm": 3046521.75, "learning_rate": 1.2047467233439604e-05, "loss": 4.2218, "step": 4490 }, { "epoch": 1.1955366631243358, "grad_norm": 7046191.0, "learning_rate": 1.2029755579171097e-05, "loss": 3.8047, "step": 4500 }, { "epoch": 1.1981934112646122, "grad_norm": 2158310.5, "learning_rate": 1.2012043924902587e-05, "loss": 4.0102, "step": 4510 }, { "epoch": 1.2008501594048884, "grad_norm": 1953139.875, "learning_rate": 1.199433227063408e-05, "loss": 3.9815, "step": 4520 }, { "epoch": 1.2035069075451648, "grad_norm": 10403948.0, "learning_rate": 1.1976620616365568e-05, "loss": 4.2106, "step": 4530 }, { "epoch": 1.206163655685441, "grad_norm": 1701127.5, "learning_rate": 1.195890896209706e-05, "loss": 4.1719, "step": 4540 }, { "epoch": 1.2088204038257173, "grad_norm": 1922839.625, "learning_rate": 1.1941197307828551e-05, "loss": 4.2, "step": 4550 }, { "epoch": 1.2114771519659937, "grad_norm": 1249251.375, "learning_rate": 1.1923485653560044e-05, "loss": 4.3854, "step": 4560 }, { "epoch": 1.2141339001062699, "grad_norm": 3677515.25, "learning_rate": 1.1905773999291534e-05, "loss": 4.1928, "step": 4570 }, { "epoch": 1.2167906482465463, "grad_norm": 1778515.5, "learning_rate": 1.1888062345023026e-05, "loss": 4.282, "step": 4580 }, { "epoch": 1.2194473963868226, "grad_norm": 2142989.75, "learning_rate": 1.1870350690754517e-05, "loss": 4.0862, "step": 4590 }, { "epoch": 1.2221041445270988, "grad_norm": 3376149.5, "learning_rate": 1.185263903648601e-05, "loss": 4.9249, "step": 4600 }, { "epoch": 1.2247608926673752, "grad_norm": 918137.0625, "learning_rate": 1.18349273822175e-05, "loss": 4.4397, "step": 4610 }, { "epoch": 1.2274176408076514, "grad_norm": 5548887.5, "learning_rate": 1.1817215727948992e-05, "loss": 4.186, "step": 4620 }, { "epoch": 1.2300743889479278, "grad_norm": 1206121.0, "learning_rate": 1.1799504073680483e-05, "loss": 4.4369, "step": 4630 }, { "epoch": 1.2327311370882041, "grad_norm": 1302905.0, "learning_rate": 1.1781792419411975e-05, "loss": 4.2492, "step": 4640 }, { "epoch": 1.2353878852284803, "grad_norm": 1243181.25, "learning_rate": 1.1764080765143466e-05, "loss": 4.3557, "step": 4650 }, { "epoch": 1.2380446333687567, "grad_norm": 1636811.25, "learning_rate": 1.1746369110874956e-05, "loss": 4.4305, "step": 4660 }, { "epoch": 1.2407013815090329, "grad_norm": 3252745.75, "learning_rate": 1.1728657456606447e-05, "loss": 4.4447, "step": 4670 }, { "epoch": 1.2433581296493093, "grad_norm": 3218180.0, "learning_rate": 1.1710945802337939e-05, "loss": 4.1695, "step": 4680 }, { "epoch": 1.2460148777895856, "grad_norm": 7251921.5, "learning_rate": 1.169323414806943e-05, "loss": 4.0679, "step": 4690 }, { "epoch": 1.2486716259298618, "grad_norm": 3886631.0, "learning_rate": 1.1675522493800922e-05, "loss": 3.9159, "step": 4700 }, { "epoch": 1.2513283740701382, "grad_norm": 2420017.75, "learning_rate": 1.1657810839532412e-05, "loss": 4.6458, "step": 4710 }, { "epoch": 1.2539851222104144, "grad_norm": 1138159.875, "learning_rate": 1.1640099185263905e-05, "loss": 4.078, "step": 4720 }, { "epoch": 1.2566418703506907, "grad_norm": 930125.875, "learning_rate": 1.1622387530995395e-05, "loss": 4.0812, "step": 4730 }, { "epoch": 1.2592986184909671, "grad_norm": 3835148.25, "learning_rate": 1.1604675876726887e-05, "loss": 4.1012, "step": 4740 }, { "epoch": 1.2619553666312433, "grad_norm": 6243373.5, "learning_rate": 1.158696422245838e-05, "loss": 3.8252, "step": 4750 }, { "epoch": 1.2646121147715197, "grad_norm": 3021652.25, "learning_rate": 1.156925256818987e-05, "loss": 3.9515, "step": 4760 }, { "epoch": 1.2672688629117959, "grad_norm": 4503118.5, "learning_rate": 1.1551540913921363e-05, "loss": 4.0478, "step": 4770 }, { "epoch": 1.2699256110520722, "grad_norm": 5867597.5, "learning_rate": 1.1533829259652853e-05, "loss": 4.0726, "step": 4780 }, { "epoch": 1.2725823591923486, "grad_norm": 23690828.0, "learning_rate": 1.1516117605384345e-05, "loss": 3.5037, "step": 4790 }, { "epoch": 1.2752391073326248, "grad_norm": 5260964.5, "learning_rate": 1.1498405951115834e-05, "loss": 4.0774, "step": 4800 }, { "epoch": 1.2778958554729012, "grad_norm": 4894551.5, "learning_rate": 1.1480694296847325e-05, "loss": 3.7113, "step": 4810 }, { "epoch": 1.2805526036131774, "grad_norm": 4784902.0, "learning_rate": 1.1462982642578817e-05, "loss": 3.886, "step": 4820 }, { "epoch": 1.2832093517534537, "grad_norm": 22511842.0, "learning_rate": 1.144527098831031e-05, "loss": 3.7413, "step": 4830 }, { "epoch": 1.2858660998937301, "grad_norm": 13445524.0, "learning_rate": 1.14275593340418e-05, "loss": 4.3171, "step": 4840 }, { "epoch": 1.2885228480340063, "grad_norm": 4879641.0, "learning_rate": 1.1409847679773292e-05, "loss": 4.0366, "step": 4850 }, { "epoch": 1.2911795961742827, "grad_norm": 5458451.0, "learning_rate": 1.1392136025504783e-05, "loss": 4.0356, "step": 4860 }, { "epoch": 1.2938363443145589, "grad_norm": 1152951.125, "learning_rate": 1.1374424371236275e-05, "loss": 3.9322, "step": 4870 }, { "epoch": 1.2964930924548352, "grad_norm": 1573109.875, "learning_rate": 1.1356712716967766e-05, "loss": 3.5684, "step": 4880 }, { "epoch": 1.2991498405951116, "grad_norm": 3557934.25, "learning_rate": 1.1339001062699258e-05, "loss": 3.8874, "step": 4890 }, { "epoch": 1.301806588735388, "grad_norm": 2637183.5, "learning_rate": 1.1321289408430748e-05, "loss": 4.0737, "step": 4900 }, { "epoch": 1.3044633368756642, "grad_norm": 1852644.25, "learning_rate": 1.130357775416224e-05, "loss": 4.4462, "step": 4910 }, { "epoch": 1.3071200850159406, "grad_norm": 7577384.5, "learning_rate": 1.1285866099893731e-05, "loss": 3.8546, "step": 4920 }, { "epoch": 1.3097768331562167, "grad_norm": 4401453.5, "learning_rate": 1.1268154445625224e-05, "loss": 4.0443, "step": 4930 }, { "epoch": 1.3124335812964931, "grad_norm": 3643839.75, "learning_rate": 1.1250442791356712e-05, "loss": 3.678, "step": 4940 }, { "epoch": 1.3150903294367695, "grad_norm": 27145024.0, "learning_rate": 1.1232731137088205e-05, "loss": 3.8589, "step": 4950 }, { "epoch": 1.3177470775770457, "grad_norm": 1982266.875, "learning_rate": 1.1215019482819695e-05, "loss": 3.587, "step": 4960 }, { "epoch": 1.320403825717322, "grad_norm": 2339293.25, "learning_rate": 1.1197307828551188e-05, "loss": 3.6116, "step": 4970 }, { "epoch": 1.3230605738575982, "grad_norm": 21441204.0, "learning_rate": 1.1179596174282678e-05, "loss": 3.4365, "step": 4980 }, { "epoch": 1.3257173219978746, "grad_norm": 3329228.0, "learning_rate": 1.116188452001417e-05, "loss": 4.184, "step": 4990 }, { "epoch": 1.328374070138151, "grad_norm": 2602702.75, "learning_rate": 1.1144172865745661e-05, "loss": 3.6095, "step": 5000 }, { "epoch": 1.3310308182784272, "grad_norm": 62917268.0, "learning_rate": 1.1126461211477153e-05, "loss": 3.4086, "step": 5010 }, { "epoch": 1.3336875664187036, "grad_norm": 9320738.0, "learning_rate": 1.1108749557208644e-05, "loss": 3.8485, "step": 5020 }, { "epoch": 1.3363443145589797, "grad_norm": 11171778.0, "learning_rate": 1.1091037902940136e-05, "loss": 3.5241, "step": 5030 }, { "epoch": 1.3390010626992561, "grad_norm": 13504690.0, "learning_rate": 1.1073326248671628e-05, "loss": 3.7951, "step": 5040 }, { "epoch": 1.3416578108395325, "grad_norm": 1940023.625, "learning_rate": 1.1055614594403119e-05, "loss": 3.938, "step": 5050 }, { "epoch": 1.3443145589798087, "grad_norm": 9250230.0, "learning_rate": 1.1037902940134611e-05, "loss": 3.6501, "step": 5060 }, { "epoch": 1.346971307120085, "grad_norm": 8658494.0, "learning_rate": 1.10201912858661e-05, "loss": 3.4101, "step": 5070 }, { "epoch": 1.3496280552603612, "grad_norm": 24788584.0, "learning_rate": 1.100247963159759e-05, "loss": 3.2665, "step": 5080 }, { "epoch": 1.3522848034006376, "grad_norm": 17288262.0, "learning_rate": 1.0984767977329083e-05, "loss": 3.9485, "step": 5090 }, { "epoch": 1.354941551540914, "grad_norm": 1679803.0, "learning_rate": 1.0967056323060574e-05, "loss": 3.7726, "step": 5100 }, { "epoch": 1.3575982996811902, "grad_norm": 14593549.0, "learning_rate": 1.0949344668792066e-05, "loss": 4.0024, "step": 5110 }, { "epoch": 1.3602550478214666, "grad_norm": 4186409.75, "learning_rate": 1.0931633014523556e-05, "loss": 3.6818, "step": 5120 }, { "epoch": 1.3629117959617427, "grad_norm": 747755.5625, "learning_rate": 1.0913921360255049e-05, "loss": 3.4717, "step": 5130 }, { "epoch": 1.365568544102019, "grad_norm": 445103.3125, "learning_rate": 1.0896209705986541e-05, "loss": 3.4684, "step": 5140 }, { "epoch": 1.3682252922422955, "grad_norm": 1250102.625, "learning_rate": 1.0878498051718031e-05, "loss": 3.2248, "step": 5150 }, { "epoch": 1.3708820403825717, "grad_norm": 532045.3125, "learning_rate": 1.0860786397449524e-05, "loss": 3.3662, "step": 5160 }, { "epoch": 1.373538788522848, "grad_norm": 454849.5625, "learning_rate": 1.0843074743181014e-05, "loss": 3.5507, "step": 5170 }, { "epoch": 1.3761955366631242, "grad_norm": 3551179.5, "learning_rate": 1.0825363088912507e-05, "loss": 3.2755, "step": 5180 }, { "epoch": 1.3788522848034006, "grad_norm": 6700418.0, "learning_rate": 1.0807651434643997e-05, "loss": 3.2751, "step": 5190 }, { "epoch": 1.381509032943677, "grad_norm": 37462192.0, "learning_rate": 1.078993978037549e-05, "loss": 3.5327, "step": 5200 }, { "epoch": 1.3841657810839532, "grad_norm": 9333666.0, "learning_rate": 1.0772228126106978e-05, "loss": 3.1278, "step": 5210 }, { "epoch": 1.3868225292242295, "grad_norm": 16026876.0, "learning_rate": 1.075451647183847e-05, "loss": 3.5275, "step": 5220 }, { "epoch": 1.3894792773645057, "grad_norm": 24360552.0, "learning_rate": 1.0736804817569961e-05, "loss": 3.6815, "step": 5230 }, { "epoch": 1.392136025504782, "grad_norm": 12289483.0, "learning_rate": 1.0719093163301453e-05, "loss": 3.1039, "step": 5240 }, { "epoch": 1.3947927736450585, "grad_norm": 1954500.625, "learning_rate": 1.0701381509032944e-05, "loss": 3.3327, "step": 5250 }, { "epoch": 1.3974495217853349, "grad_norm": 5957172.5, "learning_rate": 1.0683669854764436e-05, "loss": 3.6985, "step": 5260 }, { "epoch": 1.400106269925611, "grad_norm": 136582976.0, "learning_rate": 1.0665958200495927e-05, "loss": 3.4845, "step": 5270 }, { "epoch": 1.4027630180658874, "grad_norm": 21799228.0, "learning_rate": 1.0648246546227419e-05, "loss": 3.4648, "step": 5280 }, { "epoch": 1.4054197662061636, "grad_norm": 1183856.625, "learning_rate": 1.063053489195891e-05, "loss": 3.2929, "step": 5290 }, { "epoch": 1.40807651434644, "grad_norm": 28349394.0, "learning_rate": 1.0612823237690402e-05, "loss": 3.611, "step": 5300 }, { "epoch": 1.4107332624867164, "grad_norm": 1230487.75, "learning_rate": 1.0595111583421892e-05, "loss": 3.0602, "step": 5310 }, { "epoch": 1.4133900106269925, "grad_norm": 29549574.0, "learning_rate": 1.0577399929153385e-05, "loss": 3.6129, "step": 5320 }, { "epoch": 1.416046758767269, "grad_norm": 65607896.0, "learning_rate": 1.0559688274884875e-05, "loss": 3.305, "step": 5330 }, { "epoch": 1.418703506907545, "grad_norm": 21593944.0, "learning_rate": 1.0541976620616366e-05, "loss": 4.182, "step": 5340 }, { "epoch": 1.4213602550478215, "grad_norm": 9913192.0, "learning_rate": 1.0524264966347856e-05, "loss": 3.333, "step": 5350 }, { "epoch": 1.4240170031880979, "grad_norm": 5600408.5, "learning_rate": 1.0506553312079349e-05, "loss": 3.2001, "step": 5360 }, { "epoch": 1.426673751328374, "grad_norm": 4921900.0, "learning_rate": 1.048884165781084e-05, "loss": 3.8381, "step": 5370 }, { "epoch": 1.4293304994686504, "grad_norm": 22669404.0, "learning_rate": 1.0471130003542332e-05, "loss": 3.438, "step": 5380 }, { "epoch": 1.4319872476089266, "grad_norm": 11211402.0, "learning_rate": 1.0453418349273822e-05, "loss": 3.3608, "step": 5390 }, { "epoch": 1.434643995749203, "grad_norm": 10033162.0, "learning_rate": 1.0435706695005314e-05, "loss": 3.2148, "step": 5400 }, { "epoch": 1.4373007438894794, "grad_norm": 34627448.0, "learning_rate": 1.0417995040736805e-05, "loss": 3.3408, "step": 5410 }, { "epoch": 1.4399574920297555, "grad_norm": 19163360.0, "learning_rate": 1.0400283386468297e-05, "loss": 3.0767, "step": 5420 }, { "epoch": 1.442614240170032, "grad_norm": 11876396.0, "learning_rate": 1.038257173219979e-05, "loss": 3.8624, "step": 5430 }, { "epoch": 1.445270988310308, "grad_norm": 6485251.5, "learning_rate": 1.036486007793128e-05, "loss": 3.4212, "step": 5440 }, { "epoch": 1.4479277364505845, "grad_norm": 2855033.5, "learning_rate": 1.0347148423662772e-05, "loss": 3.5543, "step": 5450 }, { "epoch": 1.4505844845908609, "grad_norm": 39419356.0, "learning_rate": 1.0329436769394263e-05, "loss": 3.6357, "step": 5460 }, { "epoch": 1.453241232731137, "grad_norm": 8782708.0, "learning_rate": 1.0311725115125755e-05, "loss": 3.7995, "step": 5470 }, { "epoch": 1.4558979808714134, "grad_norm": 32046924.0, "learning_rate": 1.0294013460857244e-05, "loss": 3.2472, "step": 5480 }, { "epoch": 1.4585547290116896, "grad_norm": 30402538.0, "learning_rate": 1.0276301806588735e-05, "loss": 3.1715, "step": 5490 }, { "epoch": 1.461211477151966, "grad_norm": 19326186.0, "learning_rate": 1.0258590152320227e-05, "loss": 3.9161, "step": 5500 }, { "epoch": 1.4638682252922424, "grad_norm": 9990077.0, "learning_rate": 1.024087849805172e-05, "loss": 3.849, "step": 5510 }, { "epoch": 1.4665249734325185, "grad_norm": 29835254.0, "learning_rate": 1.022316684378321e-05, "loss": 3.331, "step": 5520 }, { "epoch": 1.469181721572795, "grad_norm": 84350656.0, "learning_rate": 1.0205455189514702e-05, "loss": 3.3592, "step": 5530 }, { "epoch": 1.471838469713071, "grad_norm": 5173333.5, "learning_rate": 1.0187743535246193e-05, "loss": 3.3015, "step": 5540 }, { "epoch": 1.4744952178533475, "grad_norm": 3443425.5, "learning_rate": 1.0170031880977685e-05, "loss": 3.5236, "step": 5550 }, { "epoch": 1.4771519659936239, "grad_norm": 2188022.75, "learning_rate": 1.0152320226709175e-05, "loss": 3.5614, "step": 5560 }, { "epoch": 1.4798087141339, "grad_norm": 16931794.0, "learning_rate": 1.0134608572440668e-05, "loss": 3.6685, "step": 5570 }, { "epoch": 1.4824654622741764, "grad_norm": 10456564.0, "learning_rate": 1.0116896918172158e-05, "loss": 3.4864, "step": 5580 }, { "epoch": 1.4851222104144526, "grad_norm": 27239420.0, "learning_rate": 1.009918526390365e-05, "loss": 3.5637, "step": 5590 }, { "epoch": 1.487778958554729, "grad_norm": 16616771.0, "learning_rate": 1.0081473609635141e-05, "loss": 3.6085, "step": 5600 }, { "epoch": 1.4904357066950054, "grad_norm": 10221569.0, "learning_rate": 1.0063761955366632e-05, "loss": 3.5812, "step": 5610 }, { "epoch": 1.4930924548352817, "grad_norm": 1452260.75, "learning_rate": 1.0046050301098122e-05, "loss": 3.9326, "step": 5620 }, { "epoch": 1.495749202975558, "grad_norm": 3546143.0, "learning_rate": 1.0028338646829615e-05, "loss": 3.2541, "step": 5630 }, { "epoch": 1.4984059511158343, "grad_norm": 12791246.0, "learning_rate": 1.0010626992561105e-05, "loss": 3.4152, "step": 5640 }, { "epoch": 1.5010626992561105, "grad_norm": 12529229.0, "learning_rate": 9.992915338292597e-06, "loss": 3.0508, "step": 5650 }, { "epoch": 1.5037194473963869, "grad_norm": 9755405.0, "learning_rate": 9.975203684024088e-06, "loss": 3.5064, "step": 5660 }, { "epoch": 1.5063761955366632, "grad_norm": 6901898.0, "learning_rate": 9.95749202975558e-06, "loss": 3.6654, "step": 5670 }, { "epoch": 1.5090329436769394, "grad_norm": 9542270.0, "learning_rate": 9.93978037548707e-06, "loss": 3.3481, "step": 5680 }, { "epoch": 1.5116896918172156, "grad_norm": 14570059.0, "learning_rate": 9.922068721218563e-06, "loss": 3.6342, "step": 5690 }, { "epoch": 1.514346439957492, "grad_norm": 130252984.0, "learning_rate": 9.904357066950054e-06, "loss": 3.3275, "step": 5700 }, { "epoch": 1.5170031880977684, "grad_norm": 12491921.0, "learning_rate": 9.886645412681544e-06, "loss": 3.1862, "step": 5710 }, { "epoch": 1.5196599362380447, "grad_norm": 171955248.0, "learning_rate": 9.868933758413036e-06, "loss": 3.6, "step": 5720 }, { "epoch": 1.522316684378321, "grad_norm": 67972536.0, "learning_rate": 9.851222104144527e-06, "loss": 3.5839, "step": 5730 }, { "epoch": 1.524973432518597, "grad_norm": 19312536.0, "learning_rate": 9.83351044987602e-06, "loss": 3.3906, "step": 5740 }, { "epoch": 1.5276301806588735, "grad_norm": 39636108.0, "learning_rate": 9.81579879560751e-06, "loss": 3.5388, "step": 5750 }, { "epoch": 1.5302869287991498, "grad_norm": 54133548.0, "learning_rate": 9.798087141339002e-06, "loss": 3.2938, "step": 5760 }, { "epoch": 1.5329436769394262, "grad_norm": 28021788.0, "learning_rate": 9.780375487070494e-06, "loss": 3.565, "step": 5770 }, { "epoch": 1.5356004250797024, "grad_norm": 12500334.0, "learning_rate": 9.762663832801983e-06, "loss": 3.4099, "step": 5780 }, { "epoch": 1.5382571732199788, "grad_norm": 20677724.0, "learning_rate": 9.744952178533476e-06, "loss": 3.8265, "step": 5790 }, { "epoch": 1.540913921360255, "grad_norm": 25849000.0, "learning_rate": 9.727240524264968e-06, "loss": 3.5107, "step": 5800 }, { "epoch": 1.5435706695005313, "grad_norm": 7106916.0, "learning_rate": 9.709528869996458e-06, "loss": 3.7538, "step": 5810 }, { "epoch": 1.5462274176408077, "grad_norm": 78143128.0, "learning_rate": 9.69181721572795e-06, "loss": 3.8139, "step": 5820 }, { "epoch": 1.548884165781084, "grad_norm": 124880632.0, "learning_rate": 9.674105561459441e-06, "loss": 3.4966, "step": 5830 }, { "epoch": 1.5515409139213603, "grad_norm": 16674735.0, "learning_rate": 9.656393907190934e-06, "loss": 3.7779, "step": 5840 }, { "epoch": 1.5541976620616365, "grad_norm": 36204444.0, "learning_rate": 9.638682252922424e-06, "loss": 3.5086, "step": 5850 }, { "epoch": 1.5568544102019128, "grad_norm": 7019197.5, "learning_rate": 9.620970598653915e-06, "loss": 3.3062, "step": 5860 }, { "epoch": 1.5595111583421892, "grad_norm": 14028569.0, "learning_rate": 9.603258944385407e-06, "loss": 3.4862, "step": 5870 }, { "epoch": 1.5621679064824656, "grad_norm": 24143218.0, "learning_rate": 9.585547290116898e-06, "loss": 3.388, "step": 5880 }, { "epoch": 1.5648246546227418, "grad_norm": 8635328.0, "learning_rate": 9.56783563584839e-06, "loss": 3.9959, "step": 5890 }, { "epoch": 1.567481402763018, "grad_norm": 14461347.0, "learning_rate": 9.55012398157988e-06, "loss": 3.3619, "step": 5900 }, { "epoch": 1.5701381509032943, "grad_norm": 45164232.0, "learning_rate": 9.532412327311371e-06, "loss": 3.7565, "step": 5910 }, { "epoch": 1.5727948990435707, "grad_norm": 43768708.0, "learning_rate": 9.514700673042863e-06, "loss": 3.2873, "step": 5920 }, { "epoch": 1.5754516471838471, "grad_norm": 102944216.0, "learning_rate": 9.496989018774354e-06, "loss": 3.5849, "step": 5930 }, { "epoch": 1.5781083953241233, "grad_norm": 8864102.0, "learning_rate": 9.479277364505846e-06, "loss": 3.3615, "step": 5940 }, { "epoch": 1.5807651434643994, "grad_norm": 17926040.0, "learning_rate": 9.461565710237337e-06, "loss": 3.3599, "step": 5950 }, { "epoch": 1.5834218916046758, "grad_norm": 563806208.0, "learning_rate": 9.443854055968829e-06, "loss": 3.6726, "step": 5960 }, { "epoch": 1.5860786397449522, "grad_norm": 4375813.5, "learning_rate": 9.42614240170032e-06, "loss": 3.5982, "step": 5970 }, { "epoch": 1.5887353878852286, "grad_norm": 23817932.0, "learning_rate": 9.40843074743181e-06, "loss": 3.6873, "step": 5980 }, { "epoch": 1.5913921360255048, "grad_norm": 3588041.25, "learning_rate": 9.390719093163302e-06, "loss": 3.8219, "step": 5990 }, { "epoch": 1.594048884165781, "grad_norm": 97096224.0, "learning_rate": 9.373007438894793e-06, "loss": 3.5905, "step": 6000 }, { "epoch": 1.5967056323060573, "grad_norm": 4066724.0, "learning_rate": 9.355295784626285e-06, "loss": 3.5762, "step": 6010 }, { "epoch": 1.5993623804463337, "grad_norm": 44529008.0, "learning_rate": 9.337584130357776e-06, "loss": 3.821, "step": 6020 }, { "epoch": 1.60201912858661, "grad_norm": 10141793.0, "learning_rate": 9.319872476089268e-06, "loss": 3.4989, "step": 6030 }, { "epoch": 1.6046758767268863, "grad_norm": 22102744.0, "learning_rate": 9.302160821820759e-06, "loss": 3.4363, "step": 6040 }, { "epoch": 1.6073326248671624, "grad_norm": 1421525.375, "learning_rate": 9.284449167552249e-06, "loss": 3.3543, "step": 6050 }, { "epoch": 1.6099893730074388, "grad_norm": 17624050.0, "learning_rate": 9.266737513283741e-06, "loss": 3.5835, "step": 6060 }, { "epoch": 1.6126461211477152, "grad_norm": 2787807.5, "learning_rate": 9.249025859015232e-06, "loss": 3.7715, "step": 6070 }, { "epoch": 1.6153028692879916, "grad_norm": 36419916.0, "learning_rate": 9.231314204746724e-06, "loss": 3.2874, "step": 6080 }, { "epoch": 1.6179596174282678, "grad_norm": 550304.0, "learning_rate": 9.213602550478215e-06, "loss": 3.775, "step": 6090 }, { "epoch": 1.620616365568544, "grad_norm": 13110638.0, "learning_rate": 9.195890896209707e-06, "loss": 4.0895, "step": 6100 }, { "epoch": 1.6232731137088203, "grad_norm": 153279.40625, "learning_rate": 9.1781792419412e-06, "loss": 3.5868, "step": 6110 }, { "epoch": 1.6259298618490967, "grad_norm": 274644.03125, "learning_rate": 9.160467587672688e-06, "loss": 3.4759, "step": 6120 }, { "epoch": 1.628586609989373, "grad_norm": 21545.19921875, "learning_rate": 9.14275593340418e-06, "loss": 4.0524, "step": 6130 }, { "epoch": 1.6312433581296493, "grad_norm": 27863.1015625, "learning_rate": 9.125044279135673e-06, "loss": 3.4133, "step": 6140 }, { "epoch": 1.6339001062699257, "grad_norm": 146765.640625, "learning_rate": 9.107332624867163e-06, "loss": 3.6765, "step": 6150 }, { "epoch": 1.6365568544102018, "grad_norm": 60709.375, "learning_rate": 9.089620970598656e-06, "loss": 3.8558, "step": 6160 }, { "epoch": 1.6392136025504782, "grad_norm": 290704.21875, "learning_rate": 9.071909316330146e-06, "loss": 3.3615, "step": 6170 }, { "epoch": 1.6418703506907546, "grad_norm": 198007.828125, "learning_rate": 9.054197662061637e-06, "loss": 3.6759, "step": 6180 }, { "epoch": 1.6445270988310308, "grad_norm": 30211.29296875, "learning_rate": 9.036486007793129e-06, "loss": 4.1618, "step": 6190 }, { "epoch": 1.6471838469713072, "grad_norm": 697217.3125, "learning_rate": 9.01877435352462e-06, "loss": 3.5873, "step": 6200 }, { "epoch": 1.6498405951115833, "grad_norm": 311260.34375, "learning_rate": 9.001062699256112e-06, "loss": 4.0309, "step": 6210 }, { "epoch": 1.6524973432518597, "grad_norm": 7285945.0, "learning_rate": 8.983351044987602e-06, "loss": 3.7024, "step": 6220 }, { "epoch": 1.655154091392136, "grad_norm": 238075.265625, "learning_rate": 8.965639390719095e-06, "loss": 3.7081, "step": 6230 }, { "epoch": 1.6578108395324125, "grad_norm": 104777.8828125, "learning_rate": 8.947927736450585e-06, "loss": 3.6374, "step": 6240 }, { "epoch": 1.6604675876726886, "grad_norm": 45899.98828125, "learning_rate": 8.930216082182076e-06, "loss": 3.7753, "step": 6250 }, { "epoch": 1.6631243358129648, "grad_norm": 4903258.0, "learning_rate": 8.912504427913568e-06, "loss": 3.7641, "step": 6260 }, { "epoch": 1.6657810839532412, "grad_norm": 691504.875, "learning_rate": 8.894792773645059e-06, "loss": 3.012, "step": 6270 }, { "epoch": 1.6684378320935176, "grad_norm": 7211197.0, "learning_rate": 8.877081119376551e-06, "loss": 3.278, "step": 6280 }, { "epoch": 1.671094580233794, "grad_norm": 55386.39453125, "learning_rate": 8.859369465108042e-06, "loss": 3.5972, "step": 6290 }, { "epoch": 1.6737513283740701, "grad_norm": 4803297.5, "learning_rate": 8.841657810839534e-06, "loss": 3.5168, "step": 6300 }, { "epoch": 1.6764080765143463, "grad_norm": 153394.5625, "learning_rate": 8.823946156571024e-06, "loss": 3.4884, "step": 6310 }, { "epoch": 1.6790648246546227, "grad_norm": 105014.6796875, "learning_rate": 8.806234502302515e-06, "loss": 3.5724, "step": 6320 }, { "epoch": 1.681721572794899, "grad_norm": 425531.6875, "learning_rate": 8.788522848034007e-06, "loss": 3.7171, "step": 6330 }, { "epoch": 1.6843783209351755, "grad_norm": 881638.625, "learning_rate": 8.770811193765498e-06, "loss": 3.5689, "step": 6340 }, { "epoch": 1.6870350690754516, "grad_norm": 506417.84375, "learning_rate": 8.75309953949699e-06, "loss": 3.3471, "step": 6350 }, { "epoch": 1.6896918172157278, "grad_norm": 218658.8125, "learning_rate": 8.73538788522848e-06, "loss": 3.0762, "step": 6360 }, { "epoch": 1.6923485653560042, "grad_norm": 3747502.5, "learning_rate": 8.717676230959973e-06, "loss": 3.7819, "step": 6370 }, { "epoch": 1.6950053134962806, "grad_norm": 402977.15625, "learning_rate": 8.699964576691463e-06, "loss": 3.2238, "step": 6380 }, { "epoch": 1.697662061636557, "grad_norm": 354610.0, "learning_rate": 8.682252922422954e-06, "loss": 3.5365, "step": 6390 }, { "epoch": 1.7003188097768331, "grad_norm": 737137.25, "learning_rate": 8.664541268154446e-06, "loss": 3.7334, "step": 6400 }, { "epoch": 1.7029755579171093, "grad_norm": 270020.3125, "learning_rate": 8.646829613885937e-06, "loss": 3.6183, "step": 6410 }, { "epoch": 1.7056323060573857, "grad_norm": 740626.4375, "learning_rate": 8.629117959617429e-06, "loss": 3.7487, "step": 6420 }, { "epoch": 1.708289054197662, "grad_norm": 1305229.75, "learning_rate": 8.61140630534892e-06, "loss": 3.7039, "step": 6430 }, { "epoch": 1.7109458023379385, "grad_norm": 172010.875, "learning_rate": 8.593694651080412e-06, "loss": 2.9064, "step": 6440 }, { "epoch": 1.7136025504782146, "grad_norm": 36386.55859375, "learning_rate": 8.575982996811903e-06, "loss": 3.5462, "step": 6450 }, { "epoch": 1.7162592986184908, "grad_norm": 280424.5, "learning_rate": 8.558271342543393e-06, "loss": 3.7119, "step": 6460 }, { "epoch": 1.7189160467587672, "grad_norm": 65134.73828125, "learning_rate": 8.540559688274885e-06, "loss": 4.058, "step": 6470 }, { "epoch": 1.7215727948990436, "grad_norm": 66937.53125, "learning_rate": 8.522848034006378e-06, "loss": 3.3975, "step": 6480 }, { "epoch": 1.72422954303932, "grad_norm": 131224.421875, "learning_rate": 8.505136379737868e-06, "loss": 3.4813, "step": 6490 }, { "epoch": 1.7268862911795961, "grad_norm": 108172.1640625, "learning_rate": 8.48742472546936e-06, "loss": 3.1716, "step": 6500 }, { "epoch": 1.7295430393198725, "grad_norm": 25198.029296875, "learning_rate": 8.469713071200851e-06, "loss": 3.6849, "step": 6510 }, { "epoch": 1.7321997874601487, "grad_norm": 61498.03515625, "learning_rate": 8.452001416932342e-06, "loss": 3.4036, "step": 6520 }, { "epoch": 1.734856535600425, "grad_norm": 442683.875, "learning_rate": 8.434289762663834e-06, "loss": 3.3497, "step": 6530 }, { "epoch": 1.7375132837407015, "grad_norm": 27654.84765625, "learning_rate": 8.416578108395324e-06, "loss": 3.2324, "step": 6540 }, { "epoch": 1.7401700318809776, "grad_norm": 87875.5546875, "learning_rate": 8.398866454126817e-06, "loss": 3.211, "step": 6550 }, { "epoch": 1.742826780021254, "grad_norm": 443493.65625, "learning_rate": 8.381154799858307e-06, "loss": 3.5746, "step": 6560 }, { "epoch": 1.7454835281615302, "grad_norm": 112091.3046875, "learning_rate": 8.3634431455898e-06, "loss": 3.2604, "step": 6570 }, { "epoch": 1.7481402763018066, "grad_norm": 37516.62109375, "learning_rate": 8.34573149132129e-06, "loss": 3.3058, "step": 6580 }, { "epoch": 1.750797024442083, "grad_norm": 98792.796875, "learning_rate": 8.32801983705278e-06, "loss": 3.4504, "step": 6590 }, { "epoch": 1.7534537725823593, "grad_norm": 24296.8125, "learning_rate": 8.310308182784273e-06, "loss": 3.2476, "step": 6600 }, { "epoch": 1.7561105207226355, "grad_norm": 27490.43359375, "learning_rate": 8.292596528515764e-06, "loss": 3.4551, "step": 6610 }, { "epoch": 1.7587672688629117, "grad_norm": 163381.75, "learning_rate": 8.274884874247256e-06, "loss": 3.56, "step": 6620 }, { "epoch": 1.761424017003188, "grad_norm": 5022.00244140625, "learning_rate": 8.257173219978746e-06, "loss": 3.2829, "step": 6630 }, { "epoch": 1.7640807651434645, "grad_norm": 873426.5, "learning_rate": 8.239461565710239e-06, "loss": 3.292, "step": 6640 }, { "epoch": 1.7667375132837408, "grad_norm": 48760.75390625, "learning_rate": 8.22174991144173e-06, "loss": 3.3971, "step": 6650 }, { "epoch": 1.769394261424017, "grad_norm": 22562.328125, "learning_rate": 8.20403825717322e-06, "loss": 3.4901, "step": 6660 }, { "epoch": 1.7720510095642932, "grad_norm": 110952.984375, "learning_rate": 8.186326602904712e-06, "loss": 3.5824, "step": 6670 }, { "epoch": 1.7747077577045696, "grad_norm": 11664.615234375, "learning_rate": 8.168614948636203e-06, "loss": 3.6433, "step": 6680 }, { "epoch": 1.777364505844846, "grad_norm": 296820.28125, "learning_rate": 8.150903294367695e-06, "loss": 3.4816, "step": 6690 }, { "epoch": 1.7800212539851223, "grad_norm": 28750.556640625, "learning_rate": 8.133191640099186e-06, "loss": 3.4851, "step": 6700 }, { "epoch": 1.7826780021253985, "grad_norm": 86309.7890625, "learning_rate": 8.115479985830678e-06, "loss": 3.3058, "step": 6710 }, { "epoch": 1.7853347502656747, "grad_norm": 91584.7734375, "learning_rate": 8.097768331562168e-06, "loss": 3.7495, "step": 6720 }, { "epoch": 1.787991498405951, "grad_norm": 132450.96875, "learning_rate": 8.080056677293659e-06, "loss": 3.4955, "step": 6730 }, { "epoch": 1.7906482465462275, "grad_norm": 134387.046875, "learning_rate": 8.062345023025151e-06, "loss": 3.4655, "step": 6740 }, { "epoch": 1.7933049946865038, "grad_norm": 74426.6875, "learning_rate": 8.044633368756642e-06, "loss": 3.7594, "step": 6750 }, { "epoch": 1.79596174282678, "grad_norm": 58667.3984375, "learning_rate": 8.026921714488134e-06, "loss": 3.7655, "step": 6760 }, { "epoch": 1.7986184909670562, "grad_norm": 130389.9140625, "learning_rate": 8.009210060219625e-06, "loss": 3.673, "step": 6770 }, { "epoch": 1.8012752391073326, "grad_norm": 89147.9296875, "learning_rate": 7.991498405951117e-06, "loss": 3.2874, "step": 6780 }, { "epoch": 1.803931987247609, "grad_norm": 44793.80859375, "learning_rate": 7.973786751682607e-06, "loss": 3.2517, "step": 6790 }, { "epoch": 1.8065887353878853, "grad_norm": 15245.392578125, "learning_rate": 7.956075097414098e-06, "loss": 3.5179, "step": 6800 }, { "epoch": 1.8092454835281615, "grad_norm": 15995.4912109375, "learning_rate": 7.93836344314559e-06, "loss": 3.6515, "step": 6810 }, { "epoch": 1.8119022316684377, "grad_norm": 16524.787109375, "learning_rate": 7.920651788877083e-06, "loss": 3.1618, "step": 6820 }, { "epoch": 1.814558979808714, "grad_norm": 42409.20703125, "learning_rate": 7.902940134608573e-06, "loss": 3.58, "step": 6830 }, { "epoch": 1.8172157279489904, "grad_norm": 10542.6796875, "learning_rate": 7.885228480340065e-06, "loss": 3.508, "step": 6840 }, { "epoch": 1.8198724760892668, "grad_norm": 25151.1484375, "learning_rate": 7.867516826071556e-06, "loss": 3.0635, "step": 6850 }, { "epoch": 1.822529224229543, "grad_norm": 9499.1826171875, "learning_rate": 7.849805171803047e-06, "loss": 2.9901, "step": 6860 }, { "epoch": 1.8251859723698194, "grad_norm": 54946.984375, "learning_rate": 7.832093517534539e-06, "loss": 2.9531, "step": 6870 }, { "epoch": 1.8278427205100956, "grad_norm": 10790.599609375, "learning_rate": 7.81438186326603e-06, "loss": 3.5882, "step": 6880 }, { "epoch": 1.830499468650372, "grad_norm": 13575.8759765625, "learning_rate": 7.796670208997522e-06, "loss": 3.3612, "step": 6890 }, { "epoch": 1.8331562167906483, "grad_norm": 20945.48046875, "learning_rate": 7.778958554729012e-06, "loss": 3.0764, "step": 6900 }, { "epoch": 1.8358129649309245, "grad_norm": 232869.03125, "learning_rate": 7.761246900460504e-06, "loss": 3.1716, "step": 6910 }, { "epoch": 1.8384697130712009, "grad_norm": 43791.59765625, "learning_rate": 7.743535246191995e-06, "loss": 3.2311, "step": 6920 }, { "epoch": 1.841126461211477, "grad_norm": 22579.091796875, "learning_rate": 7.725823591923486e-06, "loss": 3.4563, "step": 6930 }, { "epoch": 1.8437832093517534, "grad_norm": 28530.806640625, "learning_rate": 7.708111937654978e-06, "loss": 3.452, "step": 6940 }, { "epoch": 1.8464399574920298, "grad_norm": 12486.0390625, "learning_rate": 7.690400283386468e-06, "loss": 3.2791, "step": 6950 }, { "epoch": 1.8490967056323062, "grad_norm": 17018.11328125, "learning_rate": 7.67268862911796e-06, "loss": 3.6792, "step": 6960 }, { "epoch": 1.8517534537725824, "grad_norm": 16199.2470703125, "learning_rate": 7.654976974849451e-06, "loss": 3.2561, "step": 6970 }, { "epoch": 1.8544102019128585, "grad_norm": 10388.2470703125, "learning_rate": 7.637265320580944e-06, "loss": 3.0233, "step": 6980 }, { "epoch": 1.857066950053135, "grad_norm": 15407.7548828125, "learning_rate": 7.619553666312433e-06, "loss": 3.167, "step": 6990 }, { "epoch": 1.8597236981934113, "grad_norm": 26815.095703125, "learning_rate": 7.601842012043925e-06, "loss": 3.2972, "step": 7000 }, { "epoch": 1.8623804463336877, "grad_norm": 58698.21875, "learning_rate": 7.584130357775417e-06, "loss": 3.2334, "step": 7010 }, { "epoch": 1.8650371944739639, "grad_norm": 27274.71875, "learning_rate": 7.566418703506908e-06, "loss": 3.1432, "step": 7020 }, { "epoch": 1.86769394261424, "grad_norm": 83316.0703125, "learning_rate": 7.5487070492384e-06, "loss": 3.1284, "step": 7030 }, { "epoch": 1.8703506907545164, "grad_norm": 30122.771484375, "learning_rate": 7.530995394969891e-06, "loss": 2.8904, "step": 7040 }, { "epoch": 1.8730074388947928, "grad_norm": 40200.9609375, "learning_rate": 7.513283740701383e-06, "loss": 3.3255, "step": 7050 }, { "epoch": 1.8756641870350692, "grad_norm": 16342.447265625, "learning_rate": 7.495572086432873e-06, "loss": 3.1073, "step": 7060 }, { "epoch": 1.8783209351753454, "grad_norm": 14423.703125, "learning_rate": 7.477860432164365e-06, "loss": 3.4831, "step": 7070 }, { "epoch": 1.8809776833156215, "grad_norm": 34366.14453125, "learning_rate": 7.460148777895856e-06, "loss": 3.2063, "step": 7080 }, { "epoch": 1.883634431455898, "grad_norm": 70803.8359375, "learning_rate": 7.4424371236273475e-06, "loss": 3.5181, "step": 7090 }, { "epoch": 1.8862911795961743, "grad_norm": 13800.69140625, "learning_rate": 7.424725469358839e-06, "loss": 3.4993, "step": 7100 }, { "epoch": 1.8889479277364507, "grad_norm": 48057.68359375, "learning_rate": 7.40701381509033e-06, "loss": 3.1418, "step": 7110 }, { "epoch": 1.8916046758767269, "grad_norm": 40145.4921875, "learning_rate": 7.389302160821822e-06, "loss": 3.3427, "step": 7120 }, { "epoch": 1.894261424017003, "grad_norm": 13148.1484375, "learning_rate": 7.371590506553312e-06, "loss": 3.2584, "step": 7130 }, { "epoch": 1.8969181721572794, "grad_norm": 10740.6826171875, "learning_rate": 7.353878852284804e-06, "loss": 2.8656, "step": 7140 }, { "epoch": 1.8995749202975558, "grad_norm": 7270.3818359375, "learning_rate": 7.336167198016295e-06, "loss": 3.1929, "step": 7150 }, { "epoch": 1.9022316684378322, "grad_norm": 3250.9072265625, "learning_rate": 7.318455543747787e-06, "loss": 3.3218, "step": 7160 }, { "epoch": 1.9048884165781084, "grad_norm": 40904.6484375, "learning_rate": 7.300743889479278e-06, "loss": 3.0994, "step": 7170 }, { "epoch": 1.9075451647183845, "grad_norm": 9426.0009765625, "learning_rate": 7.2830322352107695e-06, "loss": 3.2861, "step": 7180 }, { "epoch": 1.910201912858661, "grad_norm": 10107.427734375, "learning_rate": 7.265320580942261e-06, "loss": 3.3694, "step": 7190 }, { "epoch": 1.9128586609989373, "grad_norm": 25632.7734375, "learning_rate": 7.2476089266737514e-06, "loss": 3.1918, "step": 7200 }, { "epoch": 1.9155154091392137, "grad_norm": 10823.2509765625, "learning_rate": 7.229897272405243e-06, "loss": 3.1984, "step": 7210 }, { "epoch": 1.9181721572794899, "grad_norm": 8237.4482421875, "learning_rate": 7.212185618136734e-06, "loss": 2.7874, "step": 7220 }, { "epoch": 1.9208289054197663, "grad_norm": 4823.09716796875, "learning_rate": 7.194473963868226e-06, "loss": 3.2625, "step": 7230 }, { "epoch": 1.9234856535600424, "grad_norm": 6276.54150390625, "learning_rate": 7.176762309599717e-06, "loss": 3.1739, "step": 7240 }, { "epoch": 1.9261424017003188, "grad_norm": 9979.935546875, "learning_rate": 7.1590506553312085e-06, "loss": 3.34, "step": 7250 }, { "epoch": 1.9287991498405952, "grad_norm": 3373.656982421875, "learning_rate": 7.141339001062701e-06, "loss": 3.4366, "step": 7260 }, { "epoch": 1.9314558979808714, "grad_norm": 9178.9404296875, "learning_rate": 7.1236273467941905e-06, "loss": 3.245, "step": 7270 }, { "epoch": 1.9341126461211477, "grad_norm": 11173.3037109375, "learning_rate": 7.105915692525682e-06, "loss": 3.1306, "step": 7280 }, { "epoch": 1.936769394261424, "grad_norm": 6969.20849609375, "learning_rate": 7.088204038257173e-06, "loss": 3.5482, "step": 7290 }, { "epoch": 1.9394261424017003, "grad_norm": 22079.796875, "learning_rate": 7.070492383988665e-06, "loss": 3.2338, "step": 7300 }, { "epoch": 1.9420828905419767, "grad_norm": 51803.05078125, "learning_rate": 7.052780729720157e-06, "loss": 3.1844, "step": 7310 }, { "epoch": 1.944739638682253, "grad_norm": 17502.84375, "learning_rate": 7.0350690754516485e-06, "loss": 3.3796, "step": 7320 }, { "epoch": 1.9473963868225292, "grad_norm": 4275.10009765625, "learning_rate": 7.017357421183138e-06, "loss": 3.0306, "step": 7330 }, { "epoch": 1.9500531349628054, "grad_norm": 3620.85400390625, "learning_rate": 6.99964576691463e-06, "loss": 3.3635, "step": 7340 }, { "epoch": 1.9527098831030818, "grad_norm": 32547.673828125, "learning_rate": 6.981934112646122e-06, "loss": 3.1764, "step": 7350 }, { "epoch": 1.9553666312433582, "grad_norm": 5065.5751953125, "learning_rate": 6.964222458377613e-06, "loss": 3.1895, "step": 7360 }, { "epoch": 1.9580233793836346, "grad_norm": 10395.2060546875, "learning_rate": 6.946510804109105e-06, "loss": 3.1655, "step": 7370 }, { "epoch": 1.9606801275239107, "grad_norm": 4557.41796875, "learning_rate": 6.928799149840596e-06, "loss": 3.0581, "step": 7380 }, { "epoch": 1.963336875664187, "grad_norm": 38417.4765625, "learning_rate": 6.911087495572088e-06, "loss": 3.0893, "step": 7390 }, { "epoch": 1.9659936238044633, "grad_norm": 5107.16796875, "learning_rate": 6.893375841303578e-06, "loss": 3.4167, "step": 7400 }, { "epoch": 1.9686503719447397, "grad_norm": 5035.6201171875, "learning_rate": 6.87566418703507e-06, "loss": 3.0664, "step": 7410 }, { "epoch": 1.971307120085016, "grad_norm": 12651.587890625, "learning_rate": 6.857952532766561e-06, "loss": 3.1299, "step": 7420 }, { "epoch": 1.9739638682252922, "grad_norm": 7539.5400390625, "learning_rate": 6.840240878498052e-06, "loss": 3.0492, "step": 7430 }, { "epoch": 1.9766206163655684, "grad_norm": 5577.158203125, "learning_rate": 6.822529224229544e-06, "loss": 3.098, "step": 7440 }, { "epoch": 1.9792773645058448, "grad_norm": 41558.4921875, "learning_rate": 6.804817569961035e-06, "loss": 3.2933, "step": 7450 }, { "epoch": 1.9819341126461212, "grad_norm": 3775.939697265625, "learning_rate": 6.787105915692527e-06, "loss": 2.9667, "step": 7460 }, { "epoch": 1.9845908607863976, "grad_norm": 30318.9921875, "learning_rate": 6.769394261424017e-06, "loss": 3.0666, "step": 7470 }, { "epoch": 1.9872476089266737, "grad_norm": 21865.806640625, "learning_rate": 6.751682607155509e-06, "loss": 2.9213, "step": 7480 }, { "epoch": 1.98990435706695, "grad_norm": 10458.220703125, "learning_rate": 6.733970952887e-06, "loss": 3.2567, "step": 7490 }, { "epoch": 1.9925611052072263, "grad_norm": 14638.0439453125, "learning_rate": 6.7162592986184915e-06, "loss": 3.2132, "step": 7500 } ], "logging_steps": 10, "max_steps": 11292, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7836212920320000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }