|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9925611052072263, |
|
"eval_steps": 500, |
|
"global_step": 7500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002656748140276302, |
|
"grad_norm": 207.1781768798828, |
|
"learning_rate": 1.9982288345731494e-05, |
|
"loss": 6.5351, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005313496280552604, |
|
"grad_norm": 265.0539855957031, |
|
"learning_rate": 1.9964576691462986e-05, |
|
"loss": 5.2822, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007970244420828906, |
|
"grad_norm": 1933.7158203125, |
|
"learning_rate": 1.9946865037194475e-05, |
|
"loss": 4.7764, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010626992561105207, |
|
"grad_norm": 896.5211791992188, |
|
"learning_rate": 1.9929153382925967e-05, |
|
"loss": 4.5617, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013283740701381509, |
|
"grad_norm": 2147.634765625, |
|
"learning_rate": 1.991144172865746e-05, |
|
"loss": 4.5559, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015940488841657812, |
|
"grad_norm": 1384.8623046875, |
|
"learning_rate": 1.9893730074388952e-05, |
|
"loss": 4.1671, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.018597236981934114, |
|
"grad_norm": 3381.87060546875, |
|
"learning_rate": 1.987601842012044e-05, |
|
"loss": 3.9988, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.021253985122210415, |
|
"grad_norm": 398.0505676269531, |
|
"learning_rate": 1.985830676585193e-05, |
|
"loss": 4.0844, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.023910733262486716, |
|
"grad_norm": 2040.54736328125, |
|
"learning_rate": 1.9840595111583422e-05, |
|
"loss": 3.9493, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.026567481402763018, |
|
"grad_norm": 8612.021484375, |
|
"learning_rate": 1.9822883457314914e-05, |
|
"loss": 3.6944, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02922422954303932, |
|
"grad_norm": 22271.3125, |
|
"learning_rate": 1.9805171803046406e-05, |
|
"loss": 4.1335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.031880977683315624, |
|
"grad_norm": 5334.6806640625, |
|
"learning_rate": 1.97874601487779e-05, |
|
"loss": 3.9284, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03453772582359192, |
|
"grad_norm": 1616.4825439453125, |
|
"learning_rate": 1.9769748494509388e-05, |
|
"loss": 3.9407, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03719447396386823, |
|
"grad_norm": 137.30589294433594, |
|
"learning_rate": 1.975203684024088e-05, |
|
"loss": 3.7372, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.039851222104144525, |
|
"grad_norm": 2417.81982421875, |
|
"learning_rate": 1.9734325185972372e-05, |
|
"loss": 3.6944, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04250797024442083, |
|
"grad_norm": 7971.87451171875, |
|
"learning_rate": 1.9716613531703864e-05, |
|
"loss": 3.6615, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04516471838469713, |
|
"grad_norm": 1645.13916015625, |
|
"learning_rate": 1.9698901877435353e-05, |
|
"loss": 3.4582, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04782146652497343, |
|
"grad_norm": 2899.1162109375, |
|
"learning_rate": 1.9681190223166846e-05, |
|
"loss": 3.4193, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05047821466524974, |
|
"grad_norm": 13782.0908203125, |
|
"learning_rate": 1.9663478568898338e-05, |
|
"loss": 3.577, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.053134962805526036, |
|
"grad_norm": 7818.07177734375, |
|
"learning_rate": 1.964576691462983e-05, |
|
"loss": 3.2082, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05579171094580234, |
|
"grad_norm": 14882.34375, |
|
"learning_rate": 1.962805526036132e-05, |
|
"loss": 3.1947, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05844845908607864, |
|
"grad_norm": 27526.642578125, |
|
"learning_rate": 1.961034360609281e-05, |
|
"loss": 3.23, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06110520722635494, |
|
"grad_norm": 9511.650390625, |
|
"learning_rate": 1.95926319518243e-05, |
|
"loss": 3.0386, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06376195536663125, |
|
"grad_norm": 2172.15625, |
|
"learning_rate": 1.9574920297555792e-05, |
|
"loss": 3.1756, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06641870350690754, |
|
"grad_norm": 11950.30078125, |
|
"learning_rate": 1.9557208643287285e-05, |
|
"loss": 3.36, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06907545164718384, |
|
"grad_norm": 17726.330078125, |
|
"learning_rate": 1.9539496989018777e-05, |
|
"loss": 3.0231, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07173219978746015, |
|
"grad_norm": 4690.27587890625, |
|
"learning_rate": 1.9521785334750266e-05, |
|
"loss": 3.0029, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07438894792773645, |
|
"grad_norm": 40308.61328125, |
|
"learning_rate": 1.9504073680481758e-05, |
|
"loss": 3.688, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07704569606801276, |
|
"grad_norm": 27147.087890625, |
|
"learning_rate": 1.948636202621325e-05, |
|
"loss": 3.3881, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07970244420828905, |
|
"grad_norm": 59977.046875, |
|
"learning_rate": 1.9468650371944743e-05, |
|
"loss": 3.4571, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08235919234856535, |
|
"grad_norm": 66940.046875, |
|
"learning_rate": 1.9450938717676235e-05, |
|
"loss": 3.3864, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08501594048884166, |
|
"grad_norm": 5094.89013671875, |
|
"learning_rate": 1.9433227063407724e-05, |
|
"loss": 3.3697, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08767268862911796, |
|
"grad_norm": 4367.36474609375, |
|
"learning_rate": 1.9415515409139216e-05, |
|
"loss": 3.3016, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09032943676939426, |
|
"grad_norm": 7941.5458984375, |
|
"learning_rate": 1.9397803754870705e-05, |
|
"loss": 3.0374, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09298618490967056, |
|
"grad_norm": 3960.741943359375, |
|
"learning_rate": 1.9380092100602197e-05, |
|
"loss": 3.3324, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09564293304994687, |
|
"grad_norm": 18565.732421875, |
|
"learning_rate": 1.936238044633369e-05, |
|
"loss": 3.2402, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09829968119022317, |
|
"grad_norm": 66859.0, |
|
"learning_rate": 1.9344668792065178e-05, |
|
"loss": 3.3142, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.10095642933049948, |
|
"grad_norm": 1521.879638671875, |
|
"learning_rate": 1.932695713779667e-05, |
|
"loss": 3.0546, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.10361317747077577, |
|
"grad_norm": 12662.775390625, |
|
"learning_rate": 1.9309245483528163e-05, |
|
"loss": 3.5396, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10626992561105207, |
|
"grad_norm": 105807.59375, |
|
"learning_rate": 1.9291533829259655e-05, |
|
"loss": 3.5301, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10892667375132838, |
|
"grad_norm": 663547.875, |
|
"learning_rate": 1.9273822174991147e-05, |
|
"loss": 4.28, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11158342189160468, |
|
"grad_norm": 8186676.0, |
|
"learning_rate": 1.9256110520722636e-05, |
|
"loss": 5.9807, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11424017003188097, |
|
"grad_norm": 2142551.25, |
|
"learning_rate": 1.923839886645413e-05, |
|
"loss": 9.4764, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11689691817215728, |
|
"grad_norm": 366486.1875, |
|
"learning_rate": 1.922068721218562e-05, |
|
"loss": 10.9151, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11955366631243358, |
|
"grad_norm": 2276693.0, |
|
"learning_rate": 1.9202975557917113e-05, |
|
"loss": 12.5549, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.12221041445270989, |
|
"grad_norm": 2184425.5, |
|
"learning_rate": 1.9185263903648602e-05, |
|
"loss": 13.1915, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.12486716259298619, |
|
"grad_norm": 2937578.75, |
|
"learning_rate": 1.9167552249380094e-05, |
|
"loss": 14.2279, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1275239107332625, |
|
"grad_norm": 10091141.0, |
|
"learning_rate": 1.9149840595111583e-05, |
|
"loss": 13.4766, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1301806588735388, |
|
"grad_norm": 5426885.5, |
|
"learning_rate": 1.9132128940843075e-05, |
|
"loss": 14.8065, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.13283740701381508, |
|
"grad_norm": 2068535.25, |
|
"learning_rate": 1.9114417286574568e-05, |
|
"loss": 16.3781, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13549415515409138, |
|
"grad_norm": 3599295.0, |
|
"learning_rate": 1.909670563230606e-05, |
|
"loss": 15.1519, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1381509032943677, |
|
"grad_norm": 761431.875, |
|
"learning_rate": 1.907899397803755e-05, |
|
"loss": 15.1124, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.140807651434644, |
|
"grad_norm": 933641.375, |
|
"learning_rate": 1.906128232376904e-05, |
|
"loss": 14.1038, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1434643995749203, |
|
"grad_norm": 423861.0625, |
|
"learning_rate": 1.9043570669500533e-05, |
|
"loss": 13.7131, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1461211477151966, |
|
"grad_norm": 5383.50537109375, |
|
"learning_rate": 1.9025859015232026e-05, |
|
"loss": 12.8075, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1487778958554729, |
|
"grad_norm": 3759.12548828125, |
|
"learning_rate": 1.9008147360963514e-05, |
|
"loss": 10.7237, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1514346439957492, |
|
"grad_norm": 2150.089111328125, |
|
"learning_rate": 1.8990435706695007e-05, |
|
"loss": 7.2887, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.15409139213602552, |
|
"grad_norm": 3893.645751953125, |
|
"learning_rate": 1.89727240524265e-05, |
|
"loss": 4.8237, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1567481402763018, |
|
"grad_norm": 11881.3046875, |
|
"learning_rate": 1.895501239815799e-05, |
|
"loss": 3.9525, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.1594048884165781, |
|
"grad_norm": 14820.740234375, |
|
"learning_rate": 1.8937300743889483e-05, |
|
"loss": 4.6401, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1620616365568544, |
|
"grad_norm": 99031.640625, |
|
"learning_rate": 1.8919589089620972e-05, |
|
"loss": 4.9725, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1647183846971307, |
|
"grad_norm": 47882.5859375, |
|
"learning_rate": 1.890187743535246e-05, |
|
"loss": 4.6917, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.16737513283740701, |
|
"grad_norm": 77129.8046875, |
|
"learning_rate": 1.8884165781083953e-05, |
|
"loss": 4.3883, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.17003188097768332, |
|
"grad_norm": 85341.125, |
|
"learning_rate": 1.8866454126815446e-05, |
|
"loss": 5.114, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.17268862911795962, |
|
"grad_norm": 34883.13671875, |
|
"learning_rate": 1.8848742472546938e-05, |
|
"loss": 4.9715, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.17534537725823593, |
|
"grad_norm": 22649.3359375, |
|
"learning_rate": 1.8831030818278427e-05, |
|
"loss": 4.9266, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.17800212539851223, |
|
"grad_norm": 59614.453125, |
|
"learning_rate": 1.881331916400992e-05, |
|
"loss": 4.3894, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1806588735387885, |
|
"grad_norm": 13419.771484375, |
|
"learning_rate": 1.879560750974141e-05, |
|
"loss": 4.238, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.18331562167906482, |
|
"grad_norm": 26652.462890625, |
|
"learning_rate": 1.8777895855472904e-05, |
|
"loss": 4.5253, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.18597236981934112, |
|
"grad_norm": 37440.6015625, |
|
"learning_rate": 1.8760184201204396e-05, |
|
"loss": 4.0546, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.18862911795961743, |
|
"grad_norm": 43147.1796875, |
|
"learning_rate": 1.8742472546935885e-05, |
|
"loss": 4.4831, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.19128586609989373, |
|
"grad_norm": 143355.296875, |
|
"learning_rate": 1.8724760892667377e-05, |
|
"loss": 4.5257, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.19394261424017004, |
|
"grad_norm": 12484.8466796875, |
|
"learning_rate": 1.870704923839887e-05, |
|
"loss": 4.9662, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.19659936238044634, |
|
"grad_norm": 10305.0126953125, |
|
"learning_rate": 1.868933758413036e-05, |
|
"loss": 5.3629, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.19925611052072265, |
|
"grad_norm": 3247.491943359375, |
|
"learning_rate": 1.867162592986185e-05, |
|
"loss": 5.014, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.20191285866099895, |
|
"grad_norm": 2328.57470703125, |
|
"learning_rate": 1.8653914275593343e-05, |
|
"loss": 4.9864, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.20456960680127523, |
|
"grad_norm": 16007.7978515625, |
|
"learning_rate": 1.863620262132483e-05, |
|
"loss": 4.5492, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.20722635494155153, |
|
"grad_norm": 39521.5078125, |
|
"learning_rate": 1.8618490967056324e-05, |
|
"loss": 4.4608, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.20988310308182784, |
|
"grad_norm": 553922.0, |
|
"learning_rate": 1.8600779312787816e-05, |
|
"loss": 4.9998, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.21253985122210414, |
|
"grad_norm": 623164.25, |
|
"learning_rate": 1.858306765851931e-05, |
|
"loss": 4.6969, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.21519659936238045, |
|
"grad_norm": 849724.3125, |
|
"learning_rate": 1.8565356004250797e-05, |
|
"loss": 5.2992, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.21785334750265675, |
|
"grad_norm": 1883489.125, |
|
"learning_rate": 1.854764434998229e-05, |
|
"loss": 5.5446, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.22051009564293306, |
|
"grad_norm": 1473608.5, |
|
"learning_rate": 1.8529932695713782e-05, |
|
"loss": 5.6081, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.22316684378320936, |
|
"grad_norm": 6046079.5, |
|
"learning_rate": 1.8512221041445274e-05, |
|
"loss": 5.543, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.22582359192348567, |
|
"grad_norm": 3414641.75, |
|
"learning_rate": 1.8494509387176763e-05, |
|
"loss": 6.5477, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.22848034006376194, |
|
"grad_norm": 3107066.0, |
|
"learning_rate": 1.8476797732908255e-05, |
|
"loss": 6.6238, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.23113708820403825, |
|
"grad_norm": 2057658.75, |
|
"learning_rate": 1.8459086078639748e-05, |
|
"loss": 6.6566, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.23379383634431455, |
|
"grad_norm": 689954.125, |
|
"learning_rate": 1.8441374424371236e-05, |
|
"loss": 5.6908, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.23645058448459086, |
|
"grad_norm": 5757.73388671875, |
|
"learning_rate": 1.842366277010273e-05, |
|
"loss": 4.5477, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.23910733262486716, |
|
"grad_norm": 5359.6728515625, |
|
"learning_rate": 1.840595111583422e-05, |
|
"loss": 3.6785, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.24176408076514347, |
|
"grad_norm": 2013.8673095703125, |
|
"learning_rate": 1.838823946156571e-05, |
|
"loss": 3.519, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.24442082890541977, |
|
"grad_norm": 6289.10888671875, |
|
"learning_rate": 1.8370527807297202e-05, |
|
"loss": 3.6842, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.24707757704569608, |
|
"grad_norm": 3089.353759765625, |
|
"learning_rate": 1.8352816153028694e-05, |
|
"loss": 3.6535, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.24973432518597238, |
|
"grad_norm": 2002.3780517578125, |
|
"learning_rate": 1.8335104498760187e-05, |
|
"loss": 3.5385, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.25239107332624866, |
|
"grad_norm": 5194.0224609375, |
|
"learning_rate": 1.8317392844491676e-05, |
|
"loss": 3.4652, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.255047821466525, |
|
"grad_norm": 2200.886962890625, |
|
"learning_rate": 1.8299681190223168e-05, |
|
"loss": 3.6788, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.25770456960680127, |
|
"grad_norm": 10148.009765625, |
|
"learning_rate": 1.828196953595466e-05, |
|
"loss": 3.7478, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2603613177470776, |
|
"grad_norm": 2540.3837890625, |
|
"learning_rate": 1.8264257881686152e-05, |
|
"loss": 3.4836, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2630180658873539, |
|
"grad_norm": 2385.15625, |
|
"learning_rate": 1.8246546227417645e-05, |
|
"loss": 3.2733, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.26567481402763016, |
|
"grad_norm": 8635.650390625, |
|
"learning_rate": 1.8228834573149134e-05, |
|
"loss": 3.4935, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2683315621679065, |
|
"grad_norm": 17405.947265625, |
|
"learning_rate": 1.8211122918880626e-05, |
|
"loss": 3.3743, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.27098831030818277, |
|
"grad_norm": 2616.988037109375, |
|
"learning_rate": 1.8193411264612115e-05, |
|
"loss": 4.0444, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2736450584484591, |
|
"grad_norm": 9487.044921875, |
|
"learning_rate": 1.8175699610343607e-05, |
|
"loss": 3.8644, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2763018065887354, |
|
"grad_norm": 681.0313110351562, |
|
"learning_rate": 1.81579879560751e-05, |
|
"loss": 3.2198, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2789585547290117, |
|
"grad_norm": 1654.2945556640625, |
|
"learning_rate": 1.8140276301806588e-05, |
|
"loss": 3.741, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.281615302869288, |
|
"grad_norm": 2555.9970703125, |
|
"learning_rate": 1.812256464753808e-05, |
|
"loss": 3.5377, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2842720510095643, |
|
"grad_norm": 1187.751220703125, |
|
"learning_rate": 1.8104852993269573e-05, |
|
"loss": 3.6048, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2869287991498406, |
|
"grad_norm": 2747.8486328125, |
|
"learning_rate": 1.8087141339001065e-05, |
|
"loss": 3.6148, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2895855472901169, |
|
"grad_norm": 624.16650390625, |
|
"learning_rate": 1.8069429684732557e-05, |
|
"loss": 3.0917, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.2922422954303932, |
|
"grad_norm": 283.41033935546875, |
|
"learning_rate": 1.8051718030464046e-05, |
|
"loss": 3.4423, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2948990435706695, |
|
"grad_norm": 563.9237670898438, |
|
"learning_rate": 1.8034006376195538e-05, |
|
"loss": 3.1134, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2975557917109458, |
|
"grad_norm": 419.8347473144531, |
|
"learning_rate": 1.801629472192703e-05, |
|
"loss": 3.3765, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3002125398512221, |
|
"grad_norm": 328.199462890625, |
|
"learning_rate": 1.7998583067658523e-05, |
|
"loss": 3.1981, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3028692879914984, |
|
"grad_norm": 1167.4515380859375, |
|
"learning_rate": 1.7980871413390012e-05, |
|
"loss": 2.9826, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3055260361317747, |
|
"grad_norm": 1590.5523681640625, |
|
"learning_rate": 1.7963159759121504e-05, |
|
"loss": 3.2378, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.30818278427205104, |
|
"grad_norm": 1228.88037109375, |
|
"learning_rate": 1.7945448104852993e-05, |
|
"loss": 3.2167, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3108395324123273, |
|
"grad_norm": 866.290283203125, |
|
"learning_rate": 1.7927736450584485e-05, |
|
"loss": 2.9749, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3134962805526036, |
|
"grad_norm": 326.7938537597656, |
|
"learning_rate": 1.7910024796315977e-05, |
|
"loss": 3.111, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3161530286928799, |
|
"grad_norm": 603.0250854492188, |
|
"learning_rate": 1.789231314204747e-05, |
|
"loss": 3.1647, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3188097768331562, |
|
"grad_norm": 553.5940551757812, |
|
"learning_rate": 1.787460148777896e-05, |
|
"loss": 3.1094, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.32146652497343253, |
|
"grad_norm": 417.6220703125, |
|
"learning_rate": 1.785688983351045e-05, |
|
"loss": 3.195, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3241232731137088, |
|
"grad_norm": 745.7908935546875, |
|
"learning_rate": 1.7839178179241943e-05, |
|
"loss": 2.8119, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.32678002125398514, |
|
"grad_norm": 963.697021484375, |
|
"learning_rate": 1.7821466524973435e-05, |
|
"loss": 2.9828, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3294367693942614, |
|
"grad_norm": 3789.7373046875, |
|
"learning_rate": 1.7803754870704924e-05, |
|
"loss": 2.8971, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.33209351753453775, |
|
"grad_norm": 1777.551025390625, |
|
"learning_rate": 1.7786043216436416e-05, |
|
"loss": 2.8533, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.33475026567481403, |
|
"grad_norm": 725.1536254882812, |
|
"learning_rate": 1.776833156216791e-05, |
|
"loss": 2.6644, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3374070138150903, |
|
"grad_norm": 2410.62060546875, |
|
"learning_rate": 1.77506199078994e-05, |
|
"loss": 3.058, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.34006376195536664, |
|
"grad_norm": 825.2067260742188, |
|
"learning_rate": 1.7732908253630893e-05, |
|
"loss": 2.7154, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3427205100956429, |
|
"grad_norm": 835.7099609375, |
|
"learning_rate": 1.7715196599362382e-05, |
|
"loss": 3.5358, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.34537725823591925, |
|
"grad_norm": 2334.035888671875, |
|
"learning_rate": 1.769748494509387e-05, |
|
"loss": 3.2141, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3480340063761955, |
|
"grad_norm": 1089.702392578125, |
|
"learning_rate": 1.7679773290825363e-05, |
|
"loss": 2.8534, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.35069075451647186, |
|
"grad_norm": 643.6981811523438, |
|
"learning_rate": 1.7662061636556856e-05, |
|
"loss": 3.14, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.35334750265674814, |
|
"grad_norm": 927.3551025390625, |
|
"learning_rate": 1.7644349982288348e-05, |
|
"loss": 3.255, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.35600425079702447, |
|
"grad_norm": 642.1421508789062, |
|
"learning_rate": 1.7626638328019837e-05, |
|
"loss": 2.9875, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.35866099893730075, |
|
"grad_norm": 1514.4876708984375, |
|
"learning_rate": 1.760892667375133e-05, |
|
"loss": 2.7786, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.361317747077577, |
|
"grad_norm": 2913.84912109375, |
|
"learning_rate": 1.759121501948282e-05, |
|
"loss": 2.83, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.36397449521785336, |
|
"grad_norm": 1152.3695068359375, |
|
"learning_rate": 1.7573503365214314e-05, |
|
"loss": 3.316, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.36663124335812963, |
|
"grad_norm": 2364.73876953125, |
|
"learning_rate": 1.7555791710945806e-05, |
|
"loss": 3.1473, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.36928799149840597, |
|
"grad_norm": 1560.827392578125, |
|
"learning_rate": 1.7538080056677295e-05, |
|
"loss": 2.875, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.37194473963868224, |
|
"grad_norm": 672.7749633789062, |
|
"learning_rate": 1.7520368402408787e-05, |
|
"loss": 3.3416, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3746014877789586, |
|
"grad_norm": 3212.583740234375, |
|
"learning_rate": 1.750265674814028e-05, |
|
"loss": 2.6347, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.37725823591923485, |
|
"grad_norm": 9892.419921875, |
|
"learning_rate": 1.7484945093871768e-05, |
|
"loss": 2.9356, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3799149840595112, |
|
"grad_norm": 13098.6201171875, |
|
"learning_rate": 1.746723343960326e-05, |
|
"loss": 3.0818, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.38257173219978746, |
|
"grad_norm": 33038.46484375, |
|
"learning_rate": 1.7449521785334753e-05, |
|
"loss": 3.4073, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.38522848034006374, |
|
"grad_norm": 58945.421875, |
|
"learning_rate": 1.743181013106624e-05, |
|
"loss": 3.4505, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.38788522848034007, |
|
"grad_norm": 53823.19921875, |
|
"learning_rate": 1.7414098476797734e-05, |
|
"loss": 3.4398, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.39054197662061635, |
|
"grad_norm": 213358.46875, |
|
"learning_rate": 1.7396386822529226e-05, |
|
"loss": 3.1337, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3931987247608927, |
|
"grad_norm": 174113.078125, |
|
"learning_rate": 1.7378675168260718e-05, |
|
"loss": 3.6872, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.39585547290116896, |
|
"grad_norm": 110265.9609375, |
|
"learning_rate": 1.7360963513992207e-05, |
|
"loss": 3.5268, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3985122210414453, |
|
"grad_norm": 125626.78125, |
|
"learning_rate": 1.73432518597237e-05, |
|
"loss": 3.8027, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.40116896918172157, |
|
"grad_norm": 119383.8359375, |
|
"learning_rate": 1.7325540205455192e-05, |
|
"loss": 3.6381, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4038257173219979, |
|
"grad_norm": 78246.125, |
|
"learning_rate": 1.7307828551186684e-05, |
|
"loss": 3.6688, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4064824654622742, |
|
"grad_norm": 77016.8671875, |
|
"learning_rate": 1.7290116896918173e-05, |
|
"loss": 3.7796, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.40913921360255046, |
|
"grad_norm": 471759.21875, |
|
"learning_rate": 1.7272405242649665e-05, |
|
"loss": 3.738, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4117959617428268, |
|
"grad_norm": 108969.1171875, |
|
"learning_rate": 1.7254693588381157e-05, |
|
"loss": 3.4583, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.41445270988310307, |
|
"grad_norm": 44717.91015625, |
|
"learning_rate": 1.7236981934112646e-05, |
|
"loss": 3.0156, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4171094580233794, |
|
"grad_norm": 56418.765625, |
|
"learning_rate": 1.721927027984414e-05, |
|
"loss": 3.339, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4197662061636557, |
|
"grad_norm": 82086.234375, |
|
"learning_rate": 1.720155862557563e-05, |
|
"loss": 3.2477, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.422422954303932, |
|
"grad_norm": 38437.12890625, |
|
"learning_rate": 1.718384697130712e-05, |
|
"loss": 3.0923, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4250797024442083, |
|
"grad_norm": 64070.26953125, |
|
"learning_rate": 1.7166135317038612e-05, |
|
"loss": 3.8784, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4277364505844846, |
|
"grad_norm": 96363.0078125, |
|
"learning_rate": 1.7148423662770104e-05, |
|
"loss": 3.1945, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4303931987247609, |
|
"grad_norm": 101021.7578125, |
|
"learning_rate": 1.7130712008501596e-05, |
|
"loss": 2.9785, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.43304994686503717, |
|
"grad_norm": 33741.50390625, |
|
"learning_rate": 1.7113000354233085e-05, |
|
"loss": 3.0544, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4357066950053135, |
|
"grad_norm": 18486.07421875, |
|
"learning_rate": 1.7095288699964578e-05, |
|
"loss": 3.3951, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4383634431455898, |
|
"grad_norm": 141817.4375, |
|
"learning_rate": 1.707757704569607e-05, |
|
"loss": 3.8719, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4410201912858661, |
|
"grad_norm": 18356.125, |
|
"learning_rate": 1.7059865391427562e-05, |
|
"loss": 3.217, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4436769394261424, |
|
"grad_norm": 75286.890625, |
|
"learning_rate": 1.7042153737159054e-05, |
|
"loss": 3.2279, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4463336875664187, |
|
"grad_norm": 93692.8671875, |
|
"learning_rate": 1.7024442082890543e-05, |
|
"loss": 3.3421, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.448990435706695, |
|
"grad_norm": 137171.109375, |
|
"learning_rate": 1.7006730428622032e-05, |
|
"loss": 3.4727, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.45164718384697133, |
|
"grad_norm": 143812.296875, |
|
"learning_rate": 1.6989018774353524e-05, |
|
"loss": 3.24, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4543039319872476, |
|
"grad_norm": 35345.19921875, |
|
"learning_rate": 1.6971307120085017e-05, |
|
"loss": 3.2903, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4569606801275239, |
|
"grad_norm": 69917.4375, |
|
"learning_rate": 1.695359546581651e-05, |
|
"loss": 3.1309, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4596174282678002, |
|
"grad_norm": 71451.5859375, |
|
"learning_rate": 1.6935883811547998e-05, |
|
"loss": 3.8151, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4622741764080765, |
|
"grad_norm": 54897.4375, |
|
"learning_rate": 1.691817215727949e-05, |
|
"loss": 3.7961, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.46493092454835283, |
|
"grad_norm": 42574.12109375, |
|
"learning_rate": 1.6900460503010982e-05, |
|
"loss": 3.3018, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4675876726886291, |
|
"grad_norm": 118568.609375, |
|
"learning_rate": 1.6882748848742475e-05, |
|
"loss": 3.4044, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.47024442082890544, |
|
"grad_norm": 141536.96875, |
|
"learning_rate": 1.6865037194473967e-05, |
|
"loss": 3.5705, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.4729011689691817, |
|
"grad_norm": 153274.9375, |
|
"learning_rate": 1.6847325540205456e-05, |
|
"loss": 3.7034, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.47555791710945805, |
|
"grad_norm": 121872.7890625, |
|
"learning_rate": 1.6829613885936948e-05, |
|
"loss": 3.6836, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.4782146652497343, |
|
"grad_norm": 101665.6640625, |
|
"learning_rate": 1.681190223166844e-05, |
|
"loss": 3.5983, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4808714133900106, |
|
"grad_norm": 212873.5, |
|
"learning_rate": 1.6794190577399933e-05, |
|
"loss": 3.3915, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.48352816153028694, |
|
"grad_norm": 19234.345703125, |
|
"learning_rate": 1.677647892313142e-05, |
|
"loss": 3.1403, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4861849096705632, |
|
"grad_norm": 126968.46875, |
|
"learning_rate": 1.6758767268862914e-05, |
|
"loss": 3.3559, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.48884165781083955, |
|
"grad_norm": 40483.28515625, |
|
"learning_rate": 1.6741055614594403e-05, |
|
"loss": 3.4042, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4914984059511158, |
|
"grad_norm": 281826.84375, |
|
"learning_rate": 1.6723343960325895e-05, |
|
"loss": 3.5656, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.49415515409139216, |
|
"grad_norm": 112396.421875, |
|
"learning_rate": 1.6705632306057387e-05, |
|
"loss": 3.5217, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.49681190223166843, |
|
"grad_norm": 430567.96875, |
|
"learning_rate": 1.668792065178888e-05, |
|
"loss": 3.784, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.49946865037194477, |
|
"grad_norm": 19857.708984375, |
|
"learning_rate": 1.667020899752037e-05, |
|
"loss": 3.2844, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.502125398512221, |
|
"grad_norm": 153824.828125, |
|
"learning_rate": 1.665249734325186e-05, |
|
"loss": 3.5734, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5047821466524973, |
|
"grad_norm": 555864.875, |
|
"learning_rate": 1.6634785688983353e-05, |
|
"loss": 3.5042, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5074388947927736, |
|
"grad_norm": 1425396.625, |
|
"learning_rate": 1.6617074034714845e-05, |
|
"loss": 3.8919, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.51009564293305, |
|
"grad_norm": 1588321.5, |
|
"learning_rate": 1.6599362380446334e-05, |
|
"loss": 3.7013, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5127523910733263, |
|
"grad_norm": 843313.25, |
|
"learning_rate": 1.6581650726177826e-05, |
|
"loss": 4.0527, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5154091392136025, |
|
"grad_norm": 121270.0859375, |
|
"learning_rate": 1.656393907190932e-05, |
|
"loss": 3.6732, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5180658873538788, |
|
"grad_norm": 194603.609375, |
|
"learning_rate": 1.654622741764081e-05, |
|
"loss": 3.5416, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5207226354941552, |
|
"grad_norm": 103689.84375, |
|
"learning_rate": 1.65285157633723e-05, |
|
"loss": 3.6058, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5233793836344315, |
|
"grad_norm": 148743.953125, |
|
"learning_rate": 1.6510804109103792e-05, |
|
"loss": 3.4376, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5260361317747078, |
|
"grad_norm": 23079.94140625, |
|
"learning_rate": 1.649309245483528e-05, |
|
"loss": 3.524, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.528692879914984, |
|
"grad_norm": 12263.953125, |
|
"learning_rate": 1.6475380800566773e-05, |
|
"loss": 3.1242, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5313496280552603, |
|
"grad_norm": 270958.5625, |
|
"learning_rate": 1.6457669146298265e-05, |
|
"loss": 3.8531, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5340063761955367, |
|
"grad_norm": 145561.640625, |
|
"learning_rate": 1.6439957492029758e-05, |
|
"loss": 3.104, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.536663124335813, |
|
"grad_norm": 104717.5625, |
|
"learning_rate": 1.6422245837761247e-05, |
|
"loss": 3.3674, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5393198724760893, |
|
"grad_norm": 112249.3515625, |
|
"learning_rate": 1.640453418349274e-05, |
|
"loss": 3.2119, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5419766206163655, |
|
"grad_norm": 131700.71875, |
|
"learning_rate": 1.638682252922423e-05, |
|
"loss": 3.6448, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5446333687566419, |
|
"grad_norm": 119026.4140625, |
|
"learning_rate": 1.6369110874955723e-05, |
|
"loss": 3.0097, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5472901168969182, |
|
"grad_norm": 103121.09375, |
|
"learning_rate": 1.6351399220687216e-05, |
|
"loss": 3.4205, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5499468650371945, |
|
"grad_norm": 237787.03125, |
|
"learning_rate": 1.6333687566418704e-05, |
|
"loss": 3.349, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5526036131774708, |
|
"grad_norm": 49652.95703125, |
|
"learning_rate": 1.6315975912150197e-05, |
|
"loss": 3.1665, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.555260361317747, |
|
"grad_norm": 262178.34375, |
|
"learning_rate": 1.629826425788169e-05, |
|
"loss": 3.4743, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5579171094580234, |
|
"grad_norm": 130814.703125, |
|
"learning_rate": 1.6280552603613178e-05, |
|
"loss": 3.4995, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5605738575982997, |
|
"grad_norm": 273671.09375, |
|
"learning_rate": 1.626284094934467e-05, |
|
"loss": 3.3983, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.563230605738576, |
|
"grad_norm": 385060.25, |
|
"learning_rate": 1.6245129295076162e-05, |
|
"loss": 3.5215, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5658873538788523, |
|
"grad_norm": 165007.71875, |
|
"learning_rate": 1.622741764080765e-05, |
|
"loss": 3.1164, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5685441020191286, |
|
"grad_norm": 70266.53125, |
|
"learning_rate": 1.6209705986539144e-05, |
|
"loss": 3.1971, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5712008501594049, |
|
"grad_norm": 271687.3125, |
|
"learning_rate": 1.6191994332270636e-05, |
|
"loss": 3.4932, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5738575982996812, |
|
"grad_norm": 35143.67578125, |
|
"learning_rate": 1.6174282678002128e-05, |
|
"loss": 3.4214, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5765143464399575, |
|
"grad_norm": 1173879.625, |
|
"learning_rate": 1.6156571023733617e-05, |
|
"loss": 3.3194, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5791710945802337, |
|
"grad_norm": 306067.03125, |
|
"learning_rate": 1.613885936946511e-05, |
|
"loss": 3.1417, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5818278427205101, |
|
"grad_norm": 342329.0625, |
|
"learning_rate": 1.61211477151966e-05, |
|
"loss": 3.355, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5844845908607864, |
|
"grad_norm": 50600.97265625, |
|
"learning_rate": 1.6103436060928094e-05, |
|
"loss": 3.3974, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5871413390010627, |
|
"grad_norm": 360589.03125, |
|
"learning_rate": 1.6085724406659583e-05, |
|
"loss": 3.4514, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.589798087141339, |
|
"grad_norm": 94335.3828125, |
|
"learning_rate": 1.6068012752391075e-05, |
|
"loss": 3.2719, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5924548352816154, |
|
"grad_norm": 53790.76953125, |
|
"learning_rate": 1.6050301098122564e-05, |
|
"loss": 3.3992, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5951115834218916, |
|
"grad_norm": 107421.421875, |
|
"learning_rate": 1.6032589443854056e-05, |
|
"loss": 3.3512, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5977683315621679, |
|
"grad_norm": 142487.859375, |
|
"learning_rate": 1.601487778958555e-05, |
|
"loss": 3.826, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6004250797024442, |
|
"grad_norm": 1261580.75, |
|
"learning_rate": 1.599716613531704e-05, |
|
"loss": 3.7385, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6030818278427205, |
|
"grad_norm": 648111.0, |
|
"learning_rate": 1.597945448104853e-05, |
|
"loss": 3.2839, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6057385759829969, |
|
"grad_norm": 326968.125, |
|
"learning_rate": 1.5961742826780022e-05, |
|
"loss": 3.7895, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6083953241232731, |
|
"grad_norm": 808961.5625, |
|
"learning_rate": 1.5944031172511514e-05, |
|
"loss": 3.7051, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6110520722635494, |
|
"grad_norm": 2958079.0, |
|
"learning_rate": 1.5926319518243006e-05, |
|
"loss": 3.8099, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6137088204038257, |
|
"grad_norm": 314874.03125, |
|
"learning_rate": 1.5908607863974495e-05, |
|
"loss": 3.6654, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6163655685441021, |
|
"grad_norm": 8078548.0, |
|
"learning_rate": 1.5890896209705987e-05, |
|
"loss": 4.018, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6190223166843783, |
|
"grad_norm": 135695.46875, |
|
"learning_rate": 1.587318455543748e-05, |
|
"loss": 3.6651, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6216790648246546, |
|
"grad_norm": 18501240.0, |
|
"learning_rate": 1.5855472901168972e-05, |
|
"loss": 4.0069, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6243358129649309, |
|
"grad_norm": 4980981.5, |
|
"learning_rate": 1.5837761246900464e-05, |
|
"loss": 3.9174, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6269925611052072, |
|
"grad_norm": 1297274.125, |
|
"learning_rate": 1.5820049592631953e-05, |
|
"loss": 3.3223, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6296493092454836, |
|
"grad_norm": 1378757.625, |
|
"learning_rate": 1.5802337938363442e-05, |
|
"loss": 3.7712, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6323060573857598, |
|
"grad_norm": 2027859.875, |
|
"learning_rate": 1.5784626284094934e-05, |
|
"loss": 3.5468, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6349628055260361, |
|
"grad_norm": 157107.65625, |
|
"learning_rate": 1.5766914629826427e-05, |
|
"loss": 3.5328, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6376195536663124, |
|
"grad_norm": 1103094.75, |
|
"learning_rate": 1.574920297555792e-05, |
|
"loss": 3.6031, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6402763018065888, |
|
"grad_norm": 725449.5, |
|
"learning_rate": 1.5731491321289408e-05, |
|
"loss": 3.9276, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6429330499468651, |
|
"grad_norm": 214425.640625, |
|
"learning_rate": 1.57137796670209e-05, |
|
"loss": 3.517, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6455897980871413, |
|
"grad_norm": 876419.625, |
|
"learning_rate": 1.5696068012752392e-05, |
|
"loss": 3.4675, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6482465462274176, |
|
"grad_norm": 1504300.25, |
|
"learning_rate": 1.5678356358483884e-05, |
|
"loss": 3.4772, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6509032943676939, |
|
"grad_norm": 144657.71875, |
|
"learning_rate": 1.5660644704215377e-05, |
|
"loss": 3.42, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6535600425079703, |
|
"grad_norm": 371512.40625, |
|
"learning_rate": 1.5642933049946866e-05, |
|
"loss": 3.6802, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6562167906482466, |
|
"grad_norm": 1322714.5, |
|
"learning_rate": 1.5625221395678358e-05, |
|
"loss": 3.805, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6588735387885228, |
|
"grad_norm": 218897.765625, |
|
"learning_rate": 1.560750974140985e-05, |
|
"loss": 3.252, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6615302869287991, |
|
"grad_norm": 1596077.0, |
|
"learning_rate": 1.5589798087141342e-05, |
|
"loss": 3.626, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6641870350690755, |
|
"grad_norm": 2922875.75, |
|
"learning_rate": 1.557208643287283e-05, |
|
"loss": 3.5045, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6668437832093518, |
|
"grad_norm": 96812.5859375, |
|
"learning_rate": 1.5554374778604324e-05, |
|
"loss": 3.7078, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6695005313496281, |
|
"grad_norm": 1580814.125, |
|
"learning_rate": 1.5536663124335812e-05, |
|
"loss": 3.615, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6721572794899043, |
|
"grad_norm": 235169.53125, |
|
"learning_rate": 1.5518951470067305e-05, |
|
"loss": 3.5076, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6748140276301806, |
|
"grad_norm": 816632.0, |
|
"learning_rate": 1.5501239815798797e-05, |
|
"loss": 4.0074, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.677470775770457, |
|
"grad_norm": 3783126.5, |
|
"learning_rate": 1.548352816153029e-05, |
|
"loss": 3.7162, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6801275239107333, |
|
"grad_norm": 1676969.875, |
|
"learning_rate": 1.5465816507261778e-05, |
|
"loss": 3.9383, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6827842720510096, |
|
"grad_norm": 944205.0, |
|
"learning_rate": 1.544810485299327e-05, |
|
"loss": 3.6335, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6854410201912858, |
|
"grad_norm": 532299.0, |
|
"learning_rate": 1.5430393198724763e-05, |
|
"loss": 3.776, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6880977683315622, |
|
"grad_norm": 324683.46875, |
|
"learning_rate": 1.5412681544456255e-05, |
|
"loss": 4.0332, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6907545164718385, |
|
"grad_norm": 371158.6875, |
|
"learning_rate": 1.5394969890187744e-05, |
|
"loss": 3.2831, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6934112646121148, |
|
"grad_norm": 626177.8125, |
|
"learning_rate": 1.5377258235919236e-05, |
|
"loss": 3.7419, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.696068012752391, |
|
"grad_norm": 489480.3125, |
|
"learning_rate": 1.535954658165073e-05, |
|
"loss": 3.9135, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6987247608926673, |
|
"grad_norm": 840057.5625, |
|
"learning_rate": 1.534183492738222e-05, |
|
"loss": 3.6214, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7013815090329437, |
|
"grad_norm": 641658.4375, |
|
"learning_rate": 1.532412327311371e-05, |
|
"loss": 3.9029, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.70403825717322, |
|
"grad_norm": 1129191.0, |
|
"learning_rate": 1.5306411618845202e-05, |
|
"loss": 3.6271, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7066950053134963, |
|
"grad_norm": 758676.8125, |
|
"learning_rate": 1.528869996457669e-05, |
|
"loss": 3.8411, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7093517534537725, |
|
"grad_norm": 946755.25, |
|
"learning_rate": 1.5270988310308183e-05, |
|
"loss": 3.8184, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7120085015940489, |
|
"grad_norm": 1282365.625, |
|
"learning_rate": 1.5253276656039675e-05, |
|
"loss": 3.8393, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7146652497343252, |
|
"grad_norm": 1212575.875, |
|
"learning_rate": 1.5235565001771166e-05, |
|
"loss": 3.6106, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7173219978746015, |
|
"grad_norm": 2197153.75, |
|
"learning_rate": 1.5217853347502658e-05, |
|
"loss": 3.5554, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7199787460148778, |
|
"grad_norm": 621252.1875, |
|
"learning_rate": 1.520014169323415e-05, |
|
"loss": 3.3832, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.722635494155154, |
|
"grad_norm": 243552.59375, |
|
"learning_rate": 1.5182430038965641e-05, |
|
"loss": 3.4785, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7252922422954304, |
|
"grad_norm": 3559921.0, |
|
"learning_rate": 1.5164718384697133e-05, |
|
"loss": 3.7972, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7279489904357067, |
|
"grad_norm": 8816077.0, |
|
"learning_rate": 1.5147006730428624e-05, |
|
"loss": 3.6698, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.730605738575983, |
|
"grad_norm": 2959412.0, |
|
"learning_rate": 1.5129295076160116e-05, |
|
"loss": 3.9389, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7332624867162593, |
|
"grad_norm": 13276429.0, |
|
"learning_rate": 1.5111583421891607e-05, |
|
"loss": 3.6811, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7359192348565357, |
|
"grad_norm": 24583468.0, |
|
"learning_rate": 1.5093871767623095e-05, |
|
"loss": 3.9955, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7385759829968119, |
|
"grad_norm": 11388400.0, |
|
"learning_rate": 1.5076160113354588e-05, |
|
"loss": 3.4851, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7412327311370882, |
|
"grad_norm": 2901875.5, |
|
"learning_rate": 1.5058448459086078e-05, |
|
"loss": 4.0118, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7438894792773645, |
|
"grad_norm": 7893670.0, |
|
"learning_rate": 1.504073680481757e-05, |
|
"loss": 4.3674, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7465462274176408, |
|
"grad_norm": 13170602.0, |
|
"learning_rate": 1.5023025150549063e-05, |
|
"loss": 3.5882, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7492029755579172, |
|
"grad_norm": 12720932.0, |
|
"learning_rate": 1.5005313496280553e-05, |
|
"loss": 4.7013, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7518597236981934, |
|
"grad_norm": 7461363.0, |
|
"learning_rate": 1.4987601842012046e-05, |
|
"loss": 3.5194, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.7545164718384697, |
|
"grad_norm": 3747000.25, |
|
"learning_rate": 1.4969890187743536e-05, |
|
"loss": 3.9811, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.757173219978746, |
|
"grad_norm": 2111091.0, |
|
"learning_rate": 1.4952178533475028e-05, |
|
"loss": 3.3212, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.7598299681190224, |
|
"grad_norm": 4919647.5, |
|
"learning_rate": 1.4934466879206519e-05, |
|
"loss": 4.0383, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7624867162592986, |
|
"grad_norm": 3595169.25, |
|
"learning_rate": 1.4916755224938011e-05, |
|
"loss": 3.7293, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.7651434643995749, |
|
"grad_norm": 1647251.75, |
|
"learning_rate": 1.4899043570669502e-05, |
|
"loss": 4.166, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7678002125398512, |
|
"grad_norm": 4398145.0, |
|
"learning_rate": 1.4881331916400994e-05, |
|
"loss": 3.4454, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.7704569606801275, |
|
"grad_norm": 3135213.0, |
|
"learning_rate": 1.4863620262132485e-05, |
|
"loss": 4.0135, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7731137088204039, |
|
"grad_norm": 7072787.0, |
|
"learning_rate": 1.4845908607863975e-05, |
|
"loss": 3.4145, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.7757704569606801, |
|
"grad_norm": 2635511.75, |
|
"learning_rate": 1.4828196953595466e-05, |
|
"loss": 3.8201, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7784272051009564, |
|
"grad_norm": 4616754.5, |
|
"learning_rate": 1.4810485299326958e-05, |
|
"loss": 4.1764, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.7810839532412327, |
|
"grad_norm": 877153.0, |
|
"learning_rate": 1.4792773645058449e-05, |
|
"loss": 3.9471, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7837407013815091, |
|
"grad_norm": 569671.3125, |
|
"learning_rate": 1.4775061990789941e-05, |
|
"loss": 3.7697, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7863974495217854, |
|
"grad_norm": 810236.125, |
|
"learning_rate": 1.4757350336521432e-05, |
|
"loss": 4.4753, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7890541976620616, |
|
"grad_norm": 877906.875, |
|
"learning_rate": 1.4739638682252924e-05, |
|
"loss": 3.6654, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.7917109458023379, |
|
"grad_norm": 481885.46875, |
|
"learning_rate": 1.4721927027984414e-05, |
|
"loss": 4.1253, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7943676939426142, |
|
"grad_norm": 1338787.0, |
|
"learning_rate": 1.4704215373715907e-05, |
|
"loss": 4.0294, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.7970244420828906, |
|
"grad_norm": 1250065.875, |
|
"learning_rate": 1.4686503719447397e-05, |
|
"loss": 4.7282, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7996811902231669, |
|
"grad_norm": 1604171.375, |
|
"learning_rate": 1.466879206517889e-05, |
|
"loss": 4.0439, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8023379383634431, |
|
"grad_norm": 512070.90625, |
|
"learning_rate": 1.4651080410910382e-05, |
|
"loss": 3.5779, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8049946865037194, |
|
"grad_norm": 312113.46875, |
|
"learning_rate": 1.4633368756641872e-05, |
|
"loss": 3.6514, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8076514346439958, |
|
"grad_norm": 23779.923828125, |
|
"learning_rate": 1.4615657102373361e-05, |
|
"loss": 3.8136, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8103081827842721, |
|
"grad_norm": 8204.794921875, |
|
"learning_rate": 1.4597945448104854e-05, |
|
"loss": 4.1336, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8129649309245484, |
|
"grad_norm": 76479.1640625, |
|
"learning_rate": 1.4580233793836344e-05, |
|
"loss": 3.4411, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8156216790648246, |
|
"grad_norm": 66624.71875, |
|
"learning_rate": 1.4562522139567836e-05, |
|
"loss": 3.8493, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8182784272051009, |
|
"grad_norm": 22607.904296875, |
|
"learning_rate": 1.4544810485299327e-05, |
|
"loss": 3.2428, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8209351753453773, |
|
"grad_norm": 119469.640625, |
|
"learning_rate": 1.452709883103082e-05, |
|
"loss": 3.4363, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8235919234856536, |
|
"grad_norm": 108868.203125, |
|
"learning_rate": 1.4509387176762311e-05, |
|
"loss": 3.5903, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8262486716259299, |
|
"grad_norm": 5543388.0, |
|
"learning_rate": 1.4491675522493802e-05, |
|
"loss": 3.7918, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8289054197662061, |
|
"grad_norm": 2565445.75, |
|
"learning_rate": 1.4473963868225294e-05, |
|
"loss": 3.8573, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.8315621679064825, |
|
"grad_norm": 702086.4375, |
|
"learning_rate": 1.4456252213956785e-05, |
|
"loss": 3.3944, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.8342189160467588, |
|
"grad_norm": 115243.6484375, |
|
"learning_rate": 1.4438540559688277e-05, |
|
"loss": 3.2222, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.8368756641870351, |
|
"grad_norm": 476268.625, |
|
"learning_rate": 1.4420828905419768e-05, |
|
"loss": 3.6144, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8395324123273114, |
|
"grad_norm": 65992.0, |
|
"learning_rate": 1.440311725115126e-05, |
|
"loss": 3.1891, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8421891604675876, |
|
"grad_norm": 1161863.375, |
|
"learning_rate": 1.438540559688275e-05, |
|
"loss": 3.6714, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.844845908607864, |
|
"grad_norm": 185466.84375, |
|
"learning_rate": 1.4367693942614241e-05, |
|
"loss": 3.4372, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.8475026567481403, |
|
"grad_norm": 56940.96875, |
|
"learning_rate": 1.4349982288345732e-05, |
|
"loss": 3.7385, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8501594048884166, |
|
"grad_norm": 99763.78125, |
|
"learning_rate": 1.4332270634077224e-05, |
|
"loss": 3.5612, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8528161530286928, |
|
"grad_norm": 91525.1328125, |
|
"learning_rate": 1.4314558979808715e-05, |
|
"loss": 3.6116, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.8554729011689692, |
|
"grad_norm": 23506.251953125, |
|
"learning_rate": 1.4296847325540207e-05, |
|
"loss": 3.4268, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8581296493092455, |
|
"grad_norm": 36794.52734375, |
|
"learning_rate": 1.4279135671271697e-05, |
|
"loss": 3.7912, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.8607863974495218, |
|
"grad_norm": 14971.548828125, |
|
"learning_rate": 1.426142401700319e-05, |
|
"loss": 3.7623, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8634431455897981, |
|
"grad_norm": 29957.119140625, |
|
"learning_rate": 1.424371236273468e-05, |
|
"loss": 3.5765, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.8660998937300743, |
|
"grad_norm": 24691.1796875, |
|
"learning_rate": 1.4226000708466172e-05, |
|
"loss": 3.4663, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8687566418703507, |
|
"grad_norm": 21935.2734375, |
|
"learning_rate": 1.4208289054197663e-05, |
|
"loss": 3.6494, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.871413390010627, |
|
"grad_norm": 26350.591796875, |
|
"learning_rate": 1.4190577399929155e-05, |
|
"loss": 3.5611, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8740701381509033, |
|
"grad_norm": 30286.142578125, |
|
"learning_rate": 1.4172865745660646e-05, |
|
"loss": 3.7046, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.8767268862911796, |
|
"grad_norm": 6965.02734375, |
|
"learning_rate": 1.4155154091392138e-05, |
|
"loss": 3.9012, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.879383634431456, |
|
"grad_norm": 34496.1171875, |
|
"learning_rate": 1.4137442437123627e-05, |
|
"loss": 3.5102, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.8820403825717322, |
|
"grad_norm": 15867.46875, |
|
"learning_rate": 1.411973078285512e-05, |
|
"loss": 3.9485, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8846971307120085, |
|
"grad_norm": 8408.2509765625, |
|
"learning_rate": 1.410201912858661e-05, |
|
"loss": 4.0955, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.8873538788522848, |
|
"grad_norm": 12868.8935546875, |
|
"learning_rate": 1.4084307474318102e-05, |
|
"loss": 3.8902, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8900106269925611, |
|
"grad_norm": 39027.8125, |
|
"learning_rate": 1.4066595820049593e-05, |
|
"loss": 3.7809, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8926673751328374, |
|
"grad_norm": 30144.494140625, |
|
"learning_rate": 1.4048884165781085e-05, |
|
"loss": 3.8368, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8953241232731137, |
|
"grad_norm": 14916.984375, |
|
"learning_rate": 1.4031172511512576e-05, |
|
"loss": 3.8361, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.89798087141339, |
|
"grad_norm": 10657.8974609375, |
|
"learning_rate": 1.4013460857244068e-05, |
|
"loss": 3.9388, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.9006376195536663, |
|
"grad_norm": 20504.70703125, |
|
"learning_rate": 1.399574920297556e-05, |
|
"loss": 4.257, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9032943676939427, |
|
"grad_norm": 32460.078125, |
|
"learning_rate": 1.397803754870705e-05, |
|
"loss": 4.0817, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.905951115834219, |
|
"grad_norm": 6730.14404296875, |
|
"learning_rate": 1.3960325894438543e-05, |
|
"loss": 4.2065, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.9086078639744952, |
|
"grad_norm": 17531.017578125, |
|
"learning_rate": 1.3942614240170034e-05, |
|
"loss": 3.5729, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9112646121147715, |
|
"grad_norm": 17859.064453125, |
|
"learning_rate": 1.3924902585901526e-05, |
|
"loss": 4.3419, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9139213602550478, |
|
"grad_norm": 99839.4296875, |
|
"learning_rate": 1.3907190931633016e-05, |
|
"loss": 3.9653, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.9165781083953242, |
|
"grad_norm": 13036.796875, |
|
"learning_rate": 1.3889479277364505e-05, |
|
"loss": 3.8463, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9192348565356004, |
|
"grad_norm": 54209.05859375, |
|
"learning_rate": 1.3871767623095998e-05, |
|
"loss": 3.9493, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9218916046758767, |
|
"grad_norm": 227248.34375, |
|
"learning_rate": 1.385405596882749e-05, |
|
"loss": 3.7791, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.924548352816153, |
|
"grad_norm": 856476.3125, |
|
"learning_rate": 1.383634431455898e-05, |
|
"loss": 4.2208, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.9272051009564294, |
|
"grad_norm": 373248.40625, |
|
"learning_rate": 1.3818632660290473e-05, |
|
"loss": 4.6665, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.9298618490967057, |
|
"grad_norm": 476773.1875, |
|
"learning_rate": 1.3800921006021963e-05, |
|
"loss": 4.3373, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9325185972369819, |
|
"grad_norm": 3948952.0, |
|
"learning_rate": 1.3783209351753455e-05, |
|
"loss": 3.9872, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.9351753453772582, |
|
"grad_norm": 131342.296875, |
|
"learning_rate": 1.3765497697484946e-05, |
|
"loss": 4.0315, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.9378320935175345, |
|
"grad_norm": 1021533.8125, |
|
"learning_rate": 1.3747786043216438e-05, |
|
"loss": 3.898, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.9404888416578109, |
|
"grad_norm": 70664288.0, |
|
"learning_rate": 1.3730074388947929e-05, |
|
"loss": 4.0261, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.9431455897980872, |
|
"grad_norm": 1955257.25, |
|
"learning_rate": 1.3712362734679421e-05, |
|
"loss": 3.9837, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.9458023379383634, |
|
"grad_norm": 10510368.0, |
|
"learning_rate": 1.3694651080410912e-05, |
|
"loss": 4.2089, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.9484590860786397, |
|
"grad_norm": 4540049.0, |
|
"learning_rate": 1.3676939426142404e-05, |
|
"loss": 4.0757, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.9511158342189161, |
|
"grad_norm": 1934832.5, |
|
"learning_rate": 1.3659227771873893e-05, |
|
"loss": 3.8116, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.9537725823591924, |
|
"grad_norm": 721523.875, |
|
"learning_rate": 1.3641516117605385e-05, |
|
"loss": 3.8604, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9564293304994687, |
|
"grad_norm": 3694456.5, |
|
"learning_rate": 1.3623804463336876e-05, |
|
"loss": 4.3438, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9590860786397449, |
|
"grad_norm": 4130751.5, |
|
"learning_rate": 1.3606092809068368e-05, |
|
"loss": 3.7722, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.9617428267800212, |
|
"grad_norm": 3232915.5, |
|
"learning_rate": 1.3588381154799859e-05, |
|
"loss": 4.1108, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9643995749202976, |
|
"grad_norm": 5608699.5, |
|
"learning_rate": 1.357066950053135e-05, |
|
"loss": 4.4695, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.9670563230605739, |
|
"grad_norm": 37526024.0, |
|
"learning_rate": 1.3552957846262841e-05, |
|
"loss": 3.8838, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9697130712008502, |
|
"grad_norm": 11544401.0, |
|
"learning_rate": 1.3535246191994334e-05, |
|
"loss": 3.7949, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9723698193411264, |
|
"grad_norm": 1559264.75, |
|
"learning_rate": 1.3517534537725824e-05, |
|
"loss": 3.8236, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9750265674814028, |
|
"grad_norm": 10817994.0, |
|
"learning_rate": 1.3499822883457316e-05, |
|
"loss": 4.0035, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.9776833156216791, |
|
"grad_norm": 20268342.0, |
|
"learning_rate": 1.3482111229188807e-05, |
|
"loss": 3.6612, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9803400637619554, |
|
"grad_norm": 51181968.0, |
|
"learning_rate": 1.34643995749203e-05, |
|
"loss": 3.7019, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.9829968119022316, |
|
"grad_norm": 74098400.0, |
|
"learning_rate": 1.3446687920651792e-05, |
|
"loss": 3.779, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9856535600425079, |
|
"grad_norm": 48340468.0, |
|
"learning_rate": 1.3428976266383282e-05, |
|
"loss": 3.6759, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.9883103081827843, |
|
"grad_norm": 8802756.0, |
|
"learning_rate": 1.3411264612114771e-05, |
|
"loss": 3.6238, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9909670563230606, |
|
"grad_norm": 3833086.75, |
|
"learning_rate": 1.3393552957846263e-05, |
|
"loss": 3.3759, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.9936238044633369, |
|
"grad_norm": 29499648.0, |
|
"learning_rate": 1.3375841303577754e-05, |
|
"loss": 3.6134, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9962805526036131, |
|
"grad_norm": 6612167.0, |
|
"learning_rate": 1.3358129649309246e-05, |
|
"loss": 3.5491, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.9989373007438895, |
|
"grad_norm": 21236494.0, |
|
"learning_rate": 1.3340417995040737e-05, |
|
"loss": 3.7831, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.75178599357605, |
|
"eval_runtime": 744.4128, |
|
"eval_samples_per_second": 20.225, |
|
"eval_steps_per_second": 5.056, |
|
"step": 3764 |
|
}, |
|
{ |
|
"epoch": 1.0015940488841657, |
|
"grad_norm": 40179844.0, |
|
"learning_rate": 1.3322706340772229e-05, |
|
"loss": 3.711, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.004250797024442, |
|
"grad_norm": 17010662.0, |
|
"learning_rate": 1.3304994686503721e-05, |
|
"loss": 3.4946, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.0069075451647185, |
|
"grad_norm": 19932106.0, |
|
"learning_rate": 1.3287283032235212e-05, |
|
"loss": 3.5648, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.0095642933049946, |
|
"grad_norm": 5492312.0, |
|
"learning_rate": 1.3269571377966704e-05, |
|
"loss": 4.0635, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.012221041445271, |
|
"grad_norm": 192937568.0, |
|
"learning_rate": 1.3251859723698195e-05, |
|
"loss": 3.4178, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.0148777895855472, |
|
"grad_norm": 1293443.125, |
|
"learning_rate": 1.3234148069429687e-05, |
|
"loss": 3.9658, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.0175345377258236, |
|
"grad_norm": 158162096.0, |
|
"learning_rate": 1.3216436415161178e-05, |
|
"loss": 3.6695, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.0201912858661, |
|
"grad_norm": 207503072.0, |
|
"learning_rate": 1.319872476089267e-05, |
|
"loss": 4.1104, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.0228480340063761, |
|
"grad_norm": 5859501.0, |
|
"learning_rate": 1.3181013106624159e-05, |
|
"loss": 3.7423, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.0255047821466525, |
|
"grad_norm": 65099376.0, |
|
"learning_rate": 1.3163301452355651e-05, |
|
"loss": 3.8122, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.0281615302869287, |
|
"grad_norm": 13768734.0, |
|
"learning_rate": 1.3145589798087142e-05, |
|
"loss": 3.8062, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.030818278427205, |
|
"grad_norm": 24830612.0, |
|
"learning_rate": 1.3127878143818634e-05, |
|
"loss": 3.5577, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.0334750265674815, |
|
"grad_norm": 109977040.0, |
|
"learning_rate": 1.3110166489550124e-05, |
|
"loss": 3.8904, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.0361317747077576, |
|
"grad_norm": 22621510.0, |
|
"learning_rate": 1.3092454835281617e-05, |
|
"loss": 3.7924, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.038788522848034, |
|
"grad_norm": 15618693.0, |
|
"learning_rate": 1.3074743181013107e-05, |
|
"loss": 3.9009, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.0414452709883104, |
|
"grad_norm": 102296992.0, |
|
"learning_rate": 1.30570315267446e-05, |
|
"loss": 4.0488, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.0441020191285866, |
|
"grad_norm": 180104320.0, |
|
"learning_rate": 1.303931987247609e-05, |
|
"loss": 4.0832, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.046758767268863, |
|
"grad_norm": 8426886.0, |
|
"learning_rate": 1.3021608218207582e-05, |
|
"loss": 3.9811, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.0494155154091391, |
|
"grad_norm": 23817282.0, |
|
"learning_rate": 1.3003896563939073e-05, |
|
"loss": 3.5573, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.0520722635494155, |
|
"grad_norm": 34805012.0, |
|
"learning_rate": 1.2986184909670565e-05, |
|
"loss": 3.6933, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.054729011689692, |
|
"grad_norm": 27546222.0, |
|
"learning_rate": 1.2968473255402056e-05, |
|
"loss": 3.826, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.057385759829968, |
|
"grad_norm": 73101112.0, |
|
"learning_rate": 1.2950761601133548e-05, |
|
"loss": 4.3474, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.0600425079702445, |
|
"grad_norm": 60012056.0, |
|
"learning_rate": 1.2933049946865037e-05, |
|
"loss": 3.4645, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.0626992561105206, |
|
"grad_norm": 10204493.0, |
|
"learning_rate": 1.2915338292596529e-05, |
|
"loss": 3.7942, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.065356004250797, |
|
"grad_norm": 67629928.0, |
|
"learning_rate": 1.289762663832802e-05, |
|
"loss": 3.6377, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.0680127523910734, |
|
"grad_norm": 31746526.0, |
|
"learning_rate": 1.2879914984059512e-05, |
|
"loss": 3.7846, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.0706695005313496, |
|
"grad_norm": 52992448.0, |
|
"learning_rate": 1.2862203329791003e-05, |
|
"loss": 3.2981, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.073326248671626, |
|
"grad_norm": 36022592.0, |
|
"learning_rate": 1.2844491675522495e-05, |
|
"loss": 3.6733, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.0759829968119021, |
|
"grad_norm": 11422725.0, |
|
"learning_rate": 1.2826780021253985e-05, |
|
"loss": 3.5682, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.0786397449521785, |
|
"grad_norm": 77457192.0, |
|
"learning_rate": 1.2809068366985478e-05, |
|
"loss": 3.8538, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.081296493092455, |
|
"grad_norm": 109772792.0, |
|
"learning_rate": 1.279135671271697e-05, |
|
"loss": 4.0151, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.083953241232731, |
|
"grad_norm": 126942304.0, |
|
"learning_rate": 1.277364505844846e-05, |
|
"loss": 4.418, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.0866099893730075, |
|
"grad_norm": 215005632.0, |
|
"learning_rate": 1.2755933404179953e-05, |
|
"loss": 3.6302, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.0892667375132838, |
|
"grad_norm": 18895672.0, |
|
"learning_rate": 1.2738221749911443e-05, |
|
"loss": 4.2548, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.09192348565356, |
|
"grad_norm": 20576284.0, |
|
"learning_rate": 1.2720510095642936e-05, |
|
"loss": 3.9913, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.0945802337938364, |
|
"grad_norm": 90564424.0, |
|
"learning_rate": 1.2702798441374424e-05, |
|
"loss": 3.8335, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.0972369819341126, |
|
"grad_norm": 136458144.0, |
|
"learning_rate": 1.2685086787105915e-05, |
|
"loss": 4.0485, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.099893730074389, |
|
"grad_norm": 175102016.0, |
|
"learning_rate": 1.2667375132837407e-05, |
|
"loss": 4.1181, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.1025504782146653, |
|
"grad_norm": 15060149.0, |
|
"learning_rate": 1.26496634785689e-05, |
|
"loss": 3.753, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.1052072263549415, |
|
"grad_norm": 92020808.0, |
|
"learning_rate": 1.263195182430039e-05, |
|
"loss": 3.9935, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.107863974495218, |
|
"grad_norm": 133574952.0, |
|
"learning_rate": 1.2614240170031882e-05, |
|
"loss": 4.0376, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.110520722635494, |
|
"grad_norm": 69448336.0, |
|
"learning_rate": 1.2596528515763373e-05, |
|
"loss": 3.7264, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.1131774707757705, |
|
"grad_norm": 24695358.0, |
|
"learning_rate": 1.2578816861494865e-05, |
|
"loss": 3.6435, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.1158342189160468, |
|
"grad_norm": 26981000.0, |
|
"learning_rate": 1.2561105207226356e-05, |
|
"loss": 4.1867, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.118490967056323, |
|
"grad_norm": 26429450.0, |
|
"learning_rate": 1.2543393552957848e-05, |
|
"loss": 4.2308, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.1211477151965994, |
|
"grad_norm": 75864056.0, |
|
"learning_rate": 1.2525681898689339e-05, |
|
"loss": 4.1067, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.1238044633368758, |
|
"grad_norm": 53176204.0, |
|
"learning_rate": 1.2507970244420831e-05, |
|
"loss": 4.3122, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.126461211477152, |
|
"grad_norm": 27715404.0, |
|
"learning_rate": 1.2490258590152322e-05, |
|
"loss": 4.0918, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.1291179596174283, |
|
"grad_norm": 6029370.0, |
|
"learning_rate": 1.2472546935883814e-05, |
|
"loss": 4.1725, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.1317747077577045, |
|
"grad_norm": 26051718.0, |
|
"learning_rate": 1.2454835281615303e-05, |
|
"loss": 3.9757, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.134431455897981, |
|
"grad_norm": 77973728.0, |
|
"learning_rate": 1.2437123627346795e-05, |
|
"loss": 3.989, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.1370882040382573, |
|
"grad_norm": 11366385.0, |
|
"learning_rate": 1.2419411973078286e-05, |
|
"loss": 4.3978, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.1397449521785334, |
|
"grad_norm": 19926490.0, |
|
"learning_rate": 1.2401700318809778e-05, |
|
"loss": 3.7446, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.1424017003188098, |
|
"grad_norm": 66211068.0, |
|
"learning_rate": 1.2383988664541268e-05, |
|
"loss": 3.9591, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.145058448459086, |
|
"grad_norm": 7617592.5, |
|
"learning_rate": 1.236627701027276e-05, |
|
"loss": 4.2812, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.1477151965993624, |
|
"grad_norm": 47218612.0, |
|
"learning_rate": 1.2348565356004251e-05, |
|
"loss": 4.137, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.1503719447396388, |
|
"grad_norm": 115950944.0, |
|
"learning_rate": 1.2330853701735743e-05, |
|
"loss": 4.1344, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.153028692879915, |
|
"grad_norm": 27328380.0, |
|
"learning_rate": 1.2313142047467234e-05, |
|
"loss": 4.0865, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.1556854410201913, |
|
"grad_norm": 8267316.5, |
|
"learning_rate": 1.2295430393198726e-05, |
|
"loss": 4.3048, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.1583421891604675, |
|
"grad_norm": 18654644.0, |
|
"learning_rate": 1.2277718738930217e-05, |
|
"loss": 4.4512, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.1609989373007439, |
|
"grad_norm": 123494120.0, |
|
"learning_rate": 1.2260007084661709e-05, |
|
"loss": 4.1863, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.1636556854410203, |
|
"grad_norm": 87930224.0, |
|
"learning_rate": 1.2242295430393201e-05, |
|
"loss": 4.1395, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.1663124335812964, |
|
"grad_norm": 60926568.0, |
|
"learning_rate": 1.222458377612469e-05, |
|
"loss": 3.9975, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.1689691817215728, |
|
"grad_norm": 15561844.0, |
|
"learning_rate": 1.2206872121856181e-05, |
|
"loss": 4.1746, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.171625929861849, |
|
"grad_norm": 14337786.0, |
|
"learning_rate": 1.2189160467587673e-05, |
|
"loss": 4.0762, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.1742826780021254, |
|
"grad_norm": 27260074.0, |
|
"learning_rate": 1.2171448813319164e-05, |
|
"loss": 4.3436, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.1769394261424018, |
|
"grad_norm": 14445331.0, |
|
"learning_rate": 1.2153737159050656e-05, |
|
"loss": 3.9788, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.179596174282678, |
|
"grad_norm": 21041896.0, |
|
"learning_rate": 1.2136025504782147e-05, |
|
"loss": 4.3681, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.1822529224229543, |
|
"grad_norm": 15333385.0, |
|
"learning_rate": 1.2118313850513639e-05, |
|
"loss": 4.1638, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.1849096705632305, |
|
"grad_norm": 18882606.0, |
|
"learning_rate": 1.2100602196245131e-05, |
|
"loss": 3.9175, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.1875664187035069, |
|
"grad_norm": 6002330.5, |
|
"learning_rate": 1.2082890541976622e-05, |
|
"loss": 4.0274, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.1902231668437833, |
|
"grad_norm": 12174502.0, |
|
"learning_rate": 1.2065178887708114e-05, |
|
"loss": 4.0163, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.1928799149840594, |
|
"grad_norm": 3046521.75, |
|
"learning_rate": 1.2047467233439604e-05, |
|
"loss": 4.2218, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.1955366631243358, |
|
"grad_norm": 7046191.0, |
|
"learning_rate": 1.2029755579171097e-05, |
|
"loss": 3.8047, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.1981934112646122, |
|
"grad_norm": 2158310.5, |
|
"learning_rate": 1.2012043924902587e-05, |
|
"loss": 4.0102, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.2008501594048884, |
|
"grad_norm": 1953139.875, |
|
"learning_rate": 1.199433227063408e-05, |
|
"loss": 3.9815, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.2035069075451648, |
|
"grad_norm": 10403948.0, |
|
"learning_rate": 1.1976620616365568e-05, |
|
"loss": 4.2106, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.206163655685441, |
|
"grad_norm": 1701127.5, |
|
"learning_rate": 1.195890896209706e-05, |
|
"loss": 4.1719, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.2088204038257173, |
|
"grad_norm": 1922839.625, |
|
"learning_rate": 1.1941197307828551e-05, |
|
"loss": 4.2, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.2114771519659937, |
|
"grad_norm": 1249251.375, |
|
"learning_rate": 1.1923485653560044e-05, |
|
"loss": 4.3854, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.2141339001062699, |
|
"grad_norm": 3677515.25, |
|
"learning_rate": 1.1905773999291534e-05, |
|
"loss": 4.1928, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.2167906482465463, |
|
"grad_norm": 1778515.5, |
|
"learning_rate": 1.1888062345023026e-05, |
|
"loss": 4.282, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.2194473963868226, |
|
"grad_norm": 2142989.75, |
|
"learning_rate": 1.1870350690754517e-05, |
|
"loss": 4.0862, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.2221041445270988, |
|
"grad_norm": 3376149.5, |
|
"learning_rate": 1.185263903648601e-05, |
|
"loss": 4.9249, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.2247608926673752, |
|
"grad_norm": 918137.0625, |
|
"learning_rate": 1.18349273822175e-05, |
|
"loss": 4.4397, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.2274176408076514, |
|
"grad_norm": 5548887.5, |
|
"learning_rate": 1.1817215727948992e-05, |
|
"loss": 4.186, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.2300743889479278, |
|
"grad_norm": 1206121.0, |
|
"learning_rate": 1.1799504073680483e-05, |
|
"loss": 4.4369, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.2327311370882041, |
|
"grad_norm": 1302905.0, |
|
"learning_rate": 1.1781792419411975e-05, |
|
"loss": 4.2492, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.2353878852284803, |
|
"grad_norm": 1243181.25, |
|
"learning_rate": 1.1764080765143466e-05, |
|
"loss": 4.3557, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.2380446333687567, |
|
"grad_norm": 1636811.25, |
|
"learning_rate": 1.1746369110874956e-05, |
|
"loss": 4.4305, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.2407013815090329, |
|
"grad_norm": 3252745.75, |
|
"learning_rate": 1.1728657456606447e-05, |
|
"loss": 4.4447, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.2433581296493093, |
|
"grad_norm": 3218180.0, |
|
"learning_rate": 1.1710945802337939e-05, |
|
"loss": 4.1695, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.2460148777895856, |
|
"grad_norm": 7251921.5, |
|
"learning_rate": 1.169323414806943e-05, |
|
"loss": 4.0679, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.2486716259298618, |
|
"grad_norm": 3886631.0, |
|
"learning_rate": 1.1675522493800922e-05, |
|
"loss": 3.9159, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.2513283740701382, |
|
"grad_norm": 2420017.75, |
|
"learning_rate": 1.1657810839532412e-05, |
|
"loss": 4.6458, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.2539851222104144, |
|
"grad_norm": 1138159.875, |
|
"learning_rate": 1.1640099185263905e-05, |
|
"loss": 4.078, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.2566418703506907, |
|
"grad_norm": 930125.875, |
|
"learning_rate": 1.1622387530995395e-05, |
|
"loss": 4.0812, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.2592986184909671, |
|
"grad_norm": 3835148.25, |
|
"learning_rate": 1.1604675876726887e-05, |
|
"loss": 4.1012, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.2619553666312433, |
|
"grad_norm": 6243373.5, |
|
"learning_rate": 1.158696422245838e-05, |
|
"loss": 3.8252, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.2646121147715197, |
|
"grad_norm": 3021652.25, |
|
"learning_rate": 1.156925256818987e-05, |
|
"loss": 3.9515, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.2672688629117959, |
|
"grad_norm": 4503118.5, |
|
"learning_rate": 1.1551540913921363e-05, |
|
"loss": 4.0478, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.2699256110520722, |
|
"grad_norm": 5867597.5, |
|
"learning_rate": 1.1533829259652853e-05, |
|
"loss": 4.0726, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.2725823591923486, |
|
"grad_norm": 23690828.0, |
|
"learning_rate": 1.1516117605384345e-05, |
|
"loss": 3.5037, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.2752391073326248, |
|
"grad_norm": 5260964.5, |
|
"learning_rate": 1.1498405951115834e-05, |
|
"loss": 4.0774, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.2778958554729012, |
|
"grad_norm": 4894551.5, |
|
"learning_rate": 1.1480694296847325e-05, |
|
"loss": 3.7113, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.2805526036131774, |
|
"grad_norm": 4784902.0, |
|
"learning_rate": 1.1462982642578817e-05, |
|
"loss": 3.886, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.2832093517534537, |
|
"grad_norm": 22511842.0, |
|
"learning_rate": 1.144527098831031e-05, |
|
"loss": 3.7413, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.2858660998937301, |
|
"grad_norm": 13445524.0, |
|
"learning_rate": 1.14275593340418e-05, |
|
"loss": 4.3171, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.2885228480340063, |
|
"grad_norm": 4879641.0, |
|
"learning_rate": 1.1409847679773292e-05, |
|
"loss": 4.0366, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.2911795961742827, |
|
"grad_norm": 5458451.0, |
|
"learning_rate": 1.1392136025504783e-05, |
|
"loss": 4.0356, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.2938363443145589, |
|
"grad_norm": 1152951.125, |
|
"learning_rate": 1.1374424371236275e-05, |
|
"loss": 3.9322, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.2964930924548352, |
|
"grad_norm": 1573109.875, |
|
"learning_rate": 1.1356712716967766e-05, |
|
"loss": 3.5684, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.2991498405951116, |
|
"grad_norm": 3557934.25, |
|
"learning_rate": 1.1339001062699258e-05, |
|
"loss": 3.8874, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.301806588735388, |
|
"grad_norm": 2637183.5, |
|
"learning_rate": 1.1321289408430748e-05, |
|
"loss": 4.0737, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.3044633368756642, |
|
"grad_norm": 1852644.25, |
|
"learning_rate": 1.130357775416224e-05, |
|
"loss": 4.4462, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 1.3071200850159406, |
|
"grad_norm": 7577384.5, |
|
"learning_rate": 1.1285866099893731e-05, |
|
"loss": 3.8546, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 1.3097768331562167, |
|
"grad_norm": 4401453.5, |
|
"learning_rate": 1.1268154445625224e-05, |
|
"loss": 4.0443, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 1.3124335812964931, |
|
"grad_norm": 3643839.75, |
|
"learning_rate": 1.1250442791356712e-05, |
|
"loss": 3.678, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 1.3150903294367695, |
|
"grad_norm": 27145024.0, |
|
"learning_rate": 1.1232731137088205e-05, |
|
"loss": 3.8589, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.3177470775770457, |
|
"grad_norm": 1982266.875, |
|
"learning_rate": 1.1215019482819695e-05, |
|
"loss": 3.587, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 1.320403825717322, |
|
"grad_norm": 2339293.25, |
|
"learning_rate": 1.1197307828551188e-05, |
|
"loss": 3.6116, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 1.3230605738575982, |
|
"grad_norm": 21441204.0, |
|
"learning_rate": 1.1179596174282678e-05, |
|
"loss": 3.4365, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 1.3257173219978746, |
|
"grad_norm": 3329228.0, |
|
"learning_rate": 1.116188452001417e-05, |
|
"loss": 4.184, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 1.328374070138151, |
|
"grad_norm": 2602702.75, |
|
"learning_rate": 1.1144172865745661e-05, |
|
"loss": 3.6095, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.3310308182784272, |
|
"grad_norm": 62917268.0, |
|
"learning_rate": 1.1126461211477153e-05, |
|
"loss": 3.4086, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 1.3336875664187036, |
|
"grad_norm": 9320738.0, |
|
"learning_rate": 1.1108749557208644e-05, |
|
"loss": 3.8485, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 1.3363443145589797, |
|
"grad_norm": 11171778.0, |
|
"learning_rate": 1.1091037902940136e-05, |
|
"loss": 3.5241, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 1.3390010626992561, |
|
"grad_norm": 13504690.0, |
|
"learning_rate": 1.1073326248671628e-05, |
|
"loss": 3.7951, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 1.3416578108395325, |
|
"grad_norm": 1940023.625, |
|
"learning_rate": 1.1055614594403119e-05, |
|
"loss": 3.938, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.3443145589798087, |
|
"grad_norm": 9250230.0, |
|
"learning_rate": 1.1037902940134611e-05, |
|
"loss": 3.6501, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 1.346971307120085, |
|
"grad_norm": 8658494.0, |
|
"learning_rate": 1.10201912858661e-05, |
|
"loss": 3.4101, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.3496280552603612, |
|
"grad_norm": 24788584.0, |
|
"learning_rate": 1.100247963159759e-05, |
|
"loss": 3.2665, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.3522848034006376, |
|
"grad_norm": 17288262.0, |
|
"learning_rate": 1.0984767977329083e-05, |
|
"loss": 3.9485, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.354941551540914, |
|
"grad_norm": 1679803.0, |
|
"learning_rate": 1.0967056323060574e-05, |
|
"loss": 3.7726, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.3575982996811902, |
|
"grad_norm": 14593549.0, |
|
"learning_rate": 1.0949344668792066e-05, |
|
"loss": 4.0024, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.3602550478214666, |
|
"grad_norm": 4186409.75, |
|
"learning_rate": 1.0931633014523556e-05, |
|
"loss": 3.6818, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.3629117959617427, |
|
"grad_norm": 747755.5625, |
|
"learning_rate": 1.0913921360255049e-05, |
|
"loss": 3.4717, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.365568544102019, |
|
"grad_norm": 445103.3125, |
|
"learning_rate": 1.0896209705986541e-05, |
|
"loss": 3.4684, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.3682252922422955, |
|
"grad_norm": 1250102.625, |
|
"learning_rate": 1.0878498051718031e-05, |
|
"loss": 3.2248, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.3708820403825717, |
|
"grad_norm": 532045.3125, |
|
"learning_rate": 1.0860786397449524e-05, |
|
"loss": 3.3662, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.373538788522848, |
|
"grad_norm": 454849.5625, |
|
"learning_rate": 1.0843074743181014e-05, |
|
"loss": 3.5507, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.3761955366631242, |
|
"grad_norm": 3551179.5, |
|
"learning_rate": 1.0825363088912507e-05, |
|
"loss": 3.2755, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.3788522848034006, |
|
"grad_norm": 6700418.0, |
|
"learning_rate": 1.0807651434643997e-05, |
|
"loss": 3.2751, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.381509032943677, |
|
"grad_norm": 37462192.0, |
|
"learning_rate": 1.078993978037549e-05, |
|
"loss": 3.5327, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.3841657810839532, |
|
"grad_norm": 9333666.0, |
|
"learning_rate": 1.0772228126106978e-05, |
|
"loss": 3.1278, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.3868225292242295, |
|
"grad_norm": 16026876.0, |
|
"learning_rate": 1.075451647183847e-05, |
|
"loss": 3.5275, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.3894792773645057, |
|
"grad_norm": 24360552.0, |
|
"learning_rate": 1.0736804817569961e-05, |
|
"loss": 3.6815, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.392136025504782, |
|
"grad_norm": 12289483.0, |
|
"learning_rate": 1.0719093163301453e-05, |
|
"loss": 3.1039, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.3947927736450585, |
|
"grad_norm": 1954500.625, |
|
"learning_rate": 1.0701381509032944e-05, |
|
"loss": 3.3327, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.3974495217853349, |
|
"grad_norm": 5957172.5, |
|
"learning_rate": 1.0683669854764436e-05, |
|
"loss": 3.6985, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.400106269925611, |
|
"grad_norm": 136582976.0, |
|
"learning_rate": 1.0665958200495927e-05, |
|
"loss": 3.4845, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.4027630180658874, |
|
"grad_norm": 21799228.0, |
|
"learning_rate": 1.0648246546227419e-05, |
|
"loss": 3.4648, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.4054197662061636, |
|
"grad_norm": 1183856.625, |
|
"learning_rate": 1.063053489195891e-05, |
|
"loss": 3.2929, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.40807651434644, |
|
"grad_norm": 28349394.0, |
|
"learning_rate": 1.0612823237690402e-05, |
|
"loss": 3.611, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.4107332624867164, |
|
"grad_norm": 1230487.75, |
|
"learning_rate": 1.0595111583421892e-05, |
|
"loss": 3.0602, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.4133900106269925, |
|
"grad_norm": 29549574.0, |
|
"learning_rate": 1.0577399929153385e-05, |
|
"loss": 3.6129, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.416046758767269, |
|
"grad_norm": 65607896.0, |
|
"learning_rate": 1.0559688274884875e-05, |
|
"loss": 3.305, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.418703506907545, |
|
"grad_norm": 21593944.0, |
|
"learning_rate": 1.0541976620616366e-05, |
|
"loss": 4.182, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.4213602550478215, |
|
"grad_norm": 9913192.0, |
|
"learning_rate": 1.0524264966347856e-05, |
|
"loss": 3.333, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.4240170031880979, |
|
"grad_norm": 5600408.5, |
|
"learning_rate": 1.0506553312079349e-05, |
|
"loss": 3.2001, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.426673751328374, |
|
"grad_norm": 4921900.0, |
|
"learning_rate": 1.048884165781084e-05, |
|
"loss": 3.8381, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.4293304994686504, |
|
"grad_norm": 22669404.0, |
|
"learning_rate": 1.0471130003542332e-05, |
|
"loss": 3.438, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.4319872476089266, |
|
"grad_norm": 11211402.0, |
|
"learning_rate": 1.0453418349273822e-05, |
|
"loss": 3.3608, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.434643995749203, |
|
"grad_norm": 10033162.0, |
|
"learning_rate": 1.0435706695005314e-05, |
|
"loss": 3.2148, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.4373007438894794, |
|
"grad_norm": 34627448.0, |
|
"learning_rate": 1.0417995040736805e-05, |
|
"loss": 3.3408, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.4399574920297555, |
|
"grad_norm": 19163360.0, |
|
"learning_rate": 1.0400283386468297e-05, |
|
"loss": 3.0767, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.442614240170032, |
|
"grad_norm": 11876396.0, |
|
"learning_rate": 1.038257173219979e-05, |
|
"loss": 3.8624, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.445270988310308, |
|
"grad_norm": 6485251.5, |
|
"learning_rate": 1.036486007793128e-05, |
|
"loss": 3.4212, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.4479277364505845, |
|
"grad_norm": 2855033.5, |
|
"learning_rate": 1.0347148423662772e-05, |
|
"loss": 3.5543, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.4505844845908609, |
|
"grad_norm": 39419356.0, |
|
"learning_rate": 1.0329436769394263e-05, |
|
"loss": 3.6357, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.453241232731137, |
|
"grad_norm": 8782708.0, |
|
"learning_rate": 1.0311725115125755e-05, |
|
"loss": 3.7995, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.4558979808714134, |
|
"grad_norm": 32046924.0, |
|
"learning_rate": 1.0294013460857244e-05, |
|
"loss": 3.2472, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 1.4585547290116896, |
|
"grad_norm": 30402538.0, |
|
"learning_rate": 1.0276301806588735e-05, |
|
"loss": 3.1715, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 1.461211477151966, |
|
"grad_norm": 19326186.0, |
|
"learning_rate": 1.0258590152320227e-05, |
|
"loss": 3.9161, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.4638682252922424, |
|
"grad_norm": 9990077.0, |
|
"learning_rate": 1.024087849805172e-05, |
|
"loss": 3.849, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 1.4665249734325185, |
|
"grad_norm": 29835254.0, |
|
"learning_rate": 1.022316684378321e-05, |
|
"loss": 3.331, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 1.469181721572795, |
|
"grad_norm": 84350656.0, |
|
"learning_rate": 1.0205455189514702e-05, |
|
"loss": 3.3592, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 1.471838469713071, |
|
"grad_norm": 5173333.5, |
|
"learning_rate": 1.0187743535246193e-05, |
|
"loss": 3.3015, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 1.4744952178533475, |
|
"grad_norm": 3443425.5, |
|
"learning_rate": 1.0170031880977685e-05, |
|
"loss": 3.5236, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.4771519659936239, |
|
"grad_norm": 2188022.75, |
|
"learning_rate": 1.0152320226709175e-05, |
|
"loss": 3.5614, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 1.4798087141339, |
|
"grad_norm": 16931794.0, |
|
"learning_rate": 1.0134608572440668e-05, |
|
"loss": 3.6685, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 1.4824654622741764, |
|
"grad_norm": 10456564.0, |
|
"learning_rate": 1.0116896918172158e-05, |
|
"loss": 3.4864, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 1.4851222104144526, |
|
"grad_norm": 27239420.0, |
|
"learning_rate": 1.009918526390365e-05, |
|
"loss": 3.5637, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 1.487778958554729, |
|
"grad_norm": 16616771.0, |
|
"learning_rate": 1.0081473609635141e-05, |
|
"loss": 3.6085, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.4904357066950054, |
|
"grad_norm": 10221569.0, |
|
"learning_rate": 1.0063761955366632e-05, |
|
"loss": 3.5812, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 1.4930924548352817, |
|
"grad_norm": 1452260.75, |
|
"learning_rate": 1.0046050301098122e-05, |
|
"loss": 3.9326, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 1.495749202975558, |
|
"grad_norm": 3546143.0, |
|
"learning_rate": 1.0028338646829615e-05, |
|
"loss": 3.2541, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 1.4984059511158343, |
|
"grad_norm": 12791246.0, |
|
"learning_rate": 1.0010626992561105e-05, |
|
"loss": 3.4152, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 1.5010626992561105, |
|
"grad_norm": 12529229.0, |
|
"learning_rate": 9.992915338292597e-06, |
|
"loss": 3.0508, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.5037194473963869, |
|
"grad_norm": 9755405.0, |
|
"learning_rate": 9.975203684024088e-06, |
|
"loss": 3.5064, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 1.5063761955366632, |
|
"grad_norm": 6901898.0, |
|
"learning_rate": 9.95749202975558e-06, |
|
"loss": 3.6654, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 1.5090329436769394, |
|
"grad_norm": 9542270.0, |
|
"learning_rate": 9.93978037548707e-06, |
|
"loss": 3.3481, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 1.5116896918172156, |
|
"grad_norm": 14570059.0, |
|
"learning_rate": 9.922068721218563e-06, |
|
"loss": 3.6342, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 1.514346439957492, |
|
"grad_norm": 130252984.0, |
|
"learning_rate": 9.904357066950054e-06, |
|
"loss": 3.3275, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.5170031880977684, |
|
"grad_norm": 12491921.0, |
|
"learning_rate": 9.886645412681544e-06, |
|
"loss": 3.1862, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 1.5196599362380447, |
|
"grad_norm": 171955248.0, |
|
"learning_rate": 9.868933758413036e-06, |
|
"loss": 3.6, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 1.522316684378321, |
|
"grad_norm": 67972536.0, |
|
"learning_rate": 9.851222104144527e-06, |
|
"loss": 3.5839, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 1.524973432518597, |
|
"grad_norm": 19312536.0, |
|
"learning_rate": 9.83351044987602e-06, |
|
"loss": 3.3906, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 1.5276301806588735, |
|
"grad_norm": 39636108.0, |
|
"learning_rate": 9.81579879560751e-06, |
|
"loss": 3.5388, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.5302869287991498, |
|
"grad_norm": 54133548.0, |
|
"learning_rate": 9.798087141339002e-06, |
|
"loss": 3.2938, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 1.5329436769394262, |
|
"grad_norm": 28021788.0, |
|
"learning_rate": 9.780375487070494e-06, |
|
"loss": 3.565, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 1.5356004250797024, |
|
"grad_norm": 12500334.0, |
|
"learning_rate": 9.762663832801983e-06, |
|
"loss": 3.4099, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 1.5382571732199788, |
|
"grad_norm": 20677724.0, |
|
"learning_rate": 9.744952178533476e-06, |
|
"loss": 3.8265, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 1.540913921360255, |
|
"grad_norm": 25849000.0, |
|
"learning_rate": 9.727240524264968e-06, |
|
"loss": 3.5107, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.5435706695005313, |
|
"grad_norm": 7106916.0, |
|
"learning_rate": 9.709528869996458e-06, |
|
"loss": 3.7538, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 1.5462274176408077, |
|
"grad_norm": 78143128.0, |
|
"learning_rate": 9.69181721572795e-06, |
|
"loss": 3.8139, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 1.548884165781084, |
|
"grad_norm": 124880632.0, |
|
"learning_rate": 9.674105561459441e-06, |
|
"loss": 3.4966, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 1.5515409139213603, |
|
"grad_norm": 16674735.0, |
|
"learning_rate": 9.656393907190934e-06, |
|
"loss": 3.7779, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 1.5541976620616365, |
|
"grad_norm": 36204444.0, |
|
"learning_rate": 9.638682252922424e-06, |
|
"loss": 3.5086, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.5568544102019128, |
|
"grad_norm": 7019197.5, |
|
"learning_rate": 9.620970598653915e-06, |
|
"loss": 3.3062, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 1.5595111583421892, |
|
"grad_norm": 14028569.0, |
|
"learning_rate": 9.603258944385407e-06, |
|
"loss": 3.4862, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 1.5621679064824656, |
|
"grad_norm": 24143218.0, |
|
"learning_rate": 9.585547290116898e-06, |
|
"loss": 3.388, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 1.5648246546227418, |
|
"grad_norm": 8635328.0, |
|
"learning_rate": 9.56783563584839e-06, |
|
"loss": 3.9959, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 1.567481402763018, |
|
"grad_norm": 14461347.0, |
|
"learning_rate": 9.55012398157988e-06, |
|
"loss": 3.3619, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.5701381509032943, |
|
"grad_norm": 45164232.0, |
|
"learning_rate": 9.532412327311371e-06, |
|
"loss": 3.7565, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 1.5727948990435707, |
|
"grad_norm": 43768708.0, |
|
"learning_rate": 9.514700673042863e-06, |
|
"loss": 3.2873, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 1.5754516471838471, |
|
"grad_norm": 102944216.0, |
|
"learning_rate": 9.496989018774354e-06, |
|
"loss": 3.5849, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 1.5781083953241233, |
|
"grad_norm": 8864102.0, |
|
"learning_rate": 9.479277364505846e-06, |
|
"loss": 3.3615, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 1.5807651434643994, |
|
"grad_norm": 17926040.0, |
|
"learning_rate": 9.461565710237337e-06, |
|
"loss": 3.3599, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.5834218916046758, |
|
"grad_norm": 563806208.0, |
|
"learning_rate": 9.443854055968829e-06, |
|
"loss": 3.6726, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 1.5860786397449522, |
|
"grad_norm": 4375813.5, |
|
"learning_rate": 9.42614240170032e-06, |
|
"loss": 3.5982, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 1.5887353878852286, |
|
"grad_norm": 23817932.0, |
|
"learning_rate": 9.40843074743181e-06, |
|
"loss": 3.6873, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 1.5913921360255048, |
|
"grad_norm": 3588041.25, |
|
"learning_rate": 9.390719093163302e-06, |
|
"loss": 3.8219, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 1.594048884165781, |
|
"grad_norm": 97096224.0, |
|
"learning_rate": 9.373007438894793e-06, |
|
"loss": 3.5905, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.5967056323060573, |
|
"grad_norm": 4066724.0, |
|
"learning_rate": 9.355295784626285e-06, |
|
"loss": 3.5762, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 1.5993623804463337, |
|
"grad_norm": 44529008.0, |
|
"learning_rate": 9.337584130357776e-06, |
|
"loss": 3.821, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 1.60201912858661, |
|
"grad_norm": 10141793.0, |
|
"learning_rate": 9.319872476089268e-06, |
|
"loss": 3.4989, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 1.6046758767268863, |
|
"grad_norm": 22102744.0, |
|
"learning_rate": 9.302160821820759e-06, |
|
"loss": 3.4363, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 1.6073326248671624, |
|
"grad_norm": 1421525.375, |
|
"learning_rate": 9.284449167552249e-06, |
|
"loss": 3.3543, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.6099893730074388, |
|
"grad_norm": 17624050.0, |
|
"learning_rate": 9.266737513283741e-06, |
|
"loss": 3.5835, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 1.6126461211477152, |
|
"grad_norm": 2787807.5, |
|
"learning_rate": 9.249025859015232e-06, |
|
"loss": 3.7715, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 1.6153028692879916, |
|
"grad_norm": 36419916.0, |
|
"learning_rate": 9.231314204746724e-06, |
|
"loss": 3.2874, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 1.6179596174282678, |
|
"grad_norm": 550304.0, |
|
"learning_rate": 9.213602550478215e-06, |
|
"loss": 3.775, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 1.620616365568544, |
|
"grad_norm": 13110638.0, |
|
"learning_rate": 9.195890896209707e-06, |
|
"loss": 4.0895, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.6232731137088203, |
|
"grad_norm": 153279.40625, |
|
"learning_rate": 9.1781792419412e-06, |
|
"loss": 3.5868, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 1.6259298618490967, |
|
"grad_norm": 274644.03125, |
|
"learning_rate": 9.160467587672688e-06, |
|
"loss": 3.4759, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 1.628586609989373, |
|
"grad_norm": 21545.19921875, |
|
"learning_rate": 9.14275593340418e-06, |
|
"loss": 4.0524, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 1.6312433581296493, |
|
"grad_norm": 27863.1015625, |
|
"learning_rate": 9.125044279135673e-06, |
|
"loss": 3.4133, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 1.6339001062699257, |
|
"grad_norm": 146765.640625, |
|
"learning_rate": 9.107332624867163e-06, |
|
"loss": 3.6765, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.6365568544102018, |
|
"grad_norm": 60709.375, |
|
"learning_rate": 9.089620970598656e-06, |
|
"loss": 3.8558, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 1.6392136025504782, |
|
"grad_norm": 290704.21875, |
|
"learning_rate": 9.071909316330146e-06, |
|
"loss": 3.3615, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 1.6418703506907546, |
|
"grad_norm": 198007.828125, |
|
"learning_rate": 9.054197662061637e-06, |
|
"loss": 3.6759, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 1.6445270988310308, |
|
"grad_norm": 30211.29296875, |
|
"learning_rate": 9.036486007793129e-06, |
|
"loss": 4.1618, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 1.6471838469713072, |
|
"grad_norm": 697217.3125, |
|
"learning_rate": 9.01877435352462e-06, |
|
"loss": 3.5873, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.6498405951115833, |
|
"grad_norm": 311260.34375, |
|
"learning_rate": 9.001062699256112e-06, |
|
"loss": 4.0309, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 1.6524973432518597, |
|
"grad_norm": 7285945.0, |
|
"learning_rate": 8.983351044987602e-06, |
|
"loss": 3.7024, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 1.655154091392136, |
|
"grad_norm": 238075.265625, |
|
"learning_rate": 8.965639390719095e-06, |
|
"loss": 3.7081, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 1.6578108395324125, |
|
"grad_norm": 104777.8828125, |
|
"learning_rate": 8.947927736450585e-06, |
|
"loss": 3.6374, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 1.6604675876726886, |
|
"grad_norm": 45899.98828125, |
|
"learning_rate": 8.930216082182076e-06, |
|
"loss": 3.7753, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.6631243358129648, |
|
"grad_norm": 4903258.0, |
|
"learning_rate": 8.912504427913568e-06, |
|
"loss": 3.7641, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 1.6657810839532412, |
|
"grad_norm": 691504.875, |
|
"learning_rate": 8.894792773645059e-06, |
|
"loss": 3.012, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 1.6684378320935176, |
|
"grad_norm": 7211197.0, |
|
"learning_rate": 8.877081119376551e-06, |
|
"loss": 3.278, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 1.671094580233794, |
|
"grad_norm": 55386.39453125, |
|
"learning_rate": 8.859369465108042e-06, |
|
"loss": 3.5972, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 1.6737513283740701, |
|
"grad_norm": 4803297.5, |
|
"learning_rate": 8.841657810839534e-06, |
|
"loss": 3.5168, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.6764080765143463, |
|
"grad_norm": 153394.5625, |
|
"learning_rate": 8.823946156571024e-06, |
|
"loss": 3.4884, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 1.6790648246546227, |
|
"grad_norm": 105014.6796875, |
|
"learning_rate": 8.806234502302515e-06, |
|
"loss": 3.5724, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 1.681721572794899, |
|
"grad_norm": 425531.6875, |
|
"learning_rate": 8.788522848034007e-06, |
|
"loss": 3.7171, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 1.6843783209351755, |
|
"grad_norm": 881638.625, |
|
"learning_rate": 8.770811193765498e-06, |
|
"loss": 3.5689, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 1.6870350690754516, |
|
"grad_norm": 506417.84375, |
|
"learning_rate": 8.75309953949699e-06, |
|
"loss": 3.3471, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.6896918172157278, |
|
"grad_norm": 218658.8125, |
|
"learning_rate": 8.73538788522848e-06, |
|
"loss": 3.0762, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 1.6923485653560042, |
|
"grad_norm": 3747502.5, |
|
"learning_rate": 8.717676230959973e-06, |
|
"loss": 3.7819, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 1.6950053134962806, |
|
"grad_norm": 402977.15625, |
|
"learning_rate": 8.699964576691463e-06, |
|
"loss": 3.2238, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 1.697662061636557, |
|
"grad_norm": 354610.0, |
|
"learning_rate": 8.682252922422954e-06, |
|
"loss": 3.5365, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 1.7003188097768331, |
|
"grad_norm": 737137.25, |
|
"learning_rate": 8.664541268154446e-06, |
|
"loss": 3.7334, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.7029755579171093, |
|
"grad_norm": 270020.3125, |
|
"learning_rate": 8.646829613885937e-06, |
|
"loss": 3.6183, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 1.7056323060573857, |
|
"grad_norm": 740626.4375, |
|
"learning_rate": 8.629117959617429e-06, |
|
"loss": 3.7487, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 1.708289054197662, |
|
"grad_norm": 1305229.75, |
|
"learning_rate": 8.61140630534892e-06, |
|
"loss": 3.7039, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 1.7109458023379385, |
|
"grad_norm": 172010.875, |
|
"learning_rate": 8.593694651080412e-06, |
|
"loss": 2.9064, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 1.7136025504782146, |
|
"grad_norm": 36386.55859375, |
|
"learning_rate": 8.575982996811903e-06, |
|
"loss": 3.5462, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.7162592986184908, |
|
"grad_norm": 280424.5, |
|
"learning_rate": 8.558271342543393e-06, |
|
"loss": 3.7119, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 1.7189160467587672, |
|
"grad_norm": 65134.73828125, |
|
"learning_rate": 8.540559688274885e-06, |
|
"loss": 4.058, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 1.7215727948990436, |
|
"grad_norm": 66937.53125, |
|
"learning_rate": 8.522848034006378e-06, |
|
"loss": 3.3975, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 1.72422954303932, |
|
"grad_norm": 131224.421875, |
|
"learning_rate": 8.505136379737868e-06, |
|
"loss": 3.4813, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 1.7268862911795961, |
|
"grad_norm": 108172.1640625, |
|
"learning_rate": 8.48742472546936e-06, |
|
"loss": 3.1716, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.7295430393198725, |
|
"grad_norm": 25198.029296875, |
|
"learning_rate": 8.469713071200851e-06, |
|
"loss": 3.6849, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 1.7321997874601487, |
|
"grad_norm": 61498.03515625, |
|
"learning_rate": 8.452001416932342e-06, |
|
"loss": 3.4036, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 1.734856535600425, |
|
"grad_norm": 442683.875, |
|
"learning_rate": 8.434289762663834e-06, |
|
"loss": 3.3497, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 1.7375132837407015, |
|
"grad_norm": 27654.84765625, |
|
"learning_rate": 8.416578108395324e-06, |
|
"loss": 3.2324, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 1.7401700318809776, |
|
"grad_norm": 87875.5546875, |
|
"learning_rate": 8.398866454126817e-06, |
|
"loss": 3.211, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.742826780021254, |
|
"grad_norm": 443493.65625, |
|
"learning_rate": 8.381154799858307e-06, |
|
"loss": 3.5746, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 1.7454835281615302, |
|
"grad_norm": 112091.3046875, |
|
"learning_rate": 8.3634431455898e-06, |
|
"loss": 3.2604, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 1.7481402763018066, |
|
"grad_norm": 37516.62109375, |
|
"learning_rate": 8.34573149132129e-06, |
|
"loss": 3.3058, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 1.750797024442083, |
|
"grad_norm": 98792.796875, |
|
"learning_rate": 8.32801983705278e-06, |
|
"loss": 3.4504, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 1.7534537725823593, |
|
"grad_norm": 24296.8125, |
|
"learning_rate": 8.310308182784273e-06, |
|
"loss": 3.2476, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.7561105207226355, |
|
"grad_norm": 27490.43359375, |
|
"learning_rate": 8.292596528515764e-06, |
|
"loss": 3.4551, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 1.7587672688629117, |
|
"grad_norm": 163381.75, |
|
"learning_rate": 8.274884874247256e-06, |
|
"loss": 3.56, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 1.761424017003188, |
|
"grad_norm": 5022.00244140625, |
|
"learning_rate": 8.257173219978746e-06, |
|
"loss": 3.2829, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 1.7640807651434645, |
|
"grad_norm": 873426.5, |
|
"learning_rate": 8.239461565710239e-06, |
|
"loss": 3.292, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 1.7667375132837408, |
|
"grad_norm": 48760.75390625, |
|
"learning_rate": 8.22174991144173e-06, |
|
"loss": 3.3971, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.769394261424017, |
|
"grad_norm": 22562.328125, |
|
"learning_rate": 8.20403825717322e-06, |
|
"loss": 3.4901, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 1.7720510095642932, |
|
"grad_norm": 110952.984375, |
|
"learning_rate": 8.186326602904712e-06, |
|
"loss": 3.5824, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 1.7747077577045696, |
|
"grad_norm": 11664.615234375, |
|
"learning_rate": 8.168614948636203e-06, |
|
"loss": 3.6433, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 1.777364505844846, |
|
"grad_norm": 296820.28125, |
|
"learning_rate": 8.150903294367695e-06, |
|
"loss": 3.4816, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 1.7800212539851223, |
|
"grad_norm": 28750.556640625, |
|
"learning_rate": 8.133191640099186e-06, |
|
"loss": 3.4851, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.7826780021253985, |
|
"grad_norm": 86309.7890625, |
|
"learning_rate": 8.115479985830678e-06, |
|
"loss": 3.3058, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 1.7853347502656747, |
|
"grad_norm": 91584.7734375, |
|
"learning_rate": 8.097768331562168e-06, |
|
"loss": 3.7495, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 1.787991498405951, |
|
"grad_norm": 132450.96875, |
|
"learning_rate": 8.080056677293659e-06, |
|
"loss": 3.4955, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 1.7906482465462275, |
|
"grad_norm": 134387.046875, |
|
"learning_rate": 8.062345023025151e-06, |
|
"loss": 3.4655, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 1.7933049946865038, |
|
"grad_norm": 74426.6875, |
|
"learning_rate": 8.044633368756642e-06, |
|
"loss": 3.7594, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.79596174282678, |
|
"grad_norm": 58667.3984375, |
|
"learning_rate": 8.026921714488134e-06, |
|
"loss": 3.7655, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 1.7986184909670562, |
|
"grad_norm": 130389.9140625, |
|
"learning_rate": 8.009210060219625e-06, |
|
"loss": 3.673, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 1.8012752391073326, |
|
"grad_norm": 89147.9296875, |
|
"learning_rate": 7.991498405951117e-06, |
|
"loss": 3.2874, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 1.803931987247609, |
|
"grad_norm": 44793.80859375, |
|
"learning_rate": 7.973786751682607e-06, |
|
"loss": 3.2517, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 1.8065887353878853, |
|
"grad_norm": 15245.392578125, |
|
"learning_rate": 7.956075097414098e-06, |
|
"loss": 3.5179, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.8092454835281615, |
|
"grad_norm": 15995.4912109375, |
|
"learning_rate": 7.93836344314559e-06, |
|
"loss": 3.6515, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 1.8119022316684377, |
|
"grad_norm": 16524.787109375, |
|
"learning_rate": 7.920651788877083e-06, |
|
"loss": 3.1618, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 1.814558979808714, |
|
"grad_norm": 42409.20703125, |
|
"learning_rate": 7.902940134608573e-06, |
|
"loss": 3.58, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 1.8172157279489904, |
|
"grad_norm": 10542.6796875, |
|
"learning_rate": 7.885228480340065e-06, |
|
"loss": 3.508, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 1.8198724760892668, |
|
"grad_norm": 25151.1484375, |
|
"learning_rate": 7.867516826071556e-06, |
|
"loss": 3.0635, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.822529224229543, |
|
"grad_norm": 9499.1826171875, |
|
"learning_rate": 7.849805171803047e-06, |
|
"loss": 2.9901, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 1.8251859723698194, |
|
"grad_norm": 54946.984375, |
|
"learning_rate": 7.832093517534539e-06, |
|
"loss": 2.9531, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 1.8278427205100956, |
|
"grad_norm": 10790.599609375, |
|
"learning_rate": 7.81438186326603e-06, |
|
"loss": 3.5882, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 1.830499468650372, |
|
"grad_norm": 13575.8759765625, |
|
"learning_rate": 7.796670208997522e-06, |
|
"loss": 3.3612, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 1.8331562167906483, |
|
"grad_norm": 20945.48046875, |
|
"learning_rate": 7.778958554729012e-06, |
|
"loss": 3.0764, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.8358129649309245, |
|
"grad_norm": 232869.03125, |
|
"learning_rate": 7.761246900460504e-06, |
|
"loss": 3.1716, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 1.8384697130712009, |
|
"grad_norm": 43791.59765625, |
|
"learning_rate": 7.743535246191995e-06, |
|
"loss": 3.2311, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 1.841126461211477, |
|
"grad_norm": 22579.091796875, |
|
"learning_rate": 7.725823591923486e-06, |
|
"loss": 3.4563, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 1.8437832093517534, |
|
"grad_norm": 28530.806640625, |
|
"learning_rate": 7.708111937654978e-06, |
|
"loss": 3.452, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 1.8464399574920298, |
|
"grad_norm": 12486.0390625, |
|
"learning_rate": 7.690400283386468e-06, |
|
"loss": 3.2791, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.8490967056323062, |
|
"grad_norm": 17018.11328125, |
|
"learning_rate": 7.67268862911796e-06, |
|
"loss": 3.6792, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 1.8517534537725824, |
|
"grad_norm": 16199.2470703125, |
|
"learning_rate": 7.654976974849451e-06, |
|
"loss": 3.2561, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 1.8544102019128585, |
|
"grad_norm": 10388.2470703125, |
|
"learning_rate": 7.637265320580944e-06, |
|
"loss": 3.0233, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 1.857066950053135, |
|
"grad_norm": 15407.7548828125, |
|
"learning_rate": 7.619553666312433e-06, |
|
"loss": 3.167, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 1.8597236981934113, |
|
"grad_norm": 26815.095703125, |
|
"learning_rate": 7.601842012043925e-06, |
|
"loss": 3.2972, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.8623804463336877, |
|
"grad_norm": 58698.21875, |
|
"learning_rate": 7.584130357775417e-06, |
|
"loss": 3.2334, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 1.8650371944739639, |
|
"grad_norm": 27274.71875, |
|
"learning_rate": 7.566418703506908e-06, |
|
"loss": 3.1432, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 1.86769394261424, |
|
"grad_norm": 83316.0703125, |
|
"learning_rate": 7.5487070492384e-06, |
|
"loss": 3.1284, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 1.8703506907545164, |
|
"grad_norm": 30122.771484375, |
|
"learning_rate": 7.530995394969891e-06, |
|
"loss": 2.8904, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 1.8730074388947928, |
|
"grad_norm": 40200.9609375, |
|
"learning_rate": 7.513283740701383e-06, |
|
"loss": 3.3255, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.8756641870350692, |
|
"grad_norm": 16342.447265625, |
|
"learning_rate": 7.495572086432873e-06, |
|
"loss": 3.1073, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 1.8783209351753454, |
|
"grad_norm": 14423.703125, |
|
"learning_rate": 7.477860432164365e-06, |
|
"loss": 3.4831, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 1.8809776833156215, |
|
"grad_norm": 34366.14453125, |
|
"learning_rate": 7.460148777895856e-06, |
|
"loss": 3.2063, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 1.883634431455898, |
|
"grad_norm": 70803.8359375, |
|
"learning_rate": 7.4424371236273475e-06, |
|
"loss": 3.5181, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 1.8862911795961743, |
|
"grad_norm": 13800.69140625, |
|
"learning_rate": 7.424725469358839e-06, |
|
"loss": 3.4993, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.8889479277364507, |
|
"grad_norm": 48057.68359375, |
|
"learning_rate": 7.40701381509033e-06, |
|
"loss": 3.1418, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 1.8916046758767269, |
|
"grad_norm": 40145.4921875, |
|
"learning_rate": 7.389302160821822e-06, |
|
"loss": 3.3427, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 1.894261424017003, |
|
"grad_norm": 13148.1484375, |
|
"learning_rate": 7.371590506553312e-06, |
|
"loss": 3.2584, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 1.8969181721572794, |
|
"grad_norm": 10740.6826171875, |
|
"learning_rate": 7.353878852284804e-06, |
|
"loss": 2.8656, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 1.8995749202975558, |
|
"grad_norm": 7270.3818359375, |
|
"learning_rate": 7.336167198016295e-06, |
|
"loss": 3.1929, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.9022316684378322, |
|
"grad_norm": 3250.9072265625, |
|
"learning_rate": 7.318455543747787e-06, |
|
"loss": 3.3218, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 1.9048884165781084, |
|
"grad_norm": 40904.6484375, |
|
"learning_rate": 7.300743889479278e-06, |
|
"loss": 3.0994, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 1.9075451647183845, |
|
"grad_norm": 9426.0009765625, |
|
"learning_rate": 7.2830322352107695e-06, |
|
"loss": 3.2861, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 1.910201912858661, |
|
"grad_norm": 10107.427734375, |
|
"learning_rate": 7.265320580942261e-06, |
|
"loss": 3.3694, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 1.9128586609989373, |
|
"grad_norm": 25632.7734375, |
|
"learning_rate": 7.2476089266737514e-06, |
|
"loss": 3.1918, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.9155154091392137, |
|
"grad_norm": 10823.2509765625, |
|
"learning_rate": 7.229897272405243e-06, |
|
"loss": 3.1984, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 1.9181721572794899, |
|
"grad_norm": 8237.4482421875, |
|
"learning_rate": 7.212185618136734e-06, |
|
"loss": 2.7874, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 1.9208289054197663, |
|
"grad_norm": 4823.09716796875, |
|
"learning_rate": 7.194473963868226e-06, |
|
"loss": 3.2625, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 1.9234856535600424, |
|
"grad_norm": 6276.54150390625, |
|
"learning_rate": 7.176762309599717e-06, |
|
"loss": 3.1739, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 1.9261424017003188, |
|
"grad_norm": 9979.935546875, |
|
"learning_rate": 7.1590506553312085e-06, |
|
"loss": 3.34, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.9287991498405952, |
|
"grad_norm": 3373.656982421875, |
|
"learning_rate": 7.141339001062701e-06, |
|
"loss": 3.4366, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 1.9314558979808714, |
|
"grad_norm": 9178.9404296875, |
|
"learning_rate": 7.1236273467941905e-06, |
|
"loss": 3.245, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 1.9341126461211477, |
|
"grad_norm": 11173.3037109375, |
|
"learning_rate": 7.105915692525682e-06, |
|
"loss": 3.1306, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 1.936769394261424, |
|
"grad_norm": 6969.20849609375, |
|
"learning_rate": 7.088204038257173e-06, |
|
"loss": 3.5482, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 1.9394261424017003, |
|
"grad_norm": 22079.796875, |
|
"learning_rate": 7.070492383988665e-06, |
|
"loss": 3.2338, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.9420828905419767, |
|
"grad_norm": 51803.05078125, |
|
"learning_rate": 7.052780729720157e-06, |
|
"loss": 3.1844, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 1.944739638682253, |
|
"grad_norm": 17502.84375, |
|
"learning_rate": 7.0350690754516485e-06, |
|
"loss": 3.3796, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 1.9473963868225292, |
|
"grad_norm": 4275.10009765625, |
|
"learning_rate": 7.017357421183138e-06, |
|
"loss": 3.0306, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 1.9500531349628054, |
|
"grad_norm": 3620.85400390625, |
|
"learning_rate": 6.99964576691463e-06, |
|
"loss": 3.3635, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 1.9527098831030818, |
|
"grad_norm": 32547.673828125, |
|
"learning_rate": 6.981934112646122e-06, |
|
"loss": 3.1764, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.9553666312433582, |
|
"grad_norm": 5065.5751953125, |
|
"learning_rate": 6.964222458377613e-06, |
|
"loss": 3.1895, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 1.9580233793836346, |
|
"grad_norm": 10395.2060546875, |
|
"learning_rate": 6.946510804109105e-06, |
|
"loss": 3.1655, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 1.9606801275239107, |
|
"grad_norm": 4557.41796875, |
|
"learning_rate": 6.928799149840596e-06, |
|
"loss": 3.0581, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 1.963336875664187, |
|
"grad_norm": 38417.4765625, |
|
"learning_rate": 6.911087495572088e-06, |
|
"loss": 3.0893, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 1.9659936238044633, |
|
"grad_norm": 5107.16796875, |
|
"learning_rate": 6.893375841303578e-06, |
|
"loss": 3.4167, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.9686503719447397, |
|
"grad_norm": 5035.6201171875, |
|
"learning_rate": 6.87566418703507e-06, |
|
"loss": 3.0664, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 1.971307120085016, |
|
"grad_norm": 12651.587890625, |
|
"learning_rate": 6.857952532766561e-06, |
|
"loss": 3.1299, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 1.9739638682252922, |
|
"grad_norm": 7539.5400390625, |
|
"learning_rate": 6.840240878498052e-06, |
|
"loss": 3.0492, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 1.9766206163655684, |
|
"grad_norm": 5577.158203125, |
|
"learning_rate": 6.822529224229544e-06, |
|
"loss": 3.098, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 1.9792773645058448, |
|
"grad_norm": 41558.4921875, |
|
"learning_rate": 6.804817569961035e-06, |
|
"loss": 3.2933, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.9819341126461212, |
|
"grad_norm": 3775.939697265625, |
|
"learning_rate": 6.787105915692527e-06, |
|
"loss": 2.9667, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 1.9845908607863976, |
|
"grad_norm": 30318.9921875, |
|
"learning_rate": 6.769394261424017e-06, |
|
"loss": 3.0666, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 1.9872476089266737, |
|
"grad_norm": 21865.806640625, |
|
"learning_rate": 6.751682607155509e-06, |
|
"loss": 2.9213, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 1.98990435706695, |
|
"grad_norm": 10458.220703125, |
|
"learning_rate": 6.733970952887e-06, |
|
"loss": 3.2567, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 1.9925611052072263, |
|
"grad_norm": 14638.0439453125, |
|
"learning_rate": 6.7162592986184915e-06, |
|
"loss": 3.2132, |
|
"step": 7500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 11292, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7836212920320000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|