task-13-Qwen-Qwen2.5-3B / trainer_state.json
ManyingZ's picture
Upload folder using huggingface_hub
86adb77 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 87.85845027455765,
"eval_steps": 500,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09762050030506407,
"grad_norm": 0.5132213234901428,
"learning_rate": 4e-05,
"loss": 2.1313,
"step": 20
},
{
"epoch": 0.19524100061012814,
"grad_norm": 0.5252098441123962,
"learning_rate": 8e-05,
"loss": 2.0977,
"step": 40
},
{
"epoch": 0.2928615009151922,
"grad_norm": 0.5683884024620056,
"learning_rate": 0.00012,
"loss": 1.9168,
"step": 60
},
{
"epoch": 0.3904820012202563,
"grad_norm": 0.6169227957725525,
"learning_rate": 0.00016,
"loss": 1.8795,
"step": 80
},
{
"epoch": 0.4881025015253203,
"grad_norm": 0.8657425045967102,
"learning_rate": 0.0002,
"loss": 1.8257,
"step": 100
},
{
"epoch": 0.5857230018303844,
"grad_norm": 1.1708447933197021,
"learning_rate": 0.00019980295566502464,
"loss": 1.8,
"step": 120
},
{
"epoch": 0.6833435021354485,
"grad_norm": 1.4427006244659424,
"learning_rate": 0.00019960591133004926,
"loss": 1.75,
"step": 140
},
{
"epoch": 0.7809640024405126,
"grad_norm": 1.190605640411377,
"learning_rate": 0.00019940886699507392,
"loss": 1.6796,
"step": 160
},
{
"epoch": 0.8785845027455765,
"grad_norm": 0.9052116274833679,
"learning_rate": 0.00019921182266009852,
"loss": 1.6721,
"step": 180
},
{
"epoch": 0.9762050030506406,
"grad_norm": 1.3986254930496216,
"learning_rate": 0.00019901477832512317,
"loss": 1.5752,
"step": 200
},
{
"epoch": 1.0738255033557047,
"grad_norm": 1.1646902561187744,
"learning_rate": 0.0001988177339901478,
"loss": 1.6187,
"step": 220
},
{
"epoch": 1.1714460036607688,
"grad_norm": 1.3168349266052246,
"learning_rate": 0.00019862068965517243,
"loss": 1.518,
"step": 240
},
{
"epoch": 1.2690665039658329,
"grad_norm": 1.5947434902191162,
"learning_rate": 0.00019842364532019705,
"loss": 1.5919,
"step": 260
},
{
"epoch": 1.366687004270897,
"grad_norm": 1.550020694732666,
"learning_rate": 0.00019822660098522168,
"loss": 1.5565,
"step": 280
},
{
"epoch": 1.4643075045759608,
"grad_norm": 1.587833285331726,
"learning_rate": 0.0001980295566502463,
"loss": 1.4928,
"step": 300
},
{
"epoch": 1.561928004881025,
"grad_norm": 1.3913565874099731,
"learning_rate": 0.00019783251231527093,
"loss": 1.4603,
"step": 320
},
{
"epoch": 1.659548505186089,
"grad_norm": 1.6511396169662476,
"learning_rate": 0.00019763546798029556,
"loss": 1.3946,
"step": 340
},
{
"epoch": 1.757169005491153,
"grad_norm": 1.6132158041000366,
"learning_rate": 0.00019743842364532022,
"loss": 1.4182,
"step": 360
},
{
"epoch": 1.8547895057962172,
"grad_norm": 2.1648366451263428,
"learning_rate": 0.00019724137931034484,
"loss": 1.4717,
"step": 380
},
{
"epoch": 1.9524100061012812,
"grad_norm": 1.5196492671966553,
"learning_rate": 0.00019704433497536947,
"loss": 1.4192,
"step": 400
},
{
"epoch": 2.0500305064063453,
"grad_norm": 1.676941990852356,
"learning_rate": 0.0001968472906403941,
"loss": 1.378,
"step": 420
},
{
"epoch": 2.1476510067114094,
"grad_norm": 1.834902286529541,
"learning_rate": 0.00019665024630541872,
"loss": 1.3741,
"step": 440
},
{
"epoch": 2.2452715070164735,
"grad_norm": 1.9870941638946533,
"learning_rate": 0.00019645320197044338,
"loss": 1.3165,
"step": 460
},
{
"epoch": 2.3428920073215376,
"grad_norm": 1.682267427444458,
"learning_rate": 0.00019625615763546798,
"loss": 1.3828,
"step": 480
},
{
"epoch": 2.4405125076266017,
"grad_norm": 2.291842222213745,
"learning_rate": 0.00019605911330049263,
"loss": 1.3857,
"step": 500
},
{
"epoch": 2.5381330079316657,
"grad_norm": 2.0962560176849365,
"learning_rate": 0.00019586206896551723,
"loss": 1.2759,
"step": 520
},
{
"epoch": 2.63575350823673,
"grad_norm": 1.6451084613800049,
"learning_rate": 0.0001956650246305419,
"loss": 1.3086,
"step": 540
},
{
"epoch": 2.733374008541794,
"grad_norm": 1.8540750741958618,
"learning_rate": 0.00019546798029556651,
"loss": 1.3416,
"step": 560
},
{
"epoch": 2.830994508846858,
"grad_norm": 1.8368124961853027,
"learning_rate": 0.00019527093596059114,
"loss": 1.2716,
"step": 580
},
{
"epoch": 2.9286150091519216,
"grad_norm": 3.194183588027954,
"learning_rate": 0.00019507389162561577,
"loss": 1.2964,
"step": 600
},
{
"epoch": 3.026235509456986,
"grad_norm": 1.751219630241394,
"learning_rate": 0.0001948768472906404,
"loss": 1.3046,
"step": 620
},
{
"epoch": 3.1238560097620502,
"grad_norm": 1.834823489189148,
"learning_rate": 0.00019467980295566505,
"loss": 1.2149,
"step": 640
},
{
"epoch": 3.221476510067114,
"grad_norm": 1.9562243223190308,
"learning_rate": 0.00019448275862068965,
"loss": 1.209,
"step": 660
},
{
"epoch": 3.319097010372178,
"grad_norm": 2.012437582015991,
"learning_rate": 0.0001942857142857143,
"loss": 1.2196,
"step": 680
},
{
"epoch": 3.416717510677242,
"grad_norm": 2.47426176071167,
"learning_rate": 0.00019408866995073893,
"loss": 1.2974,
"step": 700
},
{
"epoch": 3.514338010982306,
"grad_norm": 2.1828153133392334,
"learning_rate": 0.00019389162561576356,
"loss": 1.2287,
"step": 720
},
{
"epoch": 3.61195851128737,
"grad_norm": 2.335744619369507,
"learning_rate": 0.00019369458128078818,
"loss": 1.2018,
"step": 740
},
{
"epoch": 3.7095790115924343,
"grad_norm": 1.731418490409851,
"learning_rate": 0.0001934975369458128,
"loss": 1.2568,
"step": 760
},
{
"epoch": 3.8071995118974984,
"grad_norm": 2.0934510231018066,
"learning_rate": 0.00019330049261083744,
"loss": 1.2206,
"step": 780
},
{
"epoch": 3.9048200122025625,
"grad_norm": 2.2060680389404297,
"learning_rate": 0.0001931034482758621,
"loss": 1.1898,
"step": 800
},
{
"epoch": 4.002440512507627,
"grad_norm": 3.0342836380004883,
"learning_rate": 0.0001929064039408867,
"loss": 1.2248,
"step": 820
},
{
"epoch": 4.100061012812691,
"grad_norm": 2.1768083572387695,
"learning_rate": 0.00019270935960591135,
"loss": 1.1721,
"step": 840
},
{
"epoch": 4.197681513117755,
"grad_norm": 2.2883739471435547,
"learning_rate": 0.00019251231527093597,
"loss": 1.1117,
"step": 860
},
{
"epoch": 4.295302013422819,
"grad_norm": 2.45024037361145,
"learning_rate": 0.0001923152709359606,
"loss": 1.1392,
"step": 880
},
{
"epoch": 4.392922513727883,
"grad_norm": 1.9696956872940063,
"learning_rate": 0.00019211822660098523,
"loss": 1.0958,
"step": 900
},
{
"epoch": 4.490543014032947,
"grad_norm": 2.3901145458221436,
"learning_rate": 0.00019192118226600986,
"loss": 1.1693,
"step": 920
},
{
"epoch": 4.588163514338011,
"grad_norm": 2.003532648086548,
"learning_rate": 0.0001917241379310345,
"loss": 1.1301,
"step": 940
},
{
"epoch": 4.685784014643075,
"grad_norm": 1.990051031112671,
"learning_rate": 0.0001915270935960591,
"loss": 1.1315,
"step": 960
},
{
"epoch": 4.783404514948139,
"grad_norm": 2.517423152923584,
"learning_rate": 0.00019133004926108376,
"loss": 1.208,
"step": 980
},
{
"epoch": 4.881025015253203,
"grad_norm": 2.311152458190918,
"learning_rate": 0.0001911330049261084,
"loss": 1.1494,
"step": 1000
},
{
"epoch": 4.978645515558267,
"grad_norm": 2.327719211578369,
"learning_rate": 0.00019093596059113302,
"loss": 1.1459,
"step": 1020
},
{
"epoch": 5.0762660158633315,
"grad_norm": 3.1623075008392334,
"learning_rate": 0.00019073891625615765,
"loss": 1.1417,
"step": 1040
},
{
"epoch": 5.173886516168396,
"grad_norm": 2.418928384780884,
"learning_rate": 0.00019054187192118227,
"loss": 1.091,
"step": 1060
},
{
"epoch": 5.27150701647346,
"grad_norm": 2.6035215854644775,
"learning_rate": 0.0001903448275862069,
"loss": 1.0851,
"step": 1080
},
{
"epoch": 5.369127516778524,
"grad_norm": 3.089789628982544,
"learning_rate": 0.00019014778325123153,
"loss": 1.0592,
"step": 1100
},
{
"epoch": 5.466748017083588,
"grad_norm": 2.885105609893799,
"learning_rate": 0.00018995073891625615,
"loss": 1.0781,
"step": 1120
},
{
"epoch": 5.564368517388652,
"grad_norm": 2.3023903369903564,
"learning_rate": 0.0001897536945812808,
"loss": 1.0706,
"step": 1140
},
{
"epoch": 5.661989017693716,
"grad_norm": 2.873560905456543,
"learning_rate": 0.00018955665024630543,
"loss": 1.092,
"step": 1160
},
{
"epoch": 5.75960951799878,
"grad_norm": 2.4178314208984375,
"learning_rate": 0.00018935960591133006,
"loss": 1.0896,
"step": 1180
},
{
"epoch": 5.857230018303844,
"grad_norm": 2.150630474090576,
"learning_rate": 0.0001891625615763547,
"loss": 1.1056,
"step": 1200
},
{
"epoch": 5.954850518608908,
"grad_norm": 3.347947359085083,
"learning_rate": 0.00018896551724137932,
"loss": 1.0832,
"step": 1220
},
{
"epoch": 6.052471018913972,
"grad_norm": 2.737258195877075,
"learning_rate": 0.00018876847290640397,
"loss": 1.0573,
"step": 1240
},
{
"epoch": 6.150091519219036,
"grad_norm": 2.3305180072784424,
"learning_rate": 0.00018857142857142857,
"loss": 0.9491,
"step": 1260
},
{
"epoch": 6.2477120195241005,
"grad_norm": 3.0475850105285645,
"learning_rate": 0.00018837438423645322,
"loss": 1.0221,
"step": 1280
},
{
"epoch": 6.345332519829164,
"grad_norm": 2.7141025066375732,
"learning_rate": 0.00018817733990147782,
"loss": 1.0566,
"step": 1300
},
{
"epoch": 6.442953020134228,
"grad_norm": 2.931290626525879,
"learning_rate": 0.00018798029556650248,
"loss": 1.0178,
"step": 1320
},
{
"epoch": 6.540573520439292,
"grad_norm": 2.9428722858428955,
"learning_rate": 0.0001877832512315271,
"loss": 1.0642,
"step": 1340
},
{
"epoch": 6.638194020744356,
"grad_norm": 2.452775001525879,
"learning_rate": 0.00018758620689655173,
"loss": 1.0928,
"step": 1360
},
{
"epoch": 6.73581452104942,
"grad_norm": 3.380108594894409,
"learning_rate": 0.00018738916256157636,
"loss": 1.0091,
"step": 1380
},
{
"epoch": 6.833435021354484,
"grad_norm": 2.9912617206573486,
"learning_rate": 0.000187192118226601,
"loss": 0.9958,
"step": 1400
},
{
"epoch": 6.931055521659548,
"grad_norm": 2.5559194087982178,
"learning_rate": 0.00018699507389162561,
"loss": 1.0891,
"step": 1420
},
{
"epoch": 7.028676021964612,
"grad_norm": 2.728987693786621,
"learning_rate": 0.00018679802955665024,
"loss": 0.9723,
"step": 1440
},
{
"epoch": 7.126296522269676,
"grad_norm": 2.4664106369018555,
"learning_rate": 0.0001866009852216749,
"loss": 0.9712,
"step": 1460
},
{
"epoch": 7.22391702257474,
"grad_norm": 2.6810712814331055,
"learning_rate": 0.00018640394088669952,
"loss": 0.9408,
"step": 1480
},
{
"epoch": 7.3215375228798045,
"grad_norm": 2.690723419189453,
"learning_rate": 0.00018620689655172415,
"loss": 0.9579,
"step": 1500
},
{
"epoch": 7.419158023184869,
"grad_norm": 2.751676321029663,
"learning_rate": 0.00018600985221674878,
"loss": 0.9959,
"step": 1520
},
{
"epoch": 7.516778523489933,
"grad_norm": 2.6251280307769775,
"learning_rate": 0.0001858128078817734,
"loss": 0.9908,
"step": 1540
},
{
"epoch": 7.614399023794997,
"grad_norm": 2.897099733352661,
"learning_rate": 0.00018561576354679803,
"loss": 0.9631,
"step": 1560
},
{
"epoch": 7.712019524100061,
"grad_norm": 2.0911786556243896,
"learning_rate": 0.00018541871921182269,
"loss": 0.9963,
"step": 1580
},
{
"epoch": 7.809640024405125,
"grad_norm": 2.6954994201660156,
"learning_rate": 0.00018522167487684729,
"loss": 1.0134,
"step": 1600
},
{
"epoch": 7.907260524710189,
"grad_norm": 2.8063347339630127,
"learning_rate": 0.00018502463054187194,
"loss": 0.9732,
"step": 1620
},
{
"epoch": 8.004881025015253,
"grad_norm": 2.0492053031921387,
"learning_rate": 0.00018482758620689654,
"loss": 1.0361,
"step": 1640
},
{
"epoch": 8.102501525320317,
"grad_norm": 3.0692152976989746,
"learning_rate": 0.0001846305418719212,
"loss": 0.9293,
"step": 1660
},
{
"epoch": 8.200122025625381,
"grad_norm": 2.7933707237243652,
"learning_rate": 0.00018443349753694582,
"loss": 0.9101,
"step": 1680
},
{
"epoch": 8.297742525930445,
"grad_norm": 3.628946542739868,
"learning_rate": 0.00018423645320197045,
"loss": 0.9215,
"step": 1700
},
{
"epoch": 8.39536302623551,
"grad_norm": 2.892118215560913,
"learning_rate": 0.0001840394088669951,
"loss": 0.9008,
"step": 1720
},
{
"epoch": 8.492983526540574,
"grad_norm": 3.5419254302978516,
"learning_rate": 0.0001838423645320197,
"loss": 0.9577,
"step": 1740
},
{
"epoch": 8.590604026845638,
"grad_norm": 2.785578489303589,
"learning_rate": 0.00018364532019704436,
"loss": 0.8979,
"step": 1760
},
{
"epoch": 8.688224527150702,
"grad_norm": 3.6454851627349854,
"learning_rate": 0.00018344827586206896,
"loss": 0.9424,
"step": 1780
},
{
"epoch": 8.785845027455766,
"grad_norm": 3.1077752113342285,
"learning_rate": 0.0001832512315270936,
"loss": 0.976,
"step": 1800
},
{
"epoch": 8.88346552776083,
"grad_norm": 2.1347529888153076,
"learning_rate": 0.00018305418719211824,
"loss": 0.9889,
"step": 1820
},
{
"epoch": 8.981086028065894,
"grad_norm": 1.8763928413391113,
"learning_rate": 0.00018285714285714286,
"loss": 0.9595,
"step": 1840
},
{
"epoch": 9.078706528370958,
"grad_norm": 2.5731394290924072,
"learning_rate": 0.0001826600985221675,
"loss": 0.9346,
"step": 1860
},
{
"epoch": 9.176327028676022,
"grad_norm": 2.75944447517395,
"learning_rate": 0.00018246305418719212,
"loss": 0.8926,
"step": 1880
},
{
"epoch": 9.273947528981086,
"grad_norm": 2.7548296451568604,
"learning_rate": 0.00018226600985221675,
"loss": 0.8835,
"step": 1900
},
{
"epoch": 9.37156802928615,
"grad_norm": 3.4645333290100098,
"learning_rate": 0.0001820689655172414,
"loss": 0.8753,
"step": 1920
},
{
"epoch": 9.469188529591214,
"grad_norm": 2.7922091484069824,
"learning_rate": 0.00018187192118226603,
"loss": 0.9187,
"step": 1940
},
{
"epoch": 9.566809029896278,
"grad_norm": 2.257009506225586,
"learning_rate": 0.00018167487684729065,
"loss": 0.9294,
"step": 1960
},
{
"epoch": 9.664429530201343,
"grad_norm": 4.195834159851074,
"learning_rate": 0.00018147783251231528,
"loss": 0.9022,
"step": 1980
},
{
"epoch": 9.762050030506407,
"grad_norm": 2.8687057495117188,
"learning_rate": 0.0001812807881773399,
"loss": 0.8744,
"step": 2000
},
{
"epoch": 9.85967053081147,
"grad_norm": 3.758493661880493,
"learning_rate": 0.00018108374384236456,
"loss": 0.9117,
"step": 2020
},
{
"epoch": 9.957291031116535,
"grad_norm": 3.2609262466430664,
"learning_rate": 0.00018088669950738916,
"loss": 0.9261,
"step": 2040
},
{
"epoch": 10.054911531421599,
"grad_norm": 3.5481553077697754,
"learning_rate": 0.00018068965517241382,
"loss": 0.8786,
"step": 2060
},
{
"epoch": 10.152532031726663,
"grad_norm": 2.8181192874908447,
"learning_rate": 0.00018049261083743842,
"loss": 0.8153,
"step": 2080
},
{
"epoch": 10.250152532031727,
"grad_norm": 2.582590341567993,
"learning_rate": 0.00018029556650246307,
"loss": 0.8763,
"step": 2100
},
{
"epoch": 10.347773032336791,
"grad_norm": 2.50076961517334,
"learning_rate": 0.0001800985221674877,
"loss": 0.8512,
"step": 2120
},
{
"epoch": 10.445393532641855,
"grad_norm": 3.2371861934661865,
"learning_rate": 0.00017990147783251232,
"loss": 0.823,
"step": 2140
},
{
"epoch": 10.54301403294692,
"grad_norm": 2.688570976257324,
"learning_rate": 0.00017970443349753695,
"loss": 0.8853,
"step": 2160
},
{
"epoch": 10.640634533251983,
"grad_norm": 2.4727838039398193,
"learning_rate": 0.00017950738916256158,
"loss": 0.8257,
"step": 2180
},
{
"epoch": 10.738255033557047,
"grad_norm": 3.330667495727539,
"learning_rate": 0.0001793103448275862,
"loss": 0.923,
"step": 2200
},
{
"epoch": 10.835875533862112,
"grad_norm": 2.5213732719421387,
"learning_rate": 0.00017911330049261083,
"loss": 0.8946,
"step": 2220
},
{
"epoch": 10.933496034167176,
"grad_norm": 2.6011056900024414,
"learning_rate": 0.0001789162561576355,
"loss": 0.9194,
"step": 2240
},
{
"epoch": 11.03111653447224,
"grad_norm": 3.4423539638519287,
"learning_rate": 0.00017871921182266011,
"loss": 0.8529,
"step": 2260
},
{
"epoch": 11.128737034777304,
"grad_norm": 3.608583927154541,
"learning_rate": 0.00017852216748768474,
"loss": 0.7944,
"step": 2280
},
{
"epoch": 11.226357535082368,
"grad_norm": 2.567775249481201,
"learning_rate": 0.00017832512315270937,
"loss": 0.7843,
"step": 2300
},
{
"epoch": 11.323978035387432,
"grad_norm": 3.0681939125061035,
"learning_rate": 0.000178128078817734,
"loss": 0.8238,
"step": 2320
},
{
"epoch": 11.421598535692496,
"grad_norm": 2.489577293395996,
"learning_rate": 0.00017793103448275862,
"loss": 0.8829,
"step": 2340
},
{
"epoch": 11.51921903599756,
"grad_norm": 2.9147262573242188,
"learning_rate": 0.00017773399014778328,
"loss": 0.8246,
"step": 2360
},
{
"epoch": 11.616839536302624,
"grad_norm": 2.5094566345214844,
"learning_rate": 0.00017753694581280788,
"loss": 0.8277,
"step": 2380
},
{
"epoch": 11.714460036607688,
"grad_norm": 2.4408226013183594,
"learning_rate": 0.00017733990147783253,
"loss": 0.8722,
"step": 2400
},
{
"epoch": 11.812080536912752,
"grad_norm": 2.5982508659362793,
"learning_rate": 0.00017714285714285713,
"loss": 0.8285,
"step": 2420
},
{
"epoch": 11.909701037217816,
"grad_norm": 4.408588409423828,
"learning_rate": 0.00017694581280788179,
"loss": 0.8042,
"step": 2440
},
{
"epoch": 12.00732153752288,
"grad_norm": 3.4463417530059814,
"learning_rate": 0.0001767487684729064,
"loss": 0.8606,
"step": 2460
},
{
"epoch": 12.104942037827945,
"grad_norm": 3.192249059677124,
"learning_rate": 0.00017655172413793104,
"loss": 0.7847,
"step": 2480
},
{
"epoch": 12.202562538133009,
"grad_norm": 2.760958671569824,
"learning_rate": 0.00017635467980295567,
"loss": 0.7968,
"step": 2500
},
{
"epoch": 12.300183038438073,
"grad_norm": 2.8952383995056152,
"learning_rate": 0.0001761576354679803,
"loss": 0.8226,
"step": 2520
},
{
"epoch": 12.397803538743137,
"grad_norm": 3.6324946880340576,
"learning_rate": 0.00017596059113300495,
"loss": 0.7592,
"step": 2540
},
{
"epoch": 12.495424039048201,
"grad_norm": 4.0287885665893555,
"learning_rate": 0.00017576354679802955,
"loss": 0.8112,
"step": 2560
},
{
"epoch": 12.593044539353265,
"grad_norm": 3.1734702587127686,
"learning_rate": 0.0001755665024630542,
"loss": 0.7847,
"step": 2580
},
{
"epoch": 12.690665039658327,
"grad_norm": 2.9449315071105957,
"learning_rate": 0.00017536945812807883,
"loss": 0.8264,
"step": 2600
},
{
"epoch": 12.788285539963393,
"grad_norm": 3.1391289234161377,
"learning_rate": 0.00017517241379310346,
"loss": 0.8058,
"step": 2620
},
{
"epoch": 12.885906040268456,
"grad_norm": 3.2317001819610596,
"learning_rate": 0.00017497536945812808,
"loss": 0.767,
"step": 2640
},
{
"epoch": 12.98352654057352,
"grad_norm": 3.2640392780303955,
"learning_rate": 0.0001747783251231527,
"loss": 0.8314,
"step": 2660
},
{
"epoch": 13.081147040878584,
"grad_norm": 4.71024227142334,
"learning_rate": 0.00017458128078817734,
"loss": 0.756,
"step": 2680
},
{
"epoch": 13.178767541183648,
"grad_norm": 3.621242046356201,
"learning_rate": 0.000174384236453202,
"loss": 0.7309,
"step": 2700
},
{
"epoch": 13.276388041488712,
"grad_norm": 3.6408748626708984,
"learning_rate": 0.00017418719211822662,
"loss": 0.7143,
"step": 2720
},
{
"epoch": 13.374008541793776,
"grad_norm": 3.296096086502075,
"learning_rate": 0.00017399014778325125,
"loss": 0.7965,
"step": 2740
},
{
"epoch": 13.47162904209884,
"grad_norm": 2.74519944190979,
"learning_rate": 0.00017379310344827587,
"loss": 0.7654,
"step": 2760
},
{
"epoch": 13.569249542403904,
"grad_norm": 2.9242568016052246,
"learning_rate": 0.0001735960591133005,
"loss": 0.7875,
"step": 2780
},
{
"epoch": 13.666870042708968,
"grad_norm": 2.5848984718322754,
"learning_rate": 0.00017339901477832515,
"loss": 0.7594,
"step": 2800
},
{
"epoch": 13.764490543014032,
"grad_norm": 3.9295613765716553,
"learning_rate": 0.00017320197044334975,
"loss": 0.75,
"step": 2820
},
{
"epoch": 13.862111043319096,
"grad_norm": 3.6406261920928955,
"learning_rate": 0.0001730049261083744,
"loss": 0.8149,
"step": 2840
},
{
"epoch": 13.95973154362416,
"grad_norm": 3.069199323654175,
"learning_rate": 0.000172807881773399,
"loss": 0.8217,
"step": 2860
},
{
"epoch": 14.057352043929225,
"grad_norm": 2.788712739944458,
"learning_rate": 0.00017261083743842366,
"loss": 0.7755,
"step": 2880
},
{
"epoch": 14.154972544234289,
"grad_norm": 3.468480110168457,
"learning_rate": 0.00017241379310344826,
"loss": 0.7071,
"step": 2900
},
{
"epoch": 14.252593044539353,
"grad_norm": 2.899951696395874,
"learning_rate": 0.00017221674876847292,
"loss": 0.7368,
"step": 2920
},
{
"epoch": 14.350213544844417,
"grad_norm": 3.6109790802001953,
"learning_rate": 0.00017201970443349754,
"loss": 0.7012,
"step": 2940
},
{
"epoch": 14.44783404514948,
"grad_norm": 3.448408842086792,
"learning_rate": 0.00017182266009852217,
"loss": 0.743,
"step": 2960
},
{
"epoch": 14.545454545454545,
"grad_norm": 2.819427013397217,
"learning_rate": 0.0001716256157635468,
"loss": 0.7552,
"step": 2980
},
{
"epoch": 14.643075045759609,
"grad_norm": 4.412954807281494,
"learning_rate": 0.00017142857142857143,
"loss": 0.7838,
"step": 3000
},
{
"epoch": 14.740695546064673,
"grad_norm": 2.7720842361450195,
"learning_rate": 0.00017123152709359608,
"loss": 0.7589,
"step": 3020
},
{
"epoch": 14.838316046369737,
"grad_norm": 3.3187596797943115,
"learning_rate": 0.0001710344827586207,
"loss": 0.7812,
"step": 3040
},
{
"epoch": 14.935936546674801,
"grad_norm": 2.3551273345947266,
"learning_rate": 0.00017083743842364533,
"loss": 0.764,
"step": 3060
},
{
"epoch": 15.033557046979865,
"grad_norm": 2.663290023803711,
"learning_rate": 0.00017064039408866996,
"loss": 0.7034,
"step": 3080
},
{
"epoch": 15.13117754728493,
"grad_norm": 3.2227704524993896,
"learning_rate": 0.0001704433497536946,
"loss": 0.6878,
"step": 3100
},
{
"epoch": 15.228798047589994,
"grad_norm": 2.819664478302002,
"learning_rate": 0.00017024630541871921,
"loss": 0.6731,
"step": 3120
},
{
"epoch": 15.326418547895058,
"grad_norm": 2.9787933826446533,
"learning_rate": 0.00017004926108374387,
"loss": 0.7036,
"step": 3140
},
{
"epoch": 15.424039048200122,
"grad_norm": 2.4379117488861084,
"learning_rate": 0.00016985221674876847,
"loss": 0.7323,
"step": 3160
},
{
"epoch": 15.521659548505186,
"grad_norm": 1.9959620237350464,
"learning_rate": 0.00016965517241379312,
"loss": 0.7155,
"step": 3180
},
{
"epoch": 15.61928004881025,
"grad_norm": 2.856109619140625,
"learning_rate": 0.00016945812807881772,
"loss": 0.6876,
"step": 3200
},
{
"epoch": 15.716900549115314,
"grad_norm": 3.9589807987213135,
"learning_rate": 0.00016926108374384238,
"loss": 0.7316,
"step": 3220
},
{
"epoch": 15.814521049420378,
"grad_norm": 2.921196460723877,
"learning_rate": 0.000169064039408867,
"loss": 0.7306,
"step": 3240
},
{
"epoch": 15.912141549725442,
"grad_norm": 2.862910270690918,
"learning_rate": 0.00016886699507389163,
"loss": 0.7829,
"step": 3260
},
{
"epoch": 16.009762050030506,
"grad_norm": 2.988609552383423,
"learning_rate": 0.00016866995073891626,
"loss": 0.75,
"step": 3280
},
{
"epoch": 16.107382550335572,
"grad_norm": 3.728930950164795,
"learning_rate": 0.00016847290640394089,
"loss": 0.6083,
"step": 3300
},
{
"epoch": 16.205003050640634,
"grad_norm": 3.5626068115234375,
"learning_rate": 0.00016827586206896554,
"loss": 0.6849,
"step": 3320
},
{
"epoch": 16.3026235509457,
"grad_norm": 2.754389524459839,
"learning_rate": 0.00016807881773399014,
"loss": 0.6635,
"step": 3340
},
{
"epoch": 16.400244051250763,
"grad_norm": 3.2776389122009277,
"learning_rate": 0.0001678817733990148,
"loss": 0.6999,
"step": 3360
},
{
"epoch": 16.49786455155583,
"grad_norm": 3.0710105895996094,
"learning_rate": 0.00016768472906403942,
"loss": 0.6911,
"step": 3380
},
{
"epoch": 16.59548505186089,
"grad_norm": 3.1727585792541504,
"learning_rate": 0.00016748768472906405,
"loss": 0.7238,
"step": 3400
},
{
"epoch": 16.693105552165953,
"grad_norm": 2.671583652496338,
"learning_rate": 0.00016729064039408868,
"loss": 0.6925,
"step": 3420
},
{
"epoch": 16.79072605247102,
"grad_norm": 2.9183971881866455,
"learning_rate": 0.0001670935960591133,
"loss": 0.703,
"step": 3440
},
{
"epoch": 16.888346552776085,
"grad_norm": 3.785710334777832,
"learning_rate": 0.00016689655172413793,
"loss": 0.7245,
"step": 3460
},
{
"epoch": 16.985967053081147,
"grad_norm": 3.435655355453491,
"learning_rate": 0.00016669950738916258,
"loss": 0.7483,
"step": 3480
},
{
"epoch": 17.08358755338621,
"grad_norm": 3.7350969314575195,
"learning_rate": 0.00016650246305418718,
"loss": 0.639,
"step": 3500
},
{
"epoch": 17.181208053691275,
"grad_norm": 3.0420546531677246,
"learning_rate": 0.00016630541871921184,
"loss": 0.675,
"step": 3520
},
{
"epoch": 17.278828553996338,
"grad_norm": 2.1023027896881104,
"learning_rate": 0.00016610837438423646,
"loss": 0.6857,
"step": 3540
},
{
"epoch": 17.376449054301403,
"grad_norm": 2.282754898071289,
"learning_rate": 0.0001659113300492611,
"loss": 0.7028,
"step": 3560
},
{
"epoch": 17.474069554606466,
"grad_norm": 4.962581634521484,
"learning_rate": 0.00016571428571428575,
"loss": 0.6297,
"step": 3580
},
{
"epoch": 17.57169005491153,
"grad_norm": 2.602381944656372,
"learning_rate": 0.00016551724137931035,
"loss": 0.7003,
"step": 3600
},
{
"epoch": 17.669310555216594,
"grad_norm": 4.691868782043457,
"learning_rate": 0.000165320197044335,
"loss": 0.6993,
"step": 3620
},
{
"epoch": 17.76693105552166,
"grad_norm": 3.7989959716796875,
"learning_rate": 0.0001651231527093596,
"loss": 0.6644,
"step": 3640
},
{
"epoch": 17.864551555826722,
"grad_norm": 3.188518524169922,
"learning_rate": 0.00016492610837438425,
"loss": 0.6713,
"step": 3660
},
{
"epoch": 17.962172056131788,
"grad_norm": 3.8618476390838623,
"learning_rate": 0.00016472906403940885,
"loss": 0.6652,
"step": 3680
},
{
"epoch": 18.05979255643685,
"grad_norm": 3.6163158416748047,
"learning_rate": 0.0001645320197044335,
"loss": 0.667,
"step": 3700
},
{
"epoch": 18.157413056741916,
"grad_norm": 3.723688840866089,
"learning_rate": 0.00016433497536945814,
"loss": 0.6456,
"step": 3720
},
{
"epoch": 18.25503355704698,
"grad_norm": 4.452234268188477,
"learning_rate": 0.00016413793103448276,
"loss": 0.627,
"step": 3740
},
{
"epoch": 18.352654057352044,
"grad_norm": 3.0752596855163574,
"learning_rate": 0.0001639408866995074,
"loss": 0.6755,
"step": 3760
},
{
"epoch": 18.450274557657107,
"grad_norm": 3.043836832046509,
"learning_rate": 0.00016374384236453202,
"loss": 0.6861,
"step": 3780
},
{
"epoch": 18.547895057962172,
"grad_norm": 4.210402011871338,
"learning_rate": 0.00016354679802955667,
"loss": 0.6206,
"step": 3800
},
{
"epoch": 18.645515558267235,
"grad_norm": 3.4578044414520264,
"learning_rate": 0.0001633497536945813,
"loss": 0.633,
"step": 3820
},
{
"epoch": 18.7431360585723,
"grad_norm": 3.9487128257751465,
"learning_rate": 0.00016315270935960593,
"loss": 0.6479,
"step": 3840
},
{
"epoch": 18.840756558877363,
"grad_norm": 3.114673376083374,
"learning_rate": 0.00016295566502463055,
"loss": 0.6468,
"step": 3860
},
{
"epoch": 18.93837705918243,
"grad_norm": 3.7751824855804443,
"learning_rate": 0.00016275862068965518,
"loss": 0.6695,
"step": 3880
},
{
"epoch": 19.03599755948749,
"grad_norm": 2.7188830375671387,
"learning_rate": 0.0001625615763546798,
"loss": 0.6507,
"step": 3900
},
{
"epoch": 19.133618059792557,
"grad_norm": 3.5054094791412354,
"learning_rate": 0.00016236453201970446,
"loss": 0.5542,
"step": 3920
},
{
"epoch": 19.23123856009762,
"grad_norm": 2.4097495079040527,
"learning_rate": 0.00016216748768472906,
"loss": 0.602,
"step": 3940
},
{
"epoch": 19.328859060402685,
"grad_norm": 2.925482749938965,
"learning_rate": 0.00016197044334975372,
"loss": 0.6493,
"step": 3960
},
{
"epoch": 19.426479560707747,
"grad_norm": 4.706211566925049,
"learning_rate": 0.00016177339901477832,
"loss": 0.6285,
"step": 3980
},
{
"epoch": 19.524100061012813,
"grad_norm": 3.257904052734375,
"learning_rate": 0.00016157635467980297,
"loss": 0.6515,
"step": 4000
},
{
"epoch": 19.621720561317876,
"grad_norm": 3.0172128677368164,
"learning_rate": 0.0001613793103448276,
"loss": 0.6426,
"step": 4020
},
{
"epoch": 19.71934106162294,
"grad_norm": 2.948984146118164,
"learning_rate": 0.00016118226600985222,
"loss": 0.6487,
"step": 4040
},
{
"epoch": 19.816961561928004,
"grad_norm": 3.070138931274414,
"learning_rate": 0.00016098522167487685,
"loss": 0.6695,
"step": 4060
},
{
"epoch": 19.91458206223307,
"grad_norm": 3.364335060119629,
"learning_rate": 0.00016078817733990148,
"loss": 0.6443,
"step": 4080
},
{
"epoch": 20.012202562538132,
"grad_norm": 3.131267547607422,
"learning_rate": 0.00016059113300492613,
"loss": 0.6403,
"step": 4100
},
{
"epoch": 20.109823062843198,
"grad_norm": 2.4083542823791504,
"learning_rate": 0.00016039408866995073,
"loss": 0.5922,
"step": 4120
},
{
"epoch": 20.20744356314826,
"grad_norm": 4.872425556182861,
"learning_rate": 0.00016019704433497539,
"loss": 0.6166,
"step": 4140
},
{
"epoch": 20.305064063453326,
"grad_norm": 6.9143853187561035,
"learning_rate": 0.00016,
"loss": 0.6023,
"step": 4160
},
{
"epoch": 20.40268456375839,
"grad_norm": 2.4565210342407227,
"learning_rate": 0.00015980295566502464,
"loss": 0.6154,
"step": 4180
},
{
"epoch": 20.500305064063454,
"grad_norm": 2.886202096939087,
"learning_rate": 0.00015960591133004927,
"loss": 0.5861,
"step": 4200
},
{
"epoch": 20.597925564368516,
"grad_norm": 3.0811331272125244,
"learning_rate": 0.0001594088669950739,
"loss": 0.6445,
"step": 4220
},
{
"epoch": 20.695546064673582,
"grad_norm": 3.5066580772399902,
"learning_rate": 0.00015921182266009852,
"loss": 0.6133,
"step": 4240
},
{
"epoch": 20.793166564978645,
"grad_norm": 3.8073158264160156,
"learning_rate": 0.00015901477832512318,
"loss": 0.6133,
"step": 4260
},
{
"epoch": 20.89078706528371,
"grad_norm": 4.436833381652832,
"learning_rate": 0.00015881773399014778,
"loss": 0.6243,
"step": 4280
},
{
"epoch": 20.988407565588773,
"grad_norm": 2.7935214042663574,
"learning_rate": 0.00015862068965517243,
"loss": 0.6349,
"step": 4300
},
{
"epoch": 21.08602806589384,
"grad_norm": 3.224860668182373,
"learning_rate": 0.00015842364532019706,
"loss": 0.5906,
"step": 4320
},
{
"epoch": 21.1836485661989,
"grad_norm": 2.9267752170562744,
"learning_rate": 0.00015822660098522168,
"loss": 0.5512,
"step": 4340
},
{
"epoch": 21.281269066503967,
"grad_norm": 3.137066125869751,
"learning_rate": 0.0001580295566502463,
"loss": 0.5764,
"step": 4360
},
{
"epoch": 21.37888956680903,
"grad_norm": 3.112293004989624,
"learning_rate": 0.00015783251231527094,
"loss": 0.6045,
"step": 4380
},
{
"epoch": 21.476510067114095,
"grad_norm": 2.6162259578704834,
"learning_rate": 0.0001576354679802956,
"loss": 0.6009,
"step": 4400
},
{
"epoch": 21.574130567419157,
"grad_norm": 2.924473285675049,
"learning_rate": 0.0001574384236453202,
"loss": 0.589,
"step": 4420
},
{
"epoch": 21.671751067724223,
"grad_norm": 3.2589287757873535,
"learning_rate": 0.00015724137931034485,
"loss": 0.6078,
"step": 4440
},
{
"epoch": 21.769371568029285,
"grad_norm": 3.4130911827087402,
"learning_rate": 0.00015704433497536945,
"loss": 0.6177,
"step": 4460
},
{
"epoch": 21.86699206833435,
"grad_norm": 3.0816001892089844,
"learning_rate": 0.0001568472906403941,
"loss": 0.6077,
"step": 4480
},
{
"epoch": 21.964612568639414,
"grad_norm": 2.875441789627075,
"learning_rate": 0.00015665024630541873,
"loss": 0.6127,
"step": 4500
},
{
"epoch": 22.06223306894448,
"grad_norm": 4.020274639129639,
"learning_rate": 0.00015645320197044335,
"loss": 0.5673,
"step": 4520
},
{
"epoch": 22.15985356924954,
"grad_norm": 3.365691661834717,
"learning_rate": 0.00015625615763546798,
"loss": 0.5201,
"step": 4540
},
{
"epoch": 22.257474069554608,
"grad_norm": 3.449277400970459,
"learning_rate": 0.0001560591133004926,
"loss": 0.5657,
"step": 4560
},
{
"epoch": 22.35509456985967,
"grad_norm": 3.7012288570404053,
"learning_rate": 0.00015586206896551724,
"loss": 0.6035,
"step": 4580
},
{
"epoch": 22.452715070164736,
"grad_norm": 3.5211081504821777,
"learning_rate": 0.0001556650246305419,
"loss": 0.6173,
"step": 4600
},
{
"epoch": 22.550335570469798,
"grad_norm": 3.026588201522827,
"learning_rate": 0.00015546798029556652,
"loss": 0.6004,
"step": 4620
},
{
"epoch": 22.647956070774864,
"grad_norm": 2.7548885345458984,
"learning_rate": 0.00015527093596059114,
"loss": 0.5633,
"step": 4640
},
{
"epoch": 22.745576571079926,
"grad_norm": 5.050055027008057,
"learning_rate": 0.00015507389162561577,
"loss": 0.6061,
"step": 4660
},
{
"epoch": 22.843197071384992,
"grad_norm": 3.0278573036193848,
"learning_rate": 0.0001548768472906404,
"loss": 0.5607,
"step": 4680
},
{
"epoch": 22.940817571690054,
"grad_norm": 3.17149019241333,
"learning_rate": 0.00015467980295566505,
"loss": 0.5829,
"step": 4700
},
{
"epoch": 23.03843807199512,
"grad_norm": 2.5521585941314697,
"learning_rate": 0.00015448275862068965,
"loss": 0.5723,
"step": 4720
},
{
"epoch": 23.136058572300183,
"grad_norm": 2.7798378467559814,
"learning_rate": 0.0001542857142857143,
"loss": 0.5373,
"step": 4740
},
{
"epoch": 23.23367907260525,
"grad_norm": 3.4025466442108154,
"learning_rate": 0.0001540886699507389,
"loss": 0.5445,
"step": 4760
},
{
"epoch": 23.33129957291031,
"grad_norm": 3.9419145584106445,
"learning_rate": 0.00015389162561576356,
"loss": 0.5677,
"step": 4780
},
{
"epoch": 23.428920073215377,
"grad_norm": 2.300863265991211,
"learning_rate": 0.00015369458128078816,
"loss": 0.5941,
"step": 4800
},
{
"epoch": 23.52654057352044,
"grad_norm": 3.25654673576355,
"learning_rate": 0.00015349753694581282,
"loss": 0.5688,
"step": 4820
},
{
"epoch": 23.624161073825505,
"grad_norm": 3.1517579555511475,
"learning_rate": 0.00015330049261083744,
"loss": 0.5481,
"step": 4840
},
{
"epoch": 23.721781574130567,
"grad_norm": 2.5366251468658447,
"learning_rate": 0.00015310344827586207,
"loss": 0.5725,
"step": 4860
},
{
"epoch": 23.819402074435633,
"grad_norm": 4.309774875640869,
"learning_rate": 0.00015290640394088672,
"loss": 0.574,
"step": 4880
},
{
"epoch": 23.917022574740695,
"grad_norm": 3.031926155090332,
"learning_rate": 0.00015270935960591132,
"loss": 0.5431,
"step": 4900
},
{
"epoch": 24.01464307504576,
"grad_norm": 2.574500560760498,
"learning_rate": 0.00015251231527093598,
"loss": 0.5967,
"step": 4920
},
{
"epoch": 24.112263575350823,
"grad_norm": 2.556105136871338,
"learning_rate": 0.0001523152709359606,
"loss": 0.5419,
"step": 4940
},
{
"epoch": 24.20988407565589,
"grad_norm": 2.412322998046875,
"learning_rate": 0.00015211822660098523,
"loss": 0.5342,
"step": 4960
},
{
"epoch": 24.30750457596095,
"grad_norm": 2.39802622795105,
"learning_rate": 0.00015192118226600986,
"loss": 0.5249,
"step": 4980
},
{
"epoch": 24.405125076266017,
"grad_norm": 2.854398727416992,
"learning_rate": 0.00015172413793103449,
"loss": 0.5468,
"step": 5000
},
{
"epoch": 24.50274557657108,
"grad_norm": 2.8961057662963867,
"learning_rate": 0.0001515270935960591,
"loss": 0.5313,
"step": 5020
},
{
"epoch": 24.600366076876146,
"grad_norm": 3.2031073570251465,
"learning_rate": 0.00015133004926108377,
"loss": 0.5718,
"step": 5040
},
{
"epoch": 24.697986577181208,
"grad_norm": 4.338870525360107,
"learning_rate": 0.00015113300492610837,
"loss": 0.5415,
"step": 5060
},
{
"epoch": 24.795607077486274,
"grad_norm": 3.46842360496521,
"learning_rate": 0.00015093596059113302,
"loss": 0.5546,
"step": 5080
},
{
"epoch": 24.893227577791336,
"grad_norm": 2.853489637374878,
"learning_rate": 0.00015073891625615765,
"loss": 0.5691,
"step": 5100
},
{
"epoch": 24.990848078096402,
"grad_norm": 3.427720785140991,
"learning_rate": 0.00015054187192118228,
"loss": 0.5795,
"step": 5120
},
{
"epoch": 25.088468578401464,
"grad_norm": 3.2862656116485596,
"learning_rate": 0.0001503448275862069,
"loss": 0.5109,
"step": 5140
},
{
"epoch": 25.18608907870653,
"grad_norm": 3.383563756942749,
"learning_rate": 0.00015014778325123153,
"loss": 0.4983,
"step": 5160
},
{
"epoch": 25.283709579011592,
"grad_norm": 3.3909354209899902,
"learning_rate": 0.00014995073891625618,
"loss": 0.5164,
"step": 5180
},
{
"epoch": 25.381330079316655,
"grad_norm": 2.616955041885376,
"learning_rate": 0.00014975369458128078,
"loss": 0.5347,
"step": 5200
},
{
"epoch": 25.47895057962172,
"grad_norm": 2.7965965270996094,
"learning_rate": 0.00014955665024630544,
"loss": 0.5386,
"step": 5220
},
{
"epoch": 25.576571079926783,
"grad_norm": 2.9817397594451904,
"learning_rate": 0.00014935960591133004,
"loss": 0.5001,
"step": 5240
},
{
"epoch": 25.67419158023185,
"grad_norm": 2.527992010116577,
"learning_rate": 0.0001491625615763547,
"loss": 0.5572,
"step": 5260
},
{
"epoch": 25.77181208053691,
"grad_norm": 4.047604560852051,
"learning_rate": 0.00014896551724137932,
"loss": 0.5429,
"step": 5280
},
{
"epoch": 25.869432580841977,
"grad_norm": 3.2753515243530273,
"learning_rate": 0.00014876847290640395,
"loss": 0.5461,
"step": 5300
},
{
"epoch": 25.96705308114704,
"grad_norm": 3.5623252391815186,
"learning_rate": 0.00014857142857142857,
"loss": 0.571,
"step": 5320
},
{
"epoch": 26.064673581452105,
"grad_norm": 4.602993965148926,
"learning_rate": 0.0001483743842364532,
"loss": 0.4858,
"step": 5340
},
{
"epoch": 26.162294081757167,
"grad_norm": 3.4932191371917725,
"learning_rate": 0.00014817733990147783,
"loss": 0.5374,
"step": 5360
},
{
"epoch": 26.259914582062233,
"grad_norm": 2.595555305480957,
"learning_rate": 0.00014798029556650248,
"loss": 0.5217,
"step": 5380
},
{
"epoch": 26.357535082367296,
"grad_norm": 2.3642492294311523,
"learning_rate": 0.0001477832512315271,
"loss": 0.5055,
"step": 5400
},
{
"epoch": 26.45515558267236,
"grad_norm": 3.9272634983062744,
"learning_rate": 0.00014758620689655174,
"loss": 0.5535,
"step": 5420
},
{
"epoch": 26.552776082977424,
"grad_norm": 4.050607204437256,
"learning_rate": 0.00014738916256157636,
"loss": 0.5019,
"step": 5440
},
{
"epoch": 26.65039658328249,
"grad_norm": 3.2770299911499023,
"learning_rate": 0.000147192118226601,
"loss": 0.4922,
"step": 5460
},
{
"epoch": 26.748017083587552,
"grad_norm": 3.96409273147583,
"learning_rate": 0.00014699507389162562,
"loss": 0.5165,
"step": 5480
},
{
"epoch": 26.845637583892618,
"grad_norm": 4.587811470031738,
"learning_rate": 0.00014679802955665024,
"loss": 0.5513,
"step": 5500
},
{
"epoch": 26.94325808419768,
"grad_norm": 4.558196067810059,
"learning_rate": 0.0001466009852216749,
"loss": 0.5227,
"step": 5520
},
{
"epoch": 27.040878584502746,
"grad_norm": 3.807441473007202,
"learning_rate": 0.0001464039408866995,
"loss": 0.5141,
"step": 5540
},
{
"epoch": 27.13849908480781,
"grad_norm": 2.2902328968048096,
"learning_rate": 0.00014620689655172415,
"loss": 0.4822,
"step": 5560
},
{
"epoch": 27.236119585112874,
"grad_norm": 4.3950886726379395,
"learning_rate": 0.00014600985221674875,
"loss": 0.5136,
"step": 5580
},
{
"epoch": 27.333740085417936,
"grad_norm": 4.0127482414245605,
"learning_rate": 0.0001458128078817734,
"loss": 0.5299,
"step": 5600
},
{
"epoch": 27.431360585723002,
"grad_norm": 4.659334182739258,
"learning_rate": 0.00014561576354679803,
"loss": 0.4764,
"step": 5620
},
{
"epoch": 27.528981086028065,
"grad_norm": 4.769715785980225,
"learning_rate": 0.00014541871921182266,
"loss": 0.5236,
"step": 5640
},
{
"epoch": 27.62660158633313,
"grad_norm": 3.8856427669525146,
"learning_rate": 0.00014522167487684732,
"loss": 0.5028,
"step": 5660
},
{
"epoch": 27.724222086638193,
"grad_norm": 3.183850049972534,
"learning_rate": 0.00014502463054187192,
"loss": 0.4945,
"step": 5680
},
{
"epoch": 27.82184258694326,
"grad_norm": 3.1610593795776367,
"learning_rate": 0.00014482758620689657,
"loss": 0.4963,
"step": 5700
},
{
"epoch": 27.91946308724832,
"grad_norm": 4.054819107055664,
"learning_rate": 0.0001446305418719212,
"loss": 0.547,
"step": 5720
},
{
"epoch": 28.017083587553387,
"grad_norm": 2.7358503341674805,
"learning_rate": 0.00014443349753694582,
"loss": 0.5387,
"step": 5740
},
{
"epoch": 28.11470408785845,
"grad_norm": 2.403042793273926,
"learning_rate": 0.00014423645320197045,
"loss": 0.4593,
"step": 5760
},
{
"epoch": 28.212324588163515,
"grad_norm": 3.3207452297210693,
"learning_rate": 0.00014403940886699508,
"loss": 0.4842,
"step": 5780
},
{
"epoch": 28.309945088468577,
"grad_norm": 3.0579757690429688,
"learning_rate": 0.0001438423645320197,
"loss": 0.4754,
"step": 5800
},
{
"epoch": 28.407565588773643,
"grad_norm": 4.5140700340271,
"learning_rate": 0.00014364532019704436,
"loss": 0.5128,
"step": 5820
},
{
"epoch": 28.505186089078705,
"grad_norm": 3.541874885559082,
"learning_rate": 0.00014344827586206896,
"loss": 0.5187,
"step": 5840
},
{
"epoch": 28.60280658938377,
"grad_norm": 3.214235782623291,
"learning_rate": 0.00014325123152709361,
"loss": 0.475,
"step": 5860
},
{
"epoch": 28.700427089688834,
"grad_norm": 4.037768363952637,
"learning_rate": 0.00014305418719211824,
"loss": 0.4733,
"step": 5880
},
{
"epoch": 28.7980475899939,
"grad_norm": 3.0469048023223877,
"learning_rate": 0.00014285714285714287,
"loss": 0.5181,
"step": 5900
},
{
"epoch": 28.89566809029896,
"grad_norm": 3.3396294116973877,
"learning_rate": 0.0001426600985221675,
"loss": 0.5062,
"step": 5920
},
{
"epoch": 28.993288590604028,
"grad_norm": 3.4280455112457275,
"learning_rate": 0.00014246305418719212,
"loss": 0.5232,
"step": 5940
},
{
"epoch": 29.09090909090909,
"grad_norm": 3.8690781593322754,
"learning_rate": 0.00014226600985221678,
"loss": 0.4744,
"step": 5960
},
{
"epoch": 29.188529591214156,
"grad_norm": 3.1680831909179688,
"learning_rate": 0.00014206896551724138,
"loss": 0.4679,
"step": 5980
},
{
"epoch": 29.286150091519218,
"grad_norm": 3.752593755722046,
"learning_rate": 0.00014187192118226603,
"loss": 0.444,
"step": 6000
},
{
"epoch": 29.383770591824284,
"grad_norm": 4.88236141204834,
"learning_rate": 0.00014167487684729063,
"loss": 0.4639,
"step": 6020
},
{
"epoch": 29.481391092129346,
"grad_norm": 3.7870137691497803,
"learning_rate": 0.00014147783251231528,
"loss": 0.4873,
"step": 6040
},
{
"epoch": 29.579011592434412,
"grad_norm": 3.091411590576172,
"learning_rate": 0.0001412807881773399,
"loss": 0.4834,
"step": 6060
},
{
"epoch": 29.676632092739474,
"grad_norm": 2.7498538494110107,
"learning_rate": 0.00014108374384236454,
"loss": 0.4846,
"step": 6080
},
{
"epoch": 29.77425259304454,
"grad_norm": 3.2043850421905518,
"learning_rate": 0.00014088669950738917,
"loss": 0.4983,
"step": 6100
},
{
"epoch": 29.871873093349603,
"grad_norm": 3.270357847213745,
"learning_rate": 0.0001406896551724138,
"loss": 0.4803,
"step": 6120
},
{
"epoch": 29.96949359365467,
"grad_norm": 3.031405210494995,
"learning_rate": 0.00014049261083743842,
"loss": 0.5287,
"step": 6140
},
{
"epoch": 30.06711409395973,
"grad_norm": 3.390765905380249,
"learning_rate": 0.00014029556650246307,
"loss": 0.4619,
"step": 6160
},
{
"epoch": 30.164734594264797,
"grad_norm": 3.2783963680267334,
"learning_rate": 0.0001400985221674877,
"loss": 0.4328,
"step": 6180
},
{
"epoch": 30.26235509456986,
"grad_norm": 3.6925759315490723,
"learning_rate": 0.00013990147783251233,
"loss": 0.487,
"step": 6200
},
{
"epoch": 30.359975594874925,
"grad_norm": 3.0115065574645996,
"learning_rate": 0.00013970443349753696,
"loss": 0.467,
"step": 6220
},
{
"epoch": 30.457596095179987,
"grad_norm": 4.561310291290283,
"learning_rate": 0.00013950738916256158,
"loss": 0.4801,
"step": 6240
},
{
"epoch": 30.555216595485053,
"grad_norm": 3.2879674434661865,
"learning_rate": 0.0001393103448275862,
"loss": 0.4638,
"step": 6260
},
{
"epoch": 30.652837095790115,
"grad_norm": 2.793945789337158,
"learning_rate": 0.00013911330049261084,
"loss": 0.463,
"step": 6280
},
{
"epoch": 30.75045759609518,
"grad_norm": 3.615793466567993,
"learning_rate": 0.0001389162561576355,
"loss": 0.4907,
"step": 6300
},
{
"epoch": 30.848078096400243,
"grad_norm": 3.160133123397827,
"learning_rate": 0.0001387192118226601,
"loss": 0.477,
"step": 6320
},
{
"epoch": 30.94569859670531,
"grad_norm": 3.62670636177063,
"learning_rate": 0.00013852216748768475,
"loss": 0.4945,
"step": 6340
},
{
"epoch": 31.04331909701037,
"grad_norm": 3.346158981323242,
"learning_rate": 0.00013832512315270935,
"loss": 0.4543,
"step": 6360
},
{
"epoch": 31.140939597315437,
"grad_norm": 2.8707423210144043,
"learning_rate": 0.000138128078817734,
"loss": 0.4352,
"step": 6380
},
{
"epoch": 31.2385600976205,
"grad_norm": 2.5617620944976807,
"learning_rate": 0.00013793103448275863,
"loss": 0.4611,
"step": 6400
},
{
"epoch": 31.336180597925566,
"grad_norm": 3.2273828983306885,
"learning_rate": 0.00013773399014778325,
"loss": 0.4593,
"step": 6420
},
{
"epoch": 31.433801098230628,
"grad_norm": 3.502797842025757,
"learning_rate": 0.00013753694581280788,
"loss": 0.4717,
"step": 6440
},
{
"epoch": 31.531421598535694,
"grad_norm": 3.9278218746185303,
"learning_rate": 0.0001373399014778325,
"loss": 0.4813,
"step": 6460
},
{
"epoch": 31.629042098840756,
"grad_norm": 3.013709545135498,
"learning_rate": 0.00013714285714285716,
"loss": 0.4305,
"step": 6480
},
{
"epoch": 31.726662599145822,
"grad_norm": 2.661198377609253,
"learning_rate": 0.0001369458128078818,
"loss": 0.4495,
"step": 6500
},
{
"epoch": 31.824283099450884,
"grad_norm": 2.6343297958374023,
"learning_rate": 0.00013674876847290642,
"loss": 0.4809,
"step": 6520
},
{
"epoch": 31.92190359975595,
"grad_norm": 6.334170818328857,
"learning_rate": 0.00013655172413793104,
"loss": 0.4576,
"step": 6540
},
{
"epoch": 32.01952410006101,
"grad_norm": 3.728727102279663,
"learning_rate": 0.00013635467980295567,
"loss": 0.5034,
"step": 6560
},
{
"epoch": 32.117144600366075,
"grad_norm": 2.0572702884674072,
"learning_rate": 0.0001361576354679803,
"loss": 0.4161,
"step": 6580
},
{
"epoch": 32.214765100671144,
"grad_norm": 2.7006356716156006,
"learning_rate": 0.00013596059113300492,
"loss": 0.4357,
"step": 6600
},
{
"epoch": 32.31238560097621,
"grad_norm": 3.526782989501953,
"learning_rate": 0.00013576354679802955,
"loss": 0.4367,
"step": 6620
},
{
"epoch": 32.41000610128127,
"grad_norm": 3.240647792816162,
"learning_rate": 0.0001355665024630542,
"loss": 0.4416,
"step": 6640
},
{
"epoch": 32.50762660158633,
"grad_norm": 2.965851306915283,
"learning_rate": 0.0001353694581280788,
"loss": 0.4649,
"step": 6660
},
{
"epoch": 32.6052471018914,
"grad_norm": 3.028812885284424,
"learning_rate": 0.00013517241379310346,
"loss": 0.4381,
"step": 6680
},
{
"epoch": 32.70286760219646,
"grad_norm": 4.041370391845703,
"learning_rate": 0.0001349753694581281,
"loss": 0.4671,
"step": 6700
},
{
"epoch": 32.800488102501525,
"grad_norm": 5.677656650543213,
"learning_rate": 0.00013477832512315271,
"loss": 0.4718,
"step": 6720
},
{
"epoch": 32.89810860280659,
"grad_norm": 3.1538727283477783,
"learning_rate": 0.00013458128078817737,
"loss": 0.4705,
"step": 6740
},
{
"epoch": 32.99572910311166,
"grad_norm": 3.8186867237091064,
"learning_rate": 0.00013438423645320197,
"loss": 0.4724,
"step": 6760
},
{
"epoch": 33.09334960341672,
"grad_norm": 2.8248584270477295,
"learning_rate": 0.00013418719211822662,
"loss": 0.4399,
"step": 6780
},
{
"epoch": 33.19097010372178,
"grad_norm": 2.2694895267486572,
"learning_rate": 0.00013399014778325122,
"loss": 0.4147,
"step": 6800
},
{
"epoch": 33.288590604026844,
"grad_norm": 3.305610418319702,
"learning_rate": 0.00013379310344827588,
"loss": 0.4028,
"step": 6820
},
{
"epoch": 33.38621110433191,
"grad_norm": 3.610136032104492,
"learning_rate": 0.0001335960591133005,
"loss": 0.4319,
"step": 6840
},
{
"epoch": 33.483831604636975,
"grad_norm": 3.4783689975738525,
"learning_rate": 0.00013339901477832513,
"loss": 0.4361,
"step": 6860
},
{
"epoch": 33.58145210494204,
"grad_norm": 3.0984203815460205,
"learning_rate": 0.00013320197044334976,
"loss": 0.4488,
"step": 6880
},
{
"epoch": 33.6790726052471,
"grad_norm": 3.1558122634887695,
"learning_rate": 0.00013300492610837438,
"loss": 0.4262,
"step": 6900
},
{
"epoch": 33.77669310555217,
"grad_norm": 4.813379764556885,
"learning_rate": 0.000132807881773399,
"loss": 0.452,
"step": 6920
},
{
"epoch": 33.87431360585723,
"grad_norm": 3.047551393508911,
"learning_rate": 0.00013261083743842364,
"loss": 0.4517,
"step": 6940
},
{
"epoch": 33.971934106162294,
"grad_norm": 3.0880701541900635,
"learning_rate": 0.0001324137931034483,
"loss": 0.5147,
"step": 6960
},
{
"epoch": 34.06955460646736,
"grad_norm": 2.824169874191284,
"learning_rate": 0.00013221674876847292,
"loss": 0.4017,
"step": 6980
},
{
"epoch": 34.16717510677242,
"grad_norm": 3.1136012077331543,
"learning_rate": 0.00013201970443349755,
"loss": 0.4291,
"step": 7000
},
{
"epoch": 34.26479560707749,
"grad_norm": 4.246958255767822,
"learning_rate": 0.00013182266009852217,
"loss": 0.4318,
"step": 7020
},
{
"epoch": 34.36241610738255,
"grad_norm": 2.4655661582946777,
"learning_rate": 0.0001316256157635468,
"loss": 0.4283,
"step": 7040
},
{
"epoch": 34.46003660768761,
"grad_norm": 4.322596549987793,
"learning_rate": 0.00013142857142857143,
"loss": 0.4323,
"step": 7060
},
{
"epoch": 34.557657107992675,
"grad_norm": 4.425800800323486,
"learning_rate": 0.00013123152709359608,
"loss": 0.4376,
"step": 7080
},
{
"epoch": 34.655277608297745,
"grad_norm": 3.796889305114746,
"learning_rate": 0.00013103448275862068,
"loss": 0.4276,
"step": 7100
},
{
"epoch": 34.75289810860281,
"grad_norm": 3.9222586154937744,
"learning_rate": 0.00013083743842364534,
"loss": 0.4658,
"step": 7120
},
{
"epoch": 34.85051860890787,
"grad_norm": 4.5007548332214355,
"learning_rate": 0.00013064039408866994,
"loss": 0.4293,
"step": 7140
},
{
"epoch": 34.94813910921293,
"grad_norm": 3.0858423709869385,
"learning_rate": 0.0001304433497536946,
"loss": 0.4214,
"step": 7160
},
{
"epoch": 35.045759609518,
"grad_norm": 3.586949586868286,
"learning_rate": 0.00013024630541871922,
"loss": 0.4199,
"step": 7180
},
{
"epoch": 35.14338010982306,
"grad_norm": 2.916937828063965,
"learning_rate": 0.00013004926108374385,
"loss": 0.4071,
"step": 7200
},
{
"epoch": 35.241000610128125,
"grad_norm": 3.1324169635772705,
"learning_rate": 0.00012985221674876847,
"loss": 0.4151,
"step": 7220
},
{
"epoch": 35.33862111043319,
"grad_norm": 2.8730344772338867,
"learning_rate": 0.0001296551724137931,
"loss": 0.3984,
"step": 7240
},
{
"epoch": 35.43624161073826,
"grad_norm": 3.0865273475646973,
"learning_rate": 0.00012945812807881775,
"loss": 0.4273,
"step": 7260
},
{
"epoch": 35.53386211104332,
"grad_norm": 4.397771835327148,
"learning_rate": 0.00012926108374384238,
"loss": 0.4232,
"step": 7280
},
{
"epoch": 35.63148261134838,
"grad_norm": 2.4203243255615234,
"learning_rate": 0.000129064039408867,
"loss": 0.4035,
"step": 7300
},
{
"epoch": 35.729103111653444,
"grad_norm": 2.94404673576355,
"learning_rate": 0.00012886699507389164,
"loss": 0.4332,
"step": 7320
},
{
"epoch": 35.82672361195851,
"grad_norm": 3.4141249656677246,
"learning_rate": 0.00012866995073891626,
"loss": 0.4484,
"step": 7340
},
{
"epoch": 35.924344112263576,
"grad_norm": 2.8227927684783936,
"learning_rate": 0.0001284729064039409,
"loss": 0.4509,
"step": 7360
},
{
"epoch": 36.02196461256864,
"grad_norm": 2.768937110900879,
"learning_rate": 0.00012827586206896552,
"loss": 0.4391,
"step": 7380
},
{
"epoch": 36.1195851128737,
"grad_norm": 4.155871391296387,
"learning_rate": 0.00012807881773399014,
"loss": 0.3954,
"step": 7400
},
{
"epoch": 36.21720561317877,
"grad_norm": 2.484731912612915,
"learning_rate": 0.0001278817733990148,
"loss": 0.4363,
"step": 7420
},
{
"epoch": 36.31482611348383,
"grad_norm": 2.7758595943450928,
"learning_rate": 0.0001276847290640394,
"loss": 0.4058,
"step": 7440
},
{
"epoch": 36.412446613788894,
"grad_norm": 3.9609923362731934,
"learning_rate": 0.00012748768472906405,
"loss": 0.3845,
"step": 7460
},
{
"epoch": 36.51006711409396,
"grad_norm": 3.963120222091675,
"learning_rate": 0.00012729064039408868,
"loss": 0.4301,
"step": 7480
},
{
"epoch": 36.607687614399026,
"grad_norm": 2.77718448638916,
"learning_rate": 0.0001270935960591133,
"loss": 0.4034,
"step": 7500
},
{
"epoch": 36.70530811470409,
"grad_norm": 3.6000113487243652,
"learning_rate": 0.00012689655172413793,
"loss": 0.4087,
"step": 7520
},
{
"epoch": 36.80292861500915,
"grad_norm": 3.4430975914001465,
"learning_rate": 0.00012669950738916256,
"loss": 0.4109,
"step": 7540
},
{
"epoch": 36.90054911531421,
"grad_norm": 3.3932645320892334,
"learning_rate": 0.00012650246305418721,
"loss": 0.4394,
"step": 7560
},
{
"epoch": 36.99816961561928,
"grad_norm": 4.054554462432861,
"learning_rate": 0.00012630541871921181,
"loss": 0.4203,
"step": 7580
},
{
"epoch": 37.095790115924345,
"grad_norm": 2.8766210079193115,
"learning_rate": 0.00012610837438423647,
"loss": 0.3861,
"step": 7600
},
{
"epoch": 37.19341061622941,
"grad_norm": 4.115131855010986,
"learning_rate": 0.0001259113300492611,
"loss": 0.4236,
"step": 7620
},
{
"epoch": 37.29103111653447,
"grad_norm": 2.776914358139038,
"learning_rate": 0.00012571428571428572,
"loss": 0.4244,
"step": 7640
},
{
"epoch": 37.38865161683954,
"grad_norm": 3.8428800106048584,
"learning_rate": 0.00012551724137931035,
"loss": 0.4028,
"step": 7660
},
{
"epoch": 37.4862721171446,
"grad_norm": 3.028683662414551,
"learning_rate": 0.00012532019704433498,
"loss": 0.4127,
"step": 7680
},
{
"epoch": 37.58389261744966,
"grad_norm": 2.678617477416992,
"learning_rate": 0.0001251231527093596,
"loss": 0.4251,
"step": 7700
},
{
"epoch": 37.681513117754726,
"grad_norm": 3.496917247772217,
"learning_rate": 0.00012492610837438423,
"loss": 0.404,
"step": 7720
},
{
"epoch": 37.779133618059795,
"grad_norm": 4.018653869628906,
"learning_rate": 0.00012472906403940889,
"loss": 0.4028,
"step": 7740
},
{
"epoch": 37.87675411836486,
"grad_norm": 3.317580223083496,
"learning_rate": 0.0001245320197044335,
"loss": 0.4032,
"step": 7760
},
{
"epoch": 37.97437461866992,
"grad_norm": 3.7693002223968506,
"learning_rate": 0.00012433497536945814,
"loss": 0.3935,
"step": 7780
},
{
"epoch": 38.07199511897498,
"grad_norm": 2.809558629989624,
"learning_rate": 0.00012413793103448277,
"loss": 0.4113,
"step": 7800
},
{
"epoch": 38.16961561928005,
"grad_norm": 3.2092092037200928,
"learning_rate": 0.0001239408866995074,
"loss": 0.4019,
"step": 7820
},
{
"epoch": 38.267236119585114,
"grad_norm": 3.3514404296875,
"learning_rate": 0.00012374384236453202,
"loss": 0.4013,
"step": 7840
},
{
"epoch": 38.364856619890176,
"grad_norm": 3.9514451026916504,
"learning_rate": 0.00012354679802955667,
"loss": 0.3889,
"step": 7860
},
{
"epoch": 38.46247712019524,
"grad_norm": 2.7896828651428223,
"learning_rate": 0.00012334975369458127,
"loss": 0.377,
"step": 7880
},
{
"epoch": 38.56009762050031,
"grad_norm": 3.522840738296509,
"learning_rate": 0.00012315270935960593,
"loss": 0.4158,
"step": 7900
},
{
"epoch": 38.65771812080537,
"grad_norm": 3.422250270843506,
"learning_rate": 0.00012295566502463053,
"loss": 0.3837,
"step": 7920
},
{
"epoch": 38.75533862111043,
"grad_norm": 3.0469913482666016,
"learning_rate": 0.00012275862068965518,
"loss": 0.4036,
"step": 7940
},
{
"epoch": 38.852959121415495,
"grad_norm": 2.904141664505005,
"learning_rate": 0.0001225615763546798,
"loss": 0.3928,
"step": 7960
},
{
"epoch": 38.950579621720564,
"grad_norm": 3.7538552284240723,
"learning_rate": 0.00012236453201970444,
"loss": 0.4092,
"step": 7980
},
{
"epoch": 39.04820012202563,
"grad_norm": 3.562114715576172,
"learning_rate": 0.00012216748768472906,
"loss": 0.3982,
"step": 8000
},
{
"epoch": 39.14582062233069,
"grad_norm": 2.4931962490081787,
"learning_rate": 0.00012197044334975369,
"loss": 0.3547,
"step": 8020
},
{
"epoch": 39.24344112263575,
"grad_norm": 2.461050271987915,
"learning_rate": 0.00012177339901477833,
"loss": 0.3762,
"step": 8040
},
{
"epoch": 39.34106162294082,
"grad_norm": 3.1320595741271973,
"learning_rate": 0.00012157635467980295,
"loss": 0.3907,
"step": 8060
},
{
"epoch": 39.43868212324588,
"grad_norm": 3.044754981994629,
"learning_rate": 0.00012137931034482759,
"loss": 0.4068,
"step": 8080
},
{
"epoch": 39.536302623550945,
"grad_norm": 2.9243273735046387,
"learning_rate": 0.00012118226600985223,
"loss": 0.3903,
"step": 8100
},
{
"epoch": 39.63392312385601,
"grad_norm": 4.234837055206299,
"learning_rate": 0.00012098522167487685,
"loss": 0.3841,
"step": 8120
},
{
"epoch": 39.73154362416108,
"grad_norm": 3.993495464324951,
"learning_rate": 0.00012078817733990148,
"loss": 0.4082,
"step": 8140
},
{
"epoch": 39.82916412446614,
"grad_norm": 3.8363142013549805,
"learning_rate": 0.00012059113300492611,
"loss": 0.3939,
"step": 8160
},
{
"epoch": 39.9267846247712,
"grad_norm": 4.398952007293701,
"learning_rate": 0.00012039408866995075,
"loss": 0.4145,
"step": 8180
},
{
"epoch": 40.024405125076264,
"grad_norm": 2.7002291679382324,
"learning_rate": 0.00012019704433497539,
"loss": 0.386,
"step": 8200
},
{
"epoch": 40.12202562538133,
"grad_norm": 3.1867945194244385,
"learning_rate": 0.00012,
"loss": 0.3924,
"step": 8220
},
{
"epoch": 40.219646125686396,
"grad_norm": 2.9179584980010986,
"learning_rate": 0.00011980295566502464,
"loss": 0.3741,
"step": 8240
},
{
"epoch": 40.31726662599146,
"grad_norm": 5.108730316162109,
"learning_rate": 0.00011960591133004926,
"loss": 0.371,
"step": 8260
},
{
"epoch": 40.41488712629652,
"grad_norm": 3.4418270587921143,
"learning_rate": 0.0001194088669950739,
"loss": 0.3845,
"step": 8280
},
{
"epoch": 40.51250762660159,
"grad_norm": 3.245562791824341,
"learning_rate": 0.00011921182266009854,
"loss": 0.375,
"step": 8300
},
{
"epoch": 40.61012812690665,
"grad_norm": 2.6644446849823,
"learning_rate": 0.00011901477832512315,
"loss": 0.3839,
"step": 8320
},
{
"epoch": 40.707748627211714,
"grad_norm": 4.975727558135986,
"learning_rate": 0.00011881773399014779,
"loss": 0.3889,
"step": 8340
},
{
"epoch": 40.80536912751678,
"grad_norm": 3.6427066326141357,
"learning_rate": 0.0001186206896551724,
"loss": 0.393,
"step": 8360
},
{
"epoch": 40.902989627821846,
"grad_norm": 3.7799060344696045,
"learning_rate": 0.00011842364532019705,
"loss": 0.3894,
"step": 8380
},
{
"epoch": 41.00061012812691,
"grad_norm": 4.170138835906982,
"learning_rate": 0.00011822660098522169,
"loss": 0.3965,
"step": 8400
},
{
"epoch": 41.09823062843197,
"grad_norm": 2.660006523132324,
"learning_rate": 0.00011802955665024631,
"loss": 0.3412,
"step": 8420
},
{
"epoch": 41.19585112873703,
"grad_norm": 3.9118030071258545,
"learning_rate": 0.00011783251231527096,
"loss": 0.3608,
"step": 8440
},
{
"epoch": 41.2934716290421,
"grad_norm": 4.68622350692749,
"learning_rate": 0.00011763546798029557,
"loss": 0.3742,
"step": 8460
},
{
"epoch": 41.391092129347165,
"grad_norm": 2.5423784255981445,
"learning_rate": 0.00011743842364532021,
"loss": 0.3901,
"step": 8480
},
{
"epoch": 41.48871262965223,
"grad_norm": 3.6446280479431152,
"learning_rate": 0.00011724137931034482,
"loss": 0.3518,
"step": 8500
},
{
"epoch": 41.58633312995729,
"grad_norm": 2.6701178550720215,
"learning_rate": 0.00011704433497536946,
"loss": 0.3809,
"step": 8520
},
{
"epoch": 41.68395363026236,
"grad_norm": 3.226100206375122,
"learning_rate": 0.0001168472906403941,
"loss": 0.3834,
"step": 8540
},
{
"epoch": 41.78157413056742,
"grad_norm": 3.4181952476501465,
"learning_rate": 0.00011665024630541872,
"loss": 0.4098,
"step": 8560
},
{
"epoch": 41.87919463087248,
"grad_norm": 2.9190330505371094,
"learning_rate": 0.00011645320197044336,
"loss": 0.3838,
"step": 8580
},
{
"epoch": 41.976815131177545,
"grad_norm": 4.082178115844727,
"learning_rate": 0.00011625615763546797,
"loss": 0.4109,
"step": 8600
},
{
"epoch": 42.074435631482615,
"grad_norm": 2.899162530899048,
"learning_rate": 0.00011605911330049261,
"loss": 0.3624,
"step": 8620
},
{
"epoch": 42.17205613178768,
"grad_norm": 2.4065990447998047,
"learning_rate": 0.00011586206896551725,
"loss": 0.3573,
"step": 8640
},
{
"epoch": 42.26967663209274,
"grad_norm": 2.818037509918213,
"learning_rate": 0.00011566502463054188,
"loss": 0.3699,
"step": 8660
},
{
"epoch": 42.3672971323978,
"grad_norm": 2.8875226974487305,
"learning_rate": 0.00011546798029556651,
"loss": 0.3489,
"step": 8680
},
{
"epoch": 42.464917632702864,
"grad_norm": 3.0840396881103516,
"learning_rate": 0.00011527093596059113,
"loss": 0.3733,
"step": 8700
},
{
"epoch": 42.56253813300793,
"grad_norm": 2.6554925441741943,
"learning_rate": 0.00011507389162561578,
"loss": 0.3541,
"step": 8720
},
{
"epoch": 42.660158633312996,
"grad_norm": 2.766045331954956,
"learning_rate": 0.00011487684729064042,
"loss": 0.3682,
"step": 8740
},
{
"epoch": 42.75777913361806,
"grad_norm": 3.0672762393951416,
"learning_rate": 0.00011467980295566503,
"loss": 0.3943,
"step": 8760
},
{
"epoch": 42.85539963392312,
"grad_norm": 2.898484468460083,
"learning_rate": 0.00011448275862068967,
"loss": 0.3702,
"step": 8780
},
{
"epoch": 42.95302013422819,
"grad_norm": 2.7023797035217285,
"learning_rate": 0.00011428571428571428,
"loss": 0.388,
"step": 8800
},
{
"epoch": 43.05064063453325,
"grad_norm": 2.4088499546051025,
"learning_rate": 0.00011408866995073892,
"loss": 0.3615,
"step": 8820
},
{
"epoch": 43.148261134838314,
"grad_norm": 2.3739655017852783,
"learning_rate": 0.00011389162561576354,
"loss": 0.3703,
"step": 8840
},
{
"epoch": 43.24588163514338,
"grad_norm": 3.2558271884918213,
"learning_rate": 0.00011369458128078818,
"loss": 0.3478,
"step": 8860
},
{
"epoch": 43.343502135448446,
"grad_norm": 2.931380271911621,
"learning_rate": 0.00011349753694581282,
"loss": 0.3553,
"step": 8880
},
{
"epoch": 43.44112263575351,
"grad_norm": 2.5165908336639404,
"learning_rate": 0.00011330049261083743,
"loss": 0.3495,
"step": 8900
},
{
"epoch": 43.53874313605857,
"grad_norm": 3.5619068145751953,
"learning_rate": 0.00011310344827586207,
"loss": 0.3692,
"step": 8920
},
{
"epoch": 43.63636363636363,
"grad_norm": 2.39534068107605,
"learning_rate": 0.0001129064039408867,
"loss": 0.3674,
"step": 8940
},
{
"epoch": 43.7339841366687,
"grad_norm": 3.495316505432129,
"learning_rate": 0.00011270935960591134,
"loss": 0.367,
"step": 8960
},
{
"epoch": 43.831604636973765,
"grad_norm": 2.8195016384124756,
"learning_rate": 0.00011251231527093598,
"loss": 0.411,
"step": 8980
},
{
"epoch": 43.92922513727883,
"grad_norm": 3.446014165878296,
"learning_rate": 0.0001123152709359606,
"loss": 0.3774,
"step": 9000
},
{
"epoch": 44.02684563758389,
"grad_norm": 3.0228703022003174,
"learning_rate": 0.00011211822660098524,
"loss": 0.3479,
"step": 9020
},
{
"epoch": 44.12446613788896,
"grad_norm": 4.042842864990234,
"learning_rate": 0.00011192118226600985,
"loss": 0.3567,
"step": 9040
},
{
"epoch": 44.22208663819402,
"grad_norm": 2.5165748596191406,
"learning_rate": 0.00011172413793103449,
"loss": 0.357,
"step": 9060
},
{
"epoch": 44.31970713849908,
"grad_norm": 2.9104301929473877,
"learning_rate": 0.00011152709359605913,
"loss": 0.3478,
"step": 9080
},
{
"epoch": 44.417327638804146,
"grad_norm": 5.000180244445801,
"learning_rate": 0.00011133004926108374,
"loss": 0.3372,
"step": 9100
},
{
"epoch": 44.514948139109215,
"grad_norm": 2.7573766708374023,
"learning_rate": 0.00011113300492610838,
"loss": 0.3574,
"step": 9120
},
{
"epoch": 44.61256863941428,
"grad_norm": 3.473818778991699,
"learning_rate": 0.000110935960591133,
"loss": 0.3666,
"step": 9140
},
{
"epoch": 44.71018913971934,
"grad_norm": 4.236100196838379,
"learning_rate": 0.00011073891625615764,
"loss": 0.3612,
"step": 9160
},
{
"epoch": 44.8078096400244,
"grad_norm": 5.279041290283203,
"learning_rate": 0.00011054187192118227,
"loss": 0.3694,
"step": 9180
},
{
"epoch": 44.90543014032947,
"grad_norm": 3.0009076595306396,
"learning_rate": 0.0001103448275862069,
"loss": 0.3629,
"step": 9200
},
{
"epoch": 45.003050640634534,
"grad_norm": 3.358452796936035,
"learning_rate": 0.00011014778325123153,
"loss": 0.3584,
"step": 9220
},
{
"epoch": 45.100671140939596,
"grad_norm": 2.9341399669647217,
"learning_rate": 0.00010995073891625616,
"loss": 0.3437,
"step": 9240
},
{
"epoch": 45.19829164124466,
"grad_norm": 3.1249337196350098,
"learning_rate": 0.0001097536945812808,
"loss": 0.3551,
"step": 9260
},
{
"epoch": 45.29591214154973,
"grad_norm": 2.4878969192504883,
"learning_rate": 0.00010955665024630541,
"loss": 0.3379,
"step": 9280
},
{
"epoch": 45.39353264185479,
"grad_norm": 3.114165782928467,
"learning_rate": 0.00010935960591133006,
"loss": 0.3616,
"step": 9300
},
{
"epoch": 45.49115314215985,
"grad_norm": 3.0727782249450684,
"learning_rate": 0.0001091625615763547,
"loss": 0.348,
"step": 9320
},
{
"epoch": 45.588773642464915,
"grad_norm": 2.9487972259521484,
"learning_rate": 0.00010896551724137931,
"loss": 0.3397,
"step": 9340
},
{
"epoch": 45.686394142769984,
"grad_norm": 3.0654473304748535,
"learning_rate": 0.00010876847290640395,
"loss": 0.3515,
"step": 9360
},
{
"epoch": 45.78401464307505,
"grad_norm": 4.303600311279297,
"learning_rate": 0.00010857142857142856,
"loss": 0.3586,
"step": 9380
},
{
"epoch": 45.88163514338011,
"grad_norm": 2.946246385574341,
"learning_rate": 0.0001083743842364532,
"loss": 0.3436,
"step": 9400
},
{
"epoch": 45.97925564368517,
"grad_norm": 2.4360456466674805,
"learning_rate": 0.00010817733990147785,
"loss": 0.3766,
"step": 9420
},
{
"epoch": 46.07687614399024,
"grad_norm": 2.8351433277130127,
"learning_rate": 0.00010798029556650246,
"loss": 0.3547,
"step": 9440
},
{
"epoch": 46.1744966442953,
"grad_norm": 2.6005990505218506,
"learning_rate": 0.0001077832512315271,
"loss": 0.3333,
"step": 9460
},
{
"epoch": 46.272117144600365,
"grad_norm": 2.52091121673584,
"learning_rate": 0.00010758620689655173,
"loss": 0.3507,
"step": 9480
},
{
"epoch": 46.36973764490543,
"grad_norm": 3.0750203132629395,
"learning_rate": 0.00010738916256157637,
"loss": 0.3376,
"step": 9500
},
{
"epoch": 46.4673581452105,
"grad_norm": 3.353597640991211,
"learning_rate": 0.00010719211822660098,
"loss": 0.3362,
"step": 9520
},
{
"epoch": 46.56497864551556,
"grad_norm": 3.786407232284546,
"learning_rate": 0.00010699507389162562,
"loss": 0.3774,
"step": 9540
},
{
"epoch": 46.66259914582062,
"grad_norm": 3.2476627826690674,
"learning_rate": 0.00010679802955665026,
"loss": 0.3423,
"step": 9560
},
{
"epoch": 46.760219646125684,
"grad_norm": 2.966078281402588,
"learning_rate": 0.00010660098522167488,
"loss": 0.3382,
"step": 9580
},
{
"epoch": 46.85784014643075,
"grad_norm": 3.7173826694488525,
"learning_rate": 0.00010640394088669952,
"loss": 0.3512,
"step": 9600
},
{
"epoch": 46.955460646735816,
"grad_norm": 3.6152524948120117,
"learning_rate": 0.00010620689655172413,
"loss": 0.3499,
"step": 9620
},
{
"epoch": 47.05308114704088,
"grad_norm": 3.6383986473083496,
"learning_rate": 0.00010600985221674877,
"loss": 0.3442,
"step": 9640
},
{
"epoch": 47.15070164734594,
"grad_norm": 2.636918306350708,
"learning_rate": 0.00010581280788177341,
"loss": 0.3355,
"step": 9660
},
{
"epoch": 47.24832214765101,
"grad_norm": 3.8844096660614014,
"learning_rate": 0.00010561576354679802,
"loss": 0.3389,
"step": 9680
},
{
"epoch": 47.34594264795607,
"grad_norm": 4.149389743804932,
"learning_rate": 0.00010541871921182267,
"loss": 0.3168,
"step": 9700
},
{
"epoch": 47.443563148261134,
"grad_norm": 3.205845832824707,
"learning_rate": 0.00010522167487684729,
"loss": 0.3247,
"step": 9720
},
{
"epoch": 47.5411836485662,
"grad_norm": 3.4177889823913574,
"learning_rate": 0.00010502463054187193,
"loss": 0.3472,
"step": 9740
},
{
"epoch": 47.638804148871266,
"grad_norm": 3.2508625984191895,
"learning_rate": 0.00010482758620689656,
"loss": 0.3354,
"step": 9760
},
{
"epoch": 47.73642464917633,
"grad_norm": 3.2071492671966553,
"learning_rate": 0.00010463054187192119,
"loss": 0.3515,
"step": 9780
},
{
"epoch": 47.83404514948139,
"grad_norm": 2.505859613418579,
"learning_rate": 0.00010443349753694583,
"loss": 0.3654,
"step": 9800
},
{
"epoch": 47.93166564978645,
"grad_norm": 3.092602491378784,
"learning_rate": 0.00010423645320197044,
"loss": 0.3551,
"step": 9820
},
{
"epoch": 48.02928615009152,
"grad_norm": 3.411740303039551,
"learning_rate": 0.00010403940886699508,
"loss": 0.3445,
"step": 9840
},
{
"epoch": 48.126906650396585,
"grad_norm": 2.587663412094116,
"learning_rate": 0.00010384236453201972,
"loss": 0.3132,
"step": 9860
},
{
"epoch": 48.22452715070165,
"grad_norm": 2.244938850402832,
"learning_rate": 0.00010364532019704434,
"loss": 0.3327,
"step": 9880
},
{
"epoch": 48.32214765100671,
"grad_norm": 3.426699638366699,
"learning_rate": 0.00010344827586206898,
"loss": 0.3163,
"step": 9900
},
{
"epoch": 48.41976815131178,
"grad_norm": 2.600964069366455,
"learning_rate": 0.00010325123152709359,
"loss": 0.3318,
"step": 9920
},
{
"epoch": 48.51738865161684,
"grad_norm": 2.5745320320129395,
"learning_rate": 0.00010305418719211823,
"loss": 0.3302,
"step": 9940
},
{
"epoch": 48.6150091519219,
"grad_norm": 2.9485421180725098,
"learning_rate": 0.00010285714285714286,
"loss": 0.3468,
"step": 9960
},
{
"epoch": 48.712629652226966,
"grad_norm": 2.783953905105591,
"learning_rate": 0.00010266009852216748,
"loss": 0.3339,
"step": 9980
},
{
"epoch": 48.810250152532035,
"grad_norm": 3.2114439010620117,
"learning_rate": 0.00010246305418719213,
"loss": 0.3496,
"step": 10000
},
{
"epoch": 48.9078706528371,
"grad_norm": 4.33662748336792,
"learning_rate": 0.00010226600985221675,
"loss": 0.3358,
"step": 10020
},
{
"epoch": 49.00549115314216,
"grad_norm": 2.714755058288574,
"learning_rate": 0.0001020689655172414,
"loss": 0.3677,
"step": 10040
},
{
"epoch": 49.10311165344722,
"grad_norm": 2.1904876232147217,
"learning_rate": 0.00010187192118226601,
"loss": 0.2878,
"step": 10060
},
{
"epoch": 49.20073215375229,
"grad_norm": 2.530484676361084,
"learning_rate": 0.00010167487684729065,
"loss": 0.3221,
"step": 10080
},
{
"epoch": 49.298352654057354,
"grad_norm": 3.1762654781341553,
"learning_rate": 0.00010147783251231529,
"loss": 0.3427,
"step": 10100
},
{
"epoch": 49.395973154362416,
"grad_norm": 3.0370638370513916,
"learning_rate": 0.0001012807881773399,
"loss": 0.3466,
"step": 10120
},
{
"epoch": 49.49359365466748,
"grad_norm": 2.5626463890075684,
"learning_rate": 0.00010108374384236454,
"loss": 0.3218,
"step": 10140
},
{
"epoch": 49.59121415497255,
"grad_norm": 3.4357545375823975,
"learning_rate": 0.00010088669950738916,
"loss": 0.3312,
"step": 10160
},
{
"epoch": 49.68883465527761,
"grad_norm": 2.810955762863159,
"learning_rate": 0.0001006896551724138,
"loss": 0.3363,
"step": 10180
},
{
"epoch": 49.78645515558267,
"grad_norm": 3.8722000122070312,
"learning_rate": 0.00010049261083743844,
"loss": 0.3251,
"step": 10200
},
{
"epoch": 49.884075655887735,
"grad_norm": 3.185521364212036,
"learning_rate": 0.00010029556650246305,
"loss": 0.3429,
"step": 10220
},
{
"epoch": 49.981696156192804,
"grad_norm": 2.707853078842163,
"learning_rate": 0.00010009852216748769,
"loss": 0.3548,
"step": 10240
},
{
"epoch": 50.079316656497866,
"grad_norm": 2.749464511871338,
"learning_rate": 9.990147783251232e-05,
"loss": 0.3294,
"step": 10260
},
{
"epoch": 50.17693715680293,
"grad_norm": 3.4640865325927734,
"learning_rate": 9.970443349753696e-05,
"loss": 0.3204,
"step": 10280
},
{
"epoch": 50.27455765710799,
"grad_norm": 3.4412505626678467,
"learning_rate": 9.950738916256159e-05,
"loss": 0.3316,
"step": 10300
},
{
"epoch": 50.37217815741306,
"grad_norm": 4.671158790588379,
"learning_rate": 9.931034482758621e-05,
"loss": 0.3092,
"step": 10320
},
{
"epoch": 50.46979865771812,
"grad_norm": 2.812875986099243,
"learning_rate": 9.911330049261084e-05,
"loss": 0.3217,
"step": 10340
},
{
"epoch": 50.567419158023185,
"grad_norm": 2.600764513015747,
"learning_rate": 9.891625615763547e-05,
"loss": 0.3525,
"step": 10360
},
{
"epoch": 50.66503965832825,
"grad_norm": 2.8875558376312256,
"learning_rate": 9.871921182266011e-05,
"loss": 0.3267,
"step": 10380
},
{
"epoch": 50.76266015863331,
"grad_norm": 2.479055643081665,
"learning_rate": 9.852216748768474e-05,
"loss": 0.3283,
"step": 10400
},
{
"epoch": 50.86028065893838,
"grad_norm": 3.4580044746398926,
"learning_rate": 9.832512315270936e-05,
"loss": 0.3388,
"step": 10420
},
{
"epoch": 50.95790115924344,
"grad_norm": 2.68265962600708,
"learning_rate": 9.812807881773399e-05,
"loss": 0.3309,
"step": 10440
},
{
"epoch": 51.0555216595485,
"grad_norm": 2.545677661895752,
"learning_rate": 9.793103448275862e-05,
"loss": 0.3221,
"step": 10460
},
{
"epoch": 51.153142159853566,
"grad_norm": 2.899627685546875,
"learning_rate": 9.773399014778326e-05,
"loss": 0.3084,
"step": 10480
},
{
"epoch": 51.250762660158635,
"grad_norm": 2.948960781097412,
"learning_rate": 9.753694581280788e-05,
"loss": 0.3273,
"step": 10500
},
{
"epoch": 51.3483831604637,
"grad_norm": 2.9379513263702393,
"learning_rate": 9.733990147783252e-05,
"loss": 0.3315,
"step": 10520
},
{
"epoch": 51.44600366076876,
"grad_norm": 2.543419599533081,
"learning_rate": 9.714285714285715e-05,
"loss": 0.3258,
"step": 10540
},
{
"epoch": 51.54362416107382,
"grad_norm": 2.7236459255218506,
"learning_rate": 9.694581280788178e-05,
"loss": 0.3129,
"step": 10560
},
{
"epoch": 51.64124466137889,
"grad_norm": 3.11745548248291,
"learning_rate": 9.67487684729064e-05,
"loss": 0.3038,
"step": 10580
},
{
"epoch": 51.738865161683954,
"grad_norm": 3.6259920597076416,
"learning_rate": 9.655172413793105e-05,
"loss": 0.3269,
"step": 10600
},
{
"epoch": 51.836485661989016,
"grad_norm": 3.4961044788360596,
"learning_rate": 9.635467980295567e-05,
"loss": 0.336,
"step": 10620
},
{
"epoch": 51.93410616229408,
"grad_norm": 3.01009202003479,
"learning_rate": 9.61576354679803e-05,
"loss": 0.3297,
"step": 10640
},
{
"epoch": 52.03172666259915,
"grad_norm": 3.047903060913086,
"learning_rate": 9.596059113300493e-05,
"loss": 0.3295,
"step": 10660
},
{
"epoch": 52.12934716290421,
"grad_norm": 2.8521170616149902,
"learning_rate": 9.576354679802955e-05,
"loss": 0.2952,
"step": 10680
},
{
"epoch": 52.22696766320927,
"grad_norm": 2.8909034729003906,
"learning_rate": 9.55665024630542e-05,
"loss": 0.3128,
"step": 10700
},
{
"epoch": 52.324588163514335,
"grad_norm": 3.2134296894073486,
"learning_rate": 9.536945812807882e-05,
"loss": 0.3175,
"step": 10720
},
{
"epoch": 52.422208663819404,
"grad_norm": 3.113543748855591,
"learning_rate": 9.517241379310345e-05,
"loss": 0.3305,
"step": 10740
},
{
"epoch": 52.51982916412447,
"grad_norm": 2.3091633319854736,
"learning_rate": 9.497536945812808e-05,
"loss": 0.3032,
"step": 10760
},
{
"epoch": 52.61744966442953,
"grad_norm": 2.7626681327819824,
"learning_rate": 9.477832512315272e-05,
"loss": 0.3071,
"step": 10780
},
{
"epoch": 52.71507016473459,
"grad_norm": 2.6978394985198975,
"learning_rate": 9.458128078817734e-05,
"loss": 0.3424,
"step": 10800
},
{
"epoch": 52.81269066503966,
"grad_norm": 4.549131393432617,
"learning_rate": 9.438423645320199e-05,
"loss": 0.3086,
"step": 10820
},
{
"epoch": 52.91031116534472,
"grad_norm": 3.3548974990844727,
"learning_rate": 9.418719211822661e-05,
"loss": 0.3414,
"step": 10840
},
{
"epoch": 53.007931665649785,
"grad_norm": 2.191990852355957,
"learning_rate": 9.399014778325124e-05,
"loss": 0.3195,
"step": 10860
},
{
"epoch": 53.10555216595485,
"grad_norm": 2.8169941902160645,
"learning_rate": 9.379310344827587e-05,
"loss": 0.2971,
"step": 10880
},
{
"epoch": 53.20317266625992,
"grad_norm": 2.4809463024139404,
"learning_rate": 9.35960591133005e-05,
"loss": 0.3032,
"step": 10900
},
{
"epoch": 53.30079316656498,
"grad_norm": 2.8981711864471436,
"learning_rate": 9.339901477832512e-05,
"loss": 0.3139,
"step": 10920
},
{
"epoch": 53.39841366687004,
"grad_norm": 2.901442050933838,
"learning_rate": 9.320197044334976e-05,
"loss": 0.3197,
"step": 10940
},
{
"epoch": 53.496034167175104,
"grad_norm": 3.1128933429718018,
"learning_rate": 9.300492610837439e-05,
"loss": 0.3109,
"step": 10960
},
{
"epoch": 53.59365466748017,
"grad_norm": 2.6892173290252686,
"learning_rate": 9.280788177339902e-05,
"loss": 0.3153,
"step": 10980
},
{
"epoch": 53.691275167785236,
"grad_norm": 3.1847739219665527,
"learning_rate": 9.261083743842364e-05,
"loss": 0.3135,
"step": 11000
},
{
"epoch": 53.7888956680903,
"grad_norm": 3.1111955642700195,
"learning_rate": 9.241379310344827e-05,
"loss": 0.3472,
"step": 11020
},
{
"epoch": 53.88651616839536,
"grad_norm": 2.667539119720459,
"learning_rate": 9.221674876847291e-05,
"loss": 0.3107,
"step": 11040
},
{
"epoch": 53.98413666870043,
"grad_norm": 2.1500725746154785,
"learning_rate": 9.201970443349755e-05,
"loss": 0.3192,
"step": 11060
},
{
"epoch": 54.08175716900549,
"grad_norm": 3.6513638496398926,
"learning_rate": 9.182266009852218e-05,
"loss": 0.2974,
"step": 11080
},
{
"epoch": 54.179377669310554,
"grad_norm": 3.226287364959717,
"learning_rate": 9.16256157635468e-05,
"loss": 0.3216,
"step": 11100
},
{
"epoch": 54.27699816961562,
"grad_norm": 3.4577550888061523,
"learning_rate": 9.142857142857143e-05,
"loss": 0.2999,
"step": 11120
},
{
"epoch": 54.374618669920686,
"grad_norm": 2.047478199005127,
"learning_rate": 9.123152709359606e-05,
"loss": 0.3139,
"step": 11140
},
{
"epoch": 54.47223917022575,
"grad_norm": 3.0338408946990967,
"learning_rate": 9.10344827586207e-05,
"loss": 0.2954,
"step": 11160
},
{
"epoch": 54.56985967053081,
"grad_norm": 2.6099050045013428,
"learning_rate": 9.083743842364533e-05,
"loss": 0.3218,
"step": 11180
},
{
"epoch": 54.66748017083587,
"grad_norm": 3.248973846435547,
"learning_rate": 9.064039408866995e-05,
"loss": 0.3243,
"step": 11200
},
{
"epoch": 54.76510067114094,
"grad_norm": 4.767118453979492,
"learning_rate": 9.044334975369458e-05,
"loss": 0.315,
"step": 11220
},
{
"epoch": 54.862721171446005,
"grad_norm": 2.872119188308716,
"learning_rate": 9.024630541871921e-05,
"loss": 0.3032,
"step": 11240
},
{
"epoch": 54.96034167175107,
"grad_norm": 3.499648094177246,
"learning_rate": 9.004926108374385e-05,
"loss": 0.3141,
"step": 11260
},
{
"epoch": 55.05796217205613,
"grad_norm": 3.0000522136688232,
"learning_rate": 8.985221674876848e-05,
"loss": 0.3153,
"step": 11280
},
{
"epoch": 55.1555826723612,
"grad_norm": 2.2861599922180176,
"learning_rate": 8.96551724137931e-05,
"loss": 0.3258,
"step": 11300
},
{
"epoch": 55.25320317266626,
"grad_norm": 2.980668306350708,
"learning_rate": 8.945812807881774e-05,
"loss": 0.3099,
"step": 11320
},
{
"epoch": 55.35082367297132,
"grad_norm": 2.286050319671631,
"learning_rate": 8.926108374384237e-05,
"loss": 0.2931,
"step": 11340
},
{
"epoch": 55.448444173276386,
"grad_norm": 4.078646659851074,
"learning_rate": 8.9064039408867e-05,
"loss": 0.3142,
"step": 11360
},
{
"epoch": 55.546064673581455,
"grad_norm": 2.150973320007324,
"learning_rate": 8.886699507389164e-05,
"loss": 0.2839,
"step": 11380
},
{
"epoch": 55.64368517388652,
"grad_norm": 2.671983242034912,
"learning_rate": 8.866995073891627e-05,
"loss": 0.2981,
"step": 11400
},
{
"epoch": 55.74130567419158,
"grad_norm": 3.199276924133301,
"learning_rate": 8.847290640394089e-05,
"loss": 0.304,
"step": 11420
},
{
"epoch": 55.83892617449664,
"grad_norm": 2.477468967437744,
"learning_rate": 8.827586206896552e-05,
"loss": 0.3288,
"step": 11440
},
{
"epoch": 55.93654667480171,
"grad_norm": 2.3130173683166504,
"learning_rate": 8.807881773399015e-05,
"loss": 0.321,
"step": 11460
},
{
"epoch": 56.034167175106774,
"grad_norm": 3.1496715545654297,
"learning_rate": 8.788177339901477e-05,
"loss": 0.2992,
"step": 11480
},
{
"epoch": 56.131787675411836,
"grad_norm": 3.3296494483947754,
"learning_rate": 8.768472906403941e-05,
"loss": 0.3023,
"step": 11500
},
{
"epoch": 56.2294081757169,
"grad_norm": 2.992814540863037,
"learning_rate": 8.748768472906404e-05,
"loss": 0.291,
"step": 11520
},
{
"epoch": 56.32702867602197,
"grad_norm": 2.981858015060425,
"learning_rate": 8.729064039408867e-05,
"loss": 0.2908,
"step": 11540
},
{
"epoch": 56.42464917632703,
"grad_norm": 3.968040704727173,
"learning_rate": 8.709359605911331e-05,
"loss": 0.2963,
"step": 11560
},
{
"epoch": 56.52226967663209,
"grad_norm": 3.6845455169677734,
"learning_rate": 8.689655172413794e-05,
"loss": 0.3137,
"step": 11580
},
{
"epoch": 56.619890176937155,
"grad_norm": 3.8928792476654053,
"learning_rate": 8.669950738916258e-05,
"loss": 0.2971,
"step": 11600
},
{
"epoch": 56.717510677242224,
"grad_norm": 2.064180374145508,
"learning_rate": 8.65024630541872e-05,
"loss": 0.3067,
"step": 11620
},
{
"epoch": 56.815131177547286,
"grad_norm": 2.8107266426086426,
"learning_rate": 8.630541871921183e-05,
"loss": 0.2972,
"step": 11640
},
{
"epoch": 56.91275167785235,
"grad_norm": 2.747004270553589,
"learning_rate": 8.610837438423646e-05,
"loss": 0.3183,
"step": 11660
},
{
"epoch": 57.01037217815741,
"grad_norm": 2.0700557231903076,
"learning_rate": 8.591133004926109e-05,
"loss": 0.3075,
"step": 11680
},
{
"epoch": 57.10799267846248,
"grad_norm": 3.1093757152557373,
"learning_rate": 8.571428571428571e-05,
"loss": 0.2756,
"step": 11700
},
{
"epoch": 57.20561317876754,
"grad_norm": 2.34448504447937,
"learning_rate": 8.551724137931035e-05,
"loss": 0.2898,
"step": 11720
},
{
"epoch": 57.303233679072605,
"grad_norm": 3.3790042400360107,
"learning_rate": 8.532019704433498e-05,
"loss": 0.3081,
"step": 11740
},
{
"epoch": 57.40085417937767,
"grad_norm": 2.700956106185913,
"learning_rate": 8.512315270935961e-05,
"loss": 0.2915,
"step": 11760
},
{
"epoch": 57.49847467968274,
"grad_norm": 2.6353628635406494,
"learning_rate": 8.492610837438423e-05,
"loss": 0.3063,
"step": 11780
},
{
"epoch": 57.5960951799878,
"grad_norm": 2.56706166267395,
"learning_rate": 8.472906403940886e-05,
"loss": 0.3005,
"step": 11800
},
{
"epoch": 57.69371568029286,
"grad_norm": 4.074772357940674,
"learning_rate": 8.45320197044335e-05,
"loss": 0.3007,
"step": 11820
},
{
"epoch": 57.79133618059792,
"grad_norm": 2.786485433578491,
"learning_rate": 8.433497536945813e-05,
"loss": 0.3141,
"step": 11840
},
{
"epoch": 57.88895668090299,
"grad_norm": 2.9513659477233887,
"learning_rate": 8.413793103448277e-05,
"loss": 0.3216,
"step": 11860
},
{
"epoch": 57.986577181208055,
"grad_norm": 3.126004219055176,
"learning_rate": 8.39408866995074e-05,
"loss": 0.3,
"step": 11880
},
{
"epoch": 58.08419768151312,
"grad_norm": 2.20534348487854,
"learning_rate": 8.374384236453202e-05,
"loss": 0.2891,
"step": 11900
},
{
"epoch": 58.18181818181818,
"grad_norm": 4.753482818603516,
"learning_rate": 8.354679802955665e-05,
"loss": 0.3019,
"step": 11920
},
{
"epoch": 58.27943868212325,
"grad_norm": 3.1038873195648193,
"learning_rate": 8.334975369458129e-05,
"loss": 0.283,
"step": 11940
},
{
"epoch": 58.37705918242831,
"grad_norm": 2.9366559982299805,
"learning_rate": 8.315270935960592e-05,
"loss": 0.302,
"step": 11960
},
{
"epoch": 58.474679682733374,
"grad_norm": 3.008777379989624,
"learning_rate": 8.295566502463055e-05,
"loss": 0.3256,
"step": 11980
},
{
"epoch": 58.572300183038436,
"grad_norm": 2.7105023860931396,
"learning_rate": 8.275862068965517e-05,
"loss": 0.2959,
"step": 12000
},
{
"epoch": 58.669920683343506,
"grad_norm": 2.762347936630249,
"learning_rate": 8.25615763546798e-05,
"loss": 0.2826,
"step": 12020
},
{
"epoch": 58.76754118364857,
"grad_norm": 2.8366870880126953,
"learning_rate": 8.236453201970443e-05,
"loss": 0.302,
"step": 12040
},
{
"epoch": 58.86516168395363,
"grad_norm": 2.721994400024414,
"learning_rate": 8.216748768472907e-05,
"loss": 0.2966,
"step": 12060
},
{
"epoch": 58.96278218425869,
"grad_norm": 2.988464117050171,
"learning_rate": 8.19704433497537e-05,
"loss": 0.2899,
"step": 12080
},
{
"epoch": 59.060402684563755,
"grad_norm": 2.6657352447509766,
"learning_rate": 8.177339901477834e-05,
"loss": 0.2889,
"step": 12100
},
{
"epoch": 59.158023184868824,
"grad_norm": 3.703511953353882,
"learning_rate": 8.157635467980296e-05,
"loss": 0.2794,
"step": 12120
},
{
"epoch": 59.25564368517389,
"grad_norm": 2.9937832355499268,
"learning_rate": 8.137931034482759e-05,
"loss": 0.2896,
"step": 12140
},
{
"epoch": 59.35326418547895,
"grad_norm": 3.188159704208374,
"learning_rate": 8.118226600985223e-05,
"loss": 0.2885,
"step": 12160
},
{
"epoch": 59.45088468578401,
"grad_norm": 2.8724703788757324,
"learning_rate": 8.098522167487686e-05,
"loss": 0.2959,
"step": 12180
},
{
"epoch": 59.54850518608908,
"grad_norm": 3.351435422897339,
"learning_rate": 8.078817733990148e-05,
"loss": 0.2867,
"step": 12200
},
{
"epoch": 59.64612568639414,
"grad_norm": 2.5625758171081543,
"learning_rate": 8.059113300492611e-05,
"loss": 0.3042,
"step": 12220
},
{
"epoch": 59.743746186699205,
"grad_norm": 3.3796396255493164,
"learning_rate": 8.039408866995074e-05,
"loss": 0.301,
"step": 12240
},
{
"epoch": 59.84136668700427,
"grad_norm": 2.787851572036743,
"learning_rate": 8.019704433497537e-05,
"loss": 0.3072,
"step": 12260
},
{
"epoch": 59.93898718730934,
"grad_norm": 2.9104974269866943,
"learning_rate": 8e-05,
"loss": 0.3059,
"step": 12280
},
{
"epoch": 60.0366076876144,
"grad_norm": 2.957249879837036,
"learning_rate": 7.980295566502463e-05,
"loss": 0.2965,
"step": 12300
},
{
"epoch": 60.13422818791946,
"grad_norm": 2.2982118129730225,
"learning_rate": 7.960591133004926e-05,
"loss": 0.2703,
"step": 12320
},
{
"epoch": 60.231848688224524,
"grad_norm": 3.548534870147705,
"learning_rate": 7.940886699507389e-05,
"loss": 0.2843,
"step": 12340
},
{
"epoch": 60.32946918852959,
"grad_norm": 2.3399384021759033,
"learning_rate": 7.921182266009853e-05,
"loss": 0.2855,
"step": 12360
},
{
"epoch": 60.427089688834656,
"grad_norm": 3.4186365604400635,
"learning_rate": 7.901477832512316e-05,
"loss": 0.2942,
"step": 12380
},
{
"epoch": 60.52471018913972,
"grad_norm": 2.572951316833496,
"learning_rate": 7.88177339901478e-05,
"loss": 0.2918,
"step": 12400
},
{
"epoch": 60.62233068944478,
"grad_norm": 2.1056010723114014,
"learning_rate": 7.862068965517242e-05,
"loss": 0.3051,
"step": 12420
},
{
"epoch": 60.71995118974985,
"grad_norm": 4.122783184051514,
"learning_rate": 7.842364532019705e-05,
"loss": 0.2811,
"step": 12440
},
{
"epoch": 60.81757169005491,
"grad_norm": 2.3634865283966064,
"learning_rate": 7.822660098522168e-05,
"loss": 0.3063,
"step": 12460
},
{
"epoch": 60.915192190359974,
"grad_norm": 3.362290143966675,
"learning_rate": 7.80295566502463e-05,
"loss": 0.2954,
"step": 12480
},
{
"epoch": 61.01281269066504,
"grad_norm": 4.63106632232666,
"learning_rate": 7.783251231527095e-05,
"loss": 0.2855,
"step": 12500
},
{
"epoch": 61.110433190970106,
"grad_norm": 3.6261041164398193,
"learning_rate": 7.763546798029557e-05,
"loss": 0.2792,
"step": 12520
},
{
"epoch": 61.20805369127517,
"grad_norm": 2.869415760040283,
"learning_rate": 7.74384236453202e-05,
"loss": 0.2833,
"step": 12540
},
{
"epoch": 61.30567419158023,
"grad_norm": 2.7370972633361816,
"learning_rate": 7.724137931034483e-05,
"loss": 0.2997,
"step": 12560
},
{
"epoch": 61.40329469188529,
"grad_norm": 3.5397825241088867,
"learning_rate": 7.704433497536945e-05,
"loss": 0.2799,
"step": 12580
},
{
"epoch": 61.50091519219036,
"grad_norm": 2.3903191089630127,
"learning_rate": 7.684729064039408e-05,
"loss": 0.2857,
"step": 12600
},
{
"epoch": 61.598535692495425,
"grad_norm": 3.3589389324188232,
"learning_rate": 7.665024630541872e-05,
"loss": 0.2823,
"step": 12620
},
{
"epoch": 61.69615619280049,
"grad_norm": 4.420291423797607,
"learning_rate": 7.645320197044336e-05,
"loss": 0.2895,
"step": 12640
},
{
"epoch": 61.79377669310555,
"grad_norm": 3.060859441757202,
"learning_rate": 7.625615763546799e-05,
"loss": 0.2859,
"step": 12660
},
{
"epoch": 61.89139719341062,
"grad_norm": 3.5927321910858154,
"learning_rate": 7.605911330049262e-05,
"loss": 0.2954,
"step": 12680
},
{
"epoch": 61.98901769371568,
"grad_norm": 2.7577738761901855,
"learning_rate": 7.586206896551724e-05,
"loss": 0.2832,
"step": 12700
},
{
"epoch": 62.08663819402074,
"grad_norm": 4.519462585449219,
"learning_rate": 7.566502463054188e-05,
"loss": 0.2695,
"step": 12720
},
{
"epoch": 62.184258694325806,
"grad_norm": 2.231842279434204,
"learning_rate": 7.546798029556651e-05,
"loss": 0.2894,
"step": 12740
},
{
"epoch": 62.281879194630875,
"grad_norm": 3.5176825523376465,
"learning_rate": 7.527093596059114e-05,
"loss": 0.2749,
"step": 12760
},
{
"epoch": 62.37949969493594,
"grad_norm": 3.319891929626465,
"learning_rate": 7.507389162561577e-05,
"loss": 0.2909,
"step": 12780
},
{
"epoch": 62.477120195241,
"grad_norm": 2.778862237930298,
"learning_rate": 7.487684729064039e-05,
"loss": 0.2816,
"step": 12800
},
{
"epoch": 62.57474069554606,
"grad_norm": 2.7136170864105225,
"learning_rate": 7.467980295566502e-05,
"loss": 0.286,
"step": 12820
},
{
"epoch": 62.67236119585113,
"grad_norm": 2.841850519180298,
"learning_rate": 7.448275862068966e-05,
"loss": 0.3078,
"step": 12840
},
{
"epoch": 62.769981696156194,
"grad_norm": 3.159632682800293,
"learning_rate": 7.428571428571429e-05,
"loss": 0.2693,
"step": 12860
},
{
"epoch": 62.867602196461256,
"grad_norm": 2.638611078262329,
"learning_rate": 7.408866995073891e-05,
"loss": 0.2838,
"step": 12880
},
{
"epoch": 62.96522269676632,
"grad_norm": 3.453857421875,
"learning_rate": 7.389162561576355e-05,
"loss": 0.2892,
"step": 12900
},
{
"epoch": 63.06284319707139,
"grad_norm": 3.6586861610412598,
"learning_rate": 7.369458128078818e-05,
"loss": 0.2626,
"step": 12920
},
{
"epoch": 63.16046369737645,
"grad_norm": 3.8204469680786133,
"learning_rate": 7.349753694581281e-05,
"loss": 0.2834,
"step": 12940
},
{
"epoch": 63.25808419768151,
"grad_norm": 1.7463505268096924,
"learning_rate": 7.330049261083745e-05,
"loss": 0.2909,
"step": 12960
},
{
"epoch": 63.355704697986575,
"grad_norm": 1.687853217124939,
"learning_rate": 7.310344827586208e-05,
"loss": 0.2892,
"step": 12980
},
{
"epoch": 63.453325198291644,
"grad_norm": 2.835196018218994,
"learning_rate": 7.29064039408867e-05,
"loss": 0.2763,
"step": 13000
},
{
"epoch": 63.550945698596706,
"grad_norm": 3.77742862701416,
"learning_rate": 7.270935960591133e-05,
"loss": 0.2834,
"step": 13020
},
{
"epoch": 63.64856619890177,
"grad_norm": 2.1246883869171143,
"learning_rate": 7.251231527093596e-05,
"loss": 0.2859,
"step": 13040
},
{
"epoch": 63.74618669920683,
"grad_norm": 3.592597246170044,
"learning_rate": 7.23152709359606e-05,
"loss": 0.2865,
"step": 13060
},
{
"epoch": 63.8438071995119,
"grad_norm": 2.8954873085021973,
"learning_rate": 7.211822660098523e-05,
"loss": 0.2855,
"step": 13080
},
{
"epoch": 63.94142769981696,
"grad_norm": 2.266686201095581,
"learning_rate": 7.192118226600985e-05,
"loss": 0.2814,
"step": 13100
},
{
"epoch": 64.03904820012202,
"grad_norm": 1.9330942630767822,
"learning_rate": 7.172413793103448e-05,
"loss": 0.2832,
"step": 13120
},
{
"epoch": 64.1366687004271,
"grad_norm": 4.008347511291504,
"learning_rate": 7.152709359605912e-05,
"loss": 0.2762,
"step": 13140
},
{
"epoch": 64.23428920073215,
"grad_norm": 2.2452552318573,
"learning_rate": 7.133004926108375e-05,
"loss": 0.269,
"step": 13160
},
{
"epoch": 64.33190970103722,
"grad_norm": 7.247570991516113,
"learning_rate": 7.113300492610839e-05,
"loss": 0.2652,
"step": 13180
},
{
"epoch": 64.42953020134229,
"grad_norm": 4.846076488494873,
"learning_rate": 7.093596059113302e-05,
"loss": 0.2766,
"step": 13200
},
{
"epoch": 64.52715070164734,
"grad_norm": 3.444746732711792,
"learning_rate": 7.073891625615764e-05,
"loss": 0.2789,
"step": 13220
},
{
"epoch": 64.62477120195241,
"grad_norm": 2.506460428237915,
"learning_rate": 7.054187192118227e-05,
"loss": 0.279,
"step": 13240
},
{
"epoch": 64.72239170225747,
"grad_norm": 3.3973569869995117,
"learning_rate": 7.03448275862069e-05,
"loss": 0.2887,
"step": 13260
},
{
"epoch": 64.82001220256254,
"grad_norm": 3.14697265625,
"learning_rate": 7.014778325123154e-05,
"loss": 0.2813,
"step": 13280
},
{
"epoch": 64.9176327028676,
"grad_norm": 4.694430828094482,
"learning_rate": 6.995073891625616e-05,
"loss": 0.3026,
"step": 13300
},
{
"epoch": 65.01525320317266,
"grad_norm": 2.2463550567626953,
"learning_rate": 6.975369458128079e-05,
"loss": 0.2739,
"step": 13320
},
{
"epoch": 65.11287370347773,
"grad_norm": 2.907592535018921,
"learning_rate": 6.955665024630542e-05,
"loss": 0.2783,
"step": 13340
},
{
"epoch": 65.2104942037828,
"grad_norm": 2.9708614349365234,
"learning_rate": 6.935960591133005e-05,
"loss": 0.2718,
"step": 13360
},
{
"epoch": 65.30811470408786,
"grad_norm": 2.7227044105529785,
"learning_rate": 6.916256157635467e-05,
"loss": 0.2615,
"step": 13380
},
{
"epoch": 65.40573520439293,
"grad_norm": 2.3960001468658447,
"learning_rate": 6.896551724137931e-05,
"loss": 0.2822,
"step": 13400
},
{
"epoch": 65.50335570469798,
"grad_norm": 2.032240629196167,
"learning_rate": 6.876847290640394e-05,
"loss": 0.282,
"step": 13420
},
{
"epoch": 65.60097620500305,
"grad_norm": 2.5334010124206543,
"learning_rate": 6.857142857142858e-05,
"loss": 0.2771,
"step": 13440
},
{
"epoch": 65.69859670530812,
"grad_norm": 7.930431842803955,
"learning_rate": 6.837438423645321e-05,
"loss": 0.2878,
"step": 13460
},
{
"epoch": 65.79621720561317,
"grad_norm": 2.709092378616333,
"learning_rate": 6.817733990147784e-05,
"loss": 0.2797,
"step": 13480
},
{
"epoch": 65.89383770591824,
"grad_norm": 4.455546855926514,
"learning_rate": 6.798029556650246e-05,
"loss": 0.2803,
"step": 13500
},
{
"epoch": 65.99145820622331,
"grad_norm": 4.6384077072143555,
"learning_rate": 6.77832512315271e-05,
"loss": 0.2764,
"step": 13520
},
{
"epoch": 66.08907870652837,
"grad_norm": 2.7529897689819336,
"learning_rate": 6.758620689655173e-05,
"loss": 0.2614,
"step": 13540
},
{
"epoch": 66.18669920683344,
"grad_norm": 2.0837860107421875,
"learning_rate": 6.738916256157636e-05,
"loss": 0.2696,
"step": 13560
},
{
"epoch": 66.2843197071385,
"grad_norm": 1.6655378341674805,
"learning_rate": 6.719211822660098e-05,
"loss": 0.2781,
"step": 13580
},
{
"epoch": 66.38194020744356,
"grad_norm": 1.8926398754119873,
"learning_rate": 6.699507389162561e-05,
"loss": 0.273,
"step": 13600
},
{
"epoch": 66.47956070774863,
"grad_norm": 1.8903833627700806,
"learning_rate": 6.679802955665025e-05,
"loss": 0.2683,
"step": 13620
},
{
"epoch": 66.57718120805369,
"grad_norm": 3.0182383060455322,
"learning_rate": 6.660098522167488e-05,
"loss": 0.2685,
"step": 13640
},
{
"epoch": 66.67480170835876,
"grad_norm": 3.0081100463867188,
"learning_rate": 6.64039408866995e-05,
"loss": 0.2915,
"step": 13660
},
{
"epoch": 66.77242220866383,
"grad_norm": 2.345440149307251,
"learning_rate": 6.620689655172415e-05,
"loss": 0.2707,
"step": 13680
},
{
"epoch": 66.87004270896888,
"grad_norm": 2.430608034133911,
"learning_rate": 6.600985221674877e-05,
"loss": 0.2675,
"step": 13700
},
{
"epoch": 66.96766320927395,
"grad_norm": 4.09646463394165,
"learning_rate": 6.58128078817734e-05,
"loss": 0.2886,
"step": 13720
},
{
"epoch": 67.065283709579,
"grad_norm": 2.696843147277832,
"learning_rate": 6.561576354679804e-05,
"loss": 0.2743,
"step": 13740
},
{
"epoch": 67.16290420988408,
"grad_norm": 1.8098782300949097,
"learning_rate": 6.541871921182267e-05,
"loss": 0.2629,
"step": 13760
},
{
"epoch": 67.26052471018915,
"grad_norm": 2.604454278945923,
"learning_rate": 6.52216748768473e-05,
"loss": 0.2701,
"step": 13780
},
{
"epoch": 67.3581452104942,
"grad_norm": 2.6400327682495117,
"learning_rate": 6.502463054187192e-05,
"loss": 0.2791,
"step": 13800
},
{
"epoch": 67.45576571079927,
"grad_norm": 2.6029961109161377,
"learning_rate": 6.482758620689655e-05,
"loss": 0.2753,
"step": 13820
},
{
"epoch": 67.55338621110434,
"grad_norm": 2.493805170059204,
"learning_rate": 6.463054187192119e-05,
"loss": 0.2654,
"step": 13840
},
{
"epoch": 67.6510067114094,
"grad_norm": 3.1555075645446777,
"learning_rate": 6.443349753694582e-05,
"loss": 0.2701,
"step": 13860
},
{
"epoch": 67.74862721171446,
"grad_norm": 4.280105113983154,
"learning_rate": 6.423645320197044e-05,
"loss": 0.2732,
"step": 13880
},
{
"epoch": 67.84624771201952,
"grad_norm": 2.8167061805725098,
"learning_rate": 6.403940886699507e-05,
"loss": 0.2755,
"step": 13900
},
{
"epoch": 67.94386821232459,
"grad_norm": 3.5046565532684326,
"learning_rate": 6.38423645320197e-05,
"loss": 0.2831,
"step": 13920
},
{
"epoch": 68.04148871262966,
"grad_norm": 2.4737610816955566,
"learning_rate": 6.364532019704434e-05,
"loss": 0.2737,
"step": 13940
},
{
"epoch": 68.13910921293471,
"grad_norm": 1.996193766593933,
"learning_rate": 6.344827586206897e-05,
"loss": 0.2637,
"step": 13960
},
{
"epoch": 68.23672971323978,
"grad_norm": 2.7088236808776855,
"learning_rate": 6.325123152709361e-05,
"loss": 0.2683,
"step": 13980
},
{
"epoch": 68.33435021354484,
"grad_norm": 2.344050168991089,
"learning_rate": 6.305418719211823e-05,
"loss": 0.2685,
"step": 14000
},
{
"epoch": 68.4319707138499,
"grad_norm": 3.3628969192504883,
"learning_rate": 6.285714285714286e-05,
"loss": 0.2728,
"step": 14020
},
{
"epoch": 68.52959121415498,
"grad_norm": 2.8613572120666504,
"learning_rate": 6.266009852216749e-05,
"loss": 0.2668,
"step": 14040
},
{
"epoch": 68.62721171446003,
"grad_norm": 2.752930164337158,
"learning_rate": 6.246305418719212e-05,
"loss": 0.2753,
"step": 14060
},
{
"epoch": 68.7248322147651,
"grad_norm": 2.426806926727295,
"learning_rate": 6.226600985221676e-05,
"loss": 0.2545,
"step": 14080
},
{
"epoch": 68.82245271507017,
"grad_norm": 2.4970877170562744,
"learning_rate": 6.206896551724138e-05,
"loss": 0.2733,
"step": 14100
},
{
"epoch": 68.92007321537523,
"grad_norm": 2.6764674186706543,
"learning_rate": 6.187192118226601e-05,
"loss": 0.2726,
"step": 14120
},
{
"epoch": 69.0176937156803,
"grad_norm": 2.3702871799468994,
"learning_rate": 6.167487684729064e-05,
"loss": 0.2701,
"step": 14140
},
{
"epoch": 69.11531421598535,
"grad_norm": 3.5141944885253906,
"learning_rate": 6.147783251231526e-05,
"loss": 0.2643,
"step": 14160
},
{
"epoch": 69.21293471629042,
"grad_norm": 2.7750911712646484,
"learning_rate": 6.12807881773399e-05,
"loss": 0.248,
"step": 14180
},
{
"epoch": 69.31055521659549,
"grad_norm": 4.1003217697143555,
"learning_rate": 6.108374384236453e-05,
"loss": 0.2618,
"step": 14200
},
{
"epoch": 69.40817571690054,
"grad_norm": 2.183353900909424,
"learning_rate": 6.0886699507389166e-05,
"loss": 0.2618,
"step": 14220
},
{
"epoch": 69.50579621720561,
"grad_norm": 2.447449207305908,
"learning_rate": 6.068965517241379e-05,
"loss": 0.284,
"step": 14240
},
{
"epoch": 69.60341671751068,
"grad_norm": 2.466543674468994,
"learning_rate": 6.049261083743843e-05,
"loss": 0.269,
"step": 14260
},
{
"epoch": 69.70103721781574,
"grad_norm": 3.8052902221679688,
"learning_rate": 6.0295566502463054e-05,
"loss": 0.2681,
"step": 14280
},
{
"epoch": 69.79865771812081,
"grad_norm": 3.1913719177246094,
"learning_rate": 6.0098522167487695e-05,
"loss": 0.2677,
"step": 14300
},
{
"epoch": 69.89627821842586,
"grad_norm": 1.6767873764038086,
"learning_rate": 5.990147783251232e-05,
"loss": 0.2739,
"step": 14320
},
{
"epoch": 69.99389871873093,
"grad_norm": 2.805734634399414,
"learning_rate": 5.970443349753695e-05,
"loss": 0.2686,
"step": 14340
},
{
"epoch": 70.091519219036,
"grad_norm": 2.671316146850586,
"learning_rate": 5.9507389162561576e-05,
"loss": 0.2558,
"step": 14360
},
{
"epoch": 70.18913971934106,
"grad_norm": 2.5105350017547607,
"learning_rate": 5.93103448275862e-05,
"loss": 0.2692,
"step": 14380
},
{
"epoch": 70.28676021964613,
"grad_norm": 2.0773072242736816,
"learning_rate": 5.9113300492610844e-05,
"loss": 0.266,
"step": 14400
},
{
"epoch": 70.3843807199512,
"grad_norm": 2.2632055282592773,
"learning_rate": 5.891625615763548e-05,
"loss": 0.2624,
"step": 14420
},
{
"epoch": 70.48200122025625,
"grad_norm": 3.4696826934814453,
"learning_rate": 5.8719211822660105e-05,
"loss": 0.2616,
"step": 14440
},
{
"epoch": 70.57962172056132,
"grad_norm": 2.471937417984009,
"learning_rate": 5.852216748768473e-05,
"loss": 0.2534,
"step": 14460
},
{
"epoch": 70.67724222086638,
"grad_norm": 2.4318599700927734,
"learning_rate": 5.832512315270936e-05,
"loss": 0.2661,
"step": 14480
},
{
"epoch": 70.77486272117144,
"grad_norm": 2.773090362548828,
"learning_rate": 5.8128078817733986e-05,
"loss": 0.283,
"step": 14500
},
{
"epoch": 70.87248322147651,
"grad_norm": 2.120820999145508,
"learning_rate": 5.7931034482758627e-05,
"loss": 0.2668,
"step": 14520
},
{
"epoch": 70.97010372178157,
"grad_norm": 2.614382028579712,
"learning_rate": 5.7733990147783254e-05,
"loss": 0.2722,
"step": 14540
},
{
"epoch": 71.06772422208664,
"grad_norm": 2.954516649246216,
"learning_rate": 5.753694581280789e-05,
"loss": 0.2571,
"step": 14560
},
{
"epoch": 71.16534472239171,
"grad_norm": 2.9351367950439453,
"learning_rate": 5.7339901477832515e-05,
"loss": 0.2659,
"step": 14580
},
{
"epoch": 71.26296522269676,
"grad_norm": 2.757805347442627,
"learning_rate": 5.714285714285714e-05,
"loss": 0.2461,
"step": 14600
},
{
"epoch": 71.36058572300183,
"grad_norm": 3.4546825885772705,
"learning_rate": 5.694581280788177e-05,
"loss": 0.2655,
"step": 14620
},
{
"epoch": 71.45820622330689,
"grad_norm": 2.822056531906128,
"learning_rate": 5.674876847290641e-05,
"loss": 0.2542,
"step": 14640
},
{
"epoch": 71.55582672361196,
"grad_norm": 2.4004786014556885,
"learning_rate": 5.6551724137931037e-05,
"loss": 0.2489,
"step": 14660
},
{
"epoch": 71.65344722391703,
"grad_norm": 3.2715816497802734,
"learning_rate": 5.635467980295567e-05,
"loss": 0.2669,
"step": 14680
},
{
"epoch": 71.75106772422208,
"grad_norm": 4.031295299530029,
"learning_rate": 5.61576354679803e-05,
"loss": 0.2729,
"step": 14700
},
{
"epoch": 71.84868822452715,
"grad_norm": 3.0305051803588867,
"learning_rate": 5.5960591133004925e-05,
"loss": 0.272,
"step": 14720
},
{
"epoch": 71.94630872483222,
"grad_norm": 2.170488119125366,
"learning_rate": 5.5763546798029565e-05,
"loss": 0.2771,
"step": 14740
},
{
"epoch": 72.04392922513728,
"grad_norm": 2.89032244682312,
"learning_rate": 5.556650246305419e-05,
"loss": 0.2665,
"step": 14760
},
{
"epoch": 72.14154972544235,
"grad_norm": 2.4803104400634766,
"learning_rate": 5.536945812807882e-05,
"loss": 0.2653,
"step": 14780
},
{
"epoch": 72.2391702257474,
"grad_norm": 2.525521755218506,
"learning_rate": 5.517241379310345e-05,
"loss": 0.2595,
"step": 14800
},
{
"epoch": 72.33679072605247,
"grad_norm": 2.121696710586548,
"learning_rate": 5.497536945812808e-05,
"loss": 0.2557,
"step": 14820
},
{
"epoch": 72.43441122635754,
"grad_norm": 1.8344529867172241,
"learning_rate": 5.477832512315271e-05,
"loss": 0.2512,
"step": 14840
},
{
"epoch": 72.5320317266626,
"grad_norm": 2.196624517440796,
"learning_rate": 5.458128078817735e-05,
"loss": 0.2474,
"step": 14860
},
{
"epoch": 72.62965222696766,
"grad_norm": 3.387305974960327,
"learning_rate": 5.4384236453201975e-05,
"loss": 0.2696,
"step": 14880
},
{
"epoch": 72.72727272727273,
"grad_norm": 2.481462240219116,
"learning_rate": 5.41871921182266e-05,
"loss": 0.2681,
"step": 14900
},
{
"epoch": 72.82489322757779,
"grad_norm": 2.6742024421691895,
"learning_rate": 5.399014778325123e-05,
"loss": 0.2553,
"step": 14920
},
{
"epoch": 72.92251372788286,
"grad_norm": 2.590111494064331,
"learning_rate": 5.379310344827586e-05,
"loss": 0.265,
"step": 14940
},
{
"epoch": 73.02013422818791,
"grad_norm": 2.311305046081543,
"learning_rate": 5.359605911330049e-05,
"loss": 0.2644,
"step": 14960
},
{
"epoch": 73.11775472849298,
"grad_norm": 2.502192974090576,
"learning_rate": 5.339901477832513e-05,
"loss": 0.2634,
"step": 14980
},
{
"epoch": 73.21537522879805,
"grad_norm": 2.5767018795013428,
"learning_rate": 5.320197044334976e-05,
"loss": 0.2625,
"step": 15000
},
{
"epoch": 73.31299572910311,
"grad_norm": 3.005783796310425,
"learning_rate": 5.3004926108374385e-05,
"loss": 0.2589,
"step": 15020
},
{
"epoch": 73.41061622940818,
"grad_norm": 2.7578892707824707,
"learning_rate": 5.280788177339901e-05,
"loss": 0.2519,
"step": 15040
},
{
"epoch": 73.50823672971325,
"grad_norm": 3.286733627319336,
"learning_rate": 5.2610837438423646e-05,
"loss": 0.2603,
"step": 15060
},
{
"epoch": 73.6058572300183,
"grad_norm": 2.323225975036621,
"learning_rate": 5.241379310344828e-05,
"loss": 0.2576,
"step": 15080
},
{
"epoch": 73.70347773032337,
"grad_norm": 2.407222032546997,
"learning_rate": 5.2216748768472914e-05,
"loss": 0.253,
"step": 15100
},
{
"epoch": 73.80109823062843,
"grad_norm": 3.0755960941314697,
"learning_rate": 5.201970443349754e-05,
"loss": 0.261,
"step": 15120
},
{
"epoch": 73.8987187309335,
"grad_norm": 1.9469565153121948,
"learning_rate": 5.182266009852217e-05,
"loss": 0.2556,
"step": 15140
},
{
"epoch": 73.99633923123857,
"grad_norm": 3.5689964294433594,
"learning_rate": 5.1625615763546795e-05,
"loss": 0.2718,
"step": 15160
},
{
"epoch": 74.09395973154362,
"grad_norm": 1.9299124479293823,
"learning_rate": 5.142857142857143e-05,
"loss": 0.2497,
"step": 15180
},
{
"epoch": 74.19158023184869,
"grad_norm": 2.1597163677215576,
"learning_rate": 5.123152709359606e-05,
"loss": 0.2526,
"step": 15200
},
{
"epoch": 74.28920073215376,
"grad_norm": 2.4359443187713623,
"learning_rate": 5.10344827586207e-05,
"loss": 0.2557,
"step": 15220
},
{
"epoch": 74.38682123245881,
"grad_norm": 2.449601411819458,
"learning_rate": 5.0837438423645324e-05,
"loss": 0.2628,
"step": 15240
},
{
"epoch": 74.48444173276388,
"grad_norm": 2.5450046062469482,
"learning_rate": 5.064039408866995e-05,
"loss": 0.2683,
"step": 15260
},
{
"epoch": 74.58206223306894,
"grad_norm": 2.499568462371826,
"learning_rate": 5.044334975369458e-05,
"loss": 0.2456,
"step": 15280
},
{
"epoch": 74.67968273337401,
"grad_norm": 2.276536703109741,
"learning_rate": 5.024630541871922e-05,
"loss": 0.2613,
"step": 15300
},
{
"epoch": 74.77730323367908,
"grad_norm": 6.047021865844727,
"learning_rate": 5.0049261083743846e-05,
"loss": 0.2591,
"step": 15320
},
{
"epoch": 74.87492373398413,
"grad_norm": 2.7853705883026123,
"learning_rate": 4.985221674876848e-05,
"loss": 0.2584,
"step": 15340
},
{
"epoch": 74.9725442342892,
"grad_norm": 2.658870220184326,
"learning_rate": 4.9655172413793107e-05,
"loss": 0.2485,
"step": 15360
},
{
"epoch": 75.07016473459427,
"grad_norm": 1.9290242195129395,
"learning_rate": 4.9458128078817734e-05,
"loss": 0.2456,
"step": 15380
},
{
"epoch": 75.16778523489933,
"grad_norm": 2.4340288639068604,
"learning_rate": 4.926108374384237e-05,
"loss": 0.2517,
"step": 15400
},
{
"epoch": 75.2654057352044,
"grad_norm": 1.7368818521499634,
"learning_rate": 4.9064039408866995e-05,
"loss": 0.245,
"step": 15420
},
{
"epoch": 75.36302623550945,
"grad_norm": 3.224472999572754,
"learning_rate": 4.886699507389163e-05,
"loss": 0.2512,
"step": 15440
},
{
"epoch": 75.46064673581452,
"grad_norm": 2.9347827434539795,
"learning_rate": 4.866995073891626e-05,
"loss": 0.252,
"step": 15460
},
{
"epoch": 75.55826723611959,
"grad_norm": 3.1281368732452393,
"learning_rate": 4.847290640394089e-05,
"loss": 0.2662,
"step": 15480
},
{
"epoch": 75.65588773642465,
"grad_norm": 2.1834158897399902,
"learning_rate": 4.827586206896552e-05,
"loss": 0.2549,
"step": 15500
},
{
"epoch": 75.75350823672972,
"grad_norm": 2.4959053993225098,
"learning_rate": 4.807881773399015e-05,
"loss": 0.2489,
"step": 15520
},
{
"epoch": 75.85112873703477,
"grad_norm": 1.9630552530288696,
"learning_rate": 4.788177339901478e-05,
"loss": 0.2685,
"step": 15540
},
{
"epoch": 75.94874923733984,
"grad_norm": 2.9730660915374756,
"learning_rate": 4.768472906403941e-05,
"loss": 0.2568,
"step": 15560
},
{
"epoch": 76.04636973764491,
"grad_norm": 2.492307186126709,
"learning_rate": 4.748768472906404e-05,
"loss": 0.254,
"step": 15580
},
{
"epoch": 76.14399023794996,
"grad_norm": 2.1463494300842285,
"learning_rate": 4.729064039408867e-05,
"loss": 0.2623,
"step": 15600
},
{
"epoch": 76.24161073825503,
"grad_norm": 2.957017421722412,
"learning_rate": 4.7093596059113306e-05,
"loss": 0.2576,
"step": 15620
},
{
"epoch": 76.3392312385601,
"grad_norm": 2.1611711978912354,
"learning_rate": 4.689655172413793e-05,
"loss": 0.2447,
"step": 15640
},
{
"epoch": 76.43685173886516,
"grad_norm": 3.1399998664855957,
"learning_rate": 4.669950738916256e-05,
"loss": 0.2586,
"step": 15660
},
{
"epoch": 76.53447223917023,
"grad_norm": 2.817157030105591,
"learning_rate": 4.6502463054187194e-05,
"loss": 0.2439,
"step": 15680
},
{
"epoch": 76.63209273947528,
"grad_norm": 1.3343191146850586,
"learning_rate": 4.630541871921182e-05,
"loss": 0.2522,
"step": 15700
},
{
"epoch": 76.72971323978035,
"grad_norm": 2.9455504417419434,
"learning_rate": 4.6108374384236455e-05,
"loss": 0.2606,
"step": 15720
},
{
"epoch": 76.82733374008542,
"grad_norm": 2.981264352798462,
"learning_rate": 4.591133004926109e-05,
"loss": 0.2482,
"step": 15740
},
{
"epoch": 76.92495424039048,
"grad_norm": 2.9296011924743652,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.2578,
"step": 15760
},
{
"epoch": 77.02257474069555,
"grad_norm": 2.8159282207489014,
"learning_rate": 4.551724137931035e-05,
"loss": 0.2571,
"step": 15780
},
{
"epoch": 77.12019524100062,
"grad_norm": 2.184053421020508,
"learning_rate": 4.532019704433498e-05,
"loss": 0.2548,
"step": 15800
},
{
"epoch": 77.21781574130567,
"grad_norm": 2.1801810264587402,
"learning_rate": 4.5123152709359604e-05,
"loss": 0.2367,
"step": 15820
},
{
"epoch": 77.31543624161074,
"grad_norm": 2.510050058364868,
"learning_rate": 4.492610837438424e-05,
"loss": 0.2522,
"step": 15840
},
{
"epoch": 77.4130567419158,
"grad_norm": 2.849837303161621,
"learning_rate": 4.472906403940887e-05,
"loss": 0.2524,
"step": 15860
},
{
"epoch": 77.51067724222086,
"grad_norm": 3.769998788833618,
"learning_rate": 4.45320197044335e-05,
"loss": 0.2568,
"step": 15880
},
{
"epoch": 77.60829774252593,
"grad_norm": 3.2575082778930664,
"learning_rate": 4.433497536945813e-05,
"loss": 0.2565,
"step": 15900
},
{
"epoch": 77.70591824283099,
"grad_norm": 2.199042797088623,
"learning_rate": 4.413793103448276e-05,
"loss": 0.2508,
"step": 15920
},
{
"epoch": 77.80353874313606,
"grad_norm": 1.9908735752105713,
"learning_rate": 4.394088669950739e-05,
"loss": 0.2612,
"step": 15940
},
{
"epoch": 77.90115924344113,
"grad_norm": 2.091723680496216,
"learning_rate": 4.374384236453202e-05,
"loss": 0.2491,
"step": 15960
},
{
"epoch": 77.99877974374618,
"grad_norm": 2.705829381942749,
"learning_rate": 4.3546798029556655e-05,
"loss": 0.2596,
"step": 15980
},
{
"epoch": 78.09640024405125,
"grad_norm": 2.6604998111724854,
"learning_rate": 4.334975369458129e-05,
"loss": 0.2475,
"step": 16000
},
{
"epoch": 78.19402074435631,
"grad_norm": 2.4286489486694336,
"learning_rate": 4.3152709359605916e-05,
"loss": 0.2509,
"step": 16020
},
{
"epoch": 78.29164124466138,
"grad_norm": 3.3478493690490723,
"learning_rate": 4.295566502463054e-05,
"loss": 0.2491,
"step": 16040
},
{
"epoch": 78.38926174496645,
"grad_norm": 2.9512908458709717,
"learning_rate": 4.275862068965518e-05,
"loss": 0.2362,
"step": 16060
},
{
"epoch": 78.4868822452715,
"grad_norm": 2.0870890617370605,
"learning_rate": 4.2561576354679804e-05,
"loss": 0.2546,
"step": 16080
},
{
"epoch": 78.58450274557657,
"grad_norm": 2.3549749851226807,
"learning_rate": 4.236453201970443e-05,
"loss": 0.2544,
"step": 16100
},
{
"epoch": 78.68212324588164,
"grad_norm": 2.296377658843994,
"learning_rate": 4.2167487684729065e-05,
"loss": 0.2524,
"step": 16120
},
{
"epoch": 78.7797437461867,
"grad_norm": 2.9563801288604736,
"learning_rate": 4.19704433497537e-05,
"loss": 0.2534,
"step": 16140
},
{
"epoch": 78.87736424649177,
"grad_norm": 3.3844058513641357,
"learning_rate": 4.1773399014778326e-05,
"loss": 0.2629,
"step": 16160
},
{
"epoch": 78.97498474679682,
"grad_norm": 1.9131345748901367,
"learning_rate": 4.157635467980296e-05,
"loss": 0.2478,
"step": 16180
},
{
"epoch": 79.07260524710189,
"grad_norm": 3.4866435527801514,
"learning_rate": 4.1379310344827587e-05,
"loss": 0.2464,
"step": 16200
},
{
"epoch": 79.17022574740696,
"grad_norm": 2.0751941204071045,
"learning_rate": 4.1182266009852214e-05,
"loss": 0.2435,
"step": 16220
},
{
"epoch": 79.26784624771201,
"grad_norm": 1.776879072189331,
"learning_rate": 4.098522167487685e-05,
"loss": 0.2501,
"step": 16240
},
{
"epoch": 79.36546674801708,
"grad_norm": 3.9006545543670654,
"learning_rate": 4.078817733990148e-05,
"loss": 0.2586,
"step": 16260
},
{
"epoch": 79.46308724832215,
"grad_norm": 2.390000581741333,
"learning_rate": 4.0591133004926115e-05,
"loss": 0.2408,
"step": 16280
},
{
"epoch": 79.56070774862721,
"grad_norm": 3.1795706748962402,
"learning_rate": 4.039408866995074e-05,
"loss": 0.2469,
"step": 16300
},
{
"epoch": 79.65832824893228,
"grad_norm": 2.6821188926696777,
"learning_rate": 4.019704433497537e-05,
"loss": 0.2429,
"step": 16320
},
{
"epoch": 79.75594874923733,
"grad_norm": 3.01457142829895,
"learning_rate": 4e-05,
"loss": 0.2598,
"step": 16340
},
{
"epoch": 79.8535692495424,
"grad_norm": 2.8440592288970947,
"learning_rate": 3.980295566502463e-05,
"loss": 0.2494,
"step": 16360
},
{
"epoch": 79.95118974984747,
"grad_norm": 3.210845708847046,
"learning_rate": 3.9605911330049264e-05,
"loss": 0.2521,
"step": 16380
},
{
"epoch": 80.04881025015253,
"grad_norm": 3.9740731716156006,
"learning_rate": 3.94088669950739e-05,
"loss": 0.2537,
"step": 16400
},
{
"epoch": 80.1464307504576,
"grad_norm": 2.3433115482330322,
"learning_rate": 3.9211822660098525e-05,
"loss": 0.2441,
"step": 16420
},
{
"epoch": 80.24405125076267,
"grad_norm": 2.5279314517974854,
"learning_rate": 3.901477832512315e-05,
"loss": 0.2564,
"step": 16440
},
{
"epoch": 80.34167175106772,
"grad_norm": 2.8062689304351807,
"learning_rate": 3.8817733990147786e-05,
"loss": 0.245,
"step": 16460
},
{
"epoch": 80.43929225137279,
"grad_norm": 1.9689416885375977,
"learning_rate": 3.862068965517241e-05,
"loss": 0.2497,
"step": 16480
},
{
"epoch": 80.53691275167785,
"grad_norm": 2.462744951248169,
"learning_rate": 3.842364532019704e-05,
"loss": 0.2525,
"step": 16500
},
{
"epoch": 80.63453325198292,
"grad_norm": 1.9201568365097046,
"learning_rate": 3.822660098522168e-05,
"loss": 0.2495,
"step": 16520
},
{
"epoch": 80.73215375228799,
"grad_norm": 1.7118130922317505,
"learning_rate": 3.802955665024631e-05,
"loss": 0.2415,
"step": 16540
},
{
"epoch": 80.82977425259304,
"grad_norm": 2.311931848526001,
"learning_rate": 3.783251231527094e-05,
"loss": 0.247,
"step": 16560
},
{
"epoch": 80.92739475289811,
"grad_norm": 2.030750274658203,
"learning_rate": 3.763546798029557e-05,
"loss": 0.2415,
"step": 16580
},
{
"epoch": 81.02501525320318,
"grad_norm": 1.949194312095642,
"learning_rate": 3.7438423645320196e-05,
"loss": 0.2555,
"step": 16600
},
{
"epoch": 81.12263575350823,
"grad_norm": 1.8409544229507446,
"learning_rate": 3.724137931034483e-05,
"loss": 0.2412,
"step": 16620
},
{
"epoch": 81.2202562538133,
"grad_norm": 2.5164377689361572,
"learning_rate": 3.704433497536946e-05,
"loss": 0.2326,
"step": 16640
},
{
"epoch": 81.31787675411836,
"grad_norm": 2.3859026432037354,
"learning_rate": 3.684729064039409e-05,
"loss": 0.2499,
"step": 16660
},
{
"epoch": 81.41549725442343,
"grad_norm": 2.753124713897705,
"learning_rate": 3.6650246305418725e-05,
"loss": 0.2504,
"step": 16680
},
{
"epoch": 81.5131177547285,
"grad_norm": 2.294701099395752,
"learning_rate": 3.645320197044335e-05,
"loss": 0.2433,
"step": 16700
},
{
"epoch": 81.61073825503355,
"grad_norm": 2.179985761642456,
"learning_rate": 3.625615763546798e-05,
"loss": 0.2511,
"step": 16720
},
{
"epoch": 81.70835875533862,
"grad_norm": 2.242023229598999,
"learning_rate": 3.605911330049261e-05,
"loss": 0.2558,
"step": 16740
},
{
"epoch": 81.80597925564369,
"grad_norm": 2.9500415325164795,
"learning_rate": 3.586206896551724e-05,
"loss": 0.2423,
"step": 16760
},
{
"epoch": 81.90359975594875,
"grad_norm": 2.372332811355591,
"learning_rate": 3.5665024630541874e-05,
"loss": 0.2503,
"step": 16780
},
{
"epoch": 82.00122025625382,
"grad_norm": 2.8338615894317627,
"learning_rate": 3.546798029556651e-05,
"loss": 0.2442,
"step": 16800
},
{
"epoch": 82.09884075655887,
"grad_norm": 2.5122156143188477,
"learning_rate": 3.5270935960591135e-05,
"loss": 0.2386,
"step": 16820
},
{
"epoch": 82.19646125686394,
"grad_norm": 2.6733508110046387,
"learning_rate": 3.507389162561577e-05,
"loss": 0.2376,
"step": 16840
},
{
"epoch": 82.29408175716901,
"grad_norm": 1.9639496803283691,
"learning_rate": 3.4876847290640396e-05,
"loss": 0.2366,
"step": 16860
},
{
"epoch": 82.39170225747407,
"grad_norm": 2.2403128147125244,
"learning_rate": 3.467980295566502e-05,
"loss": 0.2478,
"step": 16880
},
{
"epoch": 82.48932275777914,
"grad_norm": 2.3874387741088867,
"learning_rate": 3.4482758620689657e-05,
"loss": 0.2561,
"step": 16900
},
{
"epoch": 82.5869432580842,
"grad_norm": 3.6774182319641113,
"learning_rate": 3.428571428571429e-05,
"loss": 0.2448,
"step": 16920
},
{
"epoch": 82.68456375838926,
"grad_norm": 1.8325834274291992,
"learning_rate": 3.408866995073892e-05,
"loss": 0.2515,
"step": 16940
},
{
"epoch": 82.78218425869433,
"grad_norm": 2.846112012863159,
"learning_rate": 3.389162561576355e-05,
"loss": 0.2481,
"step": 16960
},
{
"epoch": 82.87980475899938,
"grad_norm": 3.7636115550994873,
"learning_rate": 3.369458128078818e-05,
"loss": 0.2529,
"step": 16980
},
{
"epoch": 82.97742525930445,
"grad_norm": 2.4501962661743164,
"learning_rate": 3.3497536945812806e-05,
"loss": 0.2344,
"step": 17000
},
{
"epoch": 83.07504575960952,
"grad_norm": 2.4377410411834717,
"learning_rate": 3.330049261083744e-05,
"loss": 0.2373,
"step": 17020
},
{
"epoch": 83.17266625991458,
"grad_norm": 2.180765151977539,
"learning_rate": 3.310344827586207e-05,
"loss": 0.2395,
"step": 17040
},
{
"epoch": 83.27028676021965,
"grad_norm": 3.2704169750213623,
"learning_rate": 3.29064039408867e-05,
"loss": 0.2407,
"step": 17060
},
{
"epoch": 83.36790726052472,
"grad_norm": 2.74991512298584,
"learning_rate": 3.2709359605911334e-05,
"loss": 0.2351,
"step": 17080
},
{
"epoch": 83.46552776082977,
"grad_norm": 1.780633807182312,
"learning_rate": 3.251231527093596e-05,
"loss": 0.2379,
"step": 17100
},
{
"epoch": 83.56314826113484,
"grad_norm": 2.352802038192749,
"learning_rate": 3.2315270935960595e-05,
"loss": 0.244,
"step": 17120
},
{
"epoch": 83.6607687614399,
"grad_norm": 3.505608320236206,
"learning_rate": 3.211822660098522e-05,
"loss": 0.2443,
"step": 17140
},
{
"epoch": 83.75838926174497,
"grad_norm": 2.568233013153076,
"learning_rate": 3.192118226600985e-05,
"loss": 0.2499,
"step": 17160
},
{
"epoch": 83.85600976205004,
"grad_norm": 1.864367961883545,
"learning_rate": 3.172413793103448e-05,
"loss": 0.2543,
"step": 17180
},
{
"epoch": 83.95363026235509,
"grad_norm": 2.386052370071411,
"learning_rate": 3.152709359605912e-05,
"loss": 0.2505,
"step": 17200
},
{
"epoch": 84.05125076266016,
"grad_norm": 4.361128330230713,
"learning_rate": 3.1330049261083744e-05,
"loss": 0.2505,
"step": 17220
},
{
"epoch": 84.14887126296523,
"grad_norm": 1.4861139059066772,
"learning_rate": 3.113300492610838e-05,
"loss": 0.2314,
"step": 17240
},
{
"epoch": 84.24649176327028,
"grad_norm": 1.9692414999008179,
"learning_rate": 3.0935960591133005e-05,
"loss": 0.2499,
"step": 17260
},
{
"epoch": 84.34411226357535,
"grad_norm": 2.245277166366577,
"learning_rate": 3.073891625615763e-05,
"loss": 0.243,
"step": 17280
},
{
"epoch": 84.44173276388041,
"grad_norm": 2.0669002532958984,
"learning_rate": 3.0541871921182266e-05,
"loss": 0.2388,
"step": 17300
},
{
"epoch": 84.53935326418548,
"grad_norm": 2.377110004425049,
"learning_rate": 3.0344827586206897e-05,
"loss": 0.2431,
"step": 17320
},
{
"epoch": 84.63697376449055,
"grad_norm": 2.4260573387145996,
"learning_rate": 3.0147783251231527e-05,
"loss": 0.2393,
"step": 17340
},
{
"epoch": 84.7345942647956,
"grad_norm": 1.7577930688858032,
"learning_rate": 2.995073891625616e-05,
"loss": 0.2444,
"step": 17360
},
{
"epoch": 84.83221476510067,
"grad_norm": 2.4844295978546143,
"learning_rate": 2.9753694581280788e-05,
"loss": 0.2474,
"step": 17380
},
{
"epoch": 84.92983526540573,
"grad_norm": 2.7530508041381836,
"learning_rate": 2.9556650246305422e-05,
"loss": 0.2459,
"step": 17400
},
{
"epoch": 85.0274557657108,
"grad_norm": 1.6418040990829468,
"learning_rate": 2.9359605911330052e-05,
"loss": 0.2491,
"step": 17420
},
{
"epoch": 85.12507626601587,
"grad_norm": 2.0329489707946777,
"learning_rate": 2.916256157635468e-05,
"loss": 0.2426,
"step": 17440
},
{
"epoch": 85.22269676632092,
"grad_norm": 1.6439207792282104,
"learning_rate": 2.8965517241379313e-05,
"loss": 0.2351,
"step": 17460
},
{
"epoch": 85.32031726662599,
"grad_norm": 1.6182892322540283,
"learning_rate": 2.8768472906403944e-05,
"loss": 0.2468,
"step": 17480
},
{
"epoch": 85.41793776693106,
"grad_norm": 3.263887882232666,
"learning_rate": 2.857142857142857e-05,
"loss": 0.2426,
"step": 17500
},
{
"epoch": 85.51555826723612,
"grad_norm": 3.062742233276367,
"learning_rate": 2.8374384236453205e-05,
"loss": 0.2386,
"step": 17520
},
{
"epoch": 85.61317876754119,
"grad_norm": 2.8203582763671875,
"learning_rate": 2.8177339901477835e-05,
"loss": 0.2407,
"step": 17540
},
{
"epoch": 85.71079926784624,
"grad_norm": 2.3993334770202637,
"learning_rate": 2.7980295566502462e-05,
"loss": 0.2418,
"step": 17560
},
{
"epoch": 85.80841976815131,
"grad_norm": 1.7914482355117798,
"learning_rate": 2.7783251231527096e-05,
"loss": 0.2377,
"step": 17580
},
{
"epoch": 85.90604026845638,
"grad_norm": 3.20501971244812,
"learning_rate": 2.7586206896551727e-05,
"loss": 0.2398,
"step": 17600
},
{
"epoch": 86.00366076876143,
"grad_norm": 1.6623684167861938,
"learning_rate": 2.7389162561576354e-05,
"loss": 0.2442,
"step": 17620
},
{
"epoch": 86.1012812690665,
"grad_norm": 2.3433034420013428,
"learning_rate": 2.7192118226600988e-05,
"loss": 0.2358,
"step": 17640
},
{
"epoch": 86.19890176937157,
"grad_norm": 2.6188597679138184,
"learning_rate": 2.6995073891625615e-05,
"loss": 0.2336,
"step": 17660
},
{
"epoch": 86.29652226967663,
"grad_norm": 3.1089391708374023,
"learning_rate": 2.6798029556650245e-05,
"loss": 0.239,
"step": 17680
},
{
"epoch": 86.3941427699817,
"grad_norm": 2.378998041152954,
"learning_rate": 2.660098522167488e-05,
"loss": 0.2336,
"step": 17700
},
{
"epoch": 86.49176327028675,
"grad_norm": 2.4956347942352295,
"learning_rate": 2.6403940886699506e-05,
"loss": 0.2497,
"step": 17720
},
{
"epoch": 86.58938377059182,
"grad_norm": 2.529139757156372,
"learning_rate": 2.620689655172414e-05,
"loss": 0.2436,
"step": 17740
},
{
"epoch": 86.68700427089689,
"grad_norm": 2.6899948120117188,
"learning_rate": 2.600985221674877e-05,
"loss": 0.2445,
"step": 17760
},
{
"epoch": 86.78462477120195,
"grad_norm": 1.8922455310821533,
"learning_rate": 2.5812807881773398e-05,
"loss": 0.2366,
"step": 17780
},
{
"epoch": 86.88224527150702,
"grad_norm": 1.9104729890823364,
"learning_rate": 2.561576354679803e-05,
"loss": 0.2345,
"step": 17800
},
{
"epoch": 86.97986577181209,
"grad_norm": 3.2369461059570312,
"learning_rate": 2.5418719211822662e-05,
"loss": 0.2515,
"step": 17820
},
{
"epoch": 87.07748627211714,
"grad_norm": 2.2592508792877197,
"learning_rate": 2.522167487684729e-05,
"loss": 0.2333,
"step": 17840
},
{
"epoch": 87.17510677242221,
"grad_norm": 2.302445888519287,
"learning_rate": 2.5024630541871923e-05,
"loss": 0.2308,
"step": 17860
},
{
"epoch": 87.27272727272727,
"grad_norm": 2.0607619285583496,
"learning_rate": 2.4827586206896553e-05,
"loss": 0.2323,
"step": 17880
},
{
"epoch": 87.37034777303234,
"grad_norm": 2.4503376483917236,
"learning_rate": 2.4630541871921184e-05,
"loss": 0.2399,
"step": 17900
},
{
"epoch": 87.4679682733374,
"grad_norm": 1.7061033248901367,
"learning_rate": 2.4433497536945814e-05,
"loss": 0.249,
"step": 17920
},
{
"epoch": 87.56558877364246,
"grad_norm": 2.1557867527008057,
"learning_rate": 2.4236453201970445e-05,
"loss": 0.243,
"step": 17940
},
{
"epoch": 87.66320927394753,
"grad_norm": 2.0752928256988525,
"learning_rate": 2.4039408866995075e-05,
"loss": 0.236,
"step": 17960
},
{
"epoch": 87.7608297742526,
"grad_norm": 1.9939770698547363,
"learning_rate": 2.3842364532019706e-05,
"loss": 0.24,
"step": 17980
},
{
"epoch": 87.85845027455765,
"grad_norm": 2.043842315673828,
"learning_rate": 2.3645320197044336e-05,
"loss": 0.2438,
"step": 18000
}
],
"logging_steps": 20,
"max_steps": 20400,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.586273126839091e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}