|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 87.85845027455765,
|
|
"eval_steps": 500,
|
|
"global_step": 18000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.09762050030506407,
|
|
"grad_norm": 0.5132213234901428,
|
|
"learning_rate": 4e-05,
|
|
"loss": 2.1313,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.19524100061012814,
|
|
"grad_norm": 0.5252098441123962,
|
|
"learning_rate": 8e-05,
|
|
"loss": 2.0977,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2928615009151922,
|
|
"grad_norm": 0.5683884024620056,
|
|
"learning_rate": 0.00012,
|
|
"loss": 1.9168,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.3904820012202563,
|
|
"grad_norm": 0.6169227957725525,
|
|
"learning_rate": 0.00016,
|
|
"loss": 1.8795,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.4881025015253203,
|
|
"grad_norm": 0.8657425045967102,
|
|
"learning_rate": 0.0002,
|
|
"loss": 1.8257,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.5857230018303844,
|
|
"grad_norm": 1.1708447933197021,
|
|
"learning_rate": 0.00019980295566502464,
|
|
"loss": 1.8,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.6833435021354485,
|
|
"grad_norm": 1.4427006244659424,
|
|
"learning_rate": 0.00019960591133004926,
|
|
"loss": 1.75,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.7809640024405126,
|
|
"grad_norm": 1.190605640411377,
|
|
"learning_rate": 0.00019940886699507392,
|
|
"loss": 1.6796,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.8785845027455765,
|
|
"grad_norm": 0.9052116274833679,
|
|
"learning_rate": 0.00019921182266009852,
|
|
"loss": 1.6721,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.9762050030506406,
|
|
"grad_norm": 1.3986254930496216,
|
|
"learning_rate": 0.00019901477832512317,
|
|
"loss": 1.5752,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.0738255033557047,
|
|
"grad_norm": 1.1646902561187744,
|
|
"learning_rate": 0.0001988177339901478,
|
|
"loss": 1.6187,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.1714460036607688,
|
|
"grad_norm": 1.3168349266052246,
|
|
"learning_rate": 0.00019862068965517243,
|
|
"loss": 1.518,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.2690665039658329,
|
|
"grad_norm": 1.5947434902191162,
|
|
"learning_rate": 0.00019842364532019705,
|
|
"loss": 1.5919,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.366687004270897,
|
|
"grad_norm": 1.550020694732666,
|
|
"learning_rate": 0.00019822660098522168,
|
|
"loss": 1.5565,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.4643075045759608,
|
|
"grad_norm": 1.587833285331726,
|
|
"learning_rate": 0.0001980295566502463,
|
|
"loss": 1.4928,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.561928004881025,
|
|
"grad_norm": 1.3913565874099731,
|
|
"learning_rate": 0.00019783251231527093,
|
|
"loss": 1.4603,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.659548505186089,
|
|
"grad_norm": 1.6511396169662476,
|
|
"learning_rate": 0.00019763546798029556,
|
|
"loss": 1.3946,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.757169005491153,
|
|
"grad_norm": 1.6132158041000366,
|
|
"learning_rate": 0.00019743842364532022,
|
|
"loss": 1.4182,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.8547895057962172,
|
|
"grad_norm": 2.1648366451263428,
|
|
"learning_rate": 0.00019724137931034484,
|
|
"loss": 1.4717,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.9524100061012812,
|
|
"grad_norm": 1.5196492671966553,
|
|
"learning_rate": 0.00019704433497536947,
|
|
"loss": 1.4192,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.0500305064063453,
|
|
"grad_norm": 1.676941990852356,
|
|
"learning_rate": 0.0001968472906403941,
|
|
"loss": 1.378,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.1476510067114094,
|
|
"grad_norm": 1.834902286529541,
|
|
"learning_rate": 0.00019665024630541872,
|
|
"loss": 1.3741,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.2452715070164735,
|
|
"grad_norm": 1.9870941638946533,
|
|
"learning_rate": 0.00019645320197044338,
|
|
"loss": 1.3165,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.3428920073215376,
|
|
"grad_norm": 1.682267427444458,
|
|
"learning_rate": 0.00019625615763546798,
|
|
"loss": 1.3828,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.4405125076266017,
|
|
"grad_norm": 2.291842222213745,
|
|
"learning_rate": 0.00019605911330049263,
|
|
"loss": 1.3857,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.5381330079316657,
|
|
"grad_norm": 2.0962560176849365,
|
|
"learning_rate": 0.00019586206896551723,
|
|
"loss": 1.2759,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.63575350823673,
|
|
"grad_norm": 1.6451084613800049,
|
|
"learning_rate": 0.0001956650246305419,
|
|
"loss": 1.3086,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.733374008541794,
|
|
"grad_norm": 1.8540750741958618,
|
|
"learning_rate": 0.00019546798029556651,
|
|
"loss": 1.3416,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.830994508846858,
|
|
"grad_norm": 1.8368124961853027,
|
|
"learning_rate": 0.00019527093596059114,
|
|
"loss": 1.2716,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.9286150091519216,
|
|
"grad_norm": 3.194183588027954,
|
|
"learning_rate": 0.00019507389162561577,
|
|
"loss": 1.2964,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 3.026235509456986,
|
|
"grad_norm": 1.751219630241394,
|
|
"learning_rate": 0.0001948768472906404,
|
|
"loss": 1.3046,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 3.1238560097620502,
|
|
"grad_norm": 1.834823489189148,
|
|
"learning_rate": 0.00019467980295566505,
|
|
"loss": 1.2149,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 3.221476510067114,
|
|
"grad_norm": 1.9562243223190308,
|
|
"learning_rate": 0.00019448275862068965,
|
|
"loss": 1.209,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 3.319097010372178,
|
|
"grad_norm": 2.012437582015991,
|
|
"learning_rate": 0.0001942857142857143,
|
|
"loss": 1.2196,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 3.416717510677242,
|
|
"grad_norm": 2.47426176071167,
|
|
"learning_rate": 0.00019408866995073893,
|
|
"loss": 1.2974,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 3.514338010982306,
|
|
"grad_norm": 2.1828153133392334,
|
|
"learning_rate": 0.00019389162561576356,
|
|
"loss": 1.2287,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 3.61195851128737,
|
|
"grad_norm": 2.335744619369507,
|
|
"learning_rate": 0.00019369458128078818,
|
|
"loss": 1.2018,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 3.7095790115924343,
|
|
"grad_norm": 1.731418490409851,
|
|
"learning_rate": 0.0001934975369458128,
|
|
"loss": 1.2568,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 3.8071995118974984,
|
|
"grad_norm": 2.0934510231018066,
|
|
"learning_rate": 0.00019330049261083744,
|
|
"loss": 1.2206,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 3.9048200122025625,
|
|
"grad_norm": 2.2060680389404297,
|
|
"learning_rate": 0.0001931034482758621,
|
|
"loss": 1.1898,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 4.002440512507627,
|
|
"grad_norm": 3.0342836380004883,
|
|
"learning_rate": 0.0001929064039408867,
|
|
"loss": 1.2248,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 4.100061012812691,
|
|
"grad_norm": 2.1768083572387695,
|
|
"learning_rate": 0.00019270935960591135,
|
|
"loss": 1.1721,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 4.197681513117755,
|
|
"grad_norm": 2.2883739471435547,
|
|
"learning_rate": 0.00019251231527093597,
|
|
"loss": 1.1117,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 4.295302013422819,
|
|
"grad_norm": 2.45024037361145,
|
|
"learning_rate": 0.0001923152709359606,
|
|
"loss": 1.1392,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 4.392922513727883,
|
|
"grad_norm": 1.9696956872940063,
|
|
"learning_rate": 0.00019211822660098523,
|
|
"loss": 1.0958,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 4.490543014032947,
|
|
"grad_norm": 2.3901145458221436,
|
|
"learning_rate": 0.00019192118226600986,
|
|
"loss": 1.1693,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 4.588163514338011,
|
|
"grad_norm": 2.003532648086548,
|
|
"learning_rate": 0.0001917241379310345,
|
|
"loss": 1.1301,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 4.685784014643075,
|
|
"grad_norm": 1.990051031112671,
|
|
"learning_rate": 0.0001915270935960591,
|
|
"loss": 1.1315,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 4.783404514948139,
|
|
"grad_norm": 2.517423152923584,
|
|
"learning_rate": 0.00019133004926108376,
|
|
"loss": 1.208,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 4.881025015253203,
|
|
"grad_norm": 2.311152458190918,
|
|
"learning_rate": 0.0001911330049261084,
|
|
"loss": 1.1494,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.978645515558267,
|
|
"grad_norm": 2.327719211578369,
|
|
"learning_rate": 0.00019093596059113302,
|
|
"loss": 1.1459,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 5.0762660158633315,
|
|
"grad_norm": 3.1623075008392334,
|
|
"learning_rate": 0.00019073891625615765,
|
|
"loss": 1.1417,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 5.173886516168396,
|
|
"grad_norm": 2.418928384780884,
|
|
"learning_rate": 0.00019054187192118227,
|
|
"loss": 1.091,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 5.27150701647346,
|
|
"grad_norm": 2.6035215854644775,
|
|
"learning_rate": 0.0001903448275862069,
|
|
"loss": 1.0851,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 5.369127516778524,
|
|
"grad_norm": 3.089789628982544,
|
|
"learning_rate": 0.00019014778325123153,
|
|
"loss": 1.0592,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 5.466748017083588,
|
|
"grad_norm": 2.885105609893799,
|
|
"learning_rate": 0.00018995073891625615,
|
|
"loss": 1.0781,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 5.564368517388652,
|
|
"grad_norm": 2.3023903369903564,
|
|
"learning_rate": 0.0001897536945812808,
|
|
"loss": 1.0706,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 5.661989017693716,
|
|
"grad_norm": 2.873560905456543,
|
|
"learning_rate": 0.00018955665024630543,
|
|
"loss": 1.092,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 5.75960951799878,
|
|
"grad_norm": 2.4178314208984375,
|
|
"learning_rate": 0.00018935960591133006,
|
|
"loss": 1.0896,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 5.857230018303844,
|
|
"grad_norm": 2.150630474090576,
|
|
"learning_rate": 0.0001891625615763547,
|
|
"loss": 1.1056,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 5.954850518608908,
|
|
"grad_norm": 3.347947359085083,
|
|
"learning_rate": 0.00018896551724137932,
|
|
"loss": 1.0832,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 6.052471018913972,
|
|
"grad_norm": 2.737258195877075,
|
|
"learning_rate": 0.00018876847290640397,
|
|
"loss": 1.0573,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 6.150091519219036,
|
|
"grad_norm": 2.3305180072784424,
|
|
"learning_rate": 0.00018857142857142857,
|
|
"loss": 0.9491,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 6.2477120195241005,
|
|
"grad_norm": 3.0475850105285645,
|
|
"learning_rate": 0.00018837438423645322,
|
|
"loss": 1.0221,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 6.345332519829164,
|
|
"grad_norm": 2.7141025066375732,
|
|
"learning_rate": 0.00018817733990147782,
|
|
"loss": 1.0566,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 6.442953020134228,
|
|
"grad_norm": 2.931290626525879,
|
|
"learning_rate": 0.00018798029556650248,
|
|
"loss": 1.0178,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 6.540573520439292,
|
|
"grad_norm": 2.9428722858428955,
|
|
"learning_rate": 0.0001877832512315271,
|
|
"loss": 1.0642,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 6.638194020744356,
|
|
"grad_norm": 2.452775001525879,
|
|
"learning_rate": 0.00018758620689655173,
|
|
"loss": 1.0928,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 6.73581452104942,
|
|
"grad_norm": 3.380108594894409,
|
|
"learning_rate": 0.00018738916256157636,
|
|
"loss": 1.0091,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 6.833435021354484,
|
|
"grad_norm": 2.9912617206573486,
|
|
"learning_rate": 0.000187192118226601,
|
|
"loss": 0.9958,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 6.931055521659548,
|
|
"grad_norm": 2.5559194087982178,
|
|
"learning_rate": 0.00018699507389162561,
|
|
"loss": 1.0891,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 7.028676021964612,
|
|
"grad_norm": 2.728987693786621,
|
|
"learning_rate": 0.00018679802955665024,
|
|
"loss": 0.9723,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 7.126296522269676,
|
|
"grad_norm": 2.4664106369018555,
|
|
"learning_rate": 0.0001866009852216749,
|
|
"loss": 0.9712,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 7.22391702257474,
|
|
"grad_norm": 2.6810712814331055,
|
|
"learning_rate": 0.00018640394088669952,
|
|
"loss": 0.9408,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 7.3215375228798045,
|
|
"grad_norm": 2.690723419189453,
|
|
"learning_rate": 0.00018620689655172415,
|
|
"loss": 0.9579,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 7.419158023184869,
|
|
"grad_norm": 2.751676321029663,
|
|
"learning_rate": 0.00018600985221674878,
|
|
"loss": 0.9959,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 7.516778523489933,
|
|
"grad_norm": 2.6251280307769775,
|
|
"learning_rate": 0.0001858128078817734,
|
|
"loss": 0.9908,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 7.614399023794997,
|
|
"grad_norm": 2.897099733352661,
|
|
"learning_rate": 0.00018561576354679803,
|
|
"loss": 0.9631,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 7.712019524100061,
|
|
"grad_norm": 2.0911786556243896,
|
|
"learning_rate": 0.00018541871921182269,
|
|
"loss": 0.9963,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 7.809640024405125,
|
|
"grad_norm": 2.6954994201660156,
|
|
"learning_rate": 0.00018522167487684729,
|
|
"loss": 1.0134,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 7.907260524710189,
|
|
"grad_norm": 2.8063347339630127,
|
|
"learning_rate": 0.00018502463054187194,
|
|
"loss": 0.9732,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 8.004881025015253,
|
|
"grad_norm": 2.0492053031921387,
|
|
"learning_rate": 0.00018482758620689654,
|
|
"loss": 1.0361,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 8.102501525320317,
|
|
"grad_norm": 3.0692152976989746,
|
|
"learning_rate": 0.0001846305418719212,
|
|
"loss": 0.9293,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 8.200122025625381,
|
|
"grad_norm": 2.7933707237243652,
|
|
"learning_rate": 0.00018443349753694582,
|
|
"loss": 0.9101,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 8.297742525930445,
|
|
"grad_norm": 3.628946542739868,
|
|
"learning_rate": 0.00018423645320197045,
|
|
"loss": 0.9215,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 8.39536302623551,
|
|
"grad_norm": 2.892118215560913,
|
|
"learning_rate": 0.0001840394088669951,
|
|
"loss": 0.9008,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 8.492983526540574,
|
|
"grad_norm": 3.5419254302978516,
|
|
"learning_rate": 0.0001838423645320197,
|
|
"loss": 0.9577,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 8.590604026845638,
|
|
"grad_norm": 2.785578489303589,
|
|
"learning_rate": 0.00018364532019704436,
|
|
"loss": 0.8979,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 8.688224527150702,
|
|
"grad_norm": 3.6454851627349854,
|
|
"learning_rate": 0.00018344827586206896,
|
|
"loss": 0.9424,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 8.785845027455766,
|
|
"grad_norm": 3.1077752113342285,
|
|
"learning_rate": 0.0001832512315270936,
|
|
"loss": 0.976,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 8.88346552776083,
|
|
"grad_norm": 2.1347529888153076,
|
|
"learning_rate": 0.00018305418719211824,
|
|
"loss": 0.9889,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 8.981086028065894,
|
|
"grad_norm": 1.8763928413391113,
|
|
"learning_rate": 0.00018285714285714286,
|
|
"loss": 0.9595,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 9.078706528370958,
|
|
"grad_norm": 2.5731394290924072,
|
|
"learning_rate": 0.0001826600985221675,
|
|
"loss": 0.9346,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 9.176327028676022,
|
|
"grad_norm": 2.75944447517395,
|
|
"learning_rate": 0.00018246305418719212,
|
|
"loss": 0.8926,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 9.273947528981086,
|
|
"grad_norm": 2.7548296451568604,
|
|
"learning_rate": 0.00018226600985221675,
|
|
"loss": 0.8835,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 9.37156802928615,
|
|
"grad_norm": 3.4645333290100098,
|
|
"learning_rate": 0.0001820689655172414,
|
|
"loss": 0.8753,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 9.469188529591214,
|
|
"grad_norm": 2.7922091484069824,
|
|
"learning_rate": 0.00018187192118226603,
|
|
"loss": 0.9187,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 9.566809029896278,
|
|
"grad_norm": 2.257009506225586,
|
|
"learning_rate": 0.00018167487684729065,
|
|
"loss": 0.9294,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 9.664429530201343,
|
|
"grad_norm": 4.195834159851074,
|
|
"learning_rate": 0.00018147783251231528,
|
|
"loss": 0.9022,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 9.762050030506407,
|
|
"grad_norm": 2.8687057495117188,
|
|
"learning_rate": 0.0001812807881773399,
|
|
"loss": 0.8744,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 9.85967053081147,
|
|
"grad_norm": 3.758493661880493,
|
|
"learning_rate": 0.00018108374384236456,
|
|
"loss": 0.9117,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 9.957291031116535,
|
|
"grad_norm": 3.2609262466430664,
|
|
"learning_rate": 0.00018088669950738916,
|
|
"loss": 0.9261,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 10.054911531421599,
|
|
"grad_norm": 3.5481553077697754,
|
|
"learning_rate": 0.00018068965517241382,
|
|
"loss": 0.8786,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 10.152532031726663,
|
|
"grad_norm": 2.8181192874908447,
|
|
"learning_rate": 0.00018049261083743842,
|
|
"loss": 0.8153,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 10.250152532031727,
|
|
"grad_norm": 2.582590341567993,
|
|
"learning_rate": 0.00018029556650246307,
|
|
"loss": 0.8763,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 10.347773032336791,
|
|
"grad_norm": 2.50076961517334,
|
|
"learning_rate": 0.0001800985221674877,
|
|
"loss": 0.8512,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 10.445393532641855,
|
|
"grad_norm": 3.2371861934661865,
|
|
"learning_rate": 0.00017990147783251232,
|
|
"loss": 0.823,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 10.54301403294692,
|
|
"grad_norm": 2.688570976257324,
|
|
"learning_rate": 0.00017970443349753695,
|
|
"loss": 0.8853,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 10.640634533251983,
|
|
"grad_norm": 2.4727838039398193,
|
|
"learning_rate": 0.00017950738916256158,
|
|
"loss": 0.8257,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 10.738255033557047,
|
|
"grad_norm": 3.330667495727539,
|
|
"learning_rate": 0.0001793103448275862,
|
|
"loss": 0.923,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 10.835875533862112,
|
|
"grad_norm": 2.5213732719421387,
|
|
"learning_rate": 0.00017911330049261083,
|
|
"loss": 0.8946,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 10.933496034167176,
|
|
"grad_norm": 2.6011056900024414,
|
|
"learning_rate": 0.0001789162561576355,
|
|
"loss": 0.9194,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 11.03111653447224,
|
|
"grad_norm": 3.4423539638519287,
|
|
"learning_rate": 0.00017871921182266011,
|
|
"loss": 0.8529,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 11.128737034777304,
|
|
"grad_norm": 3.608583927154541,
|
|
"learning_rate": 0.00017852216748768474,
|
|
"loss": 0.7944,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 11.226357535082368,
|
|
"grad_norm": 2.567775249481201,
|
|
"learning_rate": 0.00017832512315270937,
|
|
"loss": 0.7843,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 11.323978035387432,
|
|
"grad_norm": 3.0681939125061035,
|
|
"learning_rate": 0.000178128078817734,
|
|
"loss": 0.8238,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 11.421598535692496,
|
|
"grad_norm": 2.489577293395996,
|
|
"learning_rate": 0.00017793103448275862,
|
|
"loss": 0.8829,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 11.51921903599756,
|
|
"grad_norm": 2.9147262573242188,
|
|
"learning_rate": 0.00017773399014778328,
|
|
"loss": 0.8246,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 11.616839536302624,
|
|
"grad_norm": 2.5094566345214844,
|
|
"learning_rate": 0.00017753694581280788,
|
|
"loss": 0.8277,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 11.714460036607688,
|
|
"grad_norm": 2.4408226013183594,
|
|
"learning_rate": 0.00017733990147783253,
|
|
"loss": 0.8722,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 11.812080536912752,
|
|
"grad_norm": 2.5982508659362793,
|
|
"learning_rate": 0.00017714285714285713,
|
|
"loss": 0.8285,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 11.909701037217816,
|
|
"grad_norm": 4.408588409423828,
|
|
"learning_rate": 0.00017694581280788179,
|
|
"loss": 0.8042,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 12.00732153752288,
|
|
"grad_norm": 3.4463417530059814,
|
|
"learning_rate": 0.0001767487684729064,
|
|
"loss": 0.8606,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 12.104942037827945,
|
|
"grad_norm": 3.192249059677124,
|
|
"learning_rate": 0.00017655172413793104,
|
|
"loss": 0.7847,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 12.202562538133009,
|
|
"grad_norm": 2.760958671569824,
|
|
"learning_rate": 0.00017635467980295567,
|
|
"loss": 0.7968,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 12.300183038438073,
|
|
"grad_norm": 2.8952383995056152,
|
|
"learning_rate": 0.0001761576354679803,
|
|
"loss": 0.8226,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 12.397803538743137,
|
|
"grad_norm": 3.6324946880340576,
|
|
"learning_rate": 0.00017596059113300495,
|
|
"loss": 0.7592,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 12.495424039048201,
|
|
"grad_norm": 4.0287885665893555,
|
|
"learning_rate": 0.00017576354679802955,
|
|
"loss": 0.8112,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 12.593044539353265,
|
|
"grad_norm": 3.1734702587127686,
|
|
"learning_rate": 0.0001755665024630542,
|
|
"loss": 0.7847,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 12.690665039658327,
|
|
"grad_norm": 2.9449315071105957,
|
|
"learning_rate": 0.00017536945812807883,
|
|
"loss": 0.8264,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 12.788285539963393,
|
|
"grad_norm": 3.1391289234161377,
|
|
"learning_rate": 0.00017517241379310346,
|
|
"loss": 0.8058,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 12.885906040268456,
|
|
"grad_norm": 3.2317001819610596,
|
|
"learning_rate": 0.00017497536945812808,
|
|
"loss": 0.767,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 12.98352654057352,
|
|
"grad_norm": 3.2640392780303955,
|
|
"learning_rate": 0.0001747783251231527,
|
|
"loss": 0.8314,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 13.081147040878584,
|
|
"grad_norm": 4.71024227142334,
|
|
"learning_rate": 0.00017458128078817734,
|
|
"loss": 0.756,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 13.178767541183648,
|
|
"grad_norm": 3.621242046356201,
|
|
"learning_rate": 0.000174384236453202,
|
|
"loss": 0.7309,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 13.276388041488712,
|
|
"grad_norm": 3.6408748626708984,
|
|
"learning_rate": 0.00017418719211822662,
|
|
"loss": 0.7143,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 13.374008541793776,
|
|
"grad_norm": 3.296096086502075,
|
|
"learning_rate": 0.00017399014778325125,
|
|
"loss": 0.7965,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 13.47162904209884,
|
|
"grad_norm": 2.74519944190979,
|
|
"learning_rate": 0.00017379310344827587,
|
|
"loss": 0.7654,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 13.569249542403904,
|
|
"grad_norm": 2.9242568016052246,
|
|
"learning_rate": 0.0001735960591133005,
|
|
"loss": 0.7875,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 13.666870042708968,
|
|
"grad_norm": 2.5848984718322754,
|
|
"learning_rate": 0.00017339901477832515,
|
|
"loss": 0.7594,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 13.764490543014032,
|
|
"grad_norm": 3.9295613765716553,
|
|
"learning_rate": 0.00017320197044334975,
|
|
"loss": 0.75,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 13.862111043319096,
|
|
"grad_norm": 3.6406261920928955,
|
|
"learning_rate": 0.0001730049261083744,
|
|
"loss": 0.8149,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 13.95973154362416,
|
|
"grad_norm": 3.069199323654175,
|
|
"learning_rate": 0.000172807881773399,
|
|
"loss": 0.8217,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 14.057352043929225,
|
|
"grad_norm": 2.788712739944458,
|
|
"learning_rate": 0.00017261083743842366,
|
|
"loss": 0.7755,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 14.154972544234289,
|
|
"grad_norm": 3.468480110168457,
|
|
"learning_rate": 0.00017241379310344826,
|
|
"loss": 0.7071,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 14.252593044539353,
|
|
"grad_norm": 2.899951696395874,
|
|
"learning_rate": 0.00017221674876847292,
|
|
"loss": 0.7368,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 14.350213544844417,
|
|
"grad_norm": 3.6109790802001953,
|
|
"learning_rate": 0.00017201970443349754,
|
|
"loss": 0.7012,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 14.44783404514948,
|
|
"grad_norm": 3.448408842086792,
|
|
"learning_rate": 0.00017182266009852217,
|
|
"loss": 0.743,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 14.545454545454545,
|
|
"grad_norm": 2.819427013397217,
|
|
"learning_rate": 0.0001716256157635468,
|
|
"loss": 0.7552,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 14.643075045759609,
|
|
"grad_norm": 4.412954807281494,
|
|
"learning_rate": 0.00017142857142857143,
|
|
"loss": 0.7838,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 14.740695546064673,
|
|
"grad_norm": 2.7720842361450195,
|
|
"learning_rate": 0.00017123152709359608,
|
|
"loss": 0.7589,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 14.838316046369737,
|
|
"grad_norm": 3.3187596797943115,
|
|
"learning_rate": 0.0001710344827586207,
|
|
"loss": 0.7812,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 14.935936546674801,
|
|
"grad_norm": 2.3551273345947266,
|
|
"learning_rate": 0.00017083743842364533,
|
|
"loss": 0.764,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 15.033557046979865,
|
|
"grad_norm": 2.663290023803711,
|
|
"learning_rate": 0.00017064039408866996,
|
|
"loss": 0.7034,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 15.13117754728493,
|
|
"grad_norm": 3.2227704524993896,
|
|
"learning_rate": 0.0001704433497536946,
|
|
"loss": 0.6878,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 15.228798047589994,
|
|
"grad_norm": 2.819664478302002,
|
|
"learning_rate": 0.00017024630541871921,
|
|
"loss": 0.6731,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 15.326418547895058,
|
|
"grad_norm": 2.9787933826446533,
|
|
"learning_rate": 0.00017004926108374387,
|
|
"loss": 0.7036,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 15.424039048200122,
|
|
"grad_norm": 2.4379117488861084,
|
|
"learning_rate": 0.00016985221674876847,
|
|
"loss": 0.7323,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 15.521659548505186,
|
|
"grad_norm": 1.9959620237350464,
|
|
"learning_rate": 0.00016965517241379312,
|
|
"loss": 0.7155,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 15.61928004881025,
|
|
"grad_norm": 2.856109619140625,
|
|
"learning_rate": 0.00016945812807881772,
|
|
"loss": 0.6876,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 15.716900549115314,
|
|
"grad_norm": 3.9589807987213135,
|
|
"learning_rate": 0.00016926108374384238,
|
|
"loss": 0.7316,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 15.814521049420378,
|
|
"grad_norm": 2.921196460723877,
|
|
"learning_rate": 0.000169064039408867,
|
|
"loss": 0.7306,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 15.912141549725442,
|
|
"grad_norm": 2.862910270690918,
|
|
"learning_rate": 0.00016886699507389163,
|
|
"loss": 0.7829,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 16.009762050030506,
|
|
"grad_norm": 2.988609552383423,
|
|
"learning_rate": 0.00016866995073891626,
|
|
"loss": 0.75,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 16.107382550335572,
|
|
"grad_norm": 3.728930950164795,
|
|
"learning_rate": 0.00016847290640394089,
|
|
"loss": 0.6083,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 16.205003050640634,
|
|
"grad_norm": 3.5626068115234375,
|
|
"learning_rate": 0.00016827586206896554,
|
|
"loss": 0.6849,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 16.3026235509457,
|
|
"grad_norm": 2.754389524459839,
|
|
"learning_rate": 0.00016807881773399014,
|
|
"loss": 0.6635,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 16.400244051250763,
|
|
"grad_norm": 3.2776389122009277,
|
|
"learning_rate": 0.0001678817733990148,
|
|
"loss": 0.6999,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 16.49786455155583,
|
|
"grad_norm": 3.0710105895996094,
|
|
"learning_rate": 0.00016768472906403942,
|
|
"loss": 0.6911,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 16.59548505186089,
|
|
"grad_norm": 3.1727585792541504,
|
|
"learning_rate": 0.00016748768472906405,
|
|
"loss": 0.7238,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 16.693105552165953,
|
|
"grad_norm": 2.671583652496338,
|
|
"learning_rate": 0.00016729064039408868,
|
|
"loss": 0.6925,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 16.79072605247102,
|
|
"grad_norm": 2.9183971881866455,
|
|
"learning_rate": 0.0001670935960591133,
|
|
"loss": 0.703,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 16.888346552776085,
|
|
"grad_norm": 3.785710334777832,
|
|
"learning_rate": 0.00016689655172413793,
|
|
"loss": 0.7245,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 16.985967053081147,
|
|
"grad_norm": 3.435655355453491,
|
|
"learning_rate": 0.00016669950738916258,
|
|
"loss": 0.7483,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 17.08358755338621,
|
|
"grad_norm": 3.7350969314575195,
|
|
"learning_rate": 0.00016650246305418718,
|
|
"loss": 0.639,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 17.181208053691275,
|
|
"grad_norm": 3.0420546531677246,
|
|
"learning_rate": 0.00016630541871921184,
|
|
"loss": 0.675,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 17.278828553996338,
|
|
"grad_norm": 2.1023027896881104,
|
|
"learning_rate": 0.00016610837438423646,
|
|
"loss": 0.6857,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 17.376449054301403,
|
|
"grad_norm": 2.282754898071289,
|
|
"learning_rate": 0.0001659113300492611,
|
|
"loss": 0.7028,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 17.474069554606466,
|
|
"grad_norm": 4.962581634521484,
|
|
"learning_rate": 0.00016571428571428575,
|
|
"loss": 0.6297,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 17.57169005491153,
|
|
"grad_norm": 2.602381944656372,
|
|
"learning_rate": 0.00016551724137931035,
|
|
"loss": 0.7003,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 17.669310555216594,
|
|
"grad_norm": 4.691868782043457,
|
|
"learning_rate": 0.000165320197044335,
|
|
"loss": 0.6993,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 17.76693105552166,
|
|
"grad_norm": 3.7989959716796875,
|
|
"learning_rate": 0.0001651231527093596,
|
|
"loss": 0.6644,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 17.864551555826722,
|
|
"grad_norm": 3.188518524169922,
|
|
"learning_rate": 0.00016492610837438425,
|
|
"loss": 0.6713,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 17.962172056131788,
|
|
"grad_norm": 3.8618476390838623,
|
|
"learning_rate": 0.00016472906403940885,
|
|
"loss": 0.6652,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 18.05979255643685,
|
|
"grad_norm": 3.6163158416748047,
|
|
"learning_rate": 0.0001645320197044335,
|
|
"loss": 0.667,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 18.157413056741916,
|
|
"grad_norm": 3.723688840866089,
|
|
"learning_rate": 0.00016433497536945814,
|
|
"loss": 0.6456,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 18.25503355704698,
|
|
"grad_norm": 4.452234268188477,
|
|
"learning_rate": 0.00016413793103448276,
|
|
"loss": 0.627,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 18.352654057352044,
|
|
"grad_norm": 3.0752596855163574,
|
|
"learning_rate": 0.0001639408866995074,
|
|
"loss": 0.6755,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 18.450274557657107,
|
|
"grad_norm": 3.043836832046509,
|
|
"learning_rate": 0.00016374384236453202,
|
|
"loss": 0.6861,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 18.547895057962172,
|
|
"grad_norm": 4.210402011871338,
|
|
"learning_rate": 0.00016354679802955667,
|
|
"loss": 0.6206,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 18.645515558267235,
|
|
"grad_norm": 3.4578044414520264,
|
|
"learning_rate": 0.0001633497536945813,
|
|
"loss": 0.633,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 18.7431360585723,
|
|
"grad_norm": 3.9487128257751465,
|
|
"learning_rate": 0.00016315270935960593,
|
|
"loss": 0.6479,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 18.840756558877363,
|
|
"grad_norm": 3.114673376083374,
|
|
"learning_rate": 0.00016295566502463055,
|
|
"loss": 0.6468,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 18.93837705918243,
|
|
"grad_norm": 3.7751824855804443,
|
|
"learning_rate": 0.00016275862068965518,
|
|
"loss": 0.6695,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 19.03599755948749,
|
|
"grad_norm": 2.7188830375671387,
|
|
"learning_rate": 0.0001625615763546798,
|
|
"loss": 0.6507,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 19.133618059792557,
|
|
"grad_norm": 3.5054094791412354,
|
|
"learning_rate": 0.00016236453201970446,
|
|
"loss": 0.5542,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 19.23123856009762,
|
|
"grad_norm": 2.4097495079040527,
|
|
"learning_rate": 0.00016216748768472906,
|
|
"loss": 0.602,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 19.328859060402685,
|
|
"grad_norm": 2.925482749938965,
|
|
"learning_rate": 0.00016197044334975372,
|
|
"loss": 0.6493,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 19.426479560707747,
|
|
"grad_norm": 4.706211566925049,
|
|
"learning_rate": 0.00016177339901477832,
|
|
"loss": 0.6285,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 19.524100061012813,
|
|
"grad_norm": 3.257904052734375,
|
|
"learning_rate": 0.00016157635467980297,
|
|
"loss": 0.6515,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 19.621720561317876,
|
|
"grad_norm": 3.0172128677368164,
|
|
"learning_rate": 0.0001613793103448276,
|
|
"loss": 0.6426,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 19.71934106162294,
|
|
"grad_norm": 2.948984146118164,
|
|
"learning_rate": 0.00016118226600985222,
|
|
"loss": 0.6487,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 19.816961561928004,
|
|
"grad_norm": 3.070138931274414,
|
|
"learning_rate": 0.00016098522167487685,
|
|
"loss": 0.6695,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 19.91458206223307,
|
|
"grad_norm": 3.364335060119629,
|
|
"learning_rate": 0.00016078817733990148,
|
|
"loss": 0.6443,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 20.012202562538132,
|
|
"grad_norm": 3.131267547607422,
|
|
"learning_rate": 0.00016059113300492613,
|
|
"loss": 0.6403,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 20.109823062843198,
|
|
"grad_norm": 2.4083542823791504,
|
|
"learning_rate": 0.00016039408866995073,
|
|
"loss": 0.5922,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 20.20744356314826,
|
|
"grad_norm": 4.872425556182861,
|
|
"learning_rate": 0.00016019704433497539,
|
|
"loss": 0.6166,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 20.305064063453326,
|
|
"grad_norm": 6.9143853187561035,
|
|
"learning_rate": 0.00016,
|
|
"loss": 0.6023,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 20.40268456375839,
|
|
"grad_norm": 2.4565210342407227,
|
|
"learning_rate": 0.00015980295566502464,
|
|
"loss": 0.6154,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 20.500305064063454,
|
|
"grad_norm": 2.886202096939087,
|
|
"learning_rate": 0.00015960591133004927,
|
|
"loss": 0.5861,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 20.597925564368516,
|
|
"grad_norm": 3.0811331272125244,
|
|
"learning_rate": 0.0001594088669950739,
|
|
"loss": 0.6445,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 20.695546064673582,
|
|
"grad_norm": 3.5066580772399902,
|
|
"learning_rate": 0.00015921182266009852,
|
|
"loss": 0.6133,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 20.793166564978645,
|
|
"grad_norm": 3.8073158264160156,
|
|
"learning_rate": 0.00015901477832512318,
|
|
"loss": 0.6133,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 20.89078706528371,
|
|
"grad_norm": 4.436833381652832,
|
|
"learning_rate": 0.00015881773399014778,
|
|
"loss": 0.6243,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 20.988407565588773,
|
|
"grad_norm": 2.7935214042663574,
|
|
"learning_rate": 0.00015862068965517243,
|
|
"loss": 0.6349,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 21.08602806589384,
|
|
"grad_norm": 3.224860668182373,
|
|
"learning_rate": 0.00015842364532019706,
|
|
"loss": 0.5906,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 21.1836485661989,
|
|
"grad_norm": 2.9267752170562744,
|
|
"learning_rate": 0.00015822660098522168,
|
|
"loss": 0.5512,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 21.281269066503967,
|
|
"grad_norm": 3.137066125869751,
|
|
"learning_rate": 0.0001580295566502463,
|
|
"loss": 0.5764,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 21.37888956680903,
|
|
"grad_norm": 3.112293004989624,
|
|
"learning_rate": 0.00015783251231527094,
|
|
"loss": 0.6045,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 21.476510067114095,
|
|
"grad_norm": 2.6162259578704834,
|
|
"learning_rate": 0.0001576354679802956,
|
|
"loss": 0.6009,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 21.574130567419157,
|
|
"grad_norm": 2.924473285675049,
|
|
"learning_rate": 0.0001574384236453202,
|
|
"loss": 0.589,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 21.671751067724223,
|
|
"grad_norm": 3.2589287757873535,
|
|
"learning_rate": 0.00015724137931034485,
|
|
"loss": 0.6078,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 21.769371568029285,
|
|
"grad_norm": 3.4130911827087402,
|
|
"learning_rate": 0.00015704433497536945,
|
|
"loss": 0.6177,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 21.86699206833435,
|
|
"grad_norm": 3.0816001892089844,
|
|
"learning_rate": 0.0001568472906403941,
|
|
"loss": 0.6077,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 21.964612568639414,
|
|
"grad_norm": 2.875441789627075,
|
|
"learning_rate": 0.00015665024630541873,
|
|
"loss": 0.6127,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 22.06223306894448,
|
|
"grad_norm": 4.020274639129639,
|
|
"learning_rate": 0.00015645320197044335,
|
|
"loss": 0.5673,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 22.15985356924954,
|
|
"grad_norm": 3.365691661834717,
|
|
"learning_rate": 0.00015625615763546798,
|
|
"loss": 0.5201,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 22.257474069554608,
|
|
"grad_norm": 3.449277400970459,
|
|
"learning_rate": 0.0001560591133004926,
|
|
"loss": 0.5657,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 22.35509456985967,
|
|
"grad_norm": 3.7012288570404053,
|
|
"learning_rate": 0.00015586206896551724,
|
|
"loss": 0.6035,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 22.452715070164736,
|
|
"grad_norm": 3.5211081504821777,
|
|
"learning_rate": 0.0001556650246305419,
|
|
"loss": 0.6173,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 22.550335570469798,
|
|
"grad_norm": 3.026588201522827,
|
|
"learning_rate": 0.00015546798029556652,
|
|
"loss": 0.6004,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 22.647956070774864,
|
|
"grad_norm": 2.7548885345458984,
|
|
"learning_rate": 0.00015527093596059114,
|
|
"loss": 0.5633,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 22.745576571079926,
|
|
"grad_norm": 5.050055027008057,
|
|
"learning_rate": 0.00015507389162561577,
|
|
"loss": 0.6061,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 22.843197071384992,
|
|
"grad_norm": 3.0278573036193848,
|
|
"learning_rate": 0.0001548768472906404,
|
|
"loss": 0.5607,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 22.940817571690054,
|
|
"grad_norm": 3.17149019241333,
|
|
"learning_rate": 0.00015467980295566505,
|
|
"loss": 0.5829,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 23.03843807199512,
|
|
"grad_norm": 2.5521585941314697,
|
|
"learning_rate": 0.00015448275862068965,
|
|
"loss": 0.5723,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 23.136058572300183,
|
|
"grad_norm": 2.7798378467559814,
|
|
"learning_rate": 0.0001542857142857143,
|
|
"loss": 0.5373,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 23.23367907260525,
|
|
"grad_norm": 3.4025466442108154,
|
|
"learning_rate": 0.0001540886699507389,
|
|
"loss": 0.5445,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 23.33129957291031,
|
|
"grad_norm": 3.9419145584106445,
|
|
"learning_rate": 0.00015389162561576356,
|
|
"loss": 0.5677,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 23.428920073215377,
|
|
"grad_norm": 2.300863265991211,
|
|
"learning_rate": 0.00015369458128078816,
|
|
"loss": 0.5941,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 23.52654057352044,
|
|
"grad_norm": 3.25654673576355,
|
|
"learning_rate": 0.00015349753694581282,
|
|
"loss": 0.5688,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 23.624161073825505,
|
|
"grad_norm": 3.1517579555511475,
|
|
"learning_rate": 0.00015330049261083744,
|
|
"loss": 0.5481,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 23.721781574130567,
|
|
"grad_norm": 2.5366251468658447,
|
|
"learning_rate": 0.00015310344827586207,
|
|
"loss": 0.5725,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 23.819402074435633,
|
|
"grad_norm": 4.309774875640869,
|
|
"learning_rate": 0.00015290640394088672,
|
|
"loss": 0.574,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 23.917022574740695,
|
|
"grad_norm": 3.031926155090332,
|
|
"learning_rate": 0.00015270935960591132,
|
|
"loss": 0.5431,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 24.01464307504576,
|
|
"grad_norm": 2.574500560760498,
|
|
"learning_rate": 0.00015251231527093598,
|
|
"loss": 0.5967,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 24.112263575350823,
|
|
"grad_norm": 2.556105136871338,
|
|
"learning_rate": 0.0001523152709359606,
|
|
"loss": 0.5419,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 24.20988407565589,
|
|
"grad_norm": 2.412322998046875,
|
|
"learning_rate": 0.00015211822660098523,
|
|
"loss": 0.5342,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 24.30750457596095,
|
|
"grad_norm": 2.39802622795105,
|
|
"learning_rate": 0.00015192118226600986,
|
|
"loss": 0.5249,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 24.405125076266017,
|
|
"grad_norm": 2.854398727416992,
|
|
"learning_rate": 0.00015172413793103449,
|
|
"loss": 0.5468,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 24.50274557657108,
|
|
"grad_norm": 2.8961057662963867,
|
|
"learning_rate": 0.0001515270935960591,
|
|
"loss": 0.5313,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 24.600366076876146,
|
|
"grad_norm": 3.2031073570251465,
|
|
"learning_rate": 0.00015133004926108377,
|
|
"loss": 0.5718,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 24.697986577181208,
|
|
"grad_norm": 4.338870525360107,
|
|
"learning_rate": 0.00015113300492610837,
|
|
"loss": 0.5415,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 24.795607077486274,
|
|
"grad_norm": 3.46842360496521,
|
|
"learning_rate": 0.00015093596059113302,
|
|
"loss": 0.5546,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 24.893227577791336,
|
|
"grad_norm": 2.853489637374878,
|
|
"learning_rate": 0.00015073891625615765,
|
|
"loss": 0.5691,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 24.990848078096402,
|
|
"grad_norm": 3.427720785140991,
|
|
"learning_rate": 0.00015054187192118228,
|
|
"loss": 0.5795,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 25.088468578401464,
|
|
"grad_norm": 3.2862656116485596,
|
|
"learning_rate": 0.0001503448275862069,
|
|
"loss": 0.5109,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 25.18608907870653,
|
|
"grad_norm": 3.383563756942749,
|
|
"learning_rate": 0.00015014778325123153,
|
|
"loss": 0.4983,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 25.283709579011592,
|
|
"grad_norm": 3.3909354209899902,
|
|
"learning_rate": 0.00014995073891625618,
|
|
"loss": 0.5164,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 25.381330079316655,
|
|
"grad_norm": 2.616955041885376,
|
|
"learning_rate": 0.00014975369458128078,
|
|
"loss": 0.5347,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 25.47895057962172,
|
|
"grad_norm": 2.7965965270996094,
|
|
"learning_rate": 0.00014955665024630544,
|
|
"loss": 0.5386,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 25.576571079926783,
|
|
"grad_norm": 2.9817397594451904,
|
|
"learning_rate": 0.00014935960591133004,
|
|
"loss": 0.5001,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 25.67419158023185,
|
|
"grad_norm": 2.527992010116577,
|
|
"learning_rate": 0.0001491625615763547,
|
|
"loss": 0.5572,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 25.77181208053691,
|
|
"grad_norm": 4.047604560852051,
|
|
"learning_rate": 0.00014896551724137932,
|
|
"loss": 0.5429,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 25.869432580841977,
|
|
"grad_norm": 3.2753515243530273,
|
|
"learning_rate": 0.00014876847290640395,
|
|
"loss": 0.5461,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 25.96705308114704,
|
|
"grad_norm": 3.5623252391815186,
|
|
"learning_rate": 0.00014857142857142857,
|
|
"loss": 0.571,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 26.064673581452105,
|
|
"grad_norm": 4.602993965148926,
|
|
"learning_rate": 0.0001483743842364532,
|
|
"loss": 0.4858,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 26.162294081757167,
|
|
"grad_norm": 3.4932191371917725,
|
|
"learning_rate": 0.00014817733990147783,
|
|
"loss": 0.5374,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 26.259914582062233,
|
|
"grad_norm": 2.595555305480957,
|
|
"learning_rate": 0.00014798029556650248,
|
|
"loss": 0.5217,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 26.357535082367296,
|
|
"grad_norm": 2.3642492294311523,
|
|
"learning_rate": 0.0001477832512315271,
|
|
"loss": 0.5055,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 26.45515558267236,
|
|
"grad_norm": 3.9272634983062744,
|
|
"learning_rate": 0.00014758620689655174,
|
|
"loss": 0.5535,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 26.552776082977424,
|
|
"grad_norm": 4.050607204437256,
|
|
"learning_rate": 0.00014738916256157636,
|
|
"loss": 0.5019,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 26.65039658328249,
|
|
"grad_norm": 3.2770299911499023,
|
|
"learning_rate": 0.000147192118226601,
|
|
"loss": 0.4922,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 26.748017083587552,
|
|
"grad_norm": 3.96409273147583,
|
|
"learning_rate": 0.00014699507389162562,
|
|
"loss": 0.5165,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 26.845637583892618,
|
|
"grad_norm": 4.587811470031738,
|
|
"learning_rate": 0.00014679802955665024,
|
|
"loss": 0.5513,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 26.94325808419768,
|
|
"grad_norm": 4.558196067810059,
|
|
"learning_rate": 0.0001466009852216749,
|
|
"loss": 0.5227,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 27.040878584502746,
|
|
"grad_norm": 3.807441473007202,
|
|
"learning_rate": 0.0001464039408866995,
|
|
"loss": 0.5141,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 27.13849908480781,
|
|
"grad_norm": 2.2902328968048096,
|
|
"learning_rate": 0.00014620689655172415,
|
|
"loss": 0.4822,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 27.236119585112874,
|
|
"grad_norm": 4.3950886726379395,
|
|
"learning_rate": 0.00014600985221674875,
|
|
"loss": 0.5136,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 27.333740085417936,
|
|
"grad_norm": 4.0127482414245605,
|
|
"learning_rate": 0.0001458128078817734,
|
|
"loss": 0.5299,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 27.431360585723002,
|
|
"grad_norm": 4.659334182739258,
|
|
"learning_rate": 0.00014561576354679803,
|
|
"loss": 0.4764,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 27.528981086028065,
|
|
"grad_norm": 4.769715785980225,
|
|
"learning_rate": 0.00014541871921182266,
|
|
"loss": 0.5236,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 27.62660158633313,
|
|
"grad_norm": 3.8856427669525146,
|
|
"learning_rate": 0.00014522167487684732,
|
|
"loss": 0.5028,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 27.724222086638193,
|
|
"grad_norm": 3.183850049972534,
|
|
"learning_rate": 0.00014502463054187192,
|
|
"loss": 0.4945,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 27.82184258694326,
|
|
"grad_norm": 3.1610593795776367,
|
|
"learning_rate": 0.00014482758620689657,
|
|
"loss": 0.4963,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 27.91946308724832,
|
|
"grad_norm": 4.054819107055664,
|
|
"learning_rate": 0.0001446305418719212,
|
|
"loss": 0.547,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 28.017083587553387,
|
|
"grad_norm": 2.7358503341674805,
|
|
"learning_rate": 0.00014443349753694582,
|
|
"loss": 0.5387,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 28.11470408785845,
|
|
"grad_norm": 2.403042793273926,
|
|
"learning_rate": 0.00014423645320197045,
|
|
"loss": 0.4593,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 28.212324588163515,
|
|
"grad_norm": 3.3207452297210693,
|
|
"learning_rate": 0.00014403940886699508,
|
|
"loss": 0.4842,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 28.309945088468577,
|
|
"grad_norm": 3.0579757690429688,
|
|
"learning_rate": 0.0001438423645320197,
|
|
"loss": 0.4754,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 28.407565588773643,
|
|
"grad_norm": 4.5140700340271,
|
|
"learning_rate": 0.00014364532019704436,
|
|
"loss": 0.5128,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 28.505186089078705,
|
|
"grad_norm": 3.541874885559082,
|
|
"learning_rate": 0.00014344827586206896,
|
|
"loss": 0.5187,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 28.60280658938377,
|
|
"grad_norm": 3.214235782623291,
|
|
"learning_rate": 0.00014325123152709361,
|
|
"loss": 0.475,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 28.700427089688834,
|
|
"grad_norm": 4.037768363952637,
|
|
"learning_rate": 0.00014305418719211824,
|
|
"loss": 0.4733,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 28.7980475899939,
|
|
"grad_norm": 3.0469048023223877,
|
|
"learning_rate": 0.00014285714285714287,
|
|
"loss": 0.5181,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 28.89566809029896,
|
|
"grad_norm": 3.3396294116973877,
|
|
"learning_rate": 0.0001426600985221675,
|
|
"loss": 0.5062,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 28.993288590604028,
|
|
"grad_norm": 3.4280455112457275,
|
|
"learning_rate": 0.00014246305418719212,
|
|
"loss": 0.5232,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 29.09090909090909,
|
|
"grad_norm": 3.8690781593322754,
|
|
"learning_rate": 0.00014226600985221678,
|
|
"loss": 0.4744,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 29.188529591214156,
|
|
"grad_norm": 3.1680831909179688,
|
|
"learning_rate": 0.00014206896551724138,
|
|
"loss": 0.4679,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 29.286150091519218,
|
|
"grad_norm": 3.752593755722046,
|
|
"learning_rate": 0.00014187192118226603,
|
|
"loss": 0.444,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 29.383770591824284,
|
|
"grad_norm": 4.88236141204834,
|
|
"learning_rate": 0.00014167487684729063,
|
|
"loss": 0.4639,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 29.481391092129346,
|
|
"grad_norm": 3.7870137691497803,
|
|
"learning_rate": 0.00014147783251231528,
|
|
"loss": 0.4873,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 29.579011592434412,
|
|
"grad_norm": 3.091411590576172,
|
|
"learning_rate": 0.0001412807881773399,
|
|
"loss": 0.4834,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 29.676632092739474,
|
|
"grad_norm": 2.7498538494110107,
|
|
"learning_rate": 0.00014108374384236454,
|
|
"loss": 0.4846,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 29.77425259304454,
|
|
"grad_norm": 3.2043850421905518,
|
|
"learning_rate": 0.00014088669950738917,
|
|
"loss": 0.4983,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 29.871873093349603,
|
|
"grad_norm": 3.270357847213745,
|
|
"learning_rate": 0.0001406896551724138,
|
|
"loss": 0.4803,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 29.96949359365467,
|
|
"grad_norm": 3.031405210494995,
|
|
"learning_rate": 0.00014049261083743842,
|
|
"loss": 0.5287,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 30.06711409395973,
|
|
"grad_norm": 3.390765905380249,
|
|
"learning_rate": 0.00014029556650246307,
|
|
"loss": 0.4619,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 30.164734594264797,
|
|
"grad_norm": 3.2783963680267334,
|
|
"learning_rate": 0.0001400985221674877,
|
|
"loss": 0.4328,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 30.26235509456986,
|
|
"grad_norm": 3.6925759315490723,
|
|
"learning_rate": 0.00013990147783251233,
|
|
"loss": 0.487,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 30.359975594874925,
|
|
"grad_norm": 3.0115065574645996,
|
|
"learning_rate": 0.00013970443349753696,
|
|
"loss": 0.467,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 30.457596095179987,
|
|
"grad_norm": 4.561310291290283,
|
|
"learning_rate": 0.00013950738916256158,
|
|
"loss": 0.4801,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 30.555216595485053,
|
|
"grad_norm": 3.2879674434661865,
|
|
"learning_rate": 0.0001393103448275862,
|
|
"loss": 0.4638,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 30.652837095790115,
|
|
"grad_norm": 2.793945789337158,
|
|
"learning_rate": 0.00013911330049261084,
|
|
"loss": 0.463,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 30.75045759609518,
|
|
"grad_norm": 3.615793466567993,
|
|
"learning_rate": 0.0001389162561576355,
|
|
"loss": 0.4907,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 30.848078096400243,
|
|
"grad_norm": 3.160133123397827,
|
|
"learning_rate": 0.0001387192118226601,
|
|
"loss": 0.477,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 30.94569859670531,
|
|
"grad_norm": 3.62670636177063,
|
|
"learning_rate": 0.00013852216748768475,
|
|
"loss": 0.4945,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 31.04331909701037,
|
|
"grad_norm": 3.346158981323242,
|
|
"learning_rate": 0.00013832512315270935,
|
|
"loss": 0.4543,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 31.140939597315437,
|
|
"grad_norm": 2.8707423210144043,
|
|
"learning_rate": 0.000138128078817734,
|
|
"loss": 0.4352,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 31.2385600976205,
|
|
"grad_norm": 2.5617620944976807,
|
|
"learning_rate": 0.00013793103448275863,
|
|
"loss": 0.4611,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 31.336180597925566,
|
|
"grad_norm": 3.2273828983306885,
|
|
"learning_rate": 0.00013773399014778325,
|
|
"loss": 0.4593,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 31.433801098230628,
|
|
"grad_norm": 3.502797842025757,
|
|
"learning_rate": 0.00013753694581280788,
|
|
"loss": 0.4717,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 31.531421598535694,
|
|
"grad_norm": 3.9278218746185303,
|
|
"learning_rate": 0.0001373399014778325,
|
|
"loss": 0.4813,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 31.629042098840756,
|
|
"grad_norm": 3.013709545135498,
|
|
"learning_rate": 0.00013714285714285716,
|
|
"loss": 0.4305,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 31.726662599145822,
|
|
"grad_norm": 2.661198377609253,
|
|
"learning_rate": 0.0001369458128078818,
|
|
"loss": 0.4495,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 31.824283099450884,
|
|
"grad_norm": 2.6343297958374023,
|
|
"learning_rate": 0.00013674876847290642,
|
|
"loss": 0.4809,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 31.92190359975595,
|
|
"grad_norm": 6.334170818328857,
|
|
"learning_rate": 0.00013655172413793104,
|
|
"loss": 0.4576,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 32.01952410006101,
|
|
"grad_norm": 3.728727102279663,
|
|
"learning_rate": 0.00013635467980295567,
|
|
"loss": 0.5034,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 32.117144600366075,
|
|
"grad_norm": 2.0572702884674072,
|
|
"learning_rate": 0.0001361576354679803,
|
|
"loss": 0.4161,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 32.214765100671144,
|
|
"grad_norm": 2.7006356716156006,
|
|
"learning_rate": 0.00013596059113300492,
|
|
"loss": 0.4357,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 32.31238560097621,
|
|
"grad_norm": 3.526782989501953,
|
|
"learning_rate": 0.00013576354679802955,
|
|
"loss": 0.4367,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 32.41000610128127,
|
|
"grad_norm": 3.240647792816162,
|
|
"learning_rate": 0.0001355665024630542,
|
|
"loss": 0.4416,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 32.50762660158633,
|
|
"grad_norm": 2.965851306915283,
|
|
"learning_rate": 0.0001353694581280788,
|
|
"loss": 0.4649,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 32.6052471018914,
|
|
"grad_norm": 3.028812885284424,
|
|
"learning_rate": 0.00013517241379310346,
|
|
"loss": 0.4381,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 32.70286760219646,
|
|
"grad_norm": 4.041370391845703,
|
|
"learning_rate": 0.0001349753694581281,
|
|
"loss": 0.4671,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 32.800488102501525,
|
|
"grad_norm": 5.677656650543213,
|
|
"learning_rate": 0.00013477832512315271,
|
|
"loss": 0.4718,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 32.89810860280659,
|
|
"grad_norm": 3.1538727283477783,
|
|
"learning_rate": 0.00013458128078817737,
|
|
"loss": 0.4705,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 32.99572910311166,
|
|
"grad_norm": 3.8186867237091064,
|
|
"learning_rate": 0.00013438423645320197,
|
|
"loss": 0.4724,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 33.09334960341672,
|
|
"grad_norm": 2.8248584270477295,
|
|
"learning_rate": 0.00013418719211822662,
|
|
"loss": 0.4399,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 33.19097010372178,
|
|
"grad_norm": 2.2694895267486572,
|
|
"learning_rate": 0.00013399014778325122,
|
|
"loss": 0.4147,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 33.288590604026844,
|
|
"grad_norm": 3.305610418319702,
|
|
"learning_rate": 0.00013379310344827588,
|
|
"loss": 0.4028,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 33.38621110433191,
|
|
"grad_norm": 3.610136032104492,
|
|
"learning_rate": 0.0001335960591133005,
|
|
"loss": 0.4319,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 33.483831604636975,
|
|
"grad_norm": 3.4783689975738525,
|
|
"learning_rate": 0.00013339901477832513,
|
|
"loss": 0.4361,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 33.58145210494204,
|
|
"grad_norm": 3.0984203815460205,
|
|
"learning_rate": 0.00013320197044334976,
|
|
"loss": 0.4488,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 33.6790726052471,
|
|
"grad_norm": 3.1558122634887695,
|
|
"learning_rate": 0.00013300492610837438,
|
|
"loss": 0.4262,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 33.77669310555217,
|
|
"grad_norm": 4.813379764556885,
|
|
"learning_rate": 0.000132807881773399,
|
|
"loss": 0.452,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 33.87431360585723,
|
|
"grad_norm": 3.047551393508911,
|
|
"learning_rate": 0.00013261083743842364,
|
|
"loss": 0.4517,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 33.971934106162294,
|
|
"grad_norm": 3.0880701541900635,
|
|
"learning_rate": 0.0001324137931034483,
|
|
"loss": 0.5147,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 34.06955460646736,
|
|
"grad_norm": 2.824169874191284,
|
|
"learning_rate": 0.00013221674876847292,
|
|
"loss": 0.4017,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 34.16717510677242,
|
|
"grad_norm": 3.1136012077331543,
|
|
"learning_rate": 0.00013201970443349755,
|
|
"loss": 0.4291,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 34.26479560707749,
|
|
"grad_norm": 4.246958255767822,
|
|
"learning_rate": 0.00013182266009852217,
|
|
"loss": 0.4318,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 34.36241610738255,
|
|
"grad_norm": 2.4655661582946777,
|
|
"learning_rate": 0.0001316256157635468,
|
|
"loss": 0.4283,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 34.46003660768761,
|
|
"grad_norm": 4.322596549987793,
|
|
"learning_rate": 0.00013142857142857143,
|
|
"loss": 0.4323,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 34.557657107992675,
|
|
"grad_norm": 4.425800800323486,
|
|
"learning_rate": 0.00013123152709359608,
|
|
"loss": 0.4376,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 34.655277608297745,
|
|
"grad_norm": 3.796889305114746,
|
|
"learning_rate": 0.00013103448275862068,
|
|
"loss": 0.4276,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 34.75289810860281,
|
|
"grad_norm": 3.9222586154937744,
|
|
"learning_rate": 0.00013083743842364534,
|
|
"loss": 0.4658,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 34.85051860890787,
|
|
"grad_norm": 4.5007548332214355,
|
|
"learning_rate": 0.00013064039408866994,
|
|
"loss": 0.4293,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 34.94813910921293,
|
|
"grad_norm": 3.0858423709869385,
|
|
"learning_rate": 0.0001304433497536946,
|
|
"loss": 0.4214,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 35.045759609518,
|
|
"grad_norm": 3.586949586868286,
|
|
"learning_rate": 0.00013024630541871922,
|
|
"loss": 0.4199,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 35.14338010982306,
|
|
"grad_norm": 2.916937828063965,
|
|
"learning_rate": 0.00013004926108374385,
|
|
"loss": 0.4071,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 35.241000610128125,
|
|
"grad_norm": 3.1324169635772705,
|
|
"learning_rate": 0.00012985221674876847,
|
|
"loss": 0.4151,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 35.33862111043319,
|
|
"grad_norm": 2.8730344772338867,
|
|
"learning_rate": 0.0001296551724137931,
|
|
"loss": 0.3984,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 35.43624161073826,
|
|
"grad_norm": 3.0865273475646973,
|
|
"learning_rate": 0.00012945812807881775,
|
|
"loss": 0.4273,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 35.53386211104332,
|
|
"grad_norm": 4.397771835327148,
|
|
"learning_rate": 0.00012926108374384238,
|
|
"loss": 0.4232,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 35.63148261134838,
|
|
"grad_norm": 2.4203243255615234,
|
|
"learning_rate": 0.000129064039408867,
|
|
"loss": 0.4035,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 35.729103111653444,
|
|
"grad_norm": 2.94404673576355,
|
|
"learning_rate": 0.00012886699507389164,
|
|
"loss": 0.4332,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 35.82672361195851,
|
|
"grad_norm": 3.4141249656677246,
|
|
"learning_rate": 0.00012866995073891626,
|
|
"loss": 0.4484,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 35.924344112263576,
|
|
"grad_norm": 2.8227927684783936,
|
|
"learning_rate": 0.0001284729064039409,
|
|
"loss": 0.4509,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 36.02196461256864,
|
|
"grad_norm": 2.768937110900879,
|
|
"learning_rate": 0.00012827586206896552,
|
|
"loss": 0.4391,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 36.1195851128737,
|
|
"grad_norm": 4.155871391296387,
|
|
"learning_rate": 0.00012807881773399014,
|
|
"loss": 0.3954,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 36.21720561317877,
|
|
"grad_norm": 2.484731912612915,
|
|
"learning_rate": 0.0001278817733990148,
|
|
"loss": 0.4363,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 36.31482611348383,
|
|
"grad_norm": 2.7758595943450928,
|
|
"learning_rate": 0.0001276847290640394,
|
|
"loss": 0.4058,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 36.412446613788894,
|
|
"grad_norm": 3.9609923362731934,
|
|
"learning_rate": 0.00012748768472906405,
|
|
"loss": 0.3845,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 36.51006711409396,
|
|
"grad_norm": 3.963120222091675,
|
|
"learning_rate": 0.00012729064039408868,
|
|
"loss": 0.4301,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 36.607687614399026,
|
|
"grad_norm": 2.77718448638916,
|
|
"learning_rate": 0.0001270935960591133,
|
|
"loss": 0.4034,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 36.70530811470409,
|
|
"grad_norm": 3.6000113487243652,
|
|
"learning_rate": 0.00012689655172413793,
|
|
"loss": 0.4087,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 36.80292861500915,
|
|
"grad_norm": 3.4430975914001465,
|
|
"learning_rate": 0.00012669950738916256,
|
|
"loss": 0.4109,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 36.90054911531421,
|
|
"grad_norm": 3.3932645320892334,
|
|
"learning_rate": 0.00012650246305418721,
|
|
"loss": 0.4394,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 36.99816961561928,
|
|
"grad_norm": 4.054554462432861,
|
|
"learning_rate": 0.00012630541871921181,
|
|
"loss": 0.4203,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 37.095790115924345,
|
|
"grad_norm": 2.8766210079193115,
|
|
"learning_rate": 0.00012610837438423647,
|
|
"loss": 0.3861,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 37.19341061622941,
|
|
"grad_norm": 4.115131855010986,
|
|
"learning_rate": 0.0001259113300492611,
|
|
"loss": 0.4236,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 37.29103111653447,
|
|
"grad_norm": 2.776914358139038,
|
|
"learning_rate": 0.00012571428571428572,
|
|
"loss": 0.4244,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 37.38865161683954,
|
|
"grad_norm": 3.8428800106048584,
|
|
"learning_rate": 0.00012551724137931035,
|
|
"loss": 0.4028,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 37.4862721171446,
|
|
"grad_norm": 3.028683662414551,
|
|
"learning_rate": 0.00012532019704433498,
|
|
"loss": 0.4127,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 37.58389261744966,
|
|
"grad_norm": 2.678617477416992,
|
|
"learning_rate": 0.0001251231527093596,
|
|
"loss": 0.4251,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 37.681513117754726,
|
|
"grad_norm": 3.496917247772217,
|
|
"learning_rate": 0.00012492610837438423,
|
|
"loss": 0.404,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 37.779133618059795,
|
|
"grad_norm": 4.018653869628906,
|
|
"learning_rate": 0.00012472906403940889,
|
|
"loss": 0.4028,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 37.87675411836486,
|
|
"grad_norm": 3.317580223083496,
|
|
"learning_rate": 0.0001245320197044335,
|
|
"loss": 0.4032,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 37.97437461866992,
|
|
"grad_norm": 3.7693002223968506,
|
|
"learning_rate": 0.00012433497536945814,
|
|
"loss": 0.3935,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 38.07199511897498,
|
|
"grad_norm": 2.809558629989624,
|
|
"learning_rate": 0.00012413793103448277,
|
|
"loss": 0.4113,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 38.16961561928005,
|
|
"grad_norm": 3.2092092037200928,
|
|
"learning_rate": 0.0001239408866995074,
|
|
"loss": 0.4019,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 38.267236119585114,
|
|
"grad_norm": 3.3514404296875,
|
|
"learning_rate": 0.00012374384236453202,
|
|
"loss": 0.4013,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 38.364856619890176,
|
|
"grad_norm": 3.9514451026916504,
|
|
"learning_rate": 0.00012354679802955667,
|
|
"loss": 0.3889,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 38.46247712019524,
|
|
"grad_norm": 2.7896828651428223,
|
|
"learning_rate": 0.00012334975369458127,
|
|
"loss": 0.377,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 38.56009762050031,
|
|
"grad_norm": 3.522840738296509,
|
|
"learning_rate": 0.00012315270935960593,
|
|
"loss": 0.4158,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 38.65771812080537,
|
|
"grad_norm": 3.422250270843506,
|
|
"learning_rate": 0.00012295566502463053,
|
|
"loss": 0.3837,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 38.75533862111043,
|
|
"grad_norm": 3.0469913482666016,
|
|
"learning_rate": 0.00012275862068965518,
|
|
"loss": 0.4036,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 38.852959121415495,
|
|
"grad_norm": 2.904141664505005,
|
|
"learning_rate": 0.0001225615763546798,
|
|
"loss": 0.3928,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 38.950579621720564,
|
|
"grad_norm": 3.7538552284240723,
|
|
"learning_rate": 0.00012236453201970444,
|
|
"loss": 0.4092,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 39.04820012202563,
|
|
"grad_norm": 3.562114715576172,
|
|
"learning_rate": 0.00012216748768472906,
|
|
"loss": 0.3982,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 39.14582062233069,
|
|
"grad_norm": 2.4931962490081787,
|
|
"learning_rate": 0.00012197044334975369,
|
|
"loss": 0.3547,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 39.24344112263575,
|
|
"grad_norm": 2.461050271987915,
|
|
"learning_rate": 0.00012177339901477833,
|
|
"loss": 0.3762,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 39.34106162294082,
|
|
"grad_norm": 3.1320595741271973,
|
|
"learning_rate": 0.00012157635467980295,
|
|
"loss": 0.3907,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 39.43868212324588,
|
|
"grad_norm": 3.044754981994629,
|
|
"learning_rate": 0.00012137931034482759,
|
|
"loss": 0.4068,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 39.536302623550945,
|
|
"grad_norm": 2.9243273735046387,
|
|
"learning_rate": 0.00012118226600985223,
|
|
"loss": 0.3903,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 39.63392312385601,
|
|
"grad_norm": 4.234837055206299,
|
|
"learning_rate": 0.00012098522167487685,
|
|
"loss": 0.3841,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 39.73154362416108,
|
|
"grad_norm": 3.993495464324951,
|
|
"learning_rate": 0.00012078817733990148,
|
|
"loss": 0.4082,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 39.82916412446614,
|
|
"grad_norm": 3.8363142013549805,
|
|
"learning_rate": 0.00012059113300492611,
|
|
"loss": 0.3939,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 39.9267846247712,
|
|
"grad_norm": 4.398952007293701,
|
|
"learning_rate": 0.00012039408866995075,
|
|
"loss": 0.4145,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 40.024405125076264,
|
|
"grad_norm": 2.7002291679382324,
|
|
"learning_rate": 0.00012019704433497539,
|
|
"loss": 0.386,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 40.12202562538133,
|
|
"grad_norm": 3.1867945194244385,
|
|
"learning_rate": 0.00012,
|
|
"loss": 0.3924,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 40.219646125686396,
|
|
"grad_norm": 2.9179584980010986,
|
|
"learning_rate": 0.00011980295566502464,
|
|
"loss": 0.3741,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 40.31726662599146,
|
|
"grad_norm": 5.108730316162109,
|
|
"learning_rate": 0.00011960591133004926,
|
|
"loss": 0.371,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 40.41488712629652,
|
|
"grad_norm": 3.4418270587921143,
|
|
"learning_rate": 0.0001194088669950739,
|
|
"loss": 0.3845,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 40.51250762660159,
|
|
"grad_norm": 3.245562791824341,
|
|
"learning_rate": 0.00011921182266009854,
|
|
"loss": 0.375,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 40.61012812690665,
|
|
"grad_norm": 2.6644446849823,
|
|
"learning_rate": 0.00011901477832512315,
|
|
"loss": 0.3839,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 40.707748627211714,
|
|
"grad_norm": 4.975727558135986,
|
|
"learning_rate": 0.00011881773399014779,
|
|
"loss": 0.3889,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 40.80536912751678,
|
|
"grad_norm": 3.6427066326141357,
|
|
"learning_rate": 0.0001186206896551724,
|
|
"loss": 0.393,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 40.902989627821846,
|
|
"grad_norm": 3.7799060344696045,
|
|
"learning_rate": 0.00011842364532019705,
|
|
"loss": 0.3894,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 41.00061012812691,
|
|
"grad_norm": 4.170138835906982,
|
|
"learning_rate": 0.00011822660098522169,
|
|
"loss": 0.3965,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 41.09823062843197,
|
|
"grad_norm": 2.660006523132324,
|
|
"learning_rate": 0.00011802955665024631,
|
|
"loss": 0.3412,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 41.19585112873703,
|
|
"grad_norm": 3.9118030071258545,
|
|
"learning_rate": 0.00011783251231527096,
|
|
"loss": 0.3608,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 41.2934716290421,
|
|
"grad_norm": 4.68622350692749,
|
|
"learning_rate": 0.00011763546798029557,
|
|
"loss": 0.3742,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 41.391092129347165,
|
|
"grad_norm": 2.5423784255981445,
|
|
"learning_rate": 0.00011743842364532021,
|
|
"loss": 0.3901,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 41.48871262965223,
|
|
"grad_norm": 3.6446280479431152,
|
|
"learning_rate": 0.00011724137931034482,
|
|
"loss": 0.3518,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 41.58633312995729,
|
|
"grad_norm": 2.6701178550720215,
|
|
"learning_rate": 0.00011704433497536946,
|
|
"loss": 0.3809,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 41.68395363026236,
|
|
"grad_norm": 3.226100206375122,
|
|
"learning_rate": 0.0001168472906403941,
|
|
"loss": 0.3834,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 41.78157413056742,
|
|
"grad_norm": 3.4181952476501465,
|
|
"learning_rate": 0.00011665024630541872,
|
|
"loss": 0.4098,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 41.87919463087248,
|
|
"grad_norm": 2.9190330505371094,
|
|
"learning_rate": 0.00011645320197044336,
|
|
"loss": 0.3838,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 41.976815131177545,
|
|
"grad_norm": 4.082178115844727,
|
|
"learning_rate": 0.00011625615763546797,
|
|
"loss": 0.4109,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 42.074435631482615,
|
|
"grad_norm": 2.899162530899048,
|
|
"learning_rate": 0.00011605911330049261,
|
|
"loss": 0.3624,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 42.17205613178768,
|
|
"grad_norm": 2.4065990447998047,
|
|
"learning_rate": 0.00011586206896551725,
|
|
"loss": 0.3573,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 42.26967663209274,
|
|
"grad_norm": 2.818037509918213,
|
|
"learning_rate": 0.00011566502463054188,
|
|
"loss": 0.3699,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 42.3672971323978,
|
|
"grad_norm": 2.8875226974487305,
|
|
"learning_rate": 0.00011546798029556651,
|
|
"loss": 0.3489,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 42.464917632702864,
|
|
"grad_norm": 3.0840396881103516,
|
|
"learning_rate": 0.00011527093596059113,
|
|
"loss": 0.3733,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 42.56253813300793,
|
|
"grad_norm": 2.6554925441741943,
|
|
"learning_rate": 0.00011507389162561578,
|
|
"loss": 0.3541,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 42.660158633312996,
|
|
"grad_norm": 2.766045331954956,
|
|
"learning_rate": 0.00011487684729064042,
|
|
"loss": 0.3682,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 42.75777913361806,
|
|
"grad_norm": 3.0672762393951416,
|
|
"learning_rate": 0.00011467980295566503,
|
|
"loss": 0.3943,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 42.85539963392312,
|
|
"grad_norm": 2.898484468460083,
|
|
"learning_rate": 0.00011448275862068967,
|
|
"loss": 0.3702,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 42.95302013422819,
|
|
"grad_norm": 2.7023797035217285,
|
|
"learning_rate": 0.00011428571428571428,
|
|
"loss": 0.388,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 43.05064063453325,
|
|
"grad_norm": 2.4088499546051025,
|
|
"learning_rate": 0.00011408866995073892,
|
|
"loss": 0.3615,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 43.148261134838314,
|
|
"grad_norm": 2.3739655017852783,
|
|
"learning_rate": 0.00011389162561576354,
|
|
"loss": 0.3703,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 43.24588163514338,
|
|
"grad_norm": 3.2558271884918213,
|
|
"learning_rate": 0.00011369458128078818,
|
|
"loss": 0.3478,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 43.343502135448446,
|
|
"grad_norm": 2.931380271911621,
|
|
"learning_rate": 0.00011349753694581282,
|
|
"loss": 0.3553,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 43.44112263575351,
|
|
"grad_norm": 2.5165908336639404,
|
|
"learning_rate": 0.00011330049261083743,
|
|
"loss": 0.3495,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 43.53874313605857,
|
|
"grad_norm": 3.5619068145751953,
|
|
"learning_rate": 0.00011310344827586207,
|
|
"loss": 0.3692,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 43.63636363636363,
|
|
"grad_norm": 2.39534068107605,
|
|
"learning_rate": 0.0001129064039408867,
|
|
"loss": 0.3674,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 43.7339841366687,
|
|
"grad_norm": 3.495316505432129,
|
|
"learning_rate": 0.00011270935960591134,
|
|
"loss": 0.367,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 43.831604636973765,
|
|
"grad_norm": 2.8195016384124756,
|
|
"learning_rate": 0.00011251231527093598,
|
|
"loss": 0.411,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 43.92922513727883,
|
|
"grad_norm": 3.446014165878296,
|
|
"learning_rate": 0.0001123152709359606,
|
|
"loss": 0.3774,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 44.02684563758389,
|
|
"grad_norm": 3.0228703022003174,
|
|
"learning_rate": 0.00011211822660098524,
|
|
"loss": 0.3479,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"epoch": 44.12446613788896,
|
|
"grad_norm": 4.042842864990234,
|
|
"learning_rate": 0.00011192118226600985,
|
|
"loss": 0.3567,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"epoch": 44.22208663819402,
|
|
"grad_norm": 2.5165748596191406,
|
|
"learning_rate": 0.00011172413793103449,
|
|
"loss": 0.357,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"epoch": 44.31970713849908,
|
|
"grad_norm": 2.9104301929473877,
|
|
"learning_rate": 0.00011152709359605913,
|
|
"loss": 0.3478,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"epoch": 44.417327638804146,
|
|
"grad_norm": 5.000180244445801,
|
|
"learning_rate": 0.00011133004926108374,
|
|
"loss": 0.3372,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 44.514948139109215,
|
|
"grad_norm": 2.7573766708374023,
|
|
"learning_rate": 0.00011113300492610838,
|
|
"loss": 0.3574,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"epoch": 44.61256863941428,
|
|
"grad_norm": 3.473818778991699,
|
|
"learning_rate": 0.000110935960591133,
|
|
"loss": 0.3666,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"epoch": 44.71018913971934,
|
|
"grad_norm": 4.236100196838379,
|
|
"learning_rate": 0.00011073891625615764,
|
|
"loss": 0.3612,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"epoch": 44.8078096400244,
|
|
"grad_norm": 5.279041290283203,
|
|
"learning_rate": 0.00011054187192118227,
|
|
"loss": 0.3694,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"epoch": 44.90543014032947,
|
|
"grad_norm": 3.0009076595306396,
|
|
"learning_rate": 0.0001103448275862069,
|
|
"loss": 0.3629,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 45.003050640634534,
|
|
"grad_norm": 3.358452796936035,
|
|
"learning_rate": 0.00011014778325123153,
|
|
"loss": 0.3584,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"epoch": 45.100671140939596,
|
|
"grad_norm": 2.9341399669647217,
|
|
"learning_rate": 0.00010995073891625616,
|
|
"loss": 0.3437,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"epoch": 45.19829164124466,
|
|
"grad_norm": 3.1249337196350098,
|
|
"learning_rate": 0.0001097536945812808,
|
|
"loss": 0.3551,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"epoch": 45.29591214154973,
|
|
"grad_norm": 2.4878969192504883,
|
|
"learning_rate": 0.00010955665024630541,
|
|
"loss": 0.3379,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"epoch": 45.39353264185479,
|
|
"grad_norm": 3.114165782928467,
|
|
"learning_rate": 0.00010935960591133006,
|
|
"loss": 0.3616,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 45.49115314215985,
|
|
"grad_norm": 3.0727782249450684,
|
|
"learning_rate": 0.0001091625615763547,
|
|
"loss": 0.348,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"epoch": 45.588773642464915,
|
|
"grad_norm": 2.9487972259521484,
|
|
"learning_rate": 0.00010896551724137931,
|
|
"loss": 0.3397,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"epoch": 45.686394142769984,
|
|
"grad_norm": 3.0654473304748535,
|
|
"learning_rate": 0.00010876847290640395,
|
|
"loss": 0.3515,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"epoch": 45.78401464307505,
|
|
"grad_norm": 4.303600311279297,
|
|
"learning_rate": 0.00010857142857142856,
|
|
"loss": 0.3586,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"epoch": 45.88163514338011,
|
|
"grad_norm": 2.946246385574341,
|
|
"learning_rate": 0.0001083743842364532,
|
|
"loss": 0.3436,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 45.97925564368517,
|
|
"grad_norm": 2.4360456466674805,
|
|
"learning_rate": 0.00010817733990147785,
|
|
"loss": 0.3766,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"epoch": 46.07687614399024,
|
|
"grad_norm": 2.8351433277130127,
|
|
"learning_rate": 0.00010798029556650246,
|
|
"loss": 0.3547,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 46.1744966442953,
|
|
"grad_norm": 2.6005990505218506,
|
|
"learning_rate": 0.0001077832512315271,
|
|
"loss": 0.3333,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"epoch": 46.272117144600365,
|
|
"grad_norm": 2.52091121673584,
|
|
"learning_rate": 0.00010758620689655173,
|
|
"loss": 0.3507,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"epoch": 46.36973764490543,
|
|
"grad_norm": 3.0750203132629395,
|
|
"learning_rate": 0.00010738916256157637,
|
|
"loss": 0.3376,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 46.4673581452105,
|
|
"grad_norm": 3.353597640991211,
|
|
"learning_rate": 0.00010719211822660098,
|
|
"loss": 0.3362,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"epoch": 46.56497864551556,
|
|
"grad_norm": 3.786407232284546,
|
|
"learning_rate": 0.00010699507389162562,
|
|
"loss": 0.3774,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"epoch": 46.66259914582062,
|
|
"grad_norm": 3.2476627826690674,
|
|
"learning_rate": 0.00010679802955665026,
|
|
"loss": 0.3423,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"epoch": 46.760219646125684,
|
|
"grad_norm": 2.966078281402588,
|
|
"learning_rate": 0.00010660098522167488,
|
|
"loss": 0.3382,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"epoch": 46.85784014643075,
|
|
"grad_norm": 3.7173826694488525,
|
|
"learning_rate": 0.00010640394088669952,
|
|
"loss": 0.3512,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 46.955460646735816,
|
|
"grad_norm": 3.6152524948120117,
|
|
"learning_rate": 0.00010620689655172413,
|
|
"loss": 0.3499,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"epoch": 47.05308114704088,
|
|
"grad_norm": 3.6383986473083496,
|
|
"learning_rate": 0.00010600985221674877,
|
|
"loss": 0.3442,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"epoch": 47.15070164734594,
|
|
"grad_norm": 2.636918306350708,
|
|
"learning_rate": 0.00010581280788177341,
|
|
"loss": 0.3355,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"epoch": 47.24832214765101,
|
|
"grad_norm": 3.8844096660614014,
|
|
"learning_rate": 0.00010561576354679802,
|
|
"loss": 0.3389,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"epoch": 47.34594264795607,
|
|
"grad_norm": 4.149389743804932,
|
|
"learning_rate": 0.00010541871921182267,
|
|
"loss": 0.3168,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 47.443563148261134,
|
|
"grad_norm": 3.205845832824707,
|
|
"learning_rate": 0.00010522167487684729,
|
|
"loss": 0.3247,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"epoch": 47.5411836485662,
|
|
"grad_norm": 3.4177889823913574,
|
|
"learning_rate": 0.00010502463054187193,
|
|
"loss": 0.3472,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"epoch": 47.638804148871266,
|
|
"grad_norm": 3.2508625984191895,
|
|
"learning_rate": 0.00010482758620689656,
|
|
"loss": 0.3354,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"epoch": 47.73642464917633,
|
|
"grad_norm": 3.2071492671966553,
|
|
"learning_rate": 0.00010463054187192119,
|
|
"loss": 0.3515,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"epoch": 47.83404514948139,
|
|
"grad_norm": 2.505859613418579,
|
|
"learning_rate": 0.00010443349753694583,
|
|
"loss": 0.3654,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 47.93166564978645,
|
|
"grad_norm": 3.092602491378784,
|
|
"learning_rate": 0.00010423645320197044,
|
|
"loss": 0.3551,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"epoch": 48.02928615009152,
|
|
"grad_norm": 3.411740303039551,
|
|
"learning_rate": 0.00010403940886699508,
|
|
"loss": 0.3445,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"epoch": 48.126906650396585,
|
|
"grad_norm": 2.587663412094116,
|
|
"learning_rate": 0.00010384236453201972,
|
|
"loss": 0.3132,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"epoch": 48.22452715070165,
|
|
"grad_norm": 2.244938850402832,
|
|
"learning_rate": 0.00010364532019704434,
|
|
"loss": 0.3327,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"epoch": 48.32214765100671,
|
|
"grad_norm": 3.426699638366699,
|
|
"learning_rate": 0.00010344827586206898,
|
|
"loss": 0.3163,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 48.41976815131178,
|
|
"grad_norm": 2.600964069366455,
|
|
"learning_rate": 0.00010325123152709359,
|
|
"loss": 0.3318,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"epoch": 48.51738865161684,
|
|
"grad_norm": 2.5745320320129395,
|
|
"learning_rate": 0.00010305418719211823,
|
|
"loss": 0.3302,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"epoch": 48.6150091519219,
|
|
"grad_norm": 2.9485421180725098,
|
|
"learning_rate": 0.00010285714285714286,
|
|
"loss": 0.3468,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"epoch": 48.712629652226966,
|
|
"grad_norm": 2.783953905105591,
|
|
"learning_rate": 0.00010266009852216748,
|
|
"loss": 0.3339,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"epoch": 48.810250152532035,
|
|
"grad_norm": 3.2114439010620117,
|
|
"learning_rate": 0.00010246305418719213,
|
|
"loss": 0.3496,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 48.9078706528371,
|
|
"grad_norm": 4.33662748336792,
|
|
"learning_rate": 0.00010226600985221675,
|
|
"loss": 0.3358,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"epoch": 49.00549115314216,
|
|
"grad_norm": 2.714755058288574,
|
|
"learning_rate": 0.0001020689655172414,
|
|
"loss": 0.3677,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"epoch": 49.10311165344722,
|
|
"grad_norm": 2.1904876232147217,
|
|
"learning_rate": 0.00010187192118226601,
|
|
"loss": 0.2878,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"epoch": 49.20073215375229,
|
|
"grad_norm": 2.530484676361084,
|
|
"learning_rate": 0.00010167487684729065,
|
|
"loss": 0.3221,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"epoch": 49.298352654057354,
|
|
"grad_norm": 3.1762654781341553,
|
|
"learning_rate": 0.00010147783251231529,
|
|
"loss": 0.3427,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 49.395973154362416,
|
|
"grad_norm": 3.0370638370513916,
|
|
"learning_rate": 0.0001012807881773399,
|
|
"loss": 0.3466,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"epoch": 49.49359365466748,
|
|
"grad_norm": 2.5626463890075684,
|
|
"learning_rate": 0.00010108374384236454,
|
|
"loss": 0.3218,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"epoch": 49.59121415497255,
|
|
"grad_norm": 3.4357545375823975,
|
|
"learning_rate": 0.00010088669950738916,
|
|
"loss": 0.3312,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"epoch": 49.68883465527761,
|
|
"grad_norm": 2.810955762863159,
|
|
"learning_rate": 0.0001006896551724138,
|
|
"loss": 0.3363,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"epoch": 49.78645515558267,
|
|
"grad_norm": 3.8722000122070312,
|
|
"learning_rate": 0.00010049261083743844,
|
|
"loss": 0.3251,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 49.884075655887735,
|
|
"grad_norm": 3.185521364212036,
|
|
"learning_rate": 0.00010029556650246305,
|
|
"loss": 0.3429,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"epoch": 49.981696156192804,
|
|
"grad_norm": 2.707853078842163,
|
|
"learning_rate": 0.00010009852216748769,
|
|
"loss": 0.3548,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"epoch": 50.079316656497866,
|
|
"grad_norm": 2.749464511871338,
|
|
"learning_rate": 9.990147783251232e-05,
|
|
"loss": 0.3294,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"epoch": 50.17693715680293,
|
|
"grad_norm": 3.4640865325927734,
|
|
"learning_rate": 9.970443349753696e-05,
|
|
"loss": 0.3204,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"epoch": 50.27455765710799,
|
|
"grad_norm": 3.4412505626678467,
|
|
"learning_rate": 9.950738916256159e-05,
|
|
"loss": 0.3316,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 50.37217815741306,
|
|
"grad_norm": 4.671158790588379,
|
|
"learning_rate": 9.931034482758621e-05,
|
|
"loss": 0.3092,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"epoch": 50.46979865771812,
|
|
"grad_norm": 2.812875986099243,
|
|
"learning_rate": 9.911330049261084e-05,
|
|
"loss": 0.3217,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"epoch": 50.567419158023185,
|
|
"grad_norm": 2.600764513015747,
|
|
"learning_rate": 9.891625615763547e-05,
|
|
"loss": 0.3525,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"epoch": 50.66503965832825,
|
|
"grad_norm": 2.8875558376312256,
|
|
"learning_rate": 9.871921182266011e-05,
|
|
"loss": 0.3267,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"epoch": 50.76266015863331,
|
|
"grad_norm": 2.479055643081665,
|
|
"learning_rate": 9.852216748768474e-05,
|
|
"loss": 0.3283,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 50.86028065893838,
|
|
"grad_norm": 3.4580044746398926,
|
|
"learning_rate": 9.832512315270936e-05,
|
|
"loss": 0.3388,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"epoch": 50.95790115924344,
|
|
"grad_norm": 2.68265962600708,
|
|
"learning_rate": 9.812807881773399e-05,
|
|
"loss": 0.3309,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"epoch": 51.0555216595485,
|
|
"grad_norm": 2.545677661895752,
|
|
"learning_rate": 9.793103448275862e-05,
|
|
"loss": 0.3221,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"epoch": 51.153142159853566,
|
|
"grad_norm": 2.899627685546875,
|
|
"learning_rate": 9.773399014778326e-05,
|
|
"loss": 0.3084,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"epoch": 51.250762660158635,
|
|
"grad_norm": 2.948960781097412,
|
|
"learning_rate": 9.753694581280788e-05,
|
|
"loss": 0.3273,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 51.3483831604637,
|
|
"grad_norm": 2.9379513263702393,
|
|
"learning_rate": 9.733990147783252e-05,
|
|
"loss": 0.3315,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"epoch": 51.44600366076876,
|
|
"grad_norm": 2.543419599533081,
|
|
"learning_rate": 9.714285714285715e-05,
|
|
"loss": 0.3258,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"epoch": 51.54362416107382,
|
|
"grad_norm": 2.7236459255218506,
|
|
"learning_rate": 9.694581280788178e-05,
|
|
"loss": 0.3129,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"epoch": 51.64124466137889,
|
|
"grad_norm": 3.11745548248291,
|
|
"learning_rate": 9.67487684729064e-05,
|
|
"loss": 0.3038,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"epoch": 51.738865161683954,
|
|
"grad_norm": 3.6259920597076416,
|
|
"learning_rate": 9.655172413793105e-05,
|
|
"loss": 0.3269,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 51.836485661989016,
|
|
"grad_norm": 3.4961044788360596,
|
|
"learning_rate": 9.635467980295567e-05,
|
|
"loss": 0.336,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 51.93410616229408,
|
|
"grad_norm": 3.01009202003479,
|
|
"learning_rate": 9.61576354679803e-05,
|
|
"loss": 0.3297,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"epoch": 52.03172666259915,
|
|
"grad_norm": 3.047903060913086,
|
|
"learning_rate": 9.596059113300493e-05,
|
|
"loss": 0.3295,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"epoch": 52.12934716290421,
|
|
"grad_norm": 2.8521170616149902,
|
|
"learning_rate": 9.576354679802955e-05,
|
|
"loss": 0.2952,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"epoch": 52.22696766320927,
|
|
"grad_norm": 2.8909034729003906,
|
|
"learning_rate": 9.55665024630542e-05,
|
|
"loss": 0.3128,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 52.324588163514335,
|
|
"grad_norm": 3.2134296894073486,
|
|
"learning_rate": 9.536945812807882e-05,
|
|
"loss": 0.3175,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"epoch": 52.422208663819404,
|
|
"grad_norm": 3.113543748855591,
|
|
"learning_rate": 9.517241379310345e-05,
|
|
"loss": 0.3305,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"epoch": 52.51982916412447,
|
|
"grad_norm": 2.3091633319854736,
|
|
"learning_rate": 9.497536945812808e-05,
|
|
"loss": 0.3032,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"epoch": 52.61744966442953,
|
|
"grad_norm": 2.7626681327819824,
|
|
"learning_rate": 9.477832512315272e-05,
|
|
"loss": 0.3071,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"epoch": 52.71507016473459,
|
|
"grad_norm": 2.6978394985198975,
|
|
"learning_rate": 9.458128078817734e-05,
|
|
"loss": 0.3424,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 52.81269066503966,
|
|
"grad_norm": 4.549131393432617,
|
|
"learning_rate": 9.438423645320199e-05,
|
|
"loss": 0.3086,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"epoch": 52.91031116534472,
|
|
"grad_norm": 3.3548974990844727,
|
|
"learning_rate": 9.418719211822661e-05,
|
|
"loss": 0.3414,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"epoch": 53.007931665649785,
|
|
"grad_norm": 2.191990852355957,
|
|
"learning_rate": 9.399014778325124e-05,
|
|
"loss": 0.3195,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"epoch": 53.10555216595485,
|
|
"grad_norm": 2.8169941902160645,
|
|
"learning_rate": 9.379310344827587e-05,
|
|
"loss": 0.2971,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"epoch": 53.20317266625992,
|
|
"grad_norm": 2.4809463024139404,
|
|
"learning_rate": 9.35960591133005e-05,
|
|
"loss": 0.3032,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 53.30079316656498,
|
|
"grad_norm": 2.8981711864471436,
|
|
"learning_rate": 9.339901477832512e-05,
|
|
"loss": 0.3139,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"epoch": 53.39841366687004,
|
|
"grad_norm": 2.901442050933838,
|
|
"learning_rate": 9.320197044334976e-05,
|
|
"loss": 0.3197,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"epoch": 53.496034167175104,
|
|
"grad_norm": 3.1128933429718018,
|
|
"learning_rate": 9.300492610837439e-05,
|
|
"loss": 0.3109,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"epoch": 53.59365466748017,
|
|
"grad_norm": 2.6892173290252686,
|
|
"learning_rate": 9.280788177339902e-05,
|
|
"loss": 0.3153,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"epoch": 53.691275167785236,
|
|
"grad_norm": 3.1847739219665527,
|
|
"learning_rate": 9.261083743842364e-05,
|
|
"loss": 0.3135,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 53.7888956680903,
|
|
"grad_norm": 3.1111955642700195,
|
|
"learning_rate": 9.241379310344827e-05,
|
|
"loss": 0.3472,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"epoch": 53.88651616839536,
|
|
"grad_norm": 2.667539119720459,
|
|
"learning_rate": 9.221674876847291e-05,
|
|
"loss": 0.3107,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"epoch": 53.98413666870043,
|
|
"grad_norm": 2.1500725746154785,
|
|
"learning_rate": 9.201970443349755e-05,
|
|
"loss": 0.3192,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"epoch": 54.08175716900549,
|
|
"grad_norm": 3.6513638496398926,
|
|
"learning_rate": 9.182266009852218e-05,
|
|
"loss": 0.2974,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"epoch": 54.179377669310554,
|
|
"grad_norm": 3.226287364959717,
|
|
"learning_rate": 9.16256157635468e-05,
|
|
"loss": 0.3216,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 54.27699816961562,
|
|
"grad_norm": 3.4577550888061523,
|
|
"learning_rate": 9.142857142857143e-05,
|
|
"loss": 0.2999,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"epoch": 54.374618669920686,
|
|
"grad_norm": 2.047478199005127,
|
|
"learning_rate": 9.123152709359606e-05,
|
|
"loss": 0.3139,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"epoch": 54.47223917022575,
|
|
"grad_norm": 3.0338408946990967,
|
|
"learning_rate": 9.10344827586207e-05,
|
|
"loss": 0.2954,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"epoch": 54.56985967053081,
|
|
"grad_norm": 2.6099050045013428,
|
|
"learning_rate": 9.083743842364533e-05,
|
|
"loss": 0.3218,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"epoch": 54.66748017083587,
|
|
"grad_norm": 3.248973846435547,
|
|
"learning_rate": 9.064039408866995e-05,
|
|
"loss": 0.3243,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 54.76510067114094,
|
|
"grad_norm": 4.767118453979492,
|
|
"learning_rate": 9.044334975369458e-05,
|
|
"loss": 0.315,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"epoch": 54.862721171446005,
|
|
"grad_norm": 2.872119188308716,
|
|
"learning_rate": 9.024630541871921e-05,
|
|
"loss": 0.3032,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"epoch": 54.96034167175107,
|
|
"grad_norm": 3.499648094177246,
|
|
"learning_rate": 9.004926108374385e-05,
|
|
"loss": 0.3141,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"epoch": 55.05796217205613,
|
|
"grad_norm": 3.0000522136688232,
|
|
"learning_rate": 8.985221674876848e-05,
|
|
"loss": 0.3153,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"epoch": 55.1555826723612,
|
|
"grad_norm": 2.2861599922180176,
|
|
"learning_rate": 8.96551724137931e-05,
|
|
"loss": 0.3258,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 55.25320317266626,
|
|
"grad_norm": 2.980668306350708,
|
|
"learning_rate": 8.945812807881774e-05,
|
|
"loss": 0.3099,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"epoch": 55.35082367297132,
|
|
"grad_norm": 2.286050319671631,
|
|
"learning_rate": 8.926108374384237e-05,
|
|
"loss": 0.2931,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"epoch": 55.448444173276386,
|
|
"grad_norm": 4.078646659851074,
|
|
"learning_rate": 8.9064039408867e-05,
|
|
"loss": 0.3142,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"epoch": 55.546064673581455,
|
|
"grad_norm": 2.150973320007324,
|
|
"learning_rate": 8.886699507389164e-05,
|
|
"loss": 0.2839,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"epoch": 55.64368517388652,
|
|
"grad_norm": 2.671983242034912,
|
|
"learning_rate": 8.866995073891627e-05,
|
|
"loss": 0.2981,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 55.74130567419158,
|
|
"grad_norm": 3.199276924133301,
|
|
"learning_rate": 8.847290640394089e-05,
|
|
"loss": 0.304,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"epoch": 55.83892617449664,
|
|
"grad_norm": 2.477468967437744,
|
|
"learning_rate": 8.827586206896552e-05,
|
|
"loss": 0.3288,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"epoch": 55.93654667480171,
|
|
"grad_norm": 2.3130173683166504,
|
|
"learning_rate": 8.807881773399015e-05,
|
|
"loss": 0.321,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"epoch": 56.034167175106774,
|
|
"grad_norm": 3.1496715545654297,
|
|
"learning_rate": 8.788177339901477e-05,
|
|
"loss": 0.2992,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"epoch": 56.131787675411836,
|
|
"grad_norm": 3.3296494483947754,
|
|
"learning_rate": 8.768472906403941e-05,
|
|
"loss": 0.3023,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 56.2294081757169,
|
|
"grad_norm": 2.992814540863037,
|
|
"learning_rate": 8.748768472906404e-05,
|
|
"loss": 0.291,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"epoch": 56.32702867602197,
|
|
"grad_norm": 2.981858015060425,
|
|
"learning_rate": 8.729064039408867e-05,
|
|
"loss": 0.2908,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"epoch": 56.42464917632703,
|
|
"grad_norm": 3.968040704727173,
|
|
"learning_rate": 8.709359605911331e-05,
|
|
"loss": 0.2963,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"epoch": 56.52226967663209,
|
|
"grad_norm": 3.6845455169677734,
|
|
"learning_rate": 8.689655172413794e-05,
|
|
"loss": 0.3137,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"epoch": 56.619890176937155,
|
|
"grad_norm": 3.8928792476654053,
|
|
"learning_rate": 8.669950738916258e-05,
|
|
"loss": 0.2971,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 56.717510677242224,
|
|
"grad_norm": 2.064180374145508,
|
|
"learning_rate": 8.65024630541872e-05,
|
|
"loss": 0.3067,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"epoch": 56.815131177547286,
|
|
"grad_norm": 2.8107266426086426,
|
|
"learning_rate": 8.630541871921183e-05,
|
|
"loss": 0.2972,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"epoch": 56.91275167785235,
|
|
"grad_norm": 2.747004270553589,
|
|
"learning_rate": 8.610837438423646e-05,
|
|
"loss": 0.3183,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"epoch": 57.01037217815741,
|
|
"grad_norm": 2.0700557231903076,
|
|
"learning_rate": 8.591133004926109e-05,
|
|
"loss": 0.3075,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"epoch": 57.10799267846248,
|
|
"grad_norm": 3.1093757152557373,
|
|
"learning_rate": 8.571428571428571e-05,
|
|
"loss": 0.2756,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 57.20561317876754,
|
|
"grad_norm": 2.34448504447937,
|
|
"learning_rate": 8.551724137931035e-05,
|
|
"loss": 0.2898,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"epoch": 57.303233679072605,
|
|
"grad_norm": 3.3790042400360107,
|
|
"learning_rate": 8.532019704433498e-05,
|
|
"loss": 0.3081,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"epoch": 57.40085417937767,
|
|
"grad_norm": 2.700956106185913,
|
|
"learning_rate": 8.512315270935961e-05,
|
|
"loss": 0.2915,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"epoch": 57.49847467968274,
|
|
"grad_norm": 2.6353628635406494,
|
|
"learning_rate": 8.492610837438423e-05,
|
|
"loss": 0.3063,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"epoch": 57.5960951799878,
|
|
"grad_norm": 2.56706166267395,
|
|
"learning_rate": 8.472906403940886e-05,
|
|
"loss": 0.3005,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 57.69371568029286,
|
|
"grad_norm": 4.074772357940674,
|
|
"learning_rate": 8.45320197044335e-05,
|
|
"loss": 0.3007,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"epoch": 57.79133618059792,
|
|
"grad_norm": 2.786485433578491,
|
|
"learning_rate": 8.433497536945813e-05,
|
|
"loss": 0.3141,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"epoch": 57.88895668090299,
|
|
"grad_norm": 2.9513659477233887,
|
|
"learning_rate": 8.413793103448277e-05,
|
|
"loss": 0.3216,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"epoch": 57.986577181208055,
|
|
"grad_norm": 3.126004219055176,
|
|
"learning_rate": 8.39408866995074e-05,
|
|
"loss": 0.3,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"epoch": 58.08419768151312,
|
|
"grad_norm": 2.20534348487854,
|
|
"learning_rate": 8.374384236453202e-05,
|
|
"loss": 0.2891,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 58.18181818181818,
|
|
"grad_norm": 4.753482818603516,
|
|
"learning_rate": 8.354679802955665e-05,
|
|
"loss": 0.3019,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"epoch": 58.27943868212325,
|
|
"grad_norm": 3.1038873195648193,
|
|
"learning_rate": 8.334975369458129e-05,
|
|
"loss": 0.283,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"epoch": 58.37705918242831,
|
|
"grad_norm": 2.9366559982299805,
|
|
"learning_rate": 8.315270935960592e-05,
|
|
"loss": 0.302,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"epoch": 58.474679682733374,
|
|
"grad_norm": 3.008777379989624,
|
|
"learning_rate": 8.295566502463055e-05,
|
|
"loss": 0.3256,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"epoch": 58.572300183038436,
|
|
"grad_norm": 2.7105023860931396,
|
|
"learning_rate": 8.275862068965517e-05,
|
|
"loss": 0.2959,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 58.669920683343506,
|
|
"grad_norm": 2.762347936630249,
|
|
"learning_rate": 8.25615763546798e-05,
|
|
"loss": 0.2826,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"epoch": 58.76754118364857,
|
|
"grad_norm": 2.8366870880126953,
|
|
"learning_rate": 8.236453201970443e-05,
|
|
"loss": 0.302,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"epoch": 58.86516168395363,
|
|
"grad_norm": 2.721994400024414,
|
|
"learning_rate": 8.216748768472907e-05,
|
|
"loss": 0.2966,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"epoch": 58.96278218425869,
|
|
"grad_norm": 2.988464117050171,
|
|
"learning_rate": 8.19704433497537e-05,
|
|
"loss": 0.2899,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"epoch": 59.060402684563755,
|
|
"grad_norm": 2.6657352447509766,
|
|
"learning_rate": 8.177339901477834e-05,
|
|
"loss": 0.2889,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 59.158023184868824,
|
|
"grad_norm": 3.703511953353882,
|
|
"learning_rate": 8.157635467980296e-05,
|
|
"loss": 0.2794,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"epoch": 59.25564368517389,
|
|
"grad_norm": 2.9937832355499268,
|
|
"learning_rate": 8.137931034482759e-05,
|
|
"loss": 0.2896,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"epoch": 59.35326418547895,
|
|
"grad_norm": 3.188159704208374,
|
|
"learning_rate": 8.118226600985223e-05,
|
|
"loss": 0.2885,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"epoch": 59.45088468578401,
|
|
"grad_norm": 2.8724703788757324,
|
|
"learning_rate": 8.098522167487686e-05,
|
|
"loss": 0.2959,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"epoch": 59.54850518608908,
|
|
"grad_norm": 3.351435422897339,
|
|
"learning_rate": 8.078817733990148e-05,
|
|
"loss": 0.2867,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 59.64612568639414,
|
|
"grad_norm": 2.5625758171081543,
|
|
"learning_rate": 8.059113300492611e-05,
|
|
"loss": 0.3042,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"epoch": 59.743746186699205,
|
|
"grad_norm": 3.3796396255493164,
|
|
"learning_rate": 8.039408866995074e-05,
|
|
"loss": 0.301,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"epoch": 59.84136668700427,
|
|
"grad_norm": 2.787851572036743,
|
|
"learning_rate": 8.019704433497537e-05,
|
|
"loss": 0.3072,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"epoch": 59.93898718730934,
|
|
"grad_norm": 2.9104974269866943,
|
|
"learning_rate": 8e-05,
|
|
"loss": 0.3059,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"epoch": 60.0366076876144,
|
|
"grad_norm": 2.957249879837036,
|
|
"learning_rate": 7.980295566502463e-05,
|
|
"loss": 0.2965,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 60.13422818791946,
|
|
"grad_norm": 2.2982118129730225,
|
|
"learning_rate": 7.960591133004926e-05,
|
|
"loss": 0.2703,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"epoch": 60.231848688224524,
|
|
"grad_norm": 3.548534870147705,
|
|
"learning_rate": 7.940886699507389e-05,
|
|
"loss": 0.2843,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"epoch": 60.32946918852959,
|
|
"grad_norm": 2.3399384021759033,
|
|
"learning_rate": 7.921182266009853e-05,
|
|
"loss": 0.2855,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"epoch": 60.427089688834656,
|
|
"grad_norm": 3.4186365604400635,
|
|
"learning_rate": 7.901477832512316e-05,
|
|
"loss": 0.2942,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"epoch": 60.52471018913972,
|
|
"grad_norm": 2.572951316833496,
|
|
"learning_rate": 7.88177339901478e-05,
|
|
"loss": 0.2918,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 60.62233068944478,
|
|
"grad_norm": 2.1056010723114014,
|
|
"learning_rate": 7.862068965517242e-05,
|
|
"loss": 0.3051,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"epoch": 60.71995118974985,
|
|
"grad_norm": 4.122783184051514,
|
|
"learning_rate": 7.842364532019705e-05,
|
|
"loss": 0.2811,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"epoch": 60.81757169005491,
|
|
"grad_norm": 2.3634865283966064,
|
|
"learning_rate": 7.822660098522168e-05,
|
|
"loss": 0.3063,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"epoch": 60.915192190359974,
|
|
"grad_norm": 3.362290143966675,
|
|
"learning_rate": 7.80295566502463e-05,
|
|
"loss": 0.2954,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"epoch": 61.01281269066504,
|
|
"grad_norm": 4.63106632232666,
|
|
"learning_rate": 7.783251231527095e-05,
|
|
"loss": 0.2855,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 61.110433190970106,
|
|
"grad_norm": 3.6261041164398193,
|
|
"learning_rate": 7.763546798029557e-05,
|
|
"loss": 0.2792,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"epoch": 61.20805369127517,
|
|
"grad_norm": 2.869415760040283,
|
|
"learning_rate": 7.74384236453202e-05,
|
|
"loss": 0.2833,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"epoch": 61.30567419158023,
|
|
"grad_norm": 2.7370972633361816,
|
|
"learning_rate": 7.724137931034483e-05,
|
|
"loss": 0.2997,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"epoch": 61.40329469188529,
|
|
"grad_norm": 3.5397825241088867,
|
|
"learning_rate": 7.704433497536945e-05,
|
|
"loss": 0.2799,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"epoch": 61.50091519219036,
|
|
"grad_norm": 2.3903191089630127,
|
|
"learning_rate": 7.684729064039408e-05,
|
|
"loss": 0.2857,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"epoch": 61.598535692495425,
|
|
"grad_norm": 3.3589389324188232,
|
|
"learning_rate": 7.665024630541872e-05,
|
|
"loss": 0.2823,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"epoch": 61.69615619280049,
|
|
"grad_norm": 4.420291423797607,
|
|
"learning_rate": 7.645320197044336e-05,
|
|
"loss": 0.2895,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"epoch": 61.79377669310555,
|
|
"grad_norm": 3.060859441757202,
|
|
"learning_rate": 7.625615763546799e-05,
|
|
"loss": 0.2859,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"epoch": 61.89139719341062,
|
|
"grad_norm": 3.5927321910858154,
|
|
"learning_rate": 7.605911330049262e-05,
|
|
"loss": 0.2954,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"epoch": 61.98901769371568,
|
|
"grad_norm": 2.7577738761901855,
|
|
"learning_rate": 7.586206896551724e-05,
|
|
"loss": 0.2832,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"epoch": 62.08663819402074,
|
|
"grad_norm": 4.519462585449219,
|
|
"learning_rate": 7.566502463054188e-05,
|
|
"loss": 0.2695,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"epoch": 62.184258694325806,
|
|
"grad_norm": 2.231842279434204,
|
|
"learning_rate": 7.546798029556651e-05,
|
|
"loss": 0.2894,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"epoch": 62.281879194630875,
|
|
"grad_norm": 3.5176825523376465,
|
|
"learning_rate": 7.527093596059114e-05,
|
|
"loss": 0.2749,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"epoch": 62.37949969493594,
|
|
"grad_norm": 3.319891929626465,
|
|
"learning_rate": 7.507389162561577e-05,
|
|
"loss": 0.2909,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"epoch": 62.477120195241,
|
|
"grad_norm": 2.778862237930298,
|
|
"learning_rate": 7.487684729064039e-05,
|
|
"loss": 0.2816,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"epoch": 62.57474069554606,
|
|
"grad_norm": 2.7136170864105225,
|
|
"learning_rate": 7.467980295566502e-05,
|
|
"loss": 0.286,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"epoch": 62.67236119585113,
|
|
"grad_norm": 2.841850519180298,
|
|
"learning_rate": 7.448275862068966e-05,
|
|
"loss": 0.3078,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"epoch": 62.769981696156194,
|
|
"grad_norm": 3.159632682800293,
|
|
"learning_rate": 7.428571428571429e-05,
|
|
"loss": 0.2693,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"epoch": 62.867602196461256,
|
|
"grad_norm": 2.638611078262329,
|
|
"learning_rate": 7.408866995073891e-05,
|
|
"loss": 0.2838,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"epoch": 62.96522269676632,
|
|
"grad_norm": 3.453857421875,
|
|
"learning_rate": 7.389162561576355e-05,
|
|
"loss": 0.2892,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"epoch": 63.06284319707139,
|
|
"grad_norm": 3.6586861610412598,
|
|
"learning_rate": 7.369458128078818e-05,
|
|
"loss": 0.2626,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"epoch": 63.16046369737645,
|
|
"grad_norm": 3.8204469680786133,
|
|
"learning_rate": 7.349753694581281e-05,
|
|
"loss": 0.2834,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"epoch": 63.25808419768151,
|
|
"grad_norm": 1.7463505268096924,
|
|
"learning_rate": 7.330049261083745e-05,
|
|
"loss": 0.2909,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"epoch": 63.355704697986575,
|
|
"grad_norm": 1.687853217124939,
|
|
"learning_rate": 7.310344827586208e-05,
|
|
"loss": 0.2892,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"epoch": 63.453325198291644,
|
|
"grad_norm": 2.835196018218994,
|
|
"learning_rate": 7.29064039408867e-05,
|
|
"loss": 0.2763,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 63.550945698596706,
|
|
"grad_norm": 3.77742862701416,
|
|
"learning_rate": 7.270935960591133e-05,
|
|
"loss": 0.2834,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"epoch": 63.64856619890177,
|
|
"grad_norm": 2.1246883869171143,
|
|
"learning_rate": 7.251231527093596e-05,
|
|
"loss": 0.2859,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"epoch": 63.74618669920683,
|
|
"grad_norm": 3.592597246170044,
|
|
"learning_rate": 7.23152709359606e-05,
|
|
"loss": 0.2865,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"epoch": 63.8438071995119,
|
|
"grad_norm": 2.8954873085021973,
|
|
"learning_rate": 7.211822660098523e-05,
|
|
"loss": 0.2855,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"epoch": 63.94142769981696,
|
|
"grad_norm": 2.266686201095581,
|
|
"learning_rate": 7.192118226600985e-05,
|
|
"loss": 0.2814,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"epoch": 64.03904820012202,
|
|
"grad_norm": 1.9330942630767822,
|
|
"learning_rate": 7.172413793103448e-05,
|
|
"loss": 0.2832,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"epoch": 64.1366687004271,
|
|
"grad_norm": 4.008347511291504,
|
|
"learning_rate": 7.152709359605912e-05,
|
|
"loss": 0.2762,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"epoch": 64.23428920073215,
|
|
"grad_norm": 2.2452552318573,
|
|
"learning_rate": 7.133004926108375e-05,
|
|
"loss": 0.269,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"epoch": 64.33190970103722,
|
|
"grad_norm": 7.247570991516113,
|
|
"learning_rate": 7.113300492610839e-05,
|
|
"loss": 0.2652,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"epoch": 64.42953020134229,
|
|
"grad_norm": 4.846076488494873,
|
|
"learning_rate": 7.093596059113302e-05,
|
|
"loss": 0.2766,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"epoch": 64.52715070164734,
|
|
"grad_norm": 3.444746732711792,
|
|
"learning_rate": 7.073891625615764e-05,
|
|
"loss": 0.2789,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"epoch": 64.62477120195241,
|
|
"grad_norm": 2.506460428237915,
|
|
"learning_rate": 7.054187192118227e-05,
|
|
"loss": 0.279,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"epoch": 64.72239170225747,
|
|
"grad_norm": 3.3973569869995117,
|
|
"learning_rate": 7.03448275862069e-05,
|
|
"loss": 0.2887,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"epoch": 64.82001220256254,
|
|
"grad_norm": 3.14697265625,
|
|
"learning_rate": 7.014778325123154e-05,
|
|
"loss": 0.2813,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"epoch": 64.9176327028676,
|
|
"grad_norm": 4.694430828094482,
|
|
"learning_rate": 6.995073891625616e-05,
|
|
"loss": 0.3026,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"epoch": 65.01525320317266,
|
|
"grad_norm": 2.2463550567626953,
|
|
"learning_rate": 6.975369458128079e-05,
|
|
"loss": 0.2739,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"epoch": 65.11287370347773,
|
|
"grad_norm": 2.907592535018921,
|
|
"learning_rate": 6.955665024630542e-05,
|
|
"loss": 0.2783,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"epoch": 65.2104942037828,
|
|
"grad_norm": 2.9708614349365234,
|
|
"learning_rate": 6.935960591133005e-05,
|
|
"loss": 0.2718,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"epoch": 65.30811470408786,
|
|
"grad_norm": 2.7227044105529785,
|
|
"learning_rate": 6.916256157635467e-05,
|
|
"loss": 0.2615,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"epoch": 65.40573520439293,
|
|
"grad_norm": 2.3960001468658447,
|
|
"learning_rate": 6.896551724137931e-05,
|
|
"loss": 0.2822,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"epoch": 65.50335570469798,
|
|
"grad_norm": 2.032240629196167,
|
|
"learning_rate": 6.876847290640394e-05,
|
|
"loss": 0.282,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"epoch": 65.60097620500305,
|
|
"grad_norm": 2.5334010124206543,
|
|
"learning_rate": 6.857142857142858e-05,
|
|
"loss": 0.2771,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"epoch": 65.69859670530812,
|
|
"grad_norm": 7.930431842803955,
|
|
"learning_rate": 6.837438423645321e-05,
|
|
"loss": 0.2878,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"epoch": 65.79621720561317,
|
|
"grad_norm": 2.709092378616333,
|
|
"learning_rate": 6.817733990147784e-05,
|
|
"loss": 0.2797,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"epoch": 65.89383770591824,
|
|
"grad_norm": 4.455546855926514,
|
|
"learning_rate": 6.798029556650246e-05,
|
|
"loss": 0.2803,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 65.99145820622331,
|
|
"grad_norm": 4.6384077072143555,
|
|
"learning_rate": 6.77832512315271e-05,
|
|
"loss": 0.2764,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"epoch": 66.08907870652837,
|
|
"grad_norm": 2.7529897689819336,
|
|
"learning_rate": 6.758620689655173e-05,
|
|
"loss": 0.2614,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"epoch": 66.18669920683344,
|
|
"grad_norm": 2.0837860107421875,
|
|
"learning_rate": 6.738916256157636e-05,
|
|
"loss": 0.2696,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"epoch": 66.2843197071385,
|
|
"grad_norm": 1.6655378341674805,
|
|
"learning_rate": 6.719211822660098e-05,
|
|
"loss": 0.2781,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"epoch": 66.38194020744356,
|
|
"grad_norm": 1.8926398754119873,
|
|
"learning_rate": 6.699507389162561e-05,
|
|
"loss": 0.273,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"epoch": 66.47956070774863,
|
|
"grad_norm": 1.8903833627700806,
|
|
"learning_rate": 6.679802955665025e-05,
|
|
"loss": 0.2683,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"epoch": 66.57718120805369,
|
|
"grad_norm": 3.0182383060455322,
|
|
"learning_rate": 6.660098522167488e-05,
|
|
"loss": 0.2685,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"epoch": 66.67480170835876,
|
|
"grad_norm": 3.0081100463867188,
|
|
"learning_rate": 6.64039408866995e-05,
|
|
"loss": 0.2915,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"epoch": 66.77242220866383,
|
|
"grad_norm": 2.345440149307251,
|
|
"learning_rate": 6.620689655172415e-05,
|
|
"loss": 0.2707,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"epoch": 66.87004270896888,
|
|
"grad_norm": 2.430608034133911,
|
|
"learning_rate": 6.600985221674877e-05,
|
|
"loss": 0.2675,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"epoch": 66.96766320927395,
|
|
"grad_norm": 4.09646463394165,
|
|
"learning_rate": 6.58128078817734e-05,
|
|
"loss": 0.2886,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"epoch": 67.065283709579,
|
|
"grad_norm": 2.696843147277832,
|
|
"learning_rate": 6.561576354679804e-05,
|
|
"loss": 0.2743,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"epoch": 67.16290420988408,
|
|
"grad_norm": 1.8098782300949097,
|
|
"learning_rate": 6.541871921182267e-05,
|
|
"loss": 0.2629,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"epoch": 67.26052471018915,
|
|
"grad_norm": 2.604454278945923,
|
|
"learning_rate": 6.52216748768473e-05,
|
|
"loss": 0.2701,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"epoch": 67.3581452104942,
|
|
"grad_norm": 2.6400327682495117,
|
|
"learning_rate": 6.502463054187192e-05,
|
|
"loss": 0.2791,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"epoch": 67.45576571079927,
|
|
"grad_norm": 2.6029961109161377,
|
|
"learning_rate": 6.482758620689655e-05,
|
|
"loss": 0.2753,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"epoch": 67.55338621110434,
|
|
"grad_norm": 2.493805170059204,
|
|
"learning_rate": 6.463054187192119e-05,
|
|
"loss": 0.2654,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"epoch": 67.6510067114094,
|
|
"grad_norm": 3.1555075645446777,
|
|
"learning_rate": 6.443349753694582e-05,
|
|
"loss": 0.2701,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"epoch": 67.74862721171446,
|
|
"grad_norm": 4.280105113983154,
|
|
"learning_rate": 6.423645320197044e-05,
|
|
"loss": 0.2732,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"epoch": 67.84624771201952,
|
|
"grad_norm": 2.8167061805725098,
|
|
"learning_rate": 6.403940886699507e-05,
|
|
"loss": 0.2755,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"epoch": 67.94386821232459,
|
|
"grad_norm": 3.5046565532684326,
|
|
"learning_rate": 6.38423645320197e-05,
|
|
"loss": 0.2831,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"epoch": 68.04148871262966,
|
|
"grad_norm": 2.4737610816955566,
|
|
"learning_rate": 6.364532019704434e-05,
|
|
"loss": 0.2737,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"epoch": 68.13910921293471,
|
|
"grad_norm": 1.996193766593933,
|
|
"learning_rate": 6.344827586206897e-05,
|
|
"loss": 0.2637,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"epoch": 68.23672971323978,
|
|
"grad_norm": 2.7088236808776855,
|
|
"learning_rate": 6.325123152709361e-05,
|
|
"loss": 0.2683,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"epoch": 68.33435021354484,
|
|
"grad_norm": 2.344050168991089,
|
|
"learning_rate": 6.305418719211823e-05,
|
|
"loss": 0.2685,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 68.4319707138499,
|
|
"grad_norm": 3.3628969192504883,
|
|
"learning_rate": 6.285714285714286e-05,
|
|
"loss": 0.2728,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"epoch": 68.52959121415498,
|
|
"grad_norm": 2.8613572120666504,
|
|
"learning_rate": 6.266009852216749e-05,
|
|
"loss": 0.2668,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"epoch": 68.62721171446003,
|
|
"grad_norm": 2.752930164337158,
|
|
"learning_rate": 6.246305418719212e-05,
|
|
"loss": 0.2753,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"epoch": 68.7248322147651,
|
|
"grad_norm": 2.426806926727295,
|
|
"learning_rate": 6.226600985221676e-05,
|
|
"loss": 0.2545,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"epoch": 68.82245271507017,
|
|
"grad_norm": 2.4970877170562744,
|
|
"learning_rate": 6.206896551724138e-05,
|
|
"loss": 0.2733,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"epoch": 68.92007321537523,
|
|
"grad_norm": 2.6764674186706543,
|
|
"learning_rate": 6.187192118226601e-05,
|
|
"loss": 0.2726,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"epoch": 69.0176937156803,
|
|
"grad_norm": 2.3702871799468994,
|
|
"learning_rate": 6.167487684729064e-05,
|
|
"loss": 0.2701,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"epoch": 69.11531421598535,
|
|
"grad_norm": 3.5141944885253906,
|
|
"learning_rate": 6.147783251231526e-05,
|
|
"loss": 0.2643,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"epoch": 69.21293471629042,
|
|
"grad_norm": 2.7750911712646484,
|
|
"learning_rate": 6.12807881773399e-05,
|
|
"loss": 0.248,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"epoch": 69.31055521659549,
|
|
"grad_norm": 4.1003217697143555,
|
|
"learning_rate": 6.108374384236453e-05,
|
|
"loss": 0.2618,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"epoch": 69.40817571690054,
|
|
"grad_norm": 2.183353900909424,
|
|
"learning_rate": 6.0886699507389166e-05,
|
|
"loss": 0.2618,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"epoch": 69.50579621720561,
|
|
"grad_norm": 2.447449207305908,
|
|
"learning_rate": 6.068965517241379e-05,
|
|
"loss": 0.284,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"epoch": 69.60341671751068,
|
|
"grad_norm": 2.466543674468994,
|
|
"learning_rate": 6.049261083743843e-05,
|
|
"loss": 0.269,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"epoch": 69.70103721781574,
|
|
"grad_norm": 3.8052902221679688,
|
|
"learning_rate": 6.0295566502463054e-05,
|
|
"loss": 0.2681,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"epoch": 69.79865771812081,
|
|
"grad_norm": 3.1913719177246094,
|
|
"learning_rate": 6.0098522167487695e-05,
|
|
"loss": 0.2677,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"epoch": 69.89627821842586,
|
|
"grad_norm": 1.6767873764038086,
|
|
"learning_rate": 5.990147783251232e-05,
|
|
"loss": 0.2739,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"epoch": 69.99389871873093,
|
|
"grad_norm": 2.805734634399414,
|
|
"learning_rate": 5.970443349753695e-05,
|
|
"loss": 0.2686,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"epoch": 70.091519219036,
|
|
"grad_norm": 2.671316146850586,
|
|
"learning_rate": 5.9507389162561576e-05,
|
|
"loss": 0.2558,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"epoch": 70.18913971934106,
|
|
"grad_norm": 2.5105350017547607,
|
|
"learning_rate": 5.93103448275862e-05,
|
|
"loss": 0.2692,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"epoch": 70.28676021964613,
|
|
"grad_norm": 2.0773072242736816,
|
|
"learning_rate": 5.9113300492610844e-05,
|
|
"loss": 0.266,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"epoch": 70.3843807199512,
|
|
"grad_norm": 2.2632055282592773,
|
|
"learning_rate": 5.891625615763548e-05,
|
|
"loss": 0.2624,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"epoch": 70.48200122025625,
|
|
"grad_norm": 3.4696826934814453,
|
|
"learning_rate": 5.8719211822660105e-05,
|
|
"loss": 0.2616,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"epoch": 70.57962172056132,
|
|
"grad_norm": 2.471937417984009,
|
|
"learning_rate": 5.852216748768473e-05,
|
|
"loss": 0.2534,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"epoch": 70.67724222086638,
|
|
"grad_norm": 2.4318599700927734,
|
|
"learning_rate": 5.832512315270936e-05,
|
|
"loss": 0.2661,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"epoch": 70.77486272117144,
|
|
"grad_norm": 2.773090362548828,
|
|
"learning_rate": 5.8128078817733986e-05,
|
|
"loss": 0.283,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 70.87248322147651,
|
|
"grad_norm": 2.120820999145508,
|
|
"learning_rate": 5.7931034482758627e-05,
|
|
"loss": 0.2668,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"epoch": 70.97010372178157,
|
|
"grad_norm": 2.614382028579712,
|
|
"learning_rate": 5.7733990147783254e-05,
|
|
"loss": 0.2722,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"epoch": 71.06772422208664,
|
|
"grad_norm": 2.954516649246216,
|
|
"learning_rate": 5.753694581280789e-05,
|
|
"loss": 0.2571,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"epoch": 71.16534472239171,
|
|
"grad_norm": 2.9351367950439453,
|
|
"learning_rate": 5.7339901477832515e-05,
|
|
"loss": 0.2659,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"epoch": 71.26296522269676,
|
|
"grad_norm": 2.757805347442627,
|
|
"learning_rate": 5.714285714285714e-05,
|
|
"loss": 0.2461,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"epoch": 71.36058572300183,
|
|
"grad_norm": 3.4546825885772705,
|
|
"learning_rate": 5.694581280788177e-05,
|
|
"loss": 0.2655,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"epoch": 71.45820622330689,
|
|
"grad_norm": 2.822056531906128,
|
|
"learning_rate": 5.674876847290641e-05,
|
|
"loss": 0.2542,
|
|
"step": 14640
|
|
},
|
|
{
|
|
"epoch": 71.55582672361196,
|
|
"grad_norm": 2.4004786014556885,
|
|
"learning_rate": 5.6551724137931037e-05,
|
|
"loss": 0.2489,
|
|
"step": 14660
|
|
},
|
|
{
|
|
"epoch": 71.65344722391703,
|
|
"grad_norm": 3.2715816497802734,
|
|
"learning_rate": 5.635467980295567e-05,
|
|
"loss": 0.2669,
|
|
"step": 14680
|
|
},
|
|
{
|
|
"epoch": 71.75106772422208,
|
|
"grad_norm": 4.031295299530029,
|
|
"learning_rate": 5.61576354679803e-05,
|
|
"loss": 0.2729,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"epoch": 71.84868822452715,
|
|
"grad_norm": 3.0305051803588867,
|
|
"learning_rate": 5.5960591133004925e-05,
|
|
"loss": 0.272,
|
|
"step": 14720
|
|
},
|
|
{
|
|
"epoch": 71.94630872483222,
|
|
"grad_norm": 2.170488119125366,
|
|
"learning_rate": 5.5763546798029565e-05,
|
|
"loss": 0.2771,
|
|
"step": 14740
|
|
},
|
|
{
|
|
"epoch": 72.04392922513728,
|
|
"grad_norm": 2.89032244682312,
|
|
"learning_rate": 5.556650246305419e-05,
|
|
"loss": 0.2665,
|
|
"step": 14760
|
|
},
|
|
{
|
|
"epoch": 72.14154972544235,
|
|
"grad_norm": 2.4803104400634766,
|
|
"learning_rate": 5.536945812807882e-05,
|
|
"loss": 0.2653,
|
|
"step": 14780
|
|
},
|
|
{
|
|
"epoch": 72.2391702257474,
|
|
"grad_norm": 2.525521755218506,
|
|
"learning_rate": 5.517241379310345e-05,
|
|
"loss": 0.2595,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"epoch": 72.33679072605247,
|
|
"grad_norm": 2.121696710586548,
|
|
"learning_rate": 5.497536945812808e-05,
|
|
"loss": 0.2557,
|
|
"step": 14820
|
|
},
|
|
{
|
|
"epoch": 72.43441122635754,
|
|
"grad_norm": 1.8344529867172241,
|
|
"learning_rate": 5.477832512315271e-05,
|
|
"loss": 0.2512,
|
|
"step": 14840
|
|
},
|
|
{
|
|
"epoch": 72.5320317266626,
|
|
"grad_norm": 2.196624517440796,
|
|
"learning_rate": 5.458128078817735e-05,
|
|
"loss": 0.2474,
|
|
"step": 14860
|
|
},
|
|
{
|
|
"epoch": 72.62965222696766,
|
|
"grad_norm": 3.387305974960327,
|
|
"learning_rate": 5.4384236453201975e-05,
|
|
"loss": 0.2696,
|
|
"step": 14880
|
|
},
|
|
{
|
|
"epoch": 72.72727272727273,
|
|
"grad_norm": 2.481462240219116,
|
|
"learning_rate": 5.41871921182266e-05,
|
|
"loss": 0.2681,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"epoch": 72.82489322757779,
|
|
"grad_norm": 2.6742024421691895,
|
|
"learning_rate": 5.399014778325123e-05,
|
|
"loss": 0.2553,
|
|
"step": 14920
|
|
},
|
|
{
|
|
"epoch": 72.92251372788286,
|
|
"grad_norm": 2.590111494064331,
|
|
"learning_rate": 5.379310344827586e-05,
|
|
"loss": 0.265,
|
|
"step": 14940
|
|
},
|
|
{
|
|
"epoch": 73.02013422818791,
|
|
"grad_norm": 2.311305046081543,
|
|
"learning_rate": 5.359605911330049e-05,
|
|
"loss": 0.2644,
|
|
"step": 14960
|
|
},
|
|
{
|
|
"epoch": 73.11775472849298,
|
|
"grad_norm": 2.502192974090576,
|
|
"learning_rate": 5.339901477832513e-05,
|
|
"loss": 0.2634,
|
|
"step": 14980
|
|
},
|
|
{
|
|
"epoch": 73.21537522879805,
|
|
"grad_norm": 2.5767018795013428,
|
|
"learning_rate": 5.320197044334976e-05,
|
|
"loss": 0.2625,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 73.31299572910311,
|
|
"grad_norm": 3.005783796310425,
|
|
"learning_rate": 5.3004926108374385e-05,
|
|
"loss": 0.2589,
|
|
"step": 15020
|
|
},
|
|
{
|
|
"epoch": 73.41061622940818,
|
|
"grad_norm": 2.7578892707824707,
|
|
"learning_rate": 5.280788177339901e-05,
|
|
"loss": 0.2519,
|
|
"step": 15040
|
|
},
|
|
{
|
|
"epoch": 73.50823672971325,
|
|
"grad_norm": 3.286733627319336,
|
|
"learning_rate": 5.2610837438423646e-05,
|
|
"loss": 0.2603,
|
|
"step": 15060
|
|
},
|
|
{
|
|
"epoch": 73.6058572300183,
|
|
"grad_norm": 2.323225975036621,
|
|
"learning_rate": 5.241379310344828e-05,
|
|
"loss": 0.2576,
|
|
"step": 15080
|
|
},
|
|
{
|
|
"epoch": 73.70347773032337,
|
|
"grad_norm": 2.407222032546997,
|
|
"learning_rate": 5.2216748768472914e-05,
|
|
"loss": 0.253,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"epoch": 73.80109823062843,
|
|
"grad_norm": 3.0755960941314697,
|
|
"learning_rate": 5.201970443349754e-05,
|
|
"loss": 0.261,
|
|
"step": 15120
|
|
},
|
|
{
|
|
"epoch": 73.8987187309335,
|
|
"grad_norm": 1.9469565153121948,
|
|
"learning_rate": 5.182266009852217e-05,
|
|
"loss": 0.2556,
|
|
"step": 15140
|
|
},
|
|
{
|
|
"epoch": 73.99633923123857,
|
|
"grad_norm": 3.5689964294433594,
|
|
"learning_rate": 5.1625615763546795e-05,
|
|
"loss": 0.2718,
|
|
"step": 15160
|
|
},
|
|
{
|
|
"epoch": 74.09395973154362,
|
|
"grad_norm": 1.9299124479293823,
|
|
"learning_rate": 5.142857142857143e-05,
|
|
"loss": 0.2497,
|
|
"step": 15180
|
|
},
|
|
{
|
|
"epoch": 74.19158023184869,
|
|
"grad_norm": 2.1597163677215576,
|
|
"learning_rate": 5.123152709359606e-05,
|
|
"loss": 0.2526,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"epoch": 74.28920073215376,
|
|
"grad_norm": 2.4359443187713623,
|
|
"learning_rate": 5.10344827586207e-05,
|
|
"loss": 0.2557,
|
|
"step": 15220
|
|
},
|
|
{
|
|
"epoch": 74.38682123245881,
|
|
"grad_norm": 2.449601411819458,
|
|
"learning_rate": 5.0837438423645324e-05,
|
|
"loss": 0.2628,
|
|
"step": 15240
|
|
},
|
|
{
|
|
"epoch": 74.48444173276388,
|
|
"grad_norm": 2.5450046062469482,
|
|
"learning_rate": 5.064039408866995e-05,
|
|
"loss": 0.2683,
|
|
"step": 15260
|
|
},
|
|
{
|
|
"epoch": 74.58206223306894,
|
|
"grad_norm": 2.499568462371826,
|
|
"learning_rate": 5.044334975369458e-05,
|
|
"loss": 0.2456,
|
|
"step": 15280
|
|
},
|
|
{
|
|
"epoch": 74.67968273337401,
|
|
"grad_norm": 2.276536703109741,
|
|
"learning_rate": 5.024630541871922e-05,
|
|
"loss": 0.2613,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"epoch": 74.77730323367908,
|
|
"grad_norm": 6.047021865844727,
|
|
"learning_rate": 5.0049261083743846e-05,
|
|
"loss": 0.2591,
|
|
"step": 15320
|
|
},
|
|
{
|
|
"epoch": 74.87492373398413,
|
|
"grad_norm": 2.7853705883026123,
|
|
"learning_rate": 4.985221674876848e-05,
|
|
"loss": 0.2584,
|
|
"step": 15340
|
|
},
|
|
{
|
|
"epoch": 74.9725442342892,
|
|
"grad_norm": 2.658870220184326,
|
|
"learning_rate": 4.9655172413793107e-05,
|
|
"loss": 0.2485,
|
|
"step": 15360
|
|
},
|
|
{
|
|
"epoch": 75.07016473459427,
|
|
"grad_norm": 1.9290242195129395,
|
|
"learning_rate": 4.9458128078817734e-05,
|
|
"loss": 0.2456,
|
|
"step": 15380
|
|
},
|
|
{
|
|
"epoch": 75.16778523489933,
|
|
"grad_norm": 2.4340288639068604,
|
|
"learning_rate": 4.926108374384237e-05,
|
|
"loss": 0.2517,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"epoch": 75.2654057352044,
|
|
"grad_norm": 1.7368818521499634,
|
|
"learning_rate": 4.9064039408866995e-05,
|
|
"loss": 0.245,
|
|
"step": 15420
|
|
},
|
|
{
|
|
"epoch": 75.36302623550945,
|
|
"grad_norm": 3.224472999572754,
|
|
"learning_rate": 4.886699507389163e-05,
|
|
"loss": 0.2512,
|
|
"step": 15440
|
|
},
|
|
{
|
|
"epoch": 75.46064673581452,
|
|
"grad_norm": 2.9347827434539795,
|
|
"learning_rate": 4.866995073891626e-05,
|
|
"loss": 0.252,
|
|
"step": 15460
|
|
},
|
|
{
|
|
"epoch": 75.55826723611959,
|
|
"grad_norm": 3.1281368732452393,
|
|
"learning_rate": 4.847290640394089e-05,
|
|
"loss": 0.2662,
|
|
"step": 15480
|
|
},
|
|
{
|
|
"epoch": 75.65588773642465,
|
|
"grad_norm": 2.1834158897399902,
|
|
"learning_rate": 4.827586206896552e-05,
|
|
"loss": 0.2549,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 75.75350823672972,
|
|
"grad_norm": 2.4959053993225098,
|
|
"learning_rate": 4.807881773399015e-05,
|
|
"loss": 0.2489,
|
|
"step": 15520
|
|
},
|
|
{
|
|
"epoch": 75.85112873703477,
|
|
"grad_norm": 1.9630552530288696,
|
|
"learning_rate": 4.788177339901478e-05,
|
|
"loss": 0.2685,
|
|
"step": 15540
|
|
},
|
|
{
|
|
"epoch": 75.94874923733984,
|
|
"grad_norm": 2.9730660915374756,
|
|
"learning_rate": 4.768472906403941e-05,
|
|
"loss": 0.2568,
|
|
"step": 15560
|
|
},
|
|
{
|
|
"epoch": 76.04636973764491,
|
|
"grad_norm": 2.492307186126709,
|
|
"learning_rate": 4.748768472906404e-05,
|
|
"loss": 0.254,
|
|
"step": 15580
|
|
},
|
|
{
|
|
"epoch": 76.14399023794996,
|
|
"grad_norm": 2.1463494300842285,
|
|
"learning_rate": 4.729064039408867e-05,
|
|
"loss": 0.2623,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"epoch": 76.24161073825503,
|
|
"grad_norm": 2.957017421722412,
|
|
"learning_rate": 4.7093596059113306e-05,
|
|
"loss": 0.2576,
|
|
"step": 15620
|
|
},
|
|
{
|
|
"epoch": 76.3392312385601,
|
|
"grad_norm": 2.1611711978912354,
|
|
"learning_rate": 4.689655172413793e-05,
|
|
"loss": 0.2447,
|
|
"step": 15640
|
|
},
|
|
{
|
|
"epoch": 76.43685173886516,
|
|
"grad_norm": 3.1399998664855957,
|
|
"learning_rate": 4.669950738916256e-05,
|
|
"loss": 0.2586,
|
|
"step": 15660
|
|
},
|
|
{
|
|
"epoch": 76.53447223917023,
|
|
"grad_norm": 2.817157030105591,
|
|
"learning_rate": 4.6502463054187194e-05,
|
|
"loss": 0.2439,
|
|
"step": 15680
|
|
},
|
|
{
|
|
"epoch": 76.63209273947528,
|
|
"grad_norm": 1.3343191146850586,
|
|
"learning_rate": 4.630541871921182e-05,
|
|
"loss": 0.2522,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"epoch": 76.72971323978035,
|
|
"grad_norm": 2.9455504417419434,
|
|
"learning_rate": 4.6108374384236455e-05,
|
|
"loss": 0.2606,
|
|
"step": 15720
|
|
},
|
|
{
|
|
"epoch": 76.82733374008542,
|
|
"grad_norm": 2.981264352798462,
|
|
"learning_rate": 4.591133004926109e-05,
|
|
"loss": 0.2482,
|
|
"step": 15740
|
|
},
|
|
{
|
|
"epoch": 76.92495424039048,
|
|
"grad_norm": 2.9296011924743652,
|
|
"learning_rate": 4.5714285714285716e-05,
|
|
"loss": 0.2578,
|
|
"step": 15760
|
|
},
|
|
{
|
|
"epoch": 77.02257474069555,
|
|
"grad_norm": 2.8159282207489014,
|
|
"learning_rate": 4.551724137931035e-05,
|
|
"loss": 0.2571,
|
|
"step": 15780
|
|
},
|
|
{
|
|
"epoch": 77.12019524100062,
|
|
"grad_norm": 2.184053421020508,
|
|
"learning_rate": 4.532019704433498e-05,
|
|
"loss": 0.2548,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"epoch": 77.21781574130567,
|
|
"grad_norm": 2.1801810264587402,
|
|
"learning_rate": 4.5123152709359604e-05,
|
|
"loss": 0.2367,
|
|
"step": 15820
|
|
},
|
|
{
|
|
"epoch": 77.31543624161074,
|
|
"grad_norm": 2.510050058364868,
|
|
"learning_rate": 4.492610837438424e-05,
|
|
"loss": 0.2522,
|
|
"step": 15840
|
|
},
|
|
{
|
|
"epoch": 77.4130567419158,
|
|
"grad_norm": 2.849837303161621,
|
|
"learning_rate": 4.472906403940887e-05,
|
|
"loss": 0.2524,
|
|
"step": 15860
|
|
},
|
|
{
|
|
"epoch": 77.51067724222086,
|
|
"grad_norm": 3.769998788833618,
|
|
"learning_rate": 4.45320197044335e-05,
|
|
"loss": 0.2568,
|
|
"step": 15880
|
|
},
|
|
{
|
|
"epoch": 77.60829774252593,
|
|
"grad_norm": 3.2575082778930664,
|
|
"learning_rate": 4.433497536945813e-05,
|
|
"loss": 0.2565,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"epoch": 77.70591824283099,
|
|
"grad_norm": 2.199042797088623,
|
|
"learning_rate": 4.413793103448276e-05,
|
|
"loss": 0.2508,
|
|
"step": 15920
|
|
},
|
|
{
|
|
"epoch": 77.80353874313606,
|
|
"grad_norm": 1.9908735752105713,
|
|
"learning_rate": 4.394088669950739e-05,
|
|
"loss": 0.2612,
|
|
"step": 15940
|
|
},
|
|
{
|
|
"epoch": 77.90115924344113,
|
|
"grad_norm": 2.091723680496216,
|
|
"learning_rate": 4.374384236453202e-05,
|
|
"loss": 0.2491,
|
|
"step": 15960
|
|
},
|
|
{
|
|
"epoch": 77.99877974374618,
|
|
"grad_norm": 2.705829381942749,
|
|
"learning_rate": 4.3546798029556655e-05,
|
|
"loss": 0.2596,
|
|
"step": 15980
|
|
},
|
|
{
|
|
"epoch": 78.09640024405125,
|
|
"grad_norm": 2.6604998111724854,
|
|
"learning_rate": 4.334975369458129e-05,
|
|
"loss": 0.2475,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 78.19402074435631,
|
|
"grad_norm": 2.4286489486694336,
|
|
"learning_rate": 4.3152709359605916e-05,
|
|
"loss": 0.2509,
|
|
"step": 16020
|
|
},
|
|
{
|
|
"epoch": 78.29164124466138,
|
|
"grad_norm": 3.3478493690490723,
|
|
"learning_rate": 4.295566502463054e-05,
|
|
"loss": 0.2491,
|
|
"step": 16040
|
|
},
|
|
{
|
|
"epoch": 78.38926174496645,
|
|
"grad_norm": 2.9512908458709717,
|
|
"learning_rate": 4.275862068965518e-05,
|
|
"loss": 0.2362,
|
|
"step": 16060
|
|
},
|
|
{
|
|
"epoch": 78.4868822452715,
|
|
"grad_norm": 2.0870890617370605,
|
|
"learning_rate": 4.2561576354679804e-05,
|
|
"loss": 0.2546,
|
|
"step": 16080
|
|
},
|
|
{
|
|
"epoch": 78.58450274557657,
|
|
"grad_norm": 2.3549749851226807,
|
|
"learning_rate": 4.236453201970443e-05,
|
|
"loss": 0.2544,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"epoch": 78.68212324588164,
|
|
"grad_norm": 2.296377658843994,
|
|
"learning_rate": 4.2167487684729065e-05,
|
|
"loss": 0.2524,
|
|
"step": 16120
|
|
},
|
|
{
|
|
"epoch": 78.7797437461867,
|
|
"grad_norm": 2.9563801288604736,
|
|
"learning_rate": 4.19704433497537e-05,
|
|
"loss": 0.2534,
|
|
"step": 16140
|
|
},
|
|
{
|
|
"epoch": 78.87736424649177,
|
|
"grad_norm": 3.3844058513641357,
|
|
"learning_rate": 4.1773399014778326e-05,
|
|
"loss": 0.2629,
|
|
"step": 16160
|
|
},
|
|
{
|
|
"epoch": 78.97498474679682,
|
|
"grad_norm": 1.9131345748901367,
|
|
"learning_rate": 4.157635467980296e-05,
|
|
"loss": 0.2478,
|
|
"step": 16180
|
|
},
|
|
{
|
|
"epoch": 79.07260524710189,
|
|
"grad_norm": 3.4866435527801514,
|
|
"learning_rate": 4.1379310344827587e-05,
|
|
"loss": 0.2464,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"epoch": 79.17022574740696,
|
|
"grad_norm": 2.0751941204071045,
|
|
"learning_rate": 4.1182266009852214e-05,
|
|
"loss": 0.2435,
|
|
"step": 16220
|
|
},
|
|
{
|
|
"epoch": 79.26784624771201,
|
|
"grad_norm": 1.776879072189331,
|
|
"learning_rate": 4.098522167487685e-05,
|
|
"loss": 0.2501,
|
|
"step": 16240
|
|
},
|
|
{
|
|
"epoch": 79.36546674801708,
|
|
"grad_norm": 3.9006545543670654,
|
|
"learning_rate": 4.078817733990148e-05,
|
|
"loss": 0.2586,
|
|
"step": 16260
|
|
},
|
|
{
|
|
"epoch": 79.46308724832215,
|
|
"grad_norm": 2.390000581741333,
|
|
"learning_rate": 4.0591133004926115e-05,
|
|
"loss": 0.2408,
|
|
"step": 16280
|
|
},
|
|
{
|
|
"epoch": 79.56070774862721,
|
|
"grad_norm": 3.1795706748962402,
|
|
"learning_rate": 4.039408866995074e-05,
|
|
"loss": 0.2469,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"epoch": 79.65832824893228,
|
|
"grad_norm": 2.6821188926696777,
|
|
"learning_rate": 4.019704433497537e-05,
|
|
"loss": 0.2429,
|
|
"step": 16320
|
|
},
|
|
{
|
|
"epoch": 79.75594874923733,
|
|
"grad_norm": 3.01457142829895,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.2598,
|
|
"step": 16340
|
|
},
|
|
{
|
|
"epoch": 79.8535692495424,
|
|
"grad_norm": 2.8440592288970947,
|
|
"learning_rate": 3.980295566502463e-05,
|
|
"loss": 0.2494,
|
|
"step": 16360
|
|
},
|
|
{
|
|
"epoch": 79.95118974984747,
|
|
"grad_norm": 3.210845708847046,
|
|
"learning_rate": 3.9605911330049264e-05,
|
|
"loss": 0.2521,
|
|
"step": 16380
|
|
},
|
|
{
|
|
"epoch": 80.04881025015253,
|
|
"grad_norm": 3.9740731716156006,
|
|
"learning_rate": 3.94088669950739e-05,
|
|
"loss": 0.2537,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"epoch": 80.1464307504576,
|
|
"grad_norm": 2.3433115482330322,
|
|
"learning_rate": 3.9211822660098525e-05,
|
|
"loss": 0.2441,
|
|
"step": 16420
|
|
},
|
|
{
|
|
"epoch": 80.24405125076267,
|
|
"grad_norm": 2.5279314517974854,
|
|
"learning_rate": 3.901477832512315e-05,
|
|
"loss": 0.2564,
|
|
"step": 16440
|
|
},
|
|
{
|
|
"epoch": 80.34167175106772,
|
|
"grad_norm": 2.8062689304351807,
|
|
"learning_rate": 3.8817733990147786e-05,
|
|
"loss": 0.245,
|
|
"step": 16460
|
|
},
|
|
{
|
|
"epoch": 80.43929225137279,
|
|
"grad_norm": 1.9689416885375977,
|
|
"learning_rate": 3.862068965517241e-05,
|
|
"loss": 0.2497,
|
|
"step": 16480
|
|
},
|
|
{
|
|
"epoch": 80.53691275167785,
|
|
"grad_norm": 2.462744951248169,
|
|
"learning_rate": 3.842364532019704e-05,
|
|
"loss": 0.2525,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 80.63453325198292,
|
|
"grad_norm": 1.9201568365097046,
|
|
"learning_rate": 3.822660098522168e-05,
|
|
"loss": 0.2495,
|
|
"step": 16520
|
|
},
|
|
{
|
|
"epoch": 80.73215375228799,
|
|
"grad_norm": 1.7118130922317505,
|
|
"learning_rate": 3.802955665024631e-05,
|
|
"loss": 0.2415,
|
|
"step": 16540
|
|
},
|
|
{
|
|
"epoch": 80.82977425259304,
|
|
"grad_norm": 2.311931848526001,
|
|
"learning_rate": 3.783251231527094e-05,
|
|
"loss": 0.247,
|
|
"step": 16560
|
|
},
|
|
{
|
|
"epoch": 80.92739475289811,
|
|
"grad_norm": 2.030750274658203,
|
|
"learning_rate": 3.763546798029557e-05,
|
|
"loss": 0.2415,
|
|
"step": 16580
|
|
},
|
|
{
|
|
"epoch": 81.02501525320318,
|
|
"grad_norm": 1.949194312095642,
|
|
"learning_rate": 3.7438423645320196e-05,
|
|
"loss": 0.2555,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"epoch": 81.12263575350823,
|
|
"grad_norm": 1.8409544229507446,
|
|
"learning_rate": 3.724137931034483e-05,
|
|
"loss": 0.2412,
|
|
"step": 16620
|
|
},
|
|
{
|
|
"epoch": 81.2202562538133,
|
|
"grad_norm": 2.5164377689361572,
|
|
"learning_rate": 3.704433497536946e-05,
|
|
"loss": 0.2326,
|
|
"step": 16640
|
|
},
|
|
{
|
|
"epoch": 81.31787675411836,
|
|
"grad_norm": 2.3859026432037354,
|
|
"learning_rate": 3.684729064039409e-05,
|
|
"loss": 0.2499,
|
|
"step": 16660
|
|
},
|
|
{
|
|
"epoch": 81.41549725442343,
|
|
"grad_norm": 2.753124713897705,
|
|
"learning_rate": 3.6650246305418725e-05,
|
|
"loss": 0.2504,
|
|
"step": 16680
|
|
},
|
|
{
|
|
"epoch": 81.5131177547285,
|
|
"grad_norm": 2.294701099395752,
|
|
"learning_rate": 3.645320197044335e-05,
|
|
"loss": 0.2433,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"epoch": 81.61073825503355,
|
|
"grad_norm": 2.179985761642456,
|
|
"learning_rate": 3.625615763546798e-05,
|
|
"loss": 0.2511,
|
|
"step": 16720
|
|
},
|
|
{
|
|
"epoch": 81.70835875533862,
|
|
"grad_norm": 2.242023229598999,
|
|
"learning_rate": 3.605911330049261e-05,
|
|
"loss": 0.2558,
|
|
"step": 16740
|
|
},
|
|
{
|
|
"epoch": 81.80597925564369,
|
|
"grad_norm": 2.9500415325164795,
|
|
"learning_rate": 3.586206896551724e-05,
|
|
"loss": 0.2423,
|
|
"step": 16760
|
|
},
|
|
{
|
|
"epoch": 81.90359975594875,
|
|
"grad_norm": 2.372332811355591,
|
|
"learning_rate": 3.5665024630541874e-05,
|
|
"loss": 0.2503,
|
|
"step": 16780
|
|
},
|
|
{
|
|
"epoch": 82.00122025625382,
|
|
"grad_norm": 2.8338615894317627,
|
|
"learning_rate": 3.546798029556651e-05,
|
|
"loss": 0.2442,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"epoch": 82.09884075655887,
|
|
"grad_norm": 2.5122156143188477,
|
|
"learning_rate": 3.5270935960591135e-05,
|
|
"loss": 0.2386,
|
|
"step": 16820
|
|
},
|
|
{
|
|
"epoch": 82.19646125686394,
|
|
"grad_norm": 2.6733508110046387,
|
|
"learning_rate": 3.507389162561577e-05,
|
|
"loss": 0.2376,
|
|
"step": 16840
|
|
},
|
|
{
|
|
"epoch": 82.29408175716901,
|
|
"grad_norm": 1.9639496803283691,
|
|
"learning_rate": 3.4876847290640396e-05,
|
|
"loss": 0.2366,
|
|
"step": 16860
|
|
},
|
|
{
|
|
"epoch": 82.39170225747407,
|
|
"grad_norm": 2.2403128147125244,
|
|
"learning_rate": 3.467980295566502e-05,
|
|
"loss": 0.2478,
|
|
"step": 16880
|
|
},
|
|
{
|
|
"epoch": 82.48932275777914,
|
|
"grad_norm": 2.3874387741088867,
|
|
"learning_rate": 3.4482758620689657e-05,
|
|
"loss": 0.2561,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"epoch": 82.5869432580842,
|
|
"grad_norm": 3.6774182319641113,
|
|
"learning_rate": 3.428571428571429e-05,
|
|
"loss": 0.2448,
|
|
"step": 16920
|
|
},
|
|
{
|
|
"epoch": 82.68456375838926,
|
|
"grad_norm": 1.8325834274291992,
|
|
"learning_rate": 3.408866995073892e-05,
|
|
"loss": 0.2515,
|
|
"step": 16940
|
|
},
|
|
{
|
|
"epoch": 82.78218425869433,
|
|
"grad_norm": 2.846112012863159,
|
|
"learning_rate": 3.389162561576355e-05,
|
|
"loss": 0.2481,
|
|
"step": 16960
|
|
},
|
|
{
|
|
"epoch": 82.87980475899938,
|
|
"grad_norm": 3.7636115550994873,
|
|
"learning_rate": 3.369458128078818e-05,
|
|
"loss": 0.2529,
|
|
"step": 16980
|
|
},
|
|
{
|
|
"epoch": 82.97742525930445,
|
|
"grad_norm": 2.4501962661743164,
|
|
"learning_rate": 3.3497536945812806e-05,
|
|
"loss": 0.2344,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 83.07504575960952,
|
|
"grad_norm": 2.4377410411834717,
|
|
"learning_rate": 3.330049261083744e-05,
|
|
"loss": 0.2373,
|
|
"step": 17020
|
|
},
|
|
{
|
|
"epoch": 83.17266625991458,
|
|
"grad_norm": 2.180765151977539,
|
|
"learning_rate": 3.310344827586207e-05,
|
|
"loss": 0.2395,
|
|
"step": 17040
|
|
},
|
|
{
|
|
"epoch": 83.27028676021965,
|
|
"grad_norm": 3.2704169750213623,
|
|
"learning_rate": 3.29064039408867e-05,
|
|
"loss": 0.2407,
|
|
"step": 17060
|
|
},
|
|
{
|
|
"epoch": 83.36790726052472,
|
|
"grad_norm": 2.74991512298584,
|
|
"learning_rate": 3.2709359605911334e-05,
|
|
"loss": 0.2351,
|
|
"step": 17080
|
|
},
|
|
{
|
|
"epoch": 83.46552776082977,
|
|
"grad_norm": 1.780633807182312,
|
|
"learning_rate": 3.251231527093596e-05,
|
|
"loss": 0.2379,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"epoch": 83.56314826113484,
|
|
"grad_norm": 2.352802038192749,
|
|
"learning_rate": 3.2315270935960595e-05,
|
|
"loss": 0.244,
|
|
"step": 17120
|
|
},
|
|
{
|
|
"epoch": 83.6607687614399,
|
|
"grad_norm": 3.505608320236206,
|
|
"learning_rate": 3.211822660098522e-05,
|
|
"loss": 0.2443,
|
|
"step": 17140
|
|
},
|
|
{
|
|
"epoch": 83.75838926174497,
|
|
"grad_norm": 2.568233013153076,
|
|
"learning_rate": 3.192118226600985e-05,
|
|
"loss": 0.2499,
|
|
"step": 17160
|
|
},
|
|
{
|
|
"epoch": 83.85600976205004,
|
|
"grad_norm": 1.864367961883545,
|
|
"learning_rate": 3.172413793103448e-05,
|
|
"loss": 0.2543,
|
|
"step": 17180
|
|
},
|
|
{
|
|
"epoch": 83.95363026235509,
|
|
"grad_norm": 2.386052370071411,
|
|
"learning_rate": 3.152709359605912e-05,
|
|
"loss": 0.2505,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"epoch": 84.05125076266016,
|
|
"grad_norm": 4.361128330230713,
|
|
"learning_rate": 3.1330049261083744e-05,
|
|
"loss": 0.2505,
|
|
"step": 17220
|
|
},
|
|
{
|
|
"epoch": 84.14887126296523,
|
|
"grad_norm": 1.4861139059066772,
|
|
"learning_rate": 3.113300492610838e-05,
|
|
"loss": 0.2314,
|
|
"step": 17240
|
|
},
|
|
{
|
|
"epoch": 84.24649176327028,
|
|
"grad_norm": 1.9692414999008179,
|
|
"learning_rate": 3.0935960591133005e-05,
|
|
"loss": 0.2499,
|
|
"step": 17260
|
|
},
|
|
{
|
|
"epoch": 84.34411226357535,
|
|
"grad_norm": 2.245277166366577,
|
|
"learning_rate": 3.073891625615763e-05,
|
|
"loss": 0.243,
|
|
"step": 17280
|
|
},
|
|
{
|
|
"epoch": 84.44173276388041,
|
|
"grad_norm": 2.0669002532958984,
|
|
"learning_rate": 3.0541871921182266e-05,
|
|
"loss": 0.2388,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"epoch": 84.53935326418548,
|
|
"grad_norm": 2.377110004425049,
|
|
"learning_rate": 3.0344827586206897e-05,
|
|
"loss": 0.2431,
|
|
"step": 17320
|
|
},
|
|
{
|
|
"epoch": 84.63697376449055,
|
|
"grad_norm": 2.4260573387145996,
|
|
"learning_rate": 3.0147783251231527e-05,
|
|
"loss": 0.2393,
|
|
"step": 17340
|
|
},
|
|
{
|
|
"epoch": 84.7345942647956,
|
|
"grad_norm": 1.7577930688858032,
|
|
"learning_rate": 2.995073891625616e-05,
|
|
"loss": 0.2444,
|
|
"step": 17360
|
|
},
|
|
{
|
|
"epoch": 84.83221476510067,
|
|
"grad_norm": 2.4844295978546143,
|
|
"learning_rate": 2.9753694581280788e-05,
|
|
"loss": 0.2474,
|
|
"step": 17380
|
|
},
|
|
{
|
|
"epoch": 84.92983526540573,
|
|
"grad_norm": 2.7530508041381836,
|
|
"learning_rate": 2.9556650246305422e-05,
|
|
"loss": 0.2459,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"epoch": 85.0274557657108,
|
|
"grad_norm": 1.6418040990829468,
|
|
"learning_rate": 2.9359605911330052e-05,
|
|
"loss": 0.2491,
|
|
"step": 17420
|
|
},
|
|
{
|
|
"epoch": 85.12507626601587,
|
|
"grad_norm": 2.0329489707946777,
|
|
"learning_rate": 2.916256157635468e-05,
|
|
"loss": 0.2426,
|
|
"step": 17440
|
|
},
|
|
{
|
|
"epoch": 85.22269676632092,
|
|
"grad_norm": 1.6439207792282104,
|
|
"learning_rate": 2.8965517241379313e-05,
|
|
"loss": 0.2351,
|
|
"step": 17460
|
|
},
|
|
{
|
|
"epoch": 85.32031726662599,
|
|
"grad_norm": 1.6182892322540283,
|
|
"learning_rate": 2.8768472906403944e-05,
|
|
"loss": 0.2468,
|
|
"step": 17480
|
|
},
|
|
{
|
|
"epoch": 85.41793776693106,
|
|
"grad_norm": 3.263887882232666,
|
|
"learning_rate": 2.857142857142857e-05,
|
|
"loss": 0.2426,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 85.51555826723612,
|
|
"grad_norm": 3.062742233276367,
|
|
"learning_rate": 2.8374384236453205e-05,
|
|
"loss": 0.2386,
|
|
"step": 17520
|
|
},
|
|
{
|
|
"epoch": 85.61317876754119,
|
|
"grad_norm": 2.8203582763671875,
|
|
"learning_rate": 2.8177339901477835e-05,
|
|
"loss": 0.2407,
|
|
"step": 17540
|
|
},
|
|
{
|
|
"epoch": 85.71079926784624,
|
|
"grad_norm": 2.3993334770202637,
|
|
"learning_rate": 2.7980295566502462e-05,
|
|
"loss": 0.2418,
|
|
"step": 17560
|
|
},
|
|
{
|
|
"epoch": 85.80841976815131,
|
|
"grad_norm": 1.7914482355117798,
|
|
"learning_rate": 2.7783251231527096e-05,
|
|
"loss": 0.2377,
|
|
"step": 17580
|
|
},
|
|
{
|
|
"epoch": 85.90604026845638,
|
|
"grad_norm": 3.20501971244812,
|
|
"learning_rate": 2.7586206896551727e-05,
|
|
"loss": 0.2398,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"epoch": 86.00366076876143,
|
|
"grad_norm": 1.6623684167861938,
|
|
"learning_rate": 2.7389162561576354e-05,
|
|
"loss": 0.2442,
|
|
"step": 17620
|
|
},
|
|
{
|
|
"epoch": 86.1012812690665,
|
|
"grad_norm": 2.3433034420013428,
|
|
"learning_rate": 2.7192118226600988e-05,
|
|
"loss": 0.2358,
|
|
"step": 17640
|
|
},
|
|
{
|
|
"epoch": 86.19890176937157,
|
|
"grad_norm": 2.6188597679138184,
|
|
"learning_rate": 2.6995073891625615e-05,
|
|
"loss": 0.2336,
|
|
"step": 17660
|
|
},
|
|
{
|
|
"epoch": 86.29652226967663,
|
|
"grad_norm": 3.1089391708374023,
|
|
"learning_rate": 2.6798029556650245e-05,
|
|
"loss": 0.239,
|
|
"step": 17680
|
|
},
|
|
{
|
|
"epoch": 86.3941427699817,
|
|
"grad_norm": 2.378998041152954,
|
|
"learning_rate": 2.660098522167488e-05,
|
|
"loss": 0.2336,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"epoch": 86.49176327028675,
|
|
"grad_norm": 2.4956347942352295,
|
|
"learning_rate": 2.6403940886699506e-05,
|
|
"loss": 0.2497,
|
|
"step": 17720
|
|
},
|
|
{
|
|
"epoch": 86.58938377059182,
|
|
"grad_norm": 2.529139757156372,
|
|
"learning_rate": 2.620689655172414e-05,
|
|
"loss": 0.2436,
|
|
"step": 17740
|
|
},
|
|
{
|
|
"epoch": 86.68700427089689,
|
|
"grad_norm": 2.6899948120117188,
|
|
"learning_rate": 2.600985221674877e-05,
|
|
"loss": 0.2445,
|
|
"step": 17760
|
|
},
|
|
{
|
|
"epoch": 86.78462477120195,
|
|
"grad_norm": 1.8922455310821533,
|
|
"learning_rate": 2.5812807881773398e-05,
|
|
"loss": 0.2366,
|
|
"step": 17780
|
|
},
|
|
{
|
|
"epoch": 86.88224527150702,
|
|
"grad_norm": 1.9104729890823364,
|
|
"learning_rate": 2.561576354679803e-05,
|
|
"loss": 0.2345,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"epoch": 86.97986577181209,
|
|
"grad_norm": 3.2369461059570312,
|
|
"learning_rate": 2.5418719211822662e-05,
|
|
"loss": 0.2515,
|
|
"step": 17820
|
|
},
|
|
{
|
|
"epoch": 87.07748627211714,
|
|
"grad_norm": 2.2592508792877197,
|
|
"learning_rate": 2.522167487684729e-05,
|
|
"loss": 0.2333,
|
|
"step": 17840
|
|
},
|
|
{
|
|
"epoch": 87.17510677242221,
|
|
"grad_norm": 2.302445888519287,
|
|
"learning_rate": 2.5024630541871923e-05,
|
|
"loss": 0.2308,
|
|
"step": 17860
|
|
},
|
|
{
|
|
"epoch": 87.27272727272727,
|
|
"grad_norm": 2.0607619285583496,
|
|
"learning_rate": 2.4827586206896553e-05,
|
|
"loss": 0.2323,
|
|
"step": 17880
|
|
},
|
|
{
|
|
"epoch": 87.37034777303234,
|
|
"grad_norm": 2.4503376483917236,
|
|
"learning_rate": 2.4630541871921184e-05,
|
|
"loss": 0.2399,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"epoch": 87.4679682733374,
|
|
"grad_norm": 1.7061033248901367,
|
|
"learning_rate": 2.4433497536945814e-05,
|
|
"loss": 0.249,
|
|
"step": 17920
|
|
},
|
|
{
|
|
"epoch": 87.56558877364246,
|
|
"grad_norm": 2.1557867527008057,
|
|
"learning_rate": 2.4236453201970445e-05,
|
|
"loss": 0.243,
|
|
"step": 17940
|
|
},
|
|
{
|
|
"epoch": 87.66320927394753,
|
|
"grad_norm": 2.0752928256988525,
|
|
"learning_rate": 2.4039408866995075e-05,
|
|
"loss": 0.236,
|
|
"step": 17960
|
|
},
|
|
{
|
|
"epoch": 87.7608297742526,
|
|
"grad_norm": 1.9939770698547363,
|
|
"learning_rate": 2.3842364532019706e-05,
|
|
"loss": 0.24,
|
|
"step": 17980
|
|
},
|
|
{
|
|
"epoch": 87.85845027455765,
|
|
"grad_norm": 2.043842315673828,
|
|
"learning_rate": 2.3645320197044336e-05,
|
|
"loss": 0.2438,
|
|
"step": 18000
|
|
}
|
|
],
|
|
"logging_steps": 20,
|
|
"max_steps": 20400,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 100,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.586273126839091e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|