en-ar-translator / trainer_state.json
moussaKam's picture
upload
c9e27f4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500.0,
"global_step": 18789,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003193357815743254,
"grad_norm": 1.9077140880050831,
"learning_rate": 2.1287919105907397e-07,
"loss": 0.293,
"step": 20
},
{
"epoch": 0.006386715631486508,
"grad_norm": 1.2741126203759185,
"learning_rate": 4.2575838211814794e-07,
"loss": 0.2755,
"step": 40
},
{
"epoch": 0.009580073447229762,
"grad_norm": 0.9579957272491677,
"learning_rate": 6.38637573177222e-07,
"loss": 0.2663,
"step": 60
},
{
"epoch": 0.012773431262973017,
"grad_norm": 0.7572791753798676,
"learning_rate": 8.515167642362959e-07,
"loss": 0.2483,
"step": 80
},
{
"epoch": 0.01596678907871627,
"grad_norm": 0.7631906150781687,
"learning_rate": 1.0643959552953699e-06,
"loss": 0.2345,
"step": 100
},
{
"epoch": 0.019160146894459523,
"grad_norm": 0.6910231975995771,
"learning_rate": 1.277275146354444e-06,
"loss": 0.2325,
"step": 120
},
{
"epoch": 0.02235350471020278,
"grad_norm": 0.6148039205762914,
"learning_rate": 1.490154337413518e-06,
"loss": 0.2204,
"step": 140
},
{
"epoch": 0.025546862525946033,
"grad_norm": 0.5789874935423447,
"learning_rate": 1.7030335284725918e-06,
"loss": 0.2098,
"step": 160
},
{
"epoch": 0.028740220341689285,
"grad_norm": 0.7476463496285388,
"learning_rate": 1.915912719531666e-06,
"loss": 0.2032,
"step": 180
},
{
"epoch": 0.03193357815743254,
"grad_norm": 0.7428740533196578,
"learning_rate": 2.1287919105907398e-06,
"loss": 0.2012,
"step": 200
},
{
"epoch": 0.035126935973175795,
"grad_norm": 0.7037948631808884,
"learning_rate": 2.341671101649814e-06,
"loss": 0.2009,
"step": 220
},
{
"epoch": 0.038320293788919046,
"grad_norm": 0.7550244833274011,
"learning_rate": 2.554550292708888e-06,
"loss": 0.1909,
"step": 240
},
{
"epoch": 0.041513651604662305,
"grad_norm": 0.6563323462206586,
"learning_rate": 2.7674294837679623e-06,
"loss": 0.1921,
"step": 260
},
{
"epoch": 0.04470700942040556,
"grad_norm": 0.6164020880535914,
"learning_rate": 2.980308674827036e-06,
"loss": 0.18,
"step": 280
},
{
"epoch": 0.04790036723614881,
"grad_norm": 0.6690736448965662,
"learning_rate": 3.1931878658861097e-06,
"loss": 0.1775,
"step": 300
},
{
"epoch": 0.05109372505189207,
"grad_norm": 0.6290006787340572,
"learning_rate": 3.4060670569451835e-06,
"loss": 0.1806,
"step": 320
},
{
"epoch": 0.05428708286763532,
"grad_norm": 0.6635212199407985,
"learning_rate": 3.6189462480042583e-06,
"loss": 0.1779,
"step": 340
},
{
"epoch": 0.05748044068337857,
"grad_norm": 0.6197593777818997,
"learning_rate": 3.831825439063332e-06,
"loss": 0.1702,
"step": 360
},
{
"epoch": 0.06067379849912183,
"grad_norm": 0.7111109613175086,
"learning_rate": 4.044704630122406e-06,
"loss": 0.1726,
"step": 380
},
{
"epoch": 0.06386715631486509,
"grad_norm": 0.7014844198325911,
"learning_rate": 4.2575838211814795e-06,
"loss": 0.1677,
"step": 400
},
{
"epoch": 0.06706051413060833,
"grad_norm": 0.6960545421354288,
"learning_rate": 4.470463012240554e-06,
"loss": 0.1661,
"step": 420
},
{
"epoch": 0.07025387194635159,
"grad_norm": 0.6526527280390234,
"learning_rate": 4.683342203299628e-06,
"loss": 0.1599,
"step": 440
},
{
"epoch": 0.07344722976209485,
"grad_norm": 0.6231986280201518,
"learning_rate": 4.896221394358702e-06,
"loss": 0.1608,
"step": 460
},
{
"epoch": 0.07664058757783809,
"grad_norm": 0.6963837819044139,
"learning_rate": 5.109100585417776e-06,
"loss": 0.1622,
"step": 480
},
{
"epoch": 0.07983394539358135,
"grad_norm": 0.6109671730909707,
"learning_rate": 5.32197977647685e-06,
"loss": 0.1598,
"step": 500
},
{
"epoch": 0.08302730320932461,
"grad_norm": 0.5371960923403704,
"learning_rate": 5.534858967535925e-06,
"loss": 0.1594,
"step": 520
},
{
"epoch": 0.08622066102506785,
"grad_norm": 0.5367820828152228,
"learning_rate": 5.747738158594997e-06,
"loss": 0.1596,
"step": 540
},
{
"epoch": 0.08941401884081111,
"grad_norm": 0.5470627592374788,
"learning_rate": 5.960617349654072e-06,
"loss": 0.1572,
"step": 560
},
{
"epoch": 0.09260737665655437,
"grad_norm": 0.6499813395079859,
"learning_rate": 6.173496540713145e-06,
"loss": 0.1608,
"step": 580
},
{
"epoch": 0.09580073447229762,
"grad_norm": 0.5979118456987372,
"learning_rate": 6.386375731772219e-06,
"loss": 0.1556,
"step": 600
},
{
"epoch": 0.09899409228804087,
"grad_norm": 0.6192365185802242,
"learning_rate": 6.5992549228312945e-06,
"loss": 0.1558,
"step": 620
},
{
"epoch": 0.10218745010378413,
"grad_norm": 0.6113365486687661,
"learning_rate": 6.812134113890367e-06,
"loss": 0.1529,
"step": 640
},
{
"epoch": 0.10538080791952738,
"grad_norm": 0.5734345240030044,
"learning_rate": 7.025013304949441e-06,
"loss": 0.1569,
"step": 660
},
{
"epoch": 0.10857416573527064,
"grad_norm": 0.5661084962098065,
"learning_rate": 7.2378924960085166e-06,
"loss": 0.1516,
"step": 680
},
{
"epoch": 0.1117675235510139,
"grad_norm": 0.5846067384703525,
"learning_rate": 7.450771687067589e-06,
"loss": 0.15,
"step": 700
},
{
"epoch": 0.11496088136675714,
"grad_norm": 0.6374604468076045,
"learning_rate": 7.663650878126664e-06,
"loss": 0.1595,
"step": 720
},
{
"epoch": 0.1181542391825004,
"grad_norm": 0.6518179456002492,
"learning_rate": 7.876530069185738e-06,
"loss": 0.1534,
"step": 740
},
{
"epoch": 0.12134759699824366,
"grad_norm": 0.6418033051046158,
"learning_rate": 8.089409260244812e-06,
"loss": 0.1544,
"step": 760
},
{
"epoch": 0.1245409548139869,
"grad_norm": 0.5560485727454112,
"learning_rate": 8.302288451303886e-06,
"loss": 0.1519,
"step": 780
},
{
"epoch": 0.12773431262973017,
"grad_norm": 0.5218192022156695,
"learning_rate": 8.515167642362959e-06,
"loss": 0.1526,
"step": 800
},
{
"epoch": 0.1309276704454734,
"grad_norm": 0.5873485030073047,
"learning_rate": 8.728046833422033e-06,
"loss": 0.1477,
"step": 820
},
{
"epoch": 0.13412102826121666,
"grad_norm": 0.52521133386056,
"learning_rate": 8.940926024481108e-06,
"loss": 0.1462,
"step": 840
},
{
"epoch": 0.13731438607695992,
"grad_norm": 0.49868364100795765,
"learning_rate": 9.153805215540182e-06,
"loss": 0.1459,
"step": 860
},
{
"epoch": 0.14050774389270318,
"grad_norm": 0.46689167597503883,
"learning_rate": 9.366684406599256e-06,
"loss": 0.1522,
"step": 880
},
{
"epoch": 0.14370110170844644,
"grad_norm": 0.5355408722325465,
"learning_rate": 9.57956359765833e-06,
"loss": 0.1512,
"step": 900
},
{
"epoch": 0.1468944595241897,
"grad_norm": 0.46406202651388007,
"learning_rate": 9.792442788717403e-06,
"loss": 0.1503,
"step": 920
},
{
"epoch": 0.15008781733993293,
"grad_norm": 0.5480845186900385,
"learning_rate": 1.0005321979776476e-05,
"loss": 0.1474,
"step": 940
},
{
"epoch": 0.15328117515567619,
"grad_norm": 0.5541284715722858,
"learning_rate": 1.0218201170835552e-05,
"loss": 0.1469,
"step": 960
},
{
"epoch": 0.15647453297141944,
"grad_norm": 0.6186186072342443,
"learning_rate": 1.0431080361894626e-05,
"loss": 0.1495,
"step": 980
},
{
"epoch": 0.1596678907871627,
"grad_norm": 0.5921353515589192,
"learning_rate": 1.06439595529537e-05,
"loss": 0.1463,
"step": 1000
},
{
"epoch": 0.16286124860290596,
"grad_norm": 0.5433507613364311,
"learning_rate": 1.0856838744012775e-05,
"loss": 0.1472,
"step": 1020
},
{
"epoch": 0.16605460641864922,
"grad_norm": 0.5979565391588779,
"learning_rate": 1.106971793507185e-05,
"loss": 0.1506,
"step": 1040
},
{
"epoch": 0.16924796423439245,
"grad_norm": 0.48287899522708827,
"learning_rate": 1.128259712613092e-05,
"loss": 0.1486,
"step": 1060
},
{
"epoch": 0.1724413220501357,
"grad_norm": 0.5655815878985752,
"learning_rate": 1.1495476317189994e-05,
"loss": 0.1457,
"step": 1080
},
{
"epoch": 0.17563467986587897,
"grad_norm": 0.512570124033007,
"learning_rate": 1.170835550824907e-05,
"loss": 0.1461,
"step": 1100
},
{
"epoch": 0.17882803768162223,
"grad_norm": 0.46185823489920147,
"learning_rate": 1.1921234699308145e-05,
"loss": 0.144,
"step": 1120
},
{
"epoch": 0.18202139549736548,
"grad_norm": 0.44757029208547805,
"learning_rate": 1.2134113890367219e-05,
"loss": 0.1473,
"step": 1140
},
{
"epoch": 0.18521475331310874,
"grad_norm": 0.4396528603456714,
"learning_rate": 1.234699308142629e-05,
"loss": 0.142,
"step": 1160
},
{
"epoch": 0.188408111128852,
"grad_norm": 0.4916233270385887,
"learning_rate": 1.2559872272485364e-05,
"loss": 0.1459,
"step": 1180
},
{
"epoch": 0.19160146894459523,
"grad_norm": 0.49164646615860214,
"learning_rate": 1.2772751463544439e-05,
"loss": 0.1442,
"step": 1200
},
{
"epoch": 0.1947948267603385,
"grad_norm": 0.47947460832408273,
"learning_rate": 1.2985630654603515e-05,
"loss": 0.1491,
"step": 1220
},
{
"epoch": 0.19798818457608175,
"grad_norm": 0.5469424153214216,
"learning_rate": 1.3198509845662589e-05,
"loss": 0.1502,
"step": 1240
},
{
"epoch": 0.201181542391825,
"grad_norm": 0.4638562794822929,
"learning_rate": 1.3411389036721663e-05,
"loss": 0.1442,
"step": 1260
},
{
"epoch": 0.20437490020756827,
"grad_norm": 0.44344070089234844,
"learning_rate": 1.3624268227780734e-05,
"loss": 0.1488,
"step": 1280
},
{
"epoch": 0.20756825802331152,
"grad_norm": 0.47717361703152655,
"learning_rate": 1.3837147418839808e-05,
"loss": 0.1488,
"step": 1300
},
{
"epoch": 0.21076161583905476,
"grad_norm": 0.4703261039559369,
"learning_rate": 1.4050026609898883e-05,
"loss": 0.145,
"step": 1320
},
{
"epoch": 0.21395497365479801,
"grad_norm": 0.45451915177321617,
"learning_rate": 1.4262905800957957e-05,
"loss": 0.1514,
"step": 1340
},
{
"epoch": 0.21714833147054127,
"grad_norm": 0.4705441248128481,
"learning_rate": 1.4475784992017033e-05,
"loss": 0.1487,
"step": 1360
},
{
"epoch": 0.22034168928628453,
"grad_norm": 0.4369632636042999,
"learning_rate": 1.4688664183076104e-05,
"loss": 0.1459,
"step": 1380
},
{
"epoch": 0.2235350471020278,
"grad_norm": 0.42608257790275605,
"learning_rate": 1.4901543374135178e-05,
"loss": 0.1455,
"step": 1400
},
{
"epoch": 0.22672840491777105,
"grad_norm": 0.50356002082837,
"learning_rate": 1.5114422565194253e-05,
"loss": 0.1451,
"step": 1420
},
{
"epoch": 0.22992176273351428,
"grad_norm": 0.4561937833231143,
"learning_rate": 1.5327301756253327e-05,
"loss": 0.1477,
"step": 1440
},
{
"epoch": 0.23311512054925754,
"grad_norm": 0.40765594909388037,
"learning_rate": 1.55401809473124e-05,
"loss": 0.1425,
"step": 1460
},
{
"epoch": 0.2363084783650008,
"grad_norm": 0.487476872013174,
"learning_rate": 1.5753060138371476e-05,
"loss": 0.1412,
"step": 1480
},
{
"epoch": 0.23950183618074405,
"grad_norm": 0.4680001690322545,
"learning_rate": 1.596593932943055e-05,
"loss": 0.1443,
"step": 1500
},
{
"epoch": 0.2426951939964873,
"grad_norm": 0.41230879655966063,
"learning_rate": 1.6178818520489624e-05,
"loss": 0.1455,
"step": 1520
},
{
"epoch": 0.24588855181223057,
"grad_norm": 0.4315075010200903,
"learning_rate": 1.63916977115487e-05,
"loss": 0.1453,
"step": 1540
},
{
"epoch": 0.2490819096279738,
"grad_norm": 0.3880821656792041,
"learning_rate": 1.6604576902607773e-05,
"loss": 0.1367,
"step": 1560
},
{
"epoch": 0.2522752674437171,
"grad_norm": 0.4170991591966089,
"learning_rate": 1.6817456093666847e-05,
"loss": 0.1444,
"step": 1580
},
{
"epoch": 0.25546862525946035,
"grad_norm": 0.4352470317730404,
"learning_rate": 1.7030335284725918e-05,
"loss": 0.1462,
"step": 1600
},
{
"epoch": 0.25866198307520355,
"grad_norm": 0.41926976953754025,
"learning_rate": 1.7243214475784992e-05,
"loss": 0.1427,
"step": 1620
},
{
"epoch": 0.2618553408909468,
"grad_norm": 0.4067020140616968,
"learning_rate": 1.7456093666844067e-05,
"loss": 0.1435,
"step": 1640
},
{
"epoch": 0.26504869870669007,
"grad_norm": 0.4568169742722482,
"learning_rate": 1.766897285790314e-05,
"loss": 0.1425,
"step": 1660
},
{
"epoch": 0.2682420565224333,
"grad_norm": 0.4952678328950158,
"learning_rate": 1.7881852048962215e-05,
"loss": 0.1411,
"step": 1680
},
{
"epoch": 0.2714354143381766,
"grad_norm": 0.36574600840843885,
"learning_rate": 1.809473124002129e-05,
"loss": 0.1424,
"step": 1700
},
{
"epoch": 0.27462877215391984,
"grad_norm": 0.40710244186170225,
"learning_rate": 1.8307610431080364e-05,
"loss": 0.1435,
"step": 1720
},
{
"epoch": 0.2778221299696631,
"grad_norm": 0.41415797524036474,
"learning_rate": 1.852048962213944e-05,
"loss": 0.1443,
"step": 1740
},
{
"epoch": 0.28101548778540636,
"grad_norm": 0.38093938436737673,
"learning_rate": 1.8733368813198513e-05,
"loss": 0.1459,
"step": 1760
},
{
"epoch": 0.2842088456011496,
"grad_norm": 0.36699157301783514,
"learning_rate": 1.8946248004257587e-05,
"loss": 0.1503,
"step": 1780
},
{
"epoch": 0.2874022034168929,
"grad_norm": 0.4426133669933364,
"learning_rate": 1.915912719531666e-05,
"loss": 0.1458,
"step": 1800
},
{
"epoch": 0.29059556123263613,
"grad_norm": 0.37577866094305634,
"learning_rate": 1.9372006386375732e-05,
"loss": 0.1437,
"step": 1820
},
{
"epoch": 0.2937889190483794,
"grad_norm": 0.3853315977661372,
"learning_rate": 1.9584885577434807e-05,
"loss": 0.1424,
"step": 1840
},
{
"epoch": 0.29698227686412265,
"grad_norm": 0.39658703817733554,
"learning_rate": 1.979776476849388e-05,
"loss": 0.143,
"step": 1860
},
{
"epoch": 0.30017563467986585,
"grad_norm": 0.34168487028906286,
"learning_rate": 1.9999999827423154e-05,
"loss": 0.1472,
"step": 1880
},
{
"epoch": 0.3033689924956091,
"grad_norm": 0.428099056379712,
"learning_rate": 1.9999923893706236e-05,
"loss": 0.1424,
"step": 1900
},
{
"epoch": 0.30656235031135237,
"grad_norm": 0.47981664372403626,
"learning_rate": 1.9999709899719893e-05,
"loss": 0.1414,
"step": 1920
},
{
"epoch": 0.30975570812709563,
"grad_norm": 0.4495236935209742,
"learning_rate": 1.9999357848418547e-05,
"loss": 0.1432,
"step": 1940
},
{
"epoch": 0.3129490659428389,
"grad_norm": 0.4335827442743115,
"learning_rate": 1.999886774466267e-05,
"loss": 0.1449,
"step": 1960
},
{
"epoch": 0.31614242375858215,
"grad_norm": 0.3740214770732922,
"learning_rate": 1.9998239595218693e-05,
"loss": 0.1455,
"step": 1980
},
{
"epoch": 0.3193357815743254,
"grad_norm": 0.35431822792110484,
"learning_rate": 1.999747340875894e-05,
"loss": 0.14,
"step": 2000
},
{
"epoch": 0.32252913939006866,
"grad_norm": 0.37271856793106084,
"learning_rate": 1.9996569195861474e-05,
"loss": 0.1433,
"step": 2020
},
{
"epoch": 0.3257224972058119,
"grad_norm": 0.36904721824612496,
"learning_rate": 1.999552696900998e-05,
"loss": 0.1474,
"step": 2040
},
{
"epoch": 0.3289158550215552,
"grad_norm": 0.4329302625174645,
"learning_rate": 1.9994346742593577e-05,
"loss": 0.1409,
"step": 2060
},
{
"epoch": 0.33210921283729844,
"grad_norm": 0.4659341494260738,
"learning_rate": 1.999302853290663e-05,
"loss": 0.1453,
"step": 2080
},
{
"epoch": 0.3353025706530417,
"grad_norm": 0.40127911103988617,
"learning_rate": 1.9991572358148522e-05,
"loss": 0.1396,
"step": 2100
},
{
"epoch": 0.3384959284687849,
"grad_norm": 0.3087442313177786,
"learning_rate": 1.9989978238423383e-05,
"loss": 0.1474,
"step": 2120
},
{
"epoch": 0.34168928628452816,
"grad_norm": 0.37193584969289195,
"learning_rate": 1.9988246195739846e-05,
"loss": 0.1422,
"step": 2140
},
{
"epoch": 0.3448826441002714,
"grad_norm": 0.3814913494874711,
"learning_rate": 1.998637625401072e-05,
"loss": 0.1422,
"step": 2160
},
{
"epoch": 0.3480760019160147,
"grad_norm": 0.34898690836821306,
"learning_rate": 1.9984368439052668e-05,
"loss": 0.1396,
"step": 2180
},
{
"epoch": 0.35126935973175794,
"grad_norm": 0.3951747534549505,
"learning_rate": 1.9982222778585845e-05,
"loss": 0.1458,
"step": 2200
},
{
"epoch": 0.3544627175475012,
"grad_norm": 0.34562618895160807,
"learning_rate": 1.9979939302233524e-05,
"loss": 0.1402,
"step": 2220
},
{
"epoch": 0.35765607536324445,
"grad_norm": 0.388573729018997,
"learning_rate": 1.9977518041521683e-05,
"loss": 0.1402,
"step": 2240
},
{
"epoch": 0.3608494331789877,
"grad_norm": 0.37200937634013864,
"learning_rate": 1.9974959029878568e-05,
"loss": 0.1438,
"step": 2260
},
{
"epoch": 0.36404279099473097,
"grad_norm": 0.3965761915716373,
"learning_rate": 1.9972262302634228e-05,
"loss": 0.1401,
"step": 2280
},
{
"epoch": 0.3672361488104742,
"grad_norm": 0.3173011648096044,
"learning_rate": 1.996942789702004e-05,
"loss": 0.1392,
"step": 2300
},
{
"epoch": 0.3704295066262175,
"grad_norm": 0.3017733588197737,
"learning_rate": 1.996645585216818e-05,
"loss": 0.1424,
"step": 2320
},
{
"epoch": 0.37362286444196074,
"grad_norm": 0.3363017850364413,
"learning_rate": 1.9963346209111084e-05,
"loss": 0.1396,
"step": 2340
},
{
"epoch": 0.376816222257704,
"grad_norm": 0.36015352029461045,
"learning_rate": 1.9960099010780906e-05,
"loss": 0.1364,
"step": 2360
},
{
"epoch": 0.3800095800734472,
"grad_norm": 0.3944315476618534,
"learning_rate": 1.995671430200889e-05,
"loss": 0.1367,
"step": 2380
},
{
"epoch": 0.38320293788919046,
"grad_norm": 0.3511161782236592,
"learning_rate": 1.9953192129524774e-05,
"loss": 0.134,
"step": 2400
},
{
"epoch": 0.3863962957049337,
"grad_norm": 0.28417767585244963,
"learning_rate": 1.994953254195613e-05,
"loss": 0.1345,
"step": 2420
},
{
"epoch": 0.389589653520677,
"grad_norm": 0.33674300015583525,
"learning_rate": 1.9945735589827714e-05,
"loss": 0.1414,
"step": 2440
},
{
"epoch": 0.39278301133642024,
"grad_norm": 0.35738285989377994,
"learning_rate": 1.9941801325560748e-05,
"loss": 0.1379,
"step": 2460
},
{
"epoch": 0.3959763691521635,
"grad_norm": 0.3281943522856012,
"learning_rate": 1.9937729803472198e-05,
"loss": 0.1377,
"step": 2480
},
{
"epoch": 0.39916972696790676,
"grad_norm": 0.45986080956623454,
"learning_rate": 1.9933521079774043e-05,
"loss": 0.1375,
"step": 2500
},
{
"epoch": 0.40236308478365,
"grad_norm": 0.3345998745429948,
"learning_rate": 1.9929175212572473e-05,
"loss": 0.1376,
"step": 2520
},
{
"epoch": 0.4055564425993933,
"grad_norm": 0.3589364581224636,
"learning_rate": 1.9924692261867107e-05,
"loss": 0.136,
"step": 2540
},
{
"epoch": 0.40874980041513653,
"grad_norm": 0.3214892613419847,
"learning_rate": 1.9920072289550152e-05,
"loss": 0.1375,
"step": 2560
},
{
"epoch": 0.4119431582308798,
"grad_norm": 0.293317969518762,
"learning_rate": 1.9915315359405556e-05,
"loss": 0.1396,
"step": 2580
},
{
"epoch": 0.41513651604662305,
"grad_norm": 0.30656393646036983,
"learning_rate": 1.9910421537108124e-05,
"loss": 0.1417,
"step": 2600
},
{
"epoch": 0.41832987386236625,
"grad_norm": 0.3145662089610958,
"learning_rate": 1.990539089022262e-05,
"loss": 0.1361,
"step": 2620
},
{
"epoch": 0.4215232316781095,
"grad_norm": 0.3321283029025178,
"learning_rate": 1.9900223488202807e-05,
"loss": 0.1374,
"step": 2640
},
{
"epoch": 0.42471658949385277,
"grad_norm": 0.3121994972039942,
"learning_rate": 1.9894919402390527e-05,
"loss": 0.1369,
"step": 2660
},
{
"epoch": 0.42790994730959603,
"grad_norm": 0.3207245571484729,
"learning_rate": 1.9889478706014687e-05,
"loss": 0.1365,
"step": 2680
},
{
"epoch": 0.4311033051253393,
"grad_norm": 0.3214891191577016,
"learning_rate": 1.9883901474190258e-05,
"loss": 0.134,
"step": 2700
},
{
"epoch": 0.43429666294108255,
"grad_norm": 0.2948703624055813,
"learning_rate": 1.9878187783917246e-05,
"loss": 0.1358,
"step": 2720
},
{
"epoch": 0.4374900207568258,
"grad_norm": 0.2962758480800377,
"learning_rate": 1.9872337714079604e-05,
"loss": 0.1353,
"step": 2740
},
{
"epoch": 0.44068337857256906,
"grad_norm": 0.28750701401056433,
"learning_rate": 1.9866351345444172e-05,
"loss": 0.1397,
"step": 2760
},
{
"epoch": 0.4438767363883123,
"grad_norm": 0.33961478398235684,
"learning_rate": 1.9860228760659547e-05,
"loss": 0.1395,
"step": 2780
},
{
"epoch": 0.4470700942040556,
"grad_norm": 0.3327701106038422,
"learning_rate": 1.9853970044254942e-05,
"loss": 0.1362,
"step": 2800
},
{
"epoch": 0.45026345201979884,
"grad_norm": 0.34046480079798697,
"learning_rate": 1.9847575282639022e-05,
"loss": 0.1357,
"step": 2820
},
{
"epoch": 0.4534568098355421,
"grad_norm": 0.2591827835319709,
"learning_rate": 1.984104456409871e-05,
"loss": 0.1319,
"step": 2840
},
{
"epoch": 0.4566501676512853,
"grad_norm": 0.31099418106495114,
"learning_rate": 1.983437797879797e-05,
"loss": 0.134,
"step": 2860
},
{
"epoch": 0.45984352546702856,
"grad_norm": 0.34942376213984855,
"learning_rate": 1.9827575618776556e-05,
"loss": 0.1353,
"step": 2880
},
{
"epoch": 0.4630368832827718,
"grad_norm": 0.29857742338407706,
"learning_rate": 1.9820637577948746e-05,
"loss": 0.1336,
"step": 2900
},
{
"epoch": 0.4662302410985151,
"grad_norm": 0.2701149477986023,
"learning_rate": 1.9813563952102056e-05,
"loss": 0.1338,
"step": 2920
},
{
"epoch": 0.46942359891425833,
"grad_norm": 0.35582085328111446,
"learning_rate": 1.980635483889589e-05,
"loss": 0.1325,
"step": 2940
},
{
"epoch": 0.4726169567300016,
"grad_norm": 0.36536478089468427,
"learning_rate": 1.979901033786022e-05,
"loss": 0.138,
"step": 2960
},
{
"epoch": 0.47581031454574485,
"grad_norm": 0.34482414871566835,
"learning_rate": 1.9791530550394197e-05,
"loss": 0.14,
"step": 2980
},
{
"epoch": 0.4790036723614881,
"grad_norm": 0.313925122152452,
"learning_rate": 1.9783915579764755e-05,
"loss": 0.1349,
"step": 3000
},
{
"epoch": 0.48219703017723137,
"grad_norm": 0.33065001108381514,
"learning_rate": 1.9776165531105182e-05,
"loss": 0.1334,
"step": 3020
},
{
"epoch": 0.4853903879929746,
"grad_norm": 0.33106961743791363,
"learning_rate": 1.9768280511413676e-05,
"loss": 0.1346,
"step": 3040
},
{
"epoch": 0.4885837458087179,
"grad_norm": 0.3038455922499442,
"learning_rate": 1.9760260629551856e-05,
"loss": 0.13,
"step": 3060
},
{
"epoch": 0.49177710362446114,
"grad_norm": 0.32774568750571736,
"learning_rate": 1.975210599624327e-05,
"loss": 0.1317,
"step": 3080
},
{
"epoch": 0.4949704614402044,
"grad_norm": 0.27913297393743014,
"learning_rate": 1.9743816724071864e-05,
"loss": 0.1299,
"step": 3100
},
{
"epoch": 0.4981638192559476,
"grad_norm": 0.25535801906865635,
"learning_rate": 1.9735392927480425e-05,
"loss": 0.1341,
"step": 3120
},
{
"epoch": 0.5013571770716909,
"grad_norm": 0.3450201878469047,
"learning_rate": 1.9726834722768998e-05,
"loss": 0.1307,
"step": 3140
},
{
"epoch": 0.5045505348874342,
"grad_norm": 0.3355377854047922,
"learning_rate": 1.9718142228093286e-05,
"loss": 0.1373,
"step": 3160
},
{
"epoch": 0.5077438927031774,
"grad_norm": 0.29501763605746917,
"learning_rate": 1.9709315563463022e-05,
"loss": 0.1329,
"step": 3180
},
{
"epoch": 0.5109372505189207,
"grad_norm": 0.29498443847446687,
"learning_rate": 1.9700354850740305e-05,
"loss": 0.1302,
"step": 3200
},
{
"epoch": 0.514130608334664,
"grad_norm": 0.3374549804904556,
"learning_rate": 1.969126021363791e-05,
"loss": 0.1332,
"step": 3220
},
{
"epoch": 0.5173239661504071,
"grad_norm": 0.2937151476643792,
"learning_rate": 1.9682031777717602e-05,
"loss": 0.1289,
"step": 3240
},
{
"epoch": 0.5205173239661504,
"grad_norm": 0.34027338318157424,
"learning_rate": 1.9672669670388387e-05,
"loss": 0.1335,
"step": 3260
},
{
"epoch": 0.5237106817818936,
"grad_norm": 0.2958186800446919,
"learning_rate": 1.966317402090475e-05,
"loss": 0.1321,
"step": 3280
},
{
"epoch": 0.5269040395976369,
"grad_norm": 0.2937191900726174,
"learning_rate": 1.9653544960364886e-05,
"loss": 0.132,
"step": 3300
},
{
"epoch": 0.5300973974133801,
"grad_norm": 0.3133245335540435,
"learning_rate": 1.9643782621708875e-05,
"loss": 0.1311,
"step": 3320
},
{
"epoch": 0.5332907552291234,
"grad_norm": 0.29304130620982005,
"learning_rate": 1.963388713971685e-05,
"loss": 0.1355,
"step": 3340
},
{
"epoch": 0.5364841130448667,
"grad_norm": 0.31292116477262744,
"learning_rate": 1.962385865100715e-05,
"loss": 0.1351,
"step": 3360
},
{
"epoch": 0.5396774708606099,
"grad_norm": 0.26493353801679925,
"learning_rate": 1.9613697294034403e-05,
"loss": 0.1315,
"step": 3380
},
{
"epoch": 0.5428708286763532,
"grad_norm": 0.2630906644626646,
"learning_rate": 1.9603403209087655e-05,
"loss": 0.1312,
"step": 3400
},
{
"epoch": 0.5460641864920964,
"grad_norm": 0.26085033107366945,
"learning_rate": 1.9592976538288392e-05,
"loss": 0.1296,
"step": 3420
},
{
"epoch": 0.5492575443078397,
"grad_norm": 0.2940107915040809,
"learning_rate": 1.9582417425588615e-05,
"loss": 0.1305,
"step": 3440
},
{
"epoch": 0.5524509021235829,
"grad_norm": 0.2648782222390229,
"learning_rate": 1.9571726016768825e-05,
"loss": 0.1298,
"step": 3460
},
{
"epoch": 0.5556442599393262,
"grad_norm": 0.25767016449009617,
"learning_rate": 1.9560902459436027e-05,
"loss": 0.1287,
"step": 3480
},
{
"epoch": 0.5588376177550695,
"grad_norm": 0.304832191804209,
"learning_rate": 1.9549946903021676e-05,
"loss": 0.1335,
"step": 3500
},
{
"epoch": 0.5620309755708127,
"grad_norm": 0.2814622371172937,
"learning_rate": 1.953885949877963e-05,
"loss": 0.1287,
"step": 3520
},
{
"epoch": 0.565224333386556,
"grad_norm": 0.27565323140470793,
"learning_rate": 1.9527640399784066e-05,
"loss": 0.132,
"step": 3540
},
{
"epoch": 0.5684176912022992,
"grad_norm": 0.2874659718873625,
"learning_rate": 1.9516289760927337e-05,
"loss": 0.1306,
"step": 3560
},
{
"epoch": 0.5716110490180425,
"grad_norm": 0.24637256127265056,
"learning_rate": 1.9504807738917864e-05,
"loss": 0.1294,
"step": 3580
},
{
"epoch": 0.5748044068337858,
"grad_norm": 0.2683166797652062,
"learning_rate": 1.949319449227796e-05,
"loss": 0.1265,
"step": 3600
},
{
"epoch": 0.577997764649529,
"grad_norm": 0.2991655914571407,
"learning_rate": 1.9481450181341636e-05,
"loss": 0.1307,
"step": 3620
},
{
"epoch": 0.5811911224652723,
"grad_norm": 0.2629061135815468,
"learning_rate": 1.9469574968252405e-05,
"loss": 0.131,
"step": 3640
},
{
"epoch": 0.5843844802810155,
"grad_norm": 0.30352941453895776,
"learning_rate": 1.9457569016961025e-05,
"loss": 0.1315,
"step": 3660
},
{
"epoch": 0.5875778380967588,
"grad_norm": 0.32189790257315865,
"learning_rate": 1.9445432493223243e-05,
"loss": 0.1301,
"step": 3680
},
{
"epoch": 0.590771195912502,
"grad_norm": 0.2262924484205468,
"learning_rate": 1.943316556459751e-05,
"loss": 0.1265,
"step": 3700
},
{
"epoch": 0.5939645537282453,
"grad_norm": 0.2711892071402863,
"learning_rate": 1.9420768400442657e-05,
"loss": 0.1271,
"step": 3720
},
{
"epoch": 0.5971579115439885,
"grad_norm": 0.256185445894437,
"learning_rate": 1.9408241171915576e-05,
"loss": 0.1277,
"step": 3740
},
{
"epoch": 0.6003512693597317,
"grad_norm": 0.25593240031460607,
"learning_rate": 1.9395584051968833e-05,
"loss": 0.1287,
"step": 3760
},
{
"epoch": 0.603544627175475,
"grad_norm": 0.2979762688925845,
"learning_rate": 1.9382797215348303e-05,
"loss": 0.1287,
"step": 3780
},
{
"epoch": 0.6067379849912182,
"grad_norm": 0.2761523818504427,
"learning_rate": 1.936988083859073e-05,
"loss": 0.1289,
"step": 3800
},
{
"epoch": 0.6099313428069615,
"grad_norm": 0.31322754272354847,
"learning_rate": 1.935683510002133e-05,
"loss": 0.1289,
"step": 3820
},
{
"epoch": 0.6131247006227047,
"grad_norm": 0.32118979161692,
"learning_rate": 1.934366017975128e-05,
"loss": 0.1291,
"step": 3840
},
{
"epoch": 0.616318058438448,
"grad_norm": 0.3965221736956701,
"learning_rate": 1.9330356259675277e-05,
"loss": 0.1291,
"step": 3860
},
{
"epoch": 0.6195114162541913,
"grad_norm": 0.23124317796079472,
"learning_rate": 1.9316923523468988e-05,
"loss": 0.127,
"step": 3880
},
{
"epoch": 0.6227047740699345,
"grad_norm": 0.26107711518189003,
"learning_rate": 1.9303362156586554e-05,
"loss": 0.1267,
"step": 3900
},
{
"epoch": 0.6258981318856778,
"grad_norm": 0.23776916842759366,
"learning_rate": 1.9289672346257988e-05,
"loss": 0.1246,
"step": 3920
},
{
"epoch": 0.629091489701421,
"grad_norm": 0.26149208748799935,
"learning_rate": 1.9275854281486626e-05,
"loss": 0.1251,
"step": 3940
},
{
"epoch": 0.6322848475171643,
"grad_norm": 0.2488232391306922,
"learning_rate": 1.9261908153046485e-05,
"loss": 0.1268,
"step": 3960
},
{
"epoch": 0.6354782053329076,
"grad_norm": 0.2541408784103856,
"learning_rate": 1.924783415347966e-05,
"loss": 0.1271,
"step": 3980
},
{
"epoch": 0.6386715631486508,
"grad_norm": 0.2731460017360242,
"learning_rate": 1.9233632477093655e-05,
"loss": 0.1255,
"step": 4000
},
{
"epoch": 0.6418649209643941,
"grad_norm": 0.22543278383772555,
"learning_rate": 1.9219303319958675e-05,
"loss": 0.1252,
"step": 4020
},
{
"epoch": 0.6450582787801373,
"grad_norm": 0.3043429192344086,
"learning_rate": 1.9204846879904966e-05,
"loss": 0.1261,
"step": 4040
},
{
"epoch": 0.6482516365958806,
"grad_norm": 0.2787988968661325,
"learning_rate": 1.9190263356520044e-05,
"loss": 0.1285,
"step": 4060
},
{
"epoch": 0.6514449944116238,
"grad_norm": 0.28334459179072036,
"learning_rate": 1.9175552951145953e-05,
"loss": 0.1312,
"step": 4080
},
{
"epoch": 0.6546383522273671,
"grad_norm": 0.2699681672265312,
"learning_rate": 1.91607158668765e-05,
"loss": 0.128,
"step": 4100
},
{
"epoch": 0.6578317100431104,
"grad_norm": 0.2653884535783852,
"learning_rate": 1.9145752308554422e-05,
"loss": 0.1236,
"step": 4120
},
{
"epoch": 0.6610250678588536,
"grad_norm": 0.24370062543889062,
"learning_rate": 1.913066248276859e-05,
"loss": 0.1267,
"step": 4140
},
{
"epoch": 0.6642184256745969,
"grad_norm": 0.268136522123535,
"learning_rate": 1.911544659785112e-05,
"loss": 0.1251,
"step": 4160
},
{
"epoch": 0.6674117834903401,
"grad_norm": 0.2804716904650792,
"learning_rate": 1.9100104863874535e-05,
"loss": 0.1282,
"step": 4180
},
{
"epoch": 0.6706051413060834,
"grad_norm": 0.25256532175596885,
"learning_rate": 1.9084637492648834e-05,
"loss": 0.1291,
"step": 4200
},
{
"epoch": 0.6737984991218267,
"grad_norm": 0.20654144385500267,
"learning_rate": 1.9069044697718596e-05,
"loss": 0.1275,
"step": 4220
},
{
"epoch": 0.6769918569375698,
"grad_norm": 0.3170119063036349,
"learning_rate": 1.9053326694359996e-05,
"loss": 0.1252,
"step": 4240
},
{
"epoch": 0.6801852147533131,
"grad_norm": 0.2518310103396095,
"learning_rate": 1.9037483699577866e-05,
"loss": 0.1252,
"step": 4260
},
{
"epoch": 0.6833785725690563,
"grad_norm": 0.24576567016775977,
"learning_rate": 1.9021515932102687e-05,
"loss": 0.1262,
"step": 4280
},
{
"epoch": 0.6865719303847996,
"grad_norm": 0.2272326194356311,
"learning_rate": 1.9005423612387564e-05,
"loss": 0.1277,
"step": 4300
},
{
"epoch": 0.6897652882005428,
"grad_norm": 0.2241851322819629,
"learning_rate": 1.8989206962605183e-05,
"loss": 0.1254,
"step": 4320
},
{
"epoch": 0.6929586460162861,
"grad_norm": 0.28963794959769024,
"learning_rate": 1.8972866206644756e-05,
"loss": 0.1269,
"step": 4340
},
{
"epoch": 0.6961520038320294,
"grad_norm": 0.27244182001640865,
"learning_rate": 1.8956401570108918e-05,
"loss": 0.1268,
"step": 4360
},
{
"epoch": 0.6993453616477726,
"grad_norm": 0.23505587827589292,
"learning_rate": 1.893981328031061e-05,
"loss": 0.128,
"step": 4380
},
{
"epoch": 0.7025387194635159,
"grad_norm": 0.2636460746314892,
"learning_rate": 1.8923101566269956e-05,
"loss": 0.1268,
"step": 4400
},
{
"epoch": 0.7057320772792591,
"grad_norm": 0.287020211559549,
"learning_rate": 1.890626665871108e-05,
"loss": 0.1251,
"step": 4420
},
{
"epoch": 0.7089254350950024,
"grad_norm": 0.3666813610337495,
"learning_rate": 1.8889308790058944e-05,
"loss": 0.122,
"step": 4440
},
{
"epoch": 0.7121187929107456,
"grad_norm": 0.24200632888509,
"learning_rate": 1.887222819443612e-05,
"loss": 0.1234,
"step": 4460
},
{
"epoch": 0.7153121507264889,
"grad_norm": 0.3142721600257018,
"learning_rate": 1.8855025107659565e-05,
"loss": 0.1247,
"step": 4480
},
{
"epoch": 0.7185055085422322,
"grad_norm": 0.2542404530052441,
"learning_rate": 1.8837699767237363e-05,
"loss": 0.1267,
"step": 4500
},
{
"epoch": 0.7216988663579754,
"grad_norm": 0.2513575844512111,
"learning_rate": 1.882025241236546e-05,
"loss": 0.1254,
"step": 4520
},
{
"epoch": 0.7248922241737187,
"grad_norm": 0.24131168314941073,
"learning_rate": 1.880268328392433e-05,
"loss": 0.1251,
"step": 4540
},
{
"epoch": 0.7280855819894619,
"grad_norm": 0.22534176261187136,
"learning_rate": 1.878499262447569e-05,
"loss": 0.1241,
"step": 4560
},
{
"epoch": 0.7312789398052052,
"grad_norm": 0.2812964686320165,
"learning_rate": 1.8767180678259113e-05,
"loss": 0.1257,
"step": 4580
},
{
"epoch": 0.7344722976209485,
"grad_norm": 0.23889076217882216,
"learning_rate": 1.874924769118868e-05,
"loss": 0.1273,
"step": 4600
},
{
"epoch": 0.7376656554366917,
"grad_norm": 0.27177520658222915,
"learning_rate": 1.873119391084958e-05,
"loss": 0.125,
"step": 4620
},
{
"epoch": 0.740859013252435,
"grad_norm": 0.21614884950104765,
"learning_rate": 1.8713019586494687e-05,
"loss": 0.1244,
"step": 4640
},
{
"epoch": 0.7440523710681782,
"grad_norm": 0.26972504495310423,
"learning_rate": 1.869472496904112e-05,
"loss": 0.1278,
"step": 4660
},
{
"epoch": 0.7472457288839215,
"grad_norm": 0.26832330471480753,
"learning_rate": 1.867631031106679e-05,
"loss": 0.1217,
"step": 4680
},
{
"epoch": 0.7504390866996647,
"grad_norm": 0.2204440405656476,
"learning_rate": 1.8657775866806885e-05,
"loss": 0.1226,
"step": 4700
},
{
"epoch": 0.753632444515408,
"grad_norm": 0.25422716012201274,
"learning_rate": 1.86391218921504e-05,
"loss": 0.1264,
"step": 4720
},
{
"epoch": 0.7568258023311512,
"grad_norm": 0.2334817914407686,
"learning_rate": 1.8620348644636572e-05,
"loss": 0.123,
"step": 4740
},
{
"epoch": 0.7600191601468944,
"grad_norm": 0.24736892038169794,
"learning_rate": 1.8601456383451325e-05,
"loss": 0.1245,
"step": 4760
},
{
"epoch": 0.7632125179626377,
"grad_norm": 0.23376798290995154,
"learning_rate": 1.8582445369423716e-05,
"loss": 0.1259,
"step": 4780
},
{
"epoch": 0.7664058757783809,
"grad_norm": 0.24649571282123517,
"learning_rate": 1.8563315865022318e-05,
"loss": 0.125,
"step": 4800
},
{
"epoch": 0.7695992335941242,
"grad_norm": 0.24228210919101548,
"learning_rate": 1.8544068134351585e-05,
"loss": 0.1225,
"step": 4820
},
{
"epoch": 0.7727925914098674,
"grad_norm": 0.25429442512742784,
"learning_rate": 1.852470244314824e-05,
"loss": 0.1261,
"step": 4840
},
{
"epoch": 0.7759859492256107,
"grad_norm": 0.2309045224127907,
"learning_rate": 1.850521905877756e-05,
"loss": 0.1249,
"step": 4860
},
{
"epoch": 0.779179307041354,
"grad_norm": 0.25672801367790765,
"learning_rate": 1.848561825022973e-05,
"loss": 0.1234,
"step": 4880
},
{
"epoch": 0.7823726648570972,
"grad_norm": 0.2473205806486083,
"learning_rate": 1.8465900288116098e-05,
"loss": 0.1284,
"step": 4900
},
{
"epoch": 0.7855660226728405,
"grad_norm": 0.3035165882865362,
"learning_rate": 1.844606544466545e-05,
"loss": 0.1237,
"step": 4920
},
{
"epoch": 0.7887593804885837,
"grad_norm": 0.26837139940976074,
"learning_rate": 1.8426113993720255e-05,
"loss": 0.1252,
"step": 4940
},
{
"epoch": 0.791952738304327,
"grad_norm": 0.26373147498792854,
"learning_rate": 1.840604621073288e-05,
"loss": 0.1227,
"step": 4960
},
{
"epoch": 0.7951460961200703,
"grad_norm": 0.2581673321881109,
"learning_rate": 1.8385862372761784e-05,
"loss": 0.1273,
"step": 4980
},
{
"epoch": 0.7983394539358135,
"grad_norm": 0.26439250344256154,
"learning_rate": 1.83655627584677e-05,
"loss": 0.1218,
"step": 5000
},
{
"epoch": 0.8015328117515568,
"grad_norm": 0.2816537144327537,
"learning_rate": 1.8345147648109784e-05,
"loss": 0.1263,
"step": 5020
},
{
"epoch": 0.8047261695673,
"grad_norm": 0.2647977758183829,
"learning_rate": 1.8324617323541738e-05,
"loss": 0.1238,
"step": 5040
},
{
"epoch": 0.8079195273830433,
"grad_norm": 0.2593258946289472,
"learning_rate": 1.830397206820794e-05,
"loss": 0.1246,
"step": 5060
},
{
"epoch": 0.8111128851987865,
"grad_norm": 0.22990124735756534,
"learning_rate": 1.8283212167139513e-05,
"loss": 0.1226,
"step": 5080
},
{
"epoch": 0.8143062430145298,
"grad_norm": 0.27455958743278586,
"learning_rate": 1.8262337906950385e-05,
"loss": 0.1261,
"step": 5100
},
{
"epoch": 0.8174996008302731,
"grad_norm": 0.2608809929482469,
"learning_rate": 1.8241349575833352e-05,
"loss": 0.1226,
"step": 5120
},
{
"epoch": 0.8206929586460163,
"grad_norm": 0.2640419564306298,
"learning_rate": 1.822024746355608e-05,
"loss": 0.1381,
"step": 5140
},
{
"epoch": 0.8238863164617596,
"grad_norm": 0.29262015087553245,
"learning_rate": 1.8199031861457123e-05,
"loss": 0.1214,
"step": 5160
},
{
"epoch": 0.8270796742775028,
"grad_norm": 0.2319619995331439,
"learning_rate": 1.8177703062441882e-05,
"loss": 0.1232,
"step": 5180
},
{
"epoch": 0.8302730320932461,
"grad_norm": 0.26293647732336844,
"learning_rate": 1.815626136097857e-05,
"loss": 0.1233,
"step": 5200
},
{
"epoch": 0.8334663899089892,
"grad_norm": 0.24081197327765444,
"learning_rate": 1.8134707053094146e-05,
"loss": 0.1202,
"step": 5220
},
{
"epoch": 0.8366597477247325,
"grad_norm": 0.2736597574126886,
"learning_rate": 1.8113040436370236e-05,
"loss": 0.1189,
"step": 5240
},
{
"epoch": 0.8398531055404758,
"grad_norm": 0.22867160064093073,
"learning_rate": 1.809126180993901e-05,
"loss": 0.1227,
"step": 5260
},
{
"epoch": 0.843046463356219,
"grad_norm": 0.20241019354224027,
"learning_rate": 1.8069371474479055e-05,
"loss": 0.1207,
"step": 5280
},
{
"epoch": 0.8462398211719623,
"grad_norm": 0.23512641329119113,
"learning_rate": 1.8047369732211236e-05,
"loss": 0.1227,
"step": 5300
},
{
"epoch": 0.8494331789877055,
"grad_norm": 0.21831678736014193,
"learning_rate": 1.8025256886894512e-05,
"loss": 0.1263,
"step": 5320
},
{
"epoch": 0.8526265368034488,
"grad_norm": 0.22942586598137038,
"learning_rate": 1.800303324382174e-05,
"loss": 0.1226,
"step": 5340
},
{
"epoch": 0.8558198946191921,
"grad_norm": 0.22565630953315605,
"learning_rate": 1.7980699109815476e-05,
"loss": 0.1227,
"step": 5360
},
{
"epoch": 0.8590132524349353,
"grad_norm": 0.2110233708822902,
"learning_rate": 1.795825479322372e-05,
"loss": 0.123,
"step": 5380
},
{
"epoch": 0.8622066102506786,
"grad_norm": 0.2588140422630483,
"learning_rate": 1.793570060391567e-05,
"loss": 0.1233,
"step": 5400
},
{
"epoch": 0.8653999680664218,
"grad_norm": 0.20643049269214508,
"learning_rate": 1.791303685327744e-05,
"loss": 0.1216,
"step": 5420
},
{
"epoch": 0.8685933258821651,
"grad_norm": 0.2450716780518527,
"learning_rate": 1.7890263854207766e-05,
"loss": 0.1187,
"step": 5440
},
{
"epoch": 0.8717866836979083,
"grad_norm": 0.2626908104568787,
"learning_rate": 1.7867381921113672e-05,
"loss": 0.1318,
"step": 5460
},
{
"epoch": 0.8749800415136516,
"grad_norm": 0.21046084433838286,
"learning_rate": 1.784439136990616e-05,
"loss": 0.1216,
"step": 5480
},
{
"epoch": 0.8781733993293949,
"grad_norm": 0.22390590052286838,
"learning_rate": 1.7821292517995802e-05,
"loss": 0.1222,
"step": 5500
},
{
"epoch": 0.8813667571451381,
"grad_norm": 0.21545360667161884,
"learning_rate": 1.7798085684288408e-05,
"loss": 0.1245,
"step": 5520
},
{
"epoch": 0.8845601149608814,
"grad_norm": 0.23969169247272867,
"learning_rate": 1.777477118918058e-05,
"loss": 0.1199,
"step": 5540
},
{
"epoch": 0.8877534727766246,
"grad_norm": 0.25616719983123853,
"learning_rate": 1.7751349354555315e-05,
"loss": 0.12,
"step": 5560
},
{
"epoch": 0.8909468305923679,
"grad_norm": 0.2327465548031593,
"learning_rate": 1.7727820503777563e-05,
"loss": 0.1188,
"step": 5580
},
{
"epoch": 0.8941401884081112,
"grad_norm": 0.2704312448776363,
"learning_rate": 1.770418496168973e-05,
"loss": 0.1266,
"step": 5600
},
{
"epoch": 0.8973335462238544,
"grad_norm": 0.280731488755357,
"learning_rate": 1.7680443054607247e-05,
"loss": 0.1186,
"step": 5620
},
{
"epoch": 0.9005269040395977,
"grad_norm": 0.2190704544630761,
"learning_rate": 1.7656595110314003e-05,
"loss": 0.1227,
"step": 5640
},
{
"epoch": 0.9037202618553409,
"grad_norm": 0.2676299073758256,
"learning_rate": 1.7632641458057874e-05,
"loss": 0.1166,
"step": 5660
},
{
"epoch": 0.9069136196710842,
"grad_norm": 0.2699663729747412,
"learning_rate": 1.7608582428546142e-05,
"loss": 0.1245,
"step": 5680
},
{
"epoch": 0.9101069774868275,
"grad_norm": 0.38105163760645616,
"learning_rate": 1.7584418353940943e-05,
"loss": 0.1218,
"step": 5700
},
{
"epoch": 0.9133003353025706,
"grad_norm": 0.23014658636555574,
"learning_rate": 1.756014956785468e-05,
"loss": 0.1181,
"step": 5720
},
{
"epoch": 0.9164936931183139,
"grad_norm": 0.24389786019248447,
"learning_rate": 1.7535776405345428e-05,
"loss": 0.1196,
"step": 5740
},
{
"epoch": 0.9196870509340571,
"grad_norm": 0.26113050468693977,
"learning_rate": 1.7511299202912275e-05,
"loss": 0.1202,
"step": 5760
},
{
"epoch": 0.9228804087498004,
"grad_norm": 0.2078740201372768,
"learning_rate": 1.7486718298490713e-05,
"loss": 0.124,
"step": 5780
},
{
"epoch": 0.9260737665655436,
"grad_norm": 0.3157327866928938,
"learning_rate": 1.7462034031447954e-05,
"loss": 0.1252,
"step": 5800
},
{
"epoch": 0.9292671243812869,
"grad_norm": 0.21114581099853116,
"learning_rate": 1.7437246742578246e-05,
"loss": 0.1204,
"step": 5820
},
{
"epoch": 0.9324604821970301,
"grad_norm": 0.2200062852329027,
"learning_rate": 1.7412356774098175e-05,
"loss": 0.1249,
"step": 5840
},
{
"epoch": 0.9356538400127734,
"grad_norm": 0.2739829354403811,
"learning_rate": 1.7387364469641928e-05,
"loss": 0.1207,
"step": 5860
},
{
"epoch": 0.9388471978285167,
"grad_norm": 0.22036300962797467,
"learning_rate": 1.736227017425656e-05,
"loss": 0.1182,
"step": 5880
},
{
"epoch": 0.9420405556442599,
"grad_norm": 0.2010246775840929,
"learning_rate": 1.7337074234397228e-05,
"loss": 0.1199,
"step": 5900
},
{
"epoch": 0.9452339134600032,
"grad_norm": 0.22961494443205888,
"learning_rate": 1.7311776997922404e-05,
"loss": 0.1207,
"step": 5920
},
{
"epoch": 0.9484272712757464,
"grad_norm": 0.26165957694875003,
"learning_rate": 1.7286378814089072e-05,
"loss": 0.1188,
"step": 5940
},
{
"epoch": 0.9516206290914897,
"grad_norm": 0.22131834255107544,
"learning_rate": 1.726088003354791e-05,
"loss": 0.1205,
"step": 5960
},
{
"epoch": 0.954813986907233,
"grad_norm": 0.2549539175287136,
"learning_rate": 1.7235281008338452e-05,
"loss": 0.1213,
"step": 5980
},
{
"epoch": 0.9580073447229762,
"grad_norm": 0.2427772520556814,
"learning_rate": 1.720958209188422e-05,
"loss": 0.1211,
"step": 6000
},
{
"epoch": 0.9612007025387195,
"grad_norm": 0.2442539895798861,
"learning_rate": 1.7183783638987845e-05,
"loss": 0.1193,
"step": 6020
},
{
"epoch": 0.9643940603544627,
"grad_norm": 0.23954523978335746,
"learning_rate": 1.7157886005826173e-05,
"loss": 0.1196,
"step": 6040
},
{
"epoch": 0.967587418170206,
"grad_norm": 0.20571373812832114,
"learning_rate": 1.7131889549945348e-05,
"loss": 0.1149,
"step": 6060
},
{
"epoch": 0.9707807759859493,
"grad_norm": 0.22749917178842363,
"learning_rate": 1.710579463025587e-05,
"loss": 0.1176,
"step": 6080
},
{
"epoch": 0.9739741338016925,
"grad_norm": 0.23012462875837292,
"learning_rate": 1.7079601607027643e-05,
"loss": 0.1186,
"step": 6100
},
{
"epoch": 0.9771674916174358,
"grad_norm": 0.20338632953694447,
"learning_rate": 1.7053310841885012e-05,
"loss": 0.1187,
"step": 6120
},
{
"epoch": 0.980360849433179,
"grad_norm": 0.23280208486194112,
"learning_rate": 1.7026922697801746e-05,
"loss": 0.1196,
"step": 6140
},
{
"epoch": 0.9835542072489223,
"grad_norm": 0.20786109950948006,
"learning_rate": 1.7000437539096046e-05,
"loss": 0.1202,
"step": 6160
},
{
"epoch": 0.9867475650646655,
"grad_norm": 0.21375986615043702,
"learning_rate": 1.6973855731425507e-05,
"loss": 0.1159,
"step": 6180
},
{
"epoch": 0.9899409228804088,
"grad_norm": 0.20748661803980806,
"learning_rate": 1.694717764178208e-05,
"loss": 0.1153,
"step": 6200
},
{
"epoch": 0.993134280696152,
"grad_norm": 0.22516009929996467,
"learning_rate": 1.692040363848699e-05,
"loss": 0.1204,
"step": 6220
},
{
"epoch": 0.9963276385118952,
"grad_norm": 0.2595564019615457,
"learning_rate": 1.6893534091185658e-05,
"loss": 0.1197,
"step": 6240
},
{
"epoch": 0.9995209963276385,
"grad_norm": 0.18297342882482412,
"learning_rate": 1.686656937084261e-05,
"loss": 0.1151,
"step": 6260
},
{
"epoch": 1.0027143541433818,
"grad_norm": 0.22852815920466457,
"learning_rate": 1.6839509849736326e-05,
"loss": 0.0949,
"step": 6280
},
{
"epoch": 1.005907711959125,
"grad_norm": 0.19728357385077158,
"learning_rate": 1.6812355901454132e-05,
"loss": 0.0872,
"step": 6300
},
{
"epoch": 1.0091010697748684,
"grad_norm": 0.2623149708691154,
"learning_rate": 1.678510790088702e-05,
"loss": 0.0887,
"step": 6320
},
{
"epoch": 1.0122944275906116,
"grad_norm": 0.18893451371595926,
"learning_rate": 1.6757766224224483e-05,
"loss": 0.0919,
"step": 6340
},
{
"epoch": 1.0154877854063549,
"grad_norm": 0.21837196710349846,
"learning_rate": 1.673033124894932e-05,
"loss": 0.0871,
"step": 6360
},
{
"epoch": 1.0186811432220981,
"grad_norm": 0.19258941847945746,
"learning_rate": 1.670280335383242e-05,
"loss": 0.0885,
"step": 6380
},
{
"epoch": 1.0218745010378414,
"grad_norm": 0.19005062378076065,
"learning_rate": 1.667518291892754e-05,
"loss": 0.0893,
"step": 6400
},
{
"epoch": 1.0250678588535846,
"grad_norm": 0.20663392660314553,
"learning_rate": 1.6647470325566045e-05,
"loss": 0.0891,
"step": 6420
},
{
"epoch": 1.028261216669328,
"grad_norm": 0.22234403999553295,
"learning_rate": 1.6619665956351664e-05,
"loss": 0.0881,
"step": 6440
},
{
"epoch": 1.0314545744850712,
"grad_norm": 0.2218548835051233,
"learning_rate": 1.6591770195155185e-05,
"loss": 0.0891,
"step": 6460
},
{
"epoch": 1.0346479323008142,
"grad_norm": 0.19448202424429442,
"learning_rate": 1.6563783427109173e-05,
"loss": 0.0882,
"step": 6480
},
{
"epoch": 1.0378412901165575,
"grad_norm": 0.2042849289860482,
"learning_rate": 1.6535706038602637e-05,
"loss": 0.0878,
"step": 6500
},
{
"epoch": 1.0410346479323007,
"grad_norm": 0.2512755796704539,
"learning_rate": 1.6507538417275716e-05,
"loss": 0.0875,
"step": 6520
},
{
"epoch": 1.044228005748044,
"grad_norm": 0.2131890646498463,
"learning_rate": 1.6479280952014304e-05,
"loss": 0.0898,
"step": 6540
},
{
"epoch": 1.0474213635637872,
"grad_norm": 0.21427122055121073,
"learning_rate": 1.6450934032944698e-05,
"loss": 0.088,
"step": 6560
},
{
"epoch": 1.0506147213795305,
"grad_norm": 0.2110500487102777,
"learning_rate": 1.64224980514282e-05,
"loss": 0.0877,
"step": 6580
},
{
"epoch": 1.0538080791952738,
"grad_norm": 0.21674633072630997,
"learning_rate": 1.6393973400055737e-05,
"loss": 0.0919,
"step": 6600
},
{
"epoch": 1.057001437011017,
"grad_norm": 0.20250013575431305,
"learning_rate": 1.63653604726424e-05,
"loss": 0.0878,
"step": 6620
},
{
"epoch": 1.0601947948267603,
"grad_norm": 0.22853386096908568,
"learning_rate": 1.6336659664222048e-05,
"loss": 0.0865,
"step": 6640
},
{
"epoch": 1.0633881526425035,
"grad_norm": 0.23371366704528887,
"learning_rate": 1.630787137104183e-05,
"loss": 0.0917,
"step": 6660
},
{
"epoch": 1.0665815104582468,
"grad_norm": 0.2520515744099512,
"learning_rate": 1.6278995990556725e-05,
"loss": 0.0885,
"step": 6680
},
{
"epoch": 1.06977486827399,
"grad_norm": 0.226518466734716,
"learning_rate": 1.6250033921424038e-05,
"loss": 0.089,
"step": 6700
},
{
"epoch": 1.0729682260897333,
"grad_norm": 0.19588721298026593,
"learning_rate": 1.6220985563497933e-05,
"loss": 0.0893,
"step": 6720
},
{
"epoch": 1.0761615839054766,
"grad_norm": 0.20545809450126928,
"learning_rate": 1.6191851317823864e-05,
"loss": 0.0878,
"step": 6740
},
{
"epoch": 1.0793549417212198,
"grad_norm": 0.19233602078710613,
"learning_rate": 1.6162631586633076e-05,
"loss": 0.0866,
"step": 6760
},
{
"epoch": 1.082548299536963,
"grad_norm": 0.16678814329219444,
"learning_rate": 1.6133326773337033e-05,
"loss": 0.0871,
"step": 6780
},
{
"epoch": 1.0857416573527063,
"grad_norm": 0.1872528998042832,
"learning_rate": 1.610393728252186e-05,
"loss": 0.0855,
"step": 6800
},
{
"epoch": 1.0889350151684496,
"grad_norm": 0.2125566089494784,
"learning_rate": 1.6074463519942747e-05,
"loss": 0.0868,
"step": 6820
},
{
"epoch": 1.0921283729841929,
"grad_norm": 0.2174911829451179,
"learning_rate": 1.604490589251835e-05,
"loss": 0.0883,
"step": 6840
},
{
"epoch": 1.095321730799936,
"grad_norm": 0.18461972367391402,
"learning_rate": 1.6015264808325172e-05,
"loss": 0.0866,
"step": 6860
},
{
"epoch": 1.0985150886156794,
"grad_norm": 0.21622527426814506,
"learning_rate": 1.5985540676591938e-05,
"loss": 0.0863,
"step": 6880
},
{
"epoch": 1.1017084464314226,
"grad_norm": 0.22055823564651658,
"learning_rate": 1.5955733907693938e-05,
"loss": 0.0864,
"step": 6900
},
{
"epoch": 1.1049018042471659,
"grad_norm": 0.21748955927958816,
"learning_rate": 1.592584491314735e-05,
"loss": 0.0914,
"step": 6920
},
{
"epoch": 1.1080951620629091,
"grad_norm": 0.19288286925997916,
"learning_rate": 1.589587410560359e-05,
"loss": 0.0886,
"step": 6940
},
{
"epoch": 1.1112885198786524,
"grad_norm": 0.22073550271753697,
"learning_rate": 1.586582189884357e-05,
"loss": 0.0874,
"step": 6960
},
{
"epoch": 1.1144818776943957,
"grad_norm": 0.19094293529375386,
"learning_rate": 1.5835688707772035e-05,
"loss": 0.0855,
"step": 6980
},
{
"epoch": 1.117675235510139,
"grad_norm": 0.21947645518408387,
"learning_rate": 1.5805474948411792e-05,
"loss": 0.0891,
"step": 7000
},
{
"epoch": 1.1208685933258822,
"grad_norm": 0.19228306320542188,
"learning_rate": 1.5775181037897995e-05,
"loss": 0.0864,
"step": 7020
},
{
"epoch": 1.1240619511416254,
"grad_norm": 0.2416878479220072,
"learning_rate": 1.5744807394472372e-05,
"loss": 0.0892,
"step": 7040
},
{
"epoch": 1.1272553089573687,
"grad_norm": 0.2763423491442259,
"learning_rate": 1.5714354437477454e-05,
"loss": 0.0903,
"step": 7060
},
{
"epoch": 1.130448666773112,
"grad_norm": 9.009817853561485,
"learning_rate": 1.568382258735078e-05,
"loss": 0.0896,
"step": 7080
},
{
"epoch": 1.1336420245888552,
"grad_norm": 0.21069452452749907,
"learning_rate": 1.5653212265619114e-05,
"loss": 0.0908,
"step": 7100
},
{
"epoch": 1.1368353824045985,
"grad_norm": 0.20407807891775565,
"learning_rate": 1.5622523894892587e-05,
"loss": 0.0908,
"step": 7120
},
{
"epoch": 1.1400287402203417,
"grad_norm": 0.2619102068507488,
"learning_rate": 1.5591757898858907e-05,
"loss": 0.0872,
"step": 7140
},
{
"epoch": 1.143222098036085,
"grad_norm": 0.20634106575751654,
"learning_rate": 1.556091470227747e-05,
"loss": 0.0875,
"step": 7160
},
{
"epoch": 1.1464154558518282,
"grad_norm": 0.23775033570197862,
"learning_rate": 1.5529994730973522e-05,
"loss": 0.0868,
"step": 7180
},
{
"epoch": 1.1496088136675715,
"grad_norm": 0.20245603598906314,
"learning_rate": 1.549899841183227e-05,
"loss": 0.0868,
"step": 7200
},
{
"epoch": 1.1528021714833148,
"grad_norm": 0.19815804657454472,
"learning_rate": 1.546792617279299e-05,
"loss": 0.0899,
"step": 7220
},
{
"epoch": 1.155995529299058,
"grad_norm": 0.18751806743751373,
"learning_rate": 1.5436778442843107e-05,
"loss": 0.0884,
"step": 7240
},
{
"epoch": 1.1591888871148013,
"grad_norm": 0.22312780655020503,
"learning_rate": 1.5405555652012302e-05,
"loss": 0.0895,
"step": 7260
},
{
"epoch": 1.1623822449305445,
"grad_norm": 0.1924743563793643,
"learning_rate": 1.5374258231366546e-05,
"loss": 0.0881,
"step": 7280
},
{
"epoch": 1.1655756027462878,
"grad_norm": 0.20844406290416265,
"learning_rate": 1.5342886613002155e-05,
"loss": 0.0867,
"step": 7300
},
{
"epoch": 1.168768960562031,
"grad_norm": 0.1761650680293785,
"learning_rate": 1.531144123003984e-05,
"loss": 0.087,
"step": 7320
},
{
"epoch": 1.1719623183777743,
"grad_norm": 0.1914806702266616,
"learning_rate": 1.5279922516618702e-05,
"loss": 0.0866,
"step": 7340
},
{
"epoch": 1.1751556761935176,
"grad_norm": 0.2112719185689836,
"learning_rate": 1.5248330907890272e-05,
"loss": 0.0867,
"step": 7360
},
{
"epoch": 1.1783490340092608,
"grad_norm": 0.20744289591360074,
"learning_rate": 1.5216666840012455e-05,
"loss": 0.0848,
"step": 7380
},
{
"epoch": 1.1815423918250039,
"grad_norm": 0.21602516707177483,
"learning_rate": 1.5184930750143565e-05,
"loss": 0.0889,
"step": 7400
},
{
"epoch": 1.1847357496407471,
"grad_norm": 0.1942180064010259,
"learning_rate": 1.515312307643624e-05,
"loss": 0.0871,
"step": 7420
},
{
"epoch": 1.1879291074564904,
"grad_norm": 0.1809045891368503,
"learning_rate": 1.5121244258031427e-05,
"loss": 0.0887,
"step": 7440
},
{
"epoch": 1.1911224652722336,
"grad_norm": 0.21509016663666897,
"learning_rate": 1.50892947350523e-05,
"loss": 0.0875,
"step": 7460
},
{
"epoch": 1.194315823087977,
"grad_norm": 0.22222425875493532,
"learning_rate": 1.5057274948598192e-05,
"loss": 0.0904,
"step": 7480
},
{
"epoch": 1.1975091809037202,
"grad_norm": 0.17436626344650585,
"learning_rate": 1.5025185340738499e-05,
"loss": 0.0869,
"step": 7500
},
{
"epoch": 1.2007025387194634,
"grad_norm": 0.2315956494531892,
"learning_rate": 1.4993026354506588e-05,
"loss": 0.0893,
"step": 7520
},
{
"epoch": 1.2038958965352067,
"grad_norm": 0.19438867498932094,
"learning_rate": 1.4960798433893664e-05,
"loss": 0.0898,
"step": 7540
},
{
"epoch": 1.20708925435095,
"grad_norm": 0.21507570120321423,
"learning_rate": 1.492850202384266e-05,
"loss": 0.0888,
"step": 7560
},
{
"epoch": 1.2102826121666932,
"grad_norm": 0.1756005064132717,
"learning_rate": 1.4896137570242068e-05,
"loss": 0.0886,
"step": 7580
},
{
"epoch": 1.2134759699824365,
"grad_norm": 0.21082827374254784,
"learning_rate": 1.486370551991981e-05,
"loss": 0.0877,
"step": 7600
},
{
"epoch": 1.2166693277981797,
"grad_norm": 0.25062287626591706,
"learning_rate": 1.483120632063706e-05,
"loss": 0.0889,
"step": 7620
},
{
"epoch": 1.219862685613923,
"grad_norm": 0.18123970615998264,
"learning_rate": 1.4798640421082047e-05,
"loss": 0.0886,
"step": 7640
},
{
"epoch": 1.2230560434296662,
"grad_norm": 0.21468260494577018,
"learning_rate": 1.4766008270863883e-05,
"loss": 0.0906,
"step": 7660
},
{
"epoch": 1.2262494012454095,
"grad_norm": 0.18876901647341507,
"learning_rate": 1.4733310320506343e-05,
"loss": 0.0882,
"step": 7680
},
{
"epoch": 1.2294427590611527,
"grad_norm": 0.19790235853542382,
"learning_rate": 1.4700547021441642e-05,
"loss": 0.0877,
"step": 7700
},
{
"epoch": 1.232636116876896,
"grad_norm": 0.18688689214473558,
"learning_rate": 1.4667718826004214e-05,
"loss": 0.0882,
"step": 7720
},
{
"epoch": 1.2358294746926393,
"grad_norm": 0.1951758945258833,
"learning_rate": 1.463482618742446e-05,
"loss": 0.0869,
"step": 7740
},
{
"epoch": 1.2390228325083825,
"grad_norm": 0.19995389074426362,
"learning_rate": 1.4601869559822488e-05,
"loss": 0.0872,
"step": 7760
},
{
"epoch": 1.2422161903241258,
"grad_norm": 0.2218492641305999,
"learning_rate": 1.4568849398201855e-05,
"loss": 0.0883,
"step": 7780
},
{
"epoch": 1.245409548139869,
"grad_norm": 0.18443852015389814,
"learning_rate": 1.4535766158443265e-05,
"loss": 0.087,
"step": 7800
},
{
"epoch": 1.2486029059556123,
"grad_norm": 0.19503753956864983,
"learning_rate": 1.45026202972983e-05,
"loss": 0.0885,
"step": 7820
},
{
"epoch": 1.2517962637713556,
"grad_norm": 0.19853902671151866,
"learning_rate": 1.446941227238309e-05,
"loss": 0.0861,
"step": 7840
},
{
"epoch": 1.2549896215870988,
"grad_norm": 0.21865153532249126,
"learning_rate": 1.4436142542172009e-05,
"loss": 0.0886,
"step": 7860
},
{
"epoch": 1.258182979402842,
"grad_norm": 0.20818634190936489,
"learning_rate": 1.4402811565991353e-05,
"loss": 0.0889,
"step": 7880
},
{
"epoch": 1.2613763372185853,
"grad_norm": 0.23080624800369903,
"learning_rate": 1.436941980401297e-05,
"loss": 0.0858,
"step": 7900
},
{
"epoch": 1.2645696950343286,
"grad_norm": 0.19862256058128666,
"learning_rate": 1.4335967717247941e-05,
"loss": 0.0865,
"step": 7920
},
{
"epoch": 1.2677630528500718,
"grad_norm": 0.18954472715597112,
"learning_rate": 1.4302455767540189e-05,
"loss": 0.0886,
"step": 7940
},
{
"epoch": 1.270956410665815,
"grad_norm": 0.18922957380652522,
"learning_rate": 1.4268884417560119e-05,
"loss": 0.0881,
"step": 7960
},
{
"epoch": 1.2741497684815584,
"grad_norm": 0.23661467243107595,
"learning_rate": 1.4235254130798213e-05,
"loss": 0.0884,
"step": 7980
},
{
"epoch": 1.2773431262973016,
"grad_norm": 0.21028360452170922,
"learning_rate": 1.4201565371558657e-05,
"loss": 0.0858,
"step": 8000
},
{
"epoch": 1.2805364841130449,
"grad_norm": 0.1857031394163611,
"learning_rate": 1.4167818604952906e-05,
"loss": 0.0865,
"step": 8020
},
{
"epoch": 1.2837298419287881,
"grad_norm": 0.227255800263239,
"learning_rate": 1.4134014296893275e-05,
"loss": 0.0884,
"step": 8040
},
{
"epoch": 1.2869231997445314,
"grad_norm": 0.26940362233973403,
"learning_rate": 1.4100152914086504e-05,
"loss": 0.0845,
"step": 8060
},
{
"epoch": 1.2901165575602747,
"grad_norm": 0.22762705633128913,
"learning_rate": 1.4066234924027318e-05,
"loss": 0.0863,
"step": 8080
},
{
"epoch": 1.293309915376018,
"grad_norm": 0.24522046661200322,
"learning_rate": 1.4032260794991956e-05,
"loss": 0.0854,
"step": 8100
},
{
"epoch": 1.2965032731917612,
"grad_norm": 0.17298541823238414,
"learning_rate": 1.3998230996031736e-05,
"loss": 0.0884,
"step": 8120
},
{
"epoch": 1.2996966310075044,
"grad_norm": 0.21973030306429478,
"learning_rate": 1.3964145996966555e-05,
"loss": 0.0879,
"step": 8140
},
{
"epoch": 1.3028899888232477,
"grad_norm": 0.18077115808310013,
"learning_rate": 1.3930006268378407e-05,
"loss": 0.089,
"step": 8160
},
{
"epoch": 1.306083346638991,
"grad_norm": 0.18437205616695954,
"learning_rate": 1.3895812281604895e-05,
"loss": 0.0887,
"step": 8180
},
{
"epoch": 1.3092767044547342,
"grad_norm": 0.22324698589088907,
"learning_rate": 1.386156450873271e-05,
"loss": 0.1099,
"step": 8200
},
{
"epoch": 1.3124700622704775,
"grad_norm": 0.1866174959700542,
"learning_rate": 1.382726342259113e-05,
"loss": 0.0899,
"step": 8220
},
{
"epoch": 1.3156634200862207,
"grad_norm": 0.22011208651394024,
"learning_rate": 1.3792909496745475e-05,
"loss": 0.0869,
"step": 8240
},
{
"epoch": 1.318856777901964,
"grad_norm": 0.21878645198323823,
"learning_rate": 1.3758503205490583e-05,
"loss": 0.0859,
"step": 8260
},
{
"epoch": 1.3220501357177072,
"grad_norm": 0.1869477105143079,
"learning_rate": 1.3724045023844253e-05,
"loss": 0.0898,
"step": 8280
},
{
"epoch": 1.3252434935334505,
"grad_norm": 0.21199782150015953,
"learning_rate": 1.3689535427540687e-05,
"loss": 0.0861,
"step": 8300
},
{
"epoch": 1.3284368513491938,
"grad_norm": 0.18518093738165986,
"learning_rate": 1.3654974893023934e-05,
"loss": 0.0908,
"step": 8320
},
{
"epoch": 1.331630209164937,
"grad_norm": 0.18688147397601756,
"learning_rate": 1.3620363897441289e-05,
"loss": 0.0868,
"step": 8340
},
{
"epoch": 1.3348235669806803,
"grad_norm": 0.2067483479178462,
"learning_rate": 1.358570291863673e-05,
"loss": 0.0884,
"step": 8360
},
{
"epoch": 1.3380169247964235,
"grad_norm": 0.21329007217550264,
"learning_rate": 1.3550992435144304e-05,
"loss": 0.086,
"step": 8380
},
{
"epoch": 1.3412102826121668,
"grad_norm": 0.18073209909106028,
"learning_rate": 1.3516232926181529e-05,
"loss": 0.0868,
"step": 8400
},
{
"epoch": 1.34440364042791,
"grad_norm": 0.23014446893395585,
"learning_rate": 1.3481424871642778e-05,
"loss": 0.088,
"step": 8420
},
{
"epoch": 1.3475969982436533,
"grad_norm": 0.3028280433486724,
"learning_rate": 1.3446568752092643e-05,
"loss": 0.0848,
"step": 8440
},
{
"epoch": 1.3507903560593966,
"grad_norm": 0.20888924306544646,
"learning_rate": 1.3411665048759313e-05,
"loss": 0.0885,
"step": 8460
},
{
"epoch": 1.3539837138751398,
"grad_norm": 0.22324045695426223,
"learning_rate": 1.3376714243527925e-05,
"loss": 0.0901,
"step": 8480
},
{
"epoch": 1.357177071690883,
"grad_norm": 0.19474459814659545,
"learning_rate": 1.3341716818933912e-05,
"loss": 0.088,
"step": 8500
},
{
"epoch": 1.3603704295066263,
"grad_norm": 0.22602725655780065,
"learning_rate": 1.3306673258156334e-05,
"loss": 0.0867,
"step": 8520
},
{
"epoch": 1.3635637873223696,
"grad_norm": 0.23360209320607728,
"learning_rate": 1.3271584045011217e-05,
"loss": 0.0886,
"step": 8540
},
{
"epoch": 1.3667571451381129,
"grad_norm": 0.1873427703628018,
"learning_rate": 1.3236449663944875e-05,
"loss": 0.0866,
"step": 8560
},
{
"epoch": 1.369950502953856,
"grad_norm": 0.1985433902478951,
"learning_rate": 1.3201270600027208e-05,
"loss": 0.0876,
"step": 8580
},
{
"epoch": 1.3731438607695992,
"grad_norm": 0.18896595210872472,
"learning_rate": 1.3166047338945019e-05,
"loss": 0.0861,
"step": 8600
},
{
"epoch": 1.3763372185853424,
"grad_norm": 0.22957720239257226,
"learning_rate": 1.3130780366995297e-05,
"loss": 0.0853,
"step": 8620
},
{
"epoch": 1.3795305764010857,
"grad_norm": 0.1933824287848287,
"learning_rate": 1.3095470171078512e-05,
"loss": 0.0867,
"step": 8640
},
{
"epoch": 1.382723934216829,
"grad_norm": 0.22324019535172776,
"learning_rate": 1.3060117238691894e-05,
"loss": 0.085,
"step": 8660
},
{
"epoch": 1.3859172920325722,
"grad_norm": 0.2316030267030887,
"learning_rate": 1.3024722057922696e-05,
"loss": 0.0841,
"step": 8680
},
{
"epoch": 1.3891106498483154,
"grad_norm": 0.1973247421696361,
"learning_rate": 1.2989285117441452e-05,
"loss": 0.0878,
"step": 8700
},
{
"epoch": 1.3923040076640587,
"grad_norm": 0.2080002656842217,
"learning_rate": 1.2953806906495244e-05,
"loss": 0.0883,
"step": 8720
},
{
"epoch": 1.395497365479802,
"grad_norm": 0.18517780070734782,
"learning_rate": 1.2918287914900933e-05,
"loss": 0.0852,
"step": 8740
},
{
"epoch": 1.3986907232955452,
"grad_norm": 0.19697224463698385,
"learning_rate": 1.2882728633038406e-05,
"loss": 0.0855,
"step": 8760
},
{
"epoch": 1.4018840811112885,
"grad_norm": 0.19736259450538857,
"learning_rate": 1.2847129551843807e-05,
"loss": 0.0876,
"step": 8780
},
{
"epoch": 1.4050774389270317,
"grad_norm": 0.18942542996017805,
"learning_rate": 1.2811491162802744e-05,
"loss": 0.0884,
"step": 8800
},
{
"epoch": 1.408270796742775,
"grad_norm": 0.19254196108878727,
"learning_rate": 1.277581395794353e-05,
"loss": 0.088,
"step": 8820
},
{
"epoch": 1.4114641545585183,
"grad_norm": 0.24282865106690285,
"learning_rate": 1.2740098429830357e-05,
"loss": 0.0891,
"step": 8840
},
{
"epoch": 1.4146575123742615,
"grad_norm": 0.23984915406072307,
"learning_rate": 1.2704345071556525e-05,
"loss": 0.0886,
"step": 8860
},
{
"epoch": 1.4178508701900048,
"grad_norm": 0.2184606228075661,
"learning_rate": 1.2668554376737619e-05,
"loss": 0.087,
"step": 8880
},
{
"epoch": 1.421044228005748,
"grad_norm": 0.19798737334853378,
"learning_rate": 1.2632726839504693e-05,
"loss": 0.0875,
"step": 8900
},
{
"epoch": 1.4242375858214913,
"grad_norm": 0.23442081669151446,
"learning_rate": 1.2596862954497458e-05,
"loss": 0.0849,
"step": 8920
},
{
"epoch": 1.4274309436372346,
"grad_norm": 0.21286909537115775,
"learning_rate": 1.2560963216857447e-05,
"loss": 0.0845,
"step": 8940
},
{
"epoch": 1.4306243014529778,
"grad_norm": 0.19037684375350825,
"learning_rate": 1.2525028122221172e-05,
"loss": 0.0857,
"step": 8960
},
{
"epoch": 1.433817659268721,
"grad_norm": 0.18725372186680364,
"learning_rate": 1.24890581667133e-05,
"loss": 0.0875,
"step": 8980
},
{
"epoch": 1.4370110170844643,
"grad_norm": 0.20844623553872596,
"learning_rate": 1.2453053846939783e-05,
"loss": 0.0898,
"step": 9000
},
{
"epoch": 1.4402043749002076,
"grad_norm": 0.21140506066201004,
"learning_rate": 1.2417015659981007e-05,
"loss": 0.0883,
"step": 9020
},
{
"epoch": 1.4433977327159508,
"grad_norm": 0.2064339774677841,
"learning_rate": 1.2380944103384946e-05,
"loss": 0.0849,
"step": 9040
},
{
"epoch": 1.446591090531694,
"grad_norm": 0.17652746458033255,
"learning_rate": 1.2344839675160271e-05,
"loss": 0.0867,
"step": 9060
},
{
"epoch": 1.4497844483474374,
"grad_norm": 0.19101046403484023,
"learning_rate": 1.2308702873769486e-05,
"loss": 0.0865,
"step": 9080
},
{
"epoch": 1.4529778061631806,
"grad_norm": 0.19778410360898788,
"learning_rate": 1.227253419812204e-05,
"loss": 0.0876,
"step": 9100
},
{
"epoch": 1.4561711639789239,
"grad_norm": 0.1884773288145621,
"learning_rate": 1.2236334147567442e-05,
"loss": 0.0873,
"step": 9120
},
{
"epoch": 1.4593645217946671,
"grad_norm": 0.22741564087867433,
"learning_rate": 1.2200103221888365e-05,
"loss": 0.0842,
"step": 9140
},
{
"epoch": 1.4625578796104104,
"grad_norm": 0.19382271044214394,
"learning_rate": 1.2163841921293761e-05,
"loss": 0.0846,
"step": 9160
},
{
"epoch": 1.4657512374261537,
"grad_norm": 0.2225438873976966,
"learning_rate": 1.2127550746411932e-05,
"loss": 0.086,
"step": 9180
},
{
"epoch": 1.468944595241897,
"grad_norm": 0.20309796630710175,
"learning_rate": 1.2091230198283626e-05,
"loss": 0.0872,
"step": 9200
},
{
"epoch": 1.4721379530576402,
"grad_norm": 0.21309103603253518,
"learning_rate": 1.2054880778355122e-05,
"loss": 0.0856,
"step": 9220
},
{
"epoch": 1.4753313108733834,
"grad_norm": 0.20007800804028458,
"learning_rate": 1.201850298847132e-05,
"loss": 0.0843,
"step": 9240
},
{
"epoch": 1.4785246686891267,
"grad_norm": 0.22102981325152446,
"learning_rate": 1.198209733086878e-05,
"loss": 0.0865,
"step": 9260
},
{
"epoch": 1.48171802650487,
"grad_norm": 0.2509432577302147,
"learning_rate": 1.194566430816882e-05,
"loss": 0.0872,
"step": 9280
},
{
"epoch": 1.4849113843206132,
"grad_norm": 0.21078643240774367,
"learning_rate": 1.1909204423370564e-05,
"loss": 0.0856,
"step": 9300
},
{
"epoch": 1.4881047421363562,
"grad_norm": 0.22252302888210984,
"learning_rate": 1.1872718179843994e-05,
"loss": 0.0838,
"step": 9320
},
{
"epoch": 1.4912980999520995,
"grad_norm": 0.18987560853570382,
"learning_rate": 1.1836206081323003e-05,
"loss": 0.085,
"step": 9340
},
{
"epoch": 1.4944914577678428,
"grad_norm": 0.19549774907184778,
"learning_rate": 1.1799668631898445e-05,
"loss": 0.0877,
"step": 9360
},
{
"epoch": 1.497684815583586,
"grad_norm": 0.19228104758868642,
"learning_rate": 1.176310633601117e-05,
"loss": 0.0956,
"step": 9380
},
{
"epoch": 1.5008781733993293,
"grad_norm": 0.20819820045783494,
"learning_rate": 1.1726519698445056e-05,
"loss": 0.0867,
"step": 9400
},
{
"epoch": 1.5040715312150725,
"grad_norm": 0.20733767509582143,
"learning_rate": 1.1689909224320062e-05,
"loss": 0.0863,
"step": 9420
},
{
"epoch": 1.5072648890308158,
"grad_norm": 0.20925265086202188,
"learning_rate": 1.165327541908522e-05,
"loss": 0.0861,
"step": 9440
},
{
"epoch": 1.510458246846559,
"grad_norm": 0.18493554321077676,
"learning_rate": 1.1616618788511684e-05,
"loss": 0.0849,
"step": 9460
},
{
"epoch": 1.5136516046623023,
"grad_norm": 0.18797864341732143,
"learning_rate": 1.1579939838685731e-05,
"loss": 0.085,
"step": 9480
},
{
"epoch": 1.5168449624780456,
"grad_norm": 0.2242101441050116,
"learning_rate": 1.154323907600179e-05,
"loss": 0.0867,
"step": 9500
},
{
"epoch": 1.5200383202937888,
"grad_norm": 0.17084103768025352,
"learning_rate": 1.1506517007155432e-05,
"loss": 0.0838,
"step": 9520
},
{
"epoch": 1.523231678109532,
"grad_norm": 0.18934207218377755,
"learning_rate": 1.1469774139136389e-05,
"loss": 0.0857,
"step": 9540
},
{
"epoch": 1.5264250359252753,
"grad_norm": 0.2265706734312821,
"learning_rate": 1.1433010979221545e-05,
"loss": 0.0866,
"step": 9560
},
{
"epoch": 1.5296183937410186,
"grad_norm": 0.22302910930406783,
"learning_rate": 1.1396228034967942e-05,
"loss": 0.0841,
"step": 9580
},
{
"epoch": 1.5328117515567619,
"grad_norm": 0.20180278303765992,
"learning_rate": 1.1359425814205767e-05,
"loss": 0.0863,
"step": 9600
},
{
"epoch": 1.5360051093725051,
"grad_norm": 0.22800639526769467,
"learning_rate": 1.132260482503133e-05,
"loss": 0.0873,
"step": 9620
},
{
"epoch": 1.5391984671882484,
"grad_norm": 0.21277101714684102,
"learning_rate": 1.1285765575800076e-05,
"loss": 0.0874,
"step": 9640
},
{
"epoch": 1.5423918250039916,
"grad_norm": 0.18816604414097163,
"learning_rate": 1.1248908575119539e-05,
"loss": 0.0862,
"step": 9660
},
{
"epoch": 1.545585182819735,
"grad_norm": 0.20138026843291984,
"learning_rate": 1.1212034331842338e-05,
"loss": 0.0856,
"step": 9680
},
{
"epoch": 1.5487785406354782,
"grad_norm": 0.18862474943057217,
"learning_rate": 1.1175143355059144e-05,
"loss": 0.085,
"step": 9700
},
{
"epoch": 1.5519718984512214,
"grad_norm": 0.18561382698856643,
"learning_rate": 1.1138236154091656e-05,
"loss": 0.0852,
"step": 9720
},
{
"epoch": 1.5551652562669647,
"grad_norm": 0.18884644793283215,
"learning_rate": 1.1101313238485552e-05,
"loss": 0.0839,
"step": 9740
},
{
"epoch": 1.558358614082708,
"grad_norm": 0.17345642894126198,
"learning_rate": 1.1064375118003487e-05,
"loss": 0.0844,
"step": 9760
},
{
"epoch": 1.5615519718984512,
"grad_norm": 0.1991026940192444,
"learning_rate": 1.1027422302618032e-05,
"loss": 0.0846,
"step": 9780
},
{
"epoch": 1.5647453297141944,
"grad_norm": 0.22201127220587602,
"learning_rate": 1.099045530250463e-05,
"loss": 0.0823,
"step": 9800
},
{
"epoch": 1.5679386875299377,
"grad_norm": 0.23679974637337212,
"learning_rate": 1.0953474628034562e-05,
"loss": 0.087,
"step": 9820
},
{
"epoch": 1.571132045345681,
"grad_norm": 0.18945453405405135,
"learning_rate": 1.0916480789767907e-05,
"loss": 0.0861,
"step": 9840
},
{
"epoch": 1.5743254031614242,
"grad_norm": 0.18943349755537386,
"learning_rate": 1.0879474298446479e-05,
"loss": 0.0831,
"step": 9860
},
{
"epoch": 1.5775187609771675,
"grad_norm": 0.20905996320818215,
"learning_rate": 1.0842455664986782e-05,
"loss": 0.0858,
"step": 9880
},
{
"epoch": 1.5807121187929107,
"grad_norm": 0.1863924849638652,
"learning_rate": 1.0805425400472956e-05,
"loss": 0.0856,
"step": 9900
},
{
"epoch": 1.583905476608654,
"grad_norm": 0.2091352813903984,
"learning_rate": 1.076838401614972e-05,
"loss": 0.0857,
"step": 9920
},
{
"epoch": 1.5870988344243973,
"grad_norm": 0.24371561015345014,
"learning_rate": 1.0731332023415319e-05,
"loss": 0.089,
"step": 9940
},
{
"epoch": 1.5902921922401405,
"grad_norm": 0.2128926213672918,
"learning_rate": 1.0694269933814456e-05,
"loss": 0.084,
"step": 9960
},
{
"epoch": 1.5934855500558838,
"grad_norm": 0.21916373322291655,
"learning_rate": 1.0657198259031232e-05,
"loss": 0.0826,
"step": 9980
},
{
"epoch": 1.596678907871627,
"grad_norm": 0.1824350216961259,
"learning_rate": 1.0620117510882083e-05,
"loss": 0.0864,
"step": 10000
},
{
"epoch": 1.5998722656873703,
"grad_norm": 0.17881824213547054,
"learning_rate": 1.058302820130871e-05,
"loss": 0.0839,
"step": 10020
},
{
"epoch": 1.6030656235031135,
"grad_norm": 0.19729212364378013,
"learning_rate": 1.0545930842371022e-05,
"loss": 0.0854,
"step": 10040
},
{
"epoch": 1.6062589813188568,
"grad_norm": 0.2087951067289451,
"learning_rate": 1.0508825946240053e-05,
"loss": 0.085,
"step": 10060
},
{
"epoch": 1.6094523391346,
"grad_norm": 0.19718155636666373,
"learning_rate": 1.0471714025190897e-05,
"loss": 0.0856,
"step": 10080
},
{
"epoch": 1.6126456969503433,
"grad_norm": 0.20228614287118912,
"learning_rate": 1.0434595591595635e-05,
"loss": 0.0853,
"step": 10100
},
{
"epoch": 1.6158390547660866,
"grad_norm": 0.18693736149298812,
"learning_rate": 1.0397471157916263e-05,
"loss": 0.0849,
"step": 10120
},
{
"epoch": 1.6190324125818298,
"grad_norm": 0.191886465159157,
"learning_rate": 1.0360341236697611e-05,
"loss": 0.0838,
"step": 10140
},
{
"epoch": 1.622225770397573,
"grad_norm": 0.20503489329364863,
"learning_rate": 1.0323206340560275e-05,
"loss": 0.0856,
"step": 10160
},
{
"epoch": 1.6254191282133164,
"grad_norm": 0.24260576208421464,
"learning_rate": 1.028606698219353e-05,
"loss": 0.0865,
"step": 10180
},
{
"epoch": 1.6286124860290596,
"grad_norm": 0.22639324906871056,
"learning_rate": 1.0248923674348268e-05,
"loss": 0.0859,
"step": 10200
},
{
"epoch": 1.6318058438448029,
"grad_norm": 0.176153514574258,
"learning_rate": 1.0211776929829893e-05,
"loss": 0.0867,
"step": 10220
},
{
"epoch": 1.6349992016605461,
"grad_norm": 0.1877599319113198,
"learning_rate": 1.0174627261491268e-05,
"loss": 0.0829,
"step": 10240
},
{
"epoch": 1.6381925594762894,
"grad_norm": 0.19998890698860952,
"learning_rate": 1.0137475182225617e-05,
"loss": 0.0841,
"step": 10260
},
{
"epoch": 1.6413859172920326,
"grad_norm": 0.21610758072730218,
"learning_rate": 1.0100321204959449e-05,
"loss": 0.0841,
"step": 10280
},
{
"epoch": 1.644579275107776,
"grad_norm": 0.18193308572064754,
"learning_rate": 1.0063165842645484e-05,
"loss": 0.0849,
"step": 10300
},
{
"epoch": 1.6477726329235192,
"grad_norm": 0.20248453381225345,
"learning_rate": 1.0026009608255555e-05,
"loss": 0.0845,
"step": 10320
},
{
"epoch": 1.6509659907392624,
"grad_norm": 0.2320711692298343,
"learning_rate": 9.988853014773542e-06,
"loss": 0.0852,
"step": 10340
},
{
"epoch": 1.6541593485550057,
"grad_norm": 0.19584194025576318,
"learning_rate": 9.951696575188278e-06,
"loss": 0.085,
"step": 10360
},
{
"epoch": 1.657352706370749,
"grad_norm": 0.21553283351451755,
"learning_rate": 9.914540802486474e-06,
"loss": 0.0856,
"step": 10380
},
{
"epoch": 1.6605460641864922,
"grad_norm": 0.18489531692862257,
"learning_rate": 9.877386209645633e-06,
"loss": 0.0858,
"step": 10400
},
{
"epoch": 1.6637394220022355,
"grad_norm": 0.20763123012361048,
"learning_rate": 9.84023330962697e-06,
"loss": 0.0852,
"step": 10420
},
{
"epoch": 1.6669327798179787,
"grad_norm": 0.19596129486655178,
"learning_rate": 9.803082615368323e-06,
"loss": 0.0835,
"step": 10440
},
{
"epoch": 1.670126137633722,
"grad_norm": 0.18532596629268455,
"learning_rate": 9.765934639777087e-06,
"loss": 0.0841,
"step": 10460
},
{
"epoch": 1.6733194954494652,
"grad_norm": 0.17232132477688097,
"learning_rate": 9.728789895723109e-06,
"loss": 0.0835,
"step": 10480
},
{
"epoch": 1.6765128532652085,
"grad_norm": 0.19279722790694026,
"learning_rate": 9.691648896031642e-06,
"loss": 0.0877,
"step": 10500
},
{
"epoch": 1.6797062110809517,
"grad_norm": 0.20508136042678382,
"learning_rate": 9.65451215347622e-06,
"loss": 0.0849,
"step": 10520
},
{
"epoch": 1.682899568896695,
"grad_norm": 0.2030859465238483,
"learning_rate": 9.61738018077162e-06,
"loss": 0.0828,
"step": 10540
},
{
"epoch": 1.6860929267124383,
"grad_norm": 0.21547506058080174,
"learning_rate": 9.580253490566753e-06,
"loss": 0.0837,
"step": 10560
},
{
"epoch": 1.6892862845281815,
"grad_norm": 0.22700049094169877,
"learning_rate": 9.543132595437612e-06,
"loss": 0.0849,
"step": 10580
},
{
"epoch": 1.6924796423439248,
"grad_norm": 0.19256501341459278,
"learning_rate": 9.506018007880169e-06,
"loss": 0.0845,
"step": 10600
},
{
"epoch": 1.695673000159668,
"grad_norm": 0.20795374910309583,
"learning_rate": 9.468910240303324e-06,
"loss": 0.0819,
"step": 10620
},
{
"epoch": 1.6988663579754113,
"grad_norm": 0.19009887752439592,
"learning_rate": 9.431809805021815e-06,
"loss": 0.0816,
"step": 10640
},
{
"epoch": 1.7020597157911546,
"grad_norm": 0.19799389204125842,
"learning_rate": 9.394717214249147e-06,
"loss": 0.0851,
"step": 10660
},
{
"epoch": 1.7052530736068978,
"grad_norm": 0.2271148145004972,
"learning_rate": 9.357632980090528e-06,
"loss": 0.0852,
"step": 10680
},
{
"epoch": 1.7084464314226409,
"grad_norm": 0.2344519009086231,
"learning_rate": 9.320557614535787e-06,
"loss": 0.0831,
"step": 10700
},
{
"epoch": 1.7116397892383841,
"grad_norm": 0.23925944266837562,
"learning_rate": 9.283491629452315e-06,
"loss": 0.0853,
"step": 10720
},
{
"epoch": 1.7148331470541274,
"grad_norm": 0.20081263527645807,
"learning_rate": 9.246435536577999e-06,
"loss": 0.085,
"step": 10740
},
{
"epoch": 1.7180265048698706,
"grad_norm": 0.20700627503253236,
"learning_rate": 9.20938984751415e-06,
"loss": 0.0851,
"step": 10760
},
{
"epoch": 1.7212198626856139,
"grad_norm": 0.201866541369534,
"learning_rate": 9.172355073718439e-06,
"loss": 0.0842,
"step": 10780
},
{
"epoch": 1.7244132205013571,
"grad_norm": 0.20280888722283474,
"learning_rate": 9.135331726497843e-06,
"loss": 0.0822,
"step": 10800
},
{
"epoch": 1.7276065783171004,
"grad_norm": 0.19344393506408358,
"learning_rate": 9.09832031700158e-06,
"loss": 0.0828,
"step": 10820
},
{
"epoch": 1.7307999361328437,
"grad_norm": 0.16624847494447237,
"learning_rate": 9.06132135621406e-06,
"loss": 0.0829,
"step": 10840
},
{
"epoch": 1.733993293948587,
"grad_norm": 0.21724180904413368,
"learning_rate": 9.024335354947812e-06,
"loss": 0.0838,
"step": 10860
},
{
"epoch": 1.7371866517643302,
"grad_norm": 0.23846515088949718,
"learning_rate": 8.987362823836461e-06,
"loss": 0.0852,
"step": 10880
},
{
"epoch": 1.7403800095800734,
"grad_norm": 0.20925184991512286,
"learning_rate": 8.950404273327646e-06,
"loss": 0.0834,
"step": 10900
},
{
"epoch": 1.7435733673958167,
"grad_norm": 0.1716514985288543,
"learning_rate": 8.913460213675998e-06,
"loss": 0.0836,
"step": 10920
},
{
"epoch": 1.74676672521156,
"grad_norm": 0.20494704853492865,
"learning_rate": 8.876531154936084e-06,
"loss": 0.0817,
"step": 10940
},
{
"epoch": 1.7499600830273032,
"grad_norm": 0.24261768980108364,
"learning_rate": 8.839617606955355e-06,
"loss": 0.0842,
"step": 10960
},
{
"epoch": 1.7531534408430465,
"grad_norm": 0.2090462161591236,
"learning_rate": 8.802720079367136e-06,
"loss": 0.0828,
"step": 10980
},
{
"epoch": 1.7563467986587897,
"grad_norm": 0.18696550737565137,
"learning_rate": 8.765839081583564e-06,
"loss": 0.082,
"step": 11000
},
{
"epoch": 1.759540156474533,
"grad_norm": 0.19121809093030445,
"learning_rate": 8.72897512278856e-06,
"loss": 0.0848,
"step": 11020
},
{
"epoch": 1.7627335142902762,
"grad_norm": 0.2102957076954447,
"learning_rate": 8.692128711930805e-06,
"loss": 0.084,
"step": 11040
},
{
"epoch": 1.7659268721060195,
"grad_norm": 0.20622666368626175,
"learning_rate": 8.655300357716716e-06,
"loss": 0.0845,
"step": 11060
},
{
"epoch": 1.7691202299217628,
"grad_norm": 0.2091346262679367,
"learning_rate": 8.618490568603409e-06,
"loss": 0.0821,
"step": 11080
},
{
"epoch": 1.772313587737506,
"grad_norm": 0.18255707260911555,
"learning_rate": 8.581699852791696e-06,
"loss": 0.0824,
"step": 11100
},
{
"epoch": 1.7755069455532493,
"grad_norm": 0.2201418888200012,
"learning_rate": 8.54492871821905e-06,
"loss": 0.0836,
"step": 11120
},
{
"epoch": 1.7787003033689925,
"grad_norm": 0.1875915274082898,
"learning_rate": 8.508177672552617e-06,
"loss": 0.0842,
"step": 11140
},
{
"epoch": 1.7818936611847358,
"grad_norm": 0.19600313792607987,
"learning_rate": 8.471447223182179e-06,
"loss": 0.0836,
"step": 11160
},
{
"epoch": 1.785087019000479,
"grad_norm": 0.19719362419525954,
"learning_rate": 8.434737877213172e-06,
"loss": 0.0856,
"step": 11180
},
{
"epoch": 1.788280376816222,
"grad_norm": 0.17156639629201742,
"learning_rate": 8.398050141459674e-06,
"loss": 0.0819,
"step": 11200
},
{
"epoch": 1.7914737346319654,
"grad_norm": 0.2039792577946715,
"learning_rate": 8.361384522437402e-06,
"loss": 0.0827,
"step": 11220
},
{
"epoch": 1.7946670924477086,
"grad_norm": 0.19179744785660258,
"learning_rate": 8.324741526356738e-06,
"loss": 0.0826,
"step": 11240
},
{
"epoch": 1.7978604502634519,
"grad_norm": 0.18215189474588084,
"learning_rate": 8.288121659115727e-06,
"loss": 0.0819,
"step": 11260
},
{
"epoch": 1.8010538080791951,
"grad_norm": 0.1644377928850563,
"learning_rate": 8.251525426293084e-06,
"loss": 0.0827,
"step": 11280
},
{
"epoch": 1.8042471658949384,
"grad_norm": 0.21222246533392128,
"learning_rate": 8.21495333314123e-06,
"loss": 0.0843,
"step": 11300
},
{
"epoch": 1.8074405237106816,
"grad_norm": 0.25181863269369087,
"learning_rate": 8.178405884579317e-06,
"loss": 0.0842,
"step": 11320
},
{
"epoch": 1.810633881526425,
"grad_norm": 0.2109399815982731,
"learning_rate": 8.141883585186241e-06,
"loss": 0.0829,
"step": 11340
},
{
"epoch": 1.8138272393421682,
"grad_norm": 0.18073042845539122,
"learning_rate": 8.10538693919369e-06,
"loss": 0.0834,
"step": 11360
},
{
"epoch": 1.8170205971579114,
"grad_norm": 0.20526943895282074,
"learning_rate": 8.068916450479174e-06,
"loss": 0.081,
"step": 11380
},
{
"epoch": 1.8202139549736547,
"grad_norm": 0.19361555670993416,
"learning_rate": 8.03247262255908e-06,
"loss": 0.0836,
"step": 11400
},
{
"epoch": 1.823407312789398,
"grad_norm": 0.24389934893406925,
"learning_rate": 7.996055958581703e-06,
"loss": 0.0828,
"step": 11420
},
{
"epoch": 1.8266006706051412,
"grad_norm": 0.1877153126969613,
"learning_rate": 7.959666961320314e-06,
"loss": 0.0823,
"step": 11440
},
{
"epoch": 1.8297940284208845,
"grad_norm": 0.19815842442257633,
"learning_rate": 7.923306133166218e-06,
"loss": 0.0827,
"step": 11460
},
{
"epoch": 1.8329873862366277,
"grad_norm": 0.21678547999171613,
"learning_rate": 7.886973976121797e-06,
"loss": 0.0821,
"step": 11480
},
{
"epoch": 1.836180744052371,
"grad_norm": 0.21618607294885436,
"learning_rate": 7.850670991793621e-06,
"loss": 0.0847,
"step": 11500
},
{
"epoch": 1.8393741018681142,
"grad_norm": 0.1704593983368394,
"learning_rate": 7.81439768138548e-06,
"loss": 0.082,
"step": 11520
},
{
"epoch": 1.8425674596838575,
"grad_norm": 0.18606341720829214,
"learning_rate": 7.778154545691481e-06,
"loss": 0.0812,
"step": 11540
},
{
"epoch": 1.8457608174996007,
"grad_norm": 0.21208825422427718,
"learning_rate": 7.741942085089146e-06,
"loss": 0.083,
"step": 11560
},
{
"epoch": 1.848954175315344,
"grad_norm": 0.18782574055868467,
"learning_rate": 7.705760799532485e-06,
"loss": 0.0828,
"step": 11580
},
{
"epoch": 1.8521475331310873,
"grad_norm": 0.19574167645932028,
"learning_rate": 7.669611188545103e-06,
"loss": 0.083,
"step": 11600
},
{
"epoch": 1.8553408909468305,
"grad_norm": 0.2065298678199762,
"learning_rate": 7.6334937512133e-06,
"loss": 0.0825,
"step": 11620
},
{
"epoch": 1.8585342487625738,
"grad_norm": 0.1977503317300438,
"learning_rate": 7.597408986179184e-06,
"loss": 0.0806,
"step": 11640
},
{
"epoch": 1.861727606578317,
"grad_norm": 0.20586182397186595,
"learning_rate": 7.561357391633789e-06,
"loss": 0.0824,
"step": 11660
},
{
"epoch": 1.8649209643940603,
"grad_norm": 0.21998998145214102,
"learning_rate": 7.525339465310183e-06,
"loss": 0.0838,
"step": 11680
},
{
"epoch": 1.8681143222098036,
"grad_norm": 0.24487809053970366,
"learning_rate": 7.4893557044766145e-06,
"loss": 0.0821,
"step": 11700
},
{
"epoch": 1.8713076800255468,
"grad_norm": 0.18687218223534408,
"learning_rate": 7.453406605929637e-06,
"loss": 0.0806,
"step": 11720
},
{
"epoch": 1.87450103784129,
"grad_norm": 0.17318503959159254,
"learning_rate": 7.417492665987247e-06,
"loss": 0.0819,
"step": 11740
},
{
"epoch": 1.8776943956570333,
"grad_norm": 0.18945197729794094,
"learning_rate": 7.3816143804820454e-06,
"loss": 0.0835,
"step": 11760
},
{
"epoch": 1.8808877534727766,
"grad_norm": 0.20142501192350587,
"learning_rate": 7.345772244754377e-06,
"loss": 0.0844,
"step": 11780
},
{
"epoch": 1.8840811112885198,
"grad_norm": 0.20568732816869706,
"learning_rate": 7.309966753645496e-06,
"loss": 0.0801,
"step": 11800
},
{
"epoch": 1.887274469104263,
"grad_norm": 0.20182816399217324,
"learning_rate": 7.274198401490744e-06,
"loss": 0.0846,
"step": 11820
},
{
"epoch": 1.8904678269200064,
"grad_norm": 0.20018924573509358,
"learning_rate": 7.2384676821127135e-06,
"loss": 0.0798,
"step": 11840
},
{
"epoch": 1.8936611847357496,
"grad_norm": 0.28199792560782483,
"learning_rate": 7.202775088814429e-06,
"loss": 0.0815,
"step": 11860
},
{
"epoch": 1.8968545425514929,
"grad_norm": 0.22764478972933266,
"learning_rate": 7.1671211143725485e-06,
"loss": 0.0815,
"step": 11880
},
{
"epoch": 1.9000479003672361,
"grad_norm": 0.1981593984765646,
"learning_rate": 7.131506251030547e-06,
"loss": 0.0809,
"step": 11900
},
{
"epoch": 1.9032412581829794,
"grad_norm": 0.20992169378762218,
"learning_rate": 7.095930990491933e-06,
"loss": 0.0809,
"step": 11920
},
{
"epoch": 1.9064346159987227,
"grad_norm": 0.19005910859773092,
"learning_rate": 7.060395823913447e-06,
"loss": 0.0842,
"step": 11940
},
{
"epoch": 1.909627973814466,
"grad_norm": 0.19205175219083725,
"learning_rate": 7.024901241898292e-06,
"loss": 0.0819,
"step": 11960
},
{
"epoch": 1.9128213316302092,
"grad_norm": 0.20008872943717196,
"learning_rate": 6.9894477344893505e-06,
"loss": 0.0819,
"step": 11980
},
{
"epoch": 1.9160146894459524,
"grad_norm": 0.1773872749793287,
"learning_rate": 6.9540357911624336e-06,
"loss": 0.0823,
"step": 12000
},
{
"epoch": 1.9192080472616957,
"grad_norm": 0.19417086960624413,
"learning_rate": 6.918665900819497e-06,
"loss": 0.0791,
"step": 12020
},
{
"epoch": 1.922401405077439,
"grad_norm": 0.1814650138072353,
"learning_rate": 6.883338551781923e-06,
"loss": 0.0811,
"step": 12040
},
{
"epoch": 1.9255947628931822,
"grad_norm": 0.1702657944804681,
"learning_rate": 6.8480542317837505e-06,
"loss": 0.0803,
"step": 12060
},
{
"epoch": 1.9287881207089255,
"grad_norm": 0.18416550882743182,
"learning_rate": 6.812813427964963e-06,
"loss": 0.081,
"step": 12080
},
{
"epoch": 1.9319814785246687,
"grad_norm": 0.21054620503327667,
"learning_rate": 6.77761662686475e-06,
"loss": 0.0837,
"step": 12100
},
{
"epoch": 1.935174836340412,
"grad_norm": 0.1788773690242681,
"learning_rate": 6.742464314414791e-06,
"loss": 0.0809,
"step": 12120
},
{
"epoch": 1.9383681941561552,
"grad_norm": 0.19629223674022553,
"learning_rate": 6.707356975932559e-06,
"loss": 0.0821,
"step": 12140
},
{
"epoch": 1.9415615519718985,
"grad_norm": 0.17739114236704748,
"learning_rate": 6.672295096114597e-06,
"loss": 0.0816,
"step": 12160
},
{
"epoch": 1.9447549097876418,
"grad_norm": 0.20468934483234205,
"learning_rate": 6.637279159029851e-06,
"loss": 0.0827,
"step": 12180
},
{
"epoch": 1.947948267603385,
"grad_norm": 0.16608032221866548,
"learning_rate": 6.602309648112968e-06,
"loss": 0.0792,
"step": 12200
},
{
"epoch": 1.9511416254191283,
"grad_norm": 0.1759677545684069,
"learning_rate": 6.567387046157632e-06,
"loss": 0.0785,
"step": 12220
},
{
"epoch": 1.9543349832348715,
"grad_norm": 0.18405948214393053,
"learning_rate": 6.532511835309896e-06,
"loss": 0.0822,
"step": 12240
},
{
"epoch": 1.9575283410506148,
"grad_norm": 0.2012173937759783,
"learning_rate": 6.497684497061531e-06,
"loss": 0.0818,
"step": 12260
},
{
"epoch": 1.960721698866358,
"grad_norm": 0.2057906504416338,
"learning_rate": 6.462905512243359e-06,
"loss": 0.0806,
"step": 12280
},
{
"epoch": 1.9639150566821013,
"grad_norm": 0.20687177701805626,
"learning_rate": 6.428175361018643e-06,
"loss": 0.0794,
"step": 12300
},
{
"epoch": 1.9671084144978446,
"grad_norm": 0.2064196549144857,
"learning_rate": 6.393494522876428e-06,
"loss": 0.0816,
"step": 12320
},
{
"epoch": 1.9703017723135878,
"grad_norm": 0.2133102540844893,
"learning_rate": 6.358863476624948e-06,
"loss": 0.0821,
"step": 12340
},
{
"epoch": 1.973495130129331,
"grad_norm": 0.18497415279048168,
"learning_rate": 6.324282700385e-06,
"loss": 0.0824,
"step": 12360
},
{
"epoch": 1.9766884879450743,
"grad_norm": 0.19520821054839646,
"learning_rate": 6.289752671583344e-06,
"loss": 0.0792,
"step": 12380
},
{
"epoch": 1.9798818457608176,
"grad_norm": 0.18726221094986775,
"learning_rate": 6.255273866946119e-06,
"loss": 0.0799,
"step": 12400
},
{
"epoch": 1.9830752035765609,
"grad_norm": 0.19525199269461027,
"learning_rate": 6.22084676249225e-06,
"loss": 0.0796,
"step": 12420
},
{
"epoch": 1.9862685613923041,
"grad_norm": 0.16345775381577554,
"learning_rate": 6.186471833526888e-06,
"loss": 0.082,
"step": 12440
},
{
"epoch": 1.9894619192080474,
"grad_norm": 0.1972221294843483,
"learning_rate": 6.15214955463484e-06,
"loss": 0.0787,
"step": 12460
},
{
"epoch": 1.9926552770237906,
"grad_norm": 0.1935374722805669,
"learning_rate": 6.117880399674016e-06,
"loss": 0.0827,
"step": 12480
},
{
"epoch": 1.995848634839534,
"grad_norm": 0.18315518408993714,
"learning_rate": 6.083664841768901e-06,
"loss": 0.0816,
"step": 12500
},
{
"epoch": 1.9990419926552772,
"grad_norm": 0.16860052008855017,
"learning_rate": 6.049503353304e-06,
"loss": 0.0844,
"step": 12520
},
{
"epoch": 2.0022353504710204,
"grad_norm": 0.18498027675472176,
"learning_rate": 6.015396405917333e-06,
"loss": 0.061,
"step": 12540
},
{
"epoch": 2.0054287082867637,
"grad_norm": 0.20247862079416473,
"learning_rate": 5.98134447049392e-06,
"loss": 0.0494,
"step": 12560
},
{
"epoch": 2.008622066102507,
"grad_norm": 0.17717972255777836,
"learning_rate": 5.947348017159272e-06,
"loss": 0.0496,
"step": 12580
},
{
"epoch": 2.01181542391825,
"grad_norm": 0.17560899509079128,
"learning_rate": 5.913407515272918e-06,
"loss": 0.0484,
"step": 12600
},
{
"epoch": 2.0150087817339934,
"grad_norm": 0.2107019559801837,
"learning_rate": 5.879523433421903e-06,
"loss": 0.0455,
"step": 12620
},
{
"epoch": 2.0182021395497367,
"grad_norm": 0.17228228604398835,
"learning_rate": 5.845696239414336e-06,
"loss": 0.0481,
"step": 12640
},
{
"epoch": 2.02139549736548,
"grad_norm": 0.16576058508327604,
"learning_rate": 5.8119264002729244e-06,
"loss": 0.0484,
"step": 12660
},
{
"epoch": 2.0245888551812232,
"grad_norm": 0.17885300287909717,
"learning_rate": 5.778214382228524e-06,
"loss": 0.047,
"step": 12680
},
{
"epoch": 2.0277822129969665,
"grad_norm": 0.20671449403256986,
"learning_rate": 5.744560650713704e-06,
"loss": 0.0471,
"step": 12700
},
{
"epoch": 2.0309755708127097,
"grad_norm": 0.20083359478447635,
"learning_rate": 5.710965670356332e-06,
"loss": 0.0479,
"step": 12720
},
{
"epoch": 2.034168928628453,
"grad_norm": 0.18961936533749266,
"learning_rate": 5.6774299049731325e-06,
"loss": 0.0478,
"step": 12740
},
{
"epoch": 2.0373622864441963,
"grad_norm": 0.21979140727547378,
"learning_rate": 5.643953817563318e-06,
"loss": 0.0453,
"step": 12760
},
{
"epoch": 2.0405556442599395,
"grad_norm": 0.16165099720000836,
"learning_rate": 5.610537870302164e-06,
"loss": 0.0476,
"step": 12780
},
{
"epoch": 2.0437490020756828,
"grad_norm": 0.18343428699528758,
"learning_rate": 5.577182524534657e-06,
"loss": 0.0478,
"step": 12800
},
{
"epoch": 2.046942359891426,
"grad_norm": 0.17215552651589366,
"learning_rate": 5.5438882407691e-06,
"loss": 0.0472,
"step": 12820
},
{
"epoch": 2.0501357177071693,
"grad_norm": 0.1624976046442029,
"learning_rate": 5.510655478670769e-06,
"loss": 0.0478,
"step": 12840
},
{
"epoch": 2.0533290755229126,
"grad_norm": 0.22026015940397797,
"learning_rate": 5.4774846970555615e-06,
"loss": 0.0461,
"step": 12860
},
{
"epoch": 2.056522433338656,
"grad_norm": 0.17519613837123435,
"learning_rate": 5.444376353883678e-06,
"loss": 0.0462,
"step": 12880
},
{
"epoch": 2.059715791154399,
"grad_norm": 0.18277575133361915,
"learning_rate": 5.411330906253269e-06,
"loss": 0.0455,
"step": 12900
},
{
"epoch": 2.0629091489701423,
"grad_norm": 0.18787731365044255,
"learning_rate": 5.378348810394143e-06,
"loss": 0.0462,
"step": 12920
},
{
"epoch": 2.066102506785885,
"grad_norm": 0.18201430894959444,
"learning_rate": 5.3454305216614766e-06,
"loss": 0.0473,
"step": 12940
},
{
"epoch": 2.0692958646016284,
"grad_norm": 0.1904233887751224,
"learning_rate": 5.312576494529507e-06,
"loss": 0.0494,
"step": 12960
},
{
"epoch": 2.0724892224173717,
"grad_norm": 0.18985642952053444,
"learning_rate": 5.279787182585271e-06,
"loss": 0.0462,
"step": 12980
},
{
"epoch": 2.075682580233115,
"grad_norm": 0.1582812242047444,
"learning_rate": 5.247063038522329e-06,
"loss": 0.0469,
"step": 13000
},
{
"epoch": 2.078875938048858,
"grad_norm": 0.19286531510895663,
"learning_rate": 5.21440451413455e-06,
"loss": 0.0465,
"step": 13020
},
{
"epoch": 2.0820692958646014,
"grad_norm": 0.22047888942684946,
"learning_rate": 5.181812060309825e-06,
"loss": 0.0463,
"step": 13040
},
{
"epoch": 2.0852626536803447,
"grad_norm": 0.22499631209380672,
"learning_rate": 5.149286127023874e-06,
"loss": 0.0467,
"step": 13060
},
{
"epoch": 2.088456011496088,
"grad_norm": 0.18796568419290619,
"learning_rate": 5.1168271633340235e-06,
"loss": 0.0471,
"step": 13080
},
{
"epoch": 2.091649369311831,
"grad_norm": 0.1796719273681106,
"learning_rate": 5.084435617373018e-06,
"loss": 0.048,
"step": 13100
},
{
"epoch": 2.0948427271275745,
"grad_norm": 0.1916078526748605,
"learning_rate": 5.052111936342812e-06,
"loss": 0.0467,
"step": 13120
},
{
"epoch": 2.0980360849433177,
"grad_norm": 0.19878847514842057,
"learning_rate": 5.019856566508412e-06,
"loss": 0.0478,
"step": 13140
},
{
"epoch": 2.101229442759061,
"grad_norm": 0.2088933392167675,
"learning_rate": 4.9876699531917186e-06,
"loss": 0.0473,
"step": 13160
},
{
"epoch": 2.1044228005748042,
"grad_norm": 0.20402583213332395,
"learning_rate": 4.95555254076536e-06,
"loss": 0.0457,
"step": 13180
},
{
"epoch": 2.1076161583905475,
"grad_norm": 0.16605435030952836,
"learning_rate": 4.923504772646573e-06,
"loss": 0.0473,
"step": 13200
},
{
"epoch": 2.1108095162062908,
"grad_norm": 0.17651776985556464,
"learning_rate": 4.891527091291071e-06,
"loss": 0.0477,
"step": 13220
},
{
"epoch": 2.114002874022034,
"grad_norm": 0.1763790661182835,
"learning_rate": 4.859619938186947e-06,
"loss": 0.0456,
"step": 13240
},
{
"epoch": 2.1171962318377773,
"grad_norm": 0.18886660022445972,
"learning_rate": 4.827783753848575e-06,
"loss": 0.0455,
"step": 13260
},
{
"epoch": 2.1203895896535205,
"grad_norm": 0.2059211240085781,
"learning_rate": 4.796018977810514e-06,
"loss": 0.0457,
"step": 13280
},
{
"epoch": 2.123582947469264,
"grad_norm": 0.19168043665328116,
"learning_rate": 4.76432604862145e-06,
"loss": 0.046,
"step": 13300
},
{
"epoch": 2.126776305285007,
"grad_norm": 0.17778767466228898,
"learning_rate": 4.732705403838159e-06,
"loss": 0.0465,
"step": 13320
},
{
"epoch": 2.1299696631007503,
"grad_norm": 0.170308319213917,
"learning_rate": 4.701157480019429e-06,
"loss": 0.0474,
"step": 13340
},
{
"epoch": 2.1331630209164936,
"grad_norm": 0.1711104888651996,
"learning_rate": 4.669682712720065e-06,
"loss": 0.0462,
"step": 13360
},
{
"epoch": 2.136356378732237,
"grad_norm": 0.1825464435577293,
"learning_rate": 4.638281536484854e-06,
"loss": 0.0485,
"step": 13380
},
{
"epoch": 2.13954973654798,
"grad_norm": 0.1835185156049789,
"learning_rate": 4.606954384842587e-06,
"loss": 0.0455,
"step": 13400
},
{
"epoch": 2.1427430943637233,
"grad_norm": 0.19538449656271248,
"learning_rate": 4.575701690300051e-06,
"loss": 0.0457,
"step": 13420
},
{
"epoch": 2.1459364521794666,
"grad_norm": 0.20119853731280407,
"learning_rate": 4.544523884336073e-06,
"loss": 0.0462,
"step": 13440
},
{
"epoch": 2.14912980999521,
"grad_norm": 0.19230165287264112,
"learning_rate": 4.513421397395563e-06,
"loss": 0.0449,
"step": 13460
},
{
"epoch": 2.152323167810953,
"grad_norm": 0.19371541515972485,
"learning_rate": 4.482394658883557e-06,
"loss": 0.0465,
"step": 13480
},
{
"epoch": 2.1555165256266964,
"grad_norm": 0.2749584429863373,
"learning_rate": 4.451444097159301e-06,
"loss": 0.0465,
"step": 13500
},
{
"epoch": 2.1587098834424396,
"grad_norm": 0.181430213502962,
"learning_rate": 4.4205701395303424e-06,
"loss": 0.0469,
"step": 13520
},
{
"epoch": 2.161903241258183,
"grad_norm": 0.21832000463916046,
"learning_rate": 4.38977321224661e-06,
"loss": 0.0472,
"step": 13540
},
{
"epoch": 2.165096599073926,
"grad_norm": 0.36594927042777403,
"learning_rate": 4.3590537404945535e-06,
"loss": 0.0471,
"step": 13560
},
{
"epoch": 2.1682899568896694,
"grad_norm": 0.19062769875876745,
"learning_rate": 4.3284121483912525e-06,
"loss": 0.0464,
"step": 13580
},
{
"epoch": 2.1714833147054127,
"grad_norm": 0.18521477830070004,
"learning_rate": 4.297848858978569e-06,
"loss": 0.0461,
"step": 13600
},
{
"epoch": 2.174676672521156,
"grad_norm": 0.2064934921930085,
"learning_rate": 4.2673642942173184e-06,
"loss": 0.0451,
"step": 13620
},
{
"epoch": 2.177870030336899,
"grad_norm": 0.19089143723142035,
"learning_rate": 4.236958874981423e-06,
"loss": 0.0448,
"step": 13640
},
{
"epoch": 2.1810633881526424,
"grad_norm": 0.17162658742427372,
"learning_rate": 4.206633021052115e-06,
"loss": 0.0453,
"step": 13660
},
{
"epoch": 2.1842567459683857,
"grad_norm": 0.18039037729927956,
"learning_rate": 4.176387151112134e-06,
"loss": 0.0455,
"step": 13680
},
{
"epoch": 2.187450103784129,
"grad_norm": 0.16510411035975564,
"learning_rate": 4.1462216827399585e-06,
"loss": 0.0446,
"step": 13700
},
{
"epoch": 2.190643461599872,
"grad_norm": 0.2215703230886645,
"learning_rate": 4.116137032404026e-06,
"loss": 0.0453,
"step": 13720
},
{
"epoch": 2.1938368194156155,
"grad_norm": 0.18140462418275824,
"learning_rate": 4.0861336154569855e-06,
"loss": 0.0446,
"step": 13740
},
{
"epoch": 2.1970301772313587,
"grad_norm": 0.164963005058681,
"learning_rate": 4.056211846129977e-06,
"loss": 0.0451,
"step": 13760
},
{
"epoch": 2.200223535047102,
"grad_norm": 0.22161978868062865,
"learning_rate": 4.0263721375269e-06,
"loss": 0.0439,
"step": 13780
},
{
"epoch": 2.2034168928628453,
"grad_norm": 0.18997163122166422,
"learning_rate": 3.99661490161871e-06,
"loss": 0.0452,
"step": 13800
},
{
"epoch": 2.2066102506785885,
"grad_norm": 0.19721572060634018,
"learning_rate": 3.966940549237728e-06,
"loss": 0.046,
"step": 13820
},
{
"epoch": 2.2098036084943318,
"grad_norm": 0.1613696871656721,
"learning_rate": 3.937349490071989e-06,
"loss": 0.0451,
"step": 13840
},
{
"epoch": 2.212996966310075,
"grad_norm": 0.23649764683113925,
"learning_rate": 3.9078421326595575e-06,
"loss": 0.0473,
"step": 13860
},
{
"epoch": 2.2161903241258183,
"grad_norm": 0.15900455957581072,
"learning_rate": 3.8784188843829075e-06,
"loss": 0.0467,
"step": 13880
},
{
"epoch": 2.2193836819415615,
"grad_norm": 0.16623211370488078,
"learning_rate": 3.849080151463284e-06,
"loss": 0.0447,
"step": 13900
},
{
"epoch": 2.222577039757305,
"grad_norm": 0.23855246445899472,
"learning_rate": 3.819826338955115e-06,
"loss": 0.045,
"step": 13920
},
{
"epoch": 2.225770397573048,
"grad_norm": 0.16852273819977373,
"learning_rate": 3.7906578507403925e-06,
"loss": 0.044,
"step": 13940
},
{
"epoch": 2.2289637553887913,
"grad_norm": 0.19176422233347587,
"learning_rate": 3.761575089523114e-06,
"loss": 0.0451,
"step": 13960
},
{
"epoch": 2.2321571132045346,
"grad_norm": 0.19217003400101632,
"learning_rate": 3.7325784568237267e-06,
"loss": 0.0456,
"step": 13980
},
{
"epoch": 2.235350471020278,
"grad_norm": 0.2142815186061357,
"learning_rate": 3.7036683529735616e-06,
"loss": 0.0438,
"step": 14000
},
{
"epoch": 2.238543828836021,
"grad_norm": 0.16980952681099654,
"learning_rate": 3.6748451771093386e-06,
"loss": 0.0456,
"step": 14020
},
{
"epoch": 2.2417371866517644,
"grad_norm": 0.20792979968816608,
"learning_rate": 3.6461093271676216e-06,
"loss": 0.045,
"step": 14040
},
{
"epoch": 2.2449305444675076,
"grad_norm": 0.19749481308114683,
"learning_rate": 3.6174611998793486e-06,
"loss": 0.0455,
"step": 14060
},
{
"epoch": 2.248123902283251,
"grad_norm": 0.208757882997406,
"learning_rate": 3.5889011907643523e-06,
"loss": 0.0468,
"step": 14080
},
{
"epoch": 2.251317260098994,
"grad_norm": 0.18603971145921822,
"learning_rate": 3.5604296941258854e-06,
"loss": 0.0456,
"step": 14100
},
{
"epoch": 2.2545106179147374,
"grad_norm": 0.24232186850665094,
"learning_rate": 3.532047103045185e-06,
"loss": 0.0442,
"step": 14120
},
{
"epoch": 2.2577039757304806,
"grad_norm": 0.24810029826855062,
"learning_rate": 3.503753809376059e-06,
"loss": 0.0463,
"step": 14140
},
{
"epoch": 2.260897333546224,
"grad_norm": 0.23406287255675895,
"learning_rate": 3.475550203739452e-06,
"loss": 0.0451,
"step": 14160
},
{
"epoch": 2.264090691361967,
"grad_norm": 0.17282967387502232,
"learning_rate": 3.4474366755180644e-06,
"loss": 0.0453,
"step": 14180
},
{
"epoch": 2.2672840491777104,
"grad_norm": 0.21126534883401732,
"learning_rate": 3.419413612850976e-06,
"loss": 0.0461,
"step": 14200
},
{
"epoch": 2.2704774069934537,
"grad_norm": 0.16104640464566056,
"learning_rate": 3.391481402628297e-06,
"loss": 0.0476,
"step": 14220
},
{
"epoch": 2.273670764809197,
"grad_norm": 0.21435527733602905,
"learning_rate": 3.363640430485804e-06,
"loss": 0.0446,
"step": 14240
},
{
"epoch": 2.27686412262494,
"grad_norm": 0.18548507359762656,
"learning_rate": 3.3358910807996325e-06,
"loss": 0.0451,
"step": 14260
},
{
"epoch": 2.2800574804406835,
"grad_norm": 0.19423383437023095,
"learning_rate": 3.3082337366809704e-06,
"loss": 0.0448,
"step": 14280
},
{
"epoch": 2.2832508382564267,
"grad_norm": 0.17237074664312235,
"learning_rate": 3.2806687799707647e-06,
"loss": 0.0459,
"step": 14300
},
{
"epoch": 2.28644419607217,
"grad_norm": 0.22791506612179063,
"learning_rate": 3.253196591234443e-06,
"loss": 0.0449,
"step": 14320
},
{
"epoch": 2.2896375538879132,
"grad_norm": 0.18890323777751128,
"learning_rate": 3.2258175497566678e-06,
"loss": 0.0449,
"step": 14340
},
{
"epoch": 2.2928309117036565,
"grad_norm": 0.22098418299523961,
"learning_rate": 3.198532033536107e-06,
"loss": 0.0437,
"step": 14360
},
{
"epoch": 2.2960242695193998,
"grad_norm": 0.22834203263219127,
"learning_rate": 3.1713404192801945e-06,
"loss": 0.0462,
"step": 14380
},
{
"epoch": 2.299217627335143,
"grad_norm": 0.19033969048906568,
"learning_rate": 3.144243082399947e-06,
"loss": 0.0454,
"step": 14400
},
{
"epoch": 2.3024109851508863,
"grad_norm": 0.1772642418355086,
"learning_rate": 3.1172403970047725e-06,
"loss": 0.0441,
"step": 14420
},
{
"epoch": 2.3056043429666295,
"grad_norm": 0.2048657544909403,
"learning_rate": 3.0903327358973168e-06,
"loss": 0.0446,
"step": 14440
},
{
"epoch": 2.308797700782373,
"grad_norm": 0.18540450076918674,
"learning_rate": 3.0635204705682976e-06,
"loss": 0.0451,
"step": 14460
},
{
"epoch": 2.311991058598116,
"grad_norm": 0.18445665460036134,
"learning_rate": 3.0368039711913867e-06,
"loss": 0.0459,
"step": 14480
},
{
"epoch": 2.3151844164138593,
"grad_norm": 0.22336940402363192,
"learning_rate": 3.0101836066181033e-06,
"loss": 0.0455,
"step": 14500
},
{
"epoch": 2.3183777742296026,
"grad_norm": 0.16285692399794796,
"learning_rate": 2.983659744372721e-06,
"loss": 0.045,
"step": 14520
},
{
"epoch": 2.321571132045346,
"grad_norm": 0.19697000745739243,
"learning_rate": 2.9572327506471775e-06,
"loss": 0.0454,
"step": 14540
},
{
"epoch": 2.324764489861089,
"grad_norm": 0.1950278510185452,
"learning_rate": 2.9309029902960395e-06,
"loss": 0.0452,
"step": 14560
},
{
"epoch": 2.3279578476768323,
"grad_norm": 0.1926073736357789,
"learning_rate": 2.9046708268314494e-06,
"loss": 0.0455,
"step": 14580
},
{
"epoch": 2.3311512054925756,
"grad_norm": 0.5787988360468825,
"learning_rate": 2.8785366224181265e-06,
"loss": 0.047,
"step": 14600
},
{
"epoch": 2.334344563308319,
"grad_norm": 0.19178497872154512,
"learning_rate": 2.8525007378683433e-06,
"loss": 0.0441,
"step": 14620
},
{
"epoch": 2.337537921124062,
"grad_norm": 0.20463851817417028,
"learning_rate": 2.8265635326369557e-06,
"loss": 0.0443,
"step": 14640
},
{
"epoch": 2.3407312789398054,
"grad_norm": 0.18832526122080892,
"learning_rate": 2.8007253648164502e-06,
"loss": 0.0447,
"step": 14660
},
{
"epoch": 2.3439246367555486,
"grad_norm": 0.25535504048141416,
"learning_rate": 2.7749865911319786e-06,
"loss": 0.0462,
"step": 14680
},
{
"epoch": 2.347117994571292,
"grad_norm": 0.2783926831983617,
"learning_rate": 2.74934756693645e-06,
"loss": 0.0461,
"step": 14700
},
{
"epoch": 2.350311352387035,
"grad_norm": 0.1799001156488928,
"learning_rate": 2.7238086462056125e-06,
"loss": 0.0451,
"step": 14720
},
{
"epoch": 2.3535047102027784,
"grad_norm": 0.22749744937087824,
"learning_rate": 2.6983701815331844e-06,
"loss": 0.0449,
"step": 14740
},
{
"epoch": 2.3566980680185217,
"grad_norm": 0.192235427214562,
"learning_rate": 2.6730325241259605e-06,
"loss": 0.0447,
"step": 14760
},
{
"epoch": 2.359891425834265,
"grad_norm": 0.1779393552771597,
"learning_rate": 2.647796023798991e-06,
"loss": 0.0455,
"step": 14780
},
{
"epoch": 2.3630847836500077,
"grad_norm": 0.17636063193070986,
"learning_rate": 2.6226610289707235e-06,
"loss": 0.0453,
"step": 14800
},
{
"epoch": 2.3662781414657514,
"grad_norm": 0.17751151289004394,
"learning_rate": 2.5976278866582226e-06,
"loss": 0.0439,
"step": 14820
},
{
"epoch": 2.3694714992814943,
"grad_norm": 0.1612714192997329,
"learning_rate": 2.5726969424723514e-06,
"loss": 0.0451,
"step": 14840
},
{
"epoch": 2.372664857097238,
"grad_norm": 0.19257379967637422,
"learning_rate": 2.5478685406130143e-06,
"loss": 0.0535,
"step": 14860
},
{
"epoch": 2.3758582149129808,
"grad_norm": 0.18593345377491236,
"learning_rate": 2.5231430238644106e-06,
"loss": 0.045,
"step": 14880
},
{
"epoch": 2.3790515727287245,
"grad_norm": 0.19051880160399431,
"learning_rate": 2.4985207335902863e-06,
"loss": 0.0451,
"step": 14900
},
{
"epoch": 2.3822449305444673,
"grad_norm": 0.18531119849649635,
"learning_rate": 2.4740020097292318e-06,
"loss": 0.0426,
"step": 14920
},
{
"epoch": 2.385438288360211,
"grad_norm": 0.23011458580940014,
"learning_rate": 2.4495871907899816e-06,
"loss": 0.0456,
"step": 14940
},
{
"epoch": 2.388631646175954,
"grad_norm": 0.22814782369226178,
"learning_rate": 2.425276613846755e-06,
"loss": 0.0458,
"step": 14960
},
{
"epoch": 2.3918250039916975,
"grad_norm": 0.18964633782059312,
"learning_rate": 2.401070614534585e-06,
"loss": 0.0445,
"step": 14980
},
{
"epoch": 2.3950183618074403,
"grad_norm": 0.18585844070460122,
"learning_rate": 2.3769695270446903e-06,
"loss": 0.0433,
"step": 15000
},
{
"epoch": 2.398211719623184,
"grad_norm": 0.2173023589979796,
"learning_rate": 2.352973684119868e-06,
"loss": 0.0452,
"step": 15020
},
{
"epoch": 2.401405077438927,
"grad_norm": 0.1888223260670983,
"learning_rate": 2.329083417049899e-06,
"loss": 0.0453,
"step": 15040
},
{
"epoch": 2.40459843525467,
"grad_norm": 0.2000345304946633,
"learning_rate": 2.3052990556669587e-06,
"loss": 0.0443,
"step": 15060
},
{
"epoch": 2.4077917930704134,
"grad_norm": 0.350402818921811,
"learning_rate": 2.2816209283410815e-06,
"loss": 0.0446,
"step": 15080
},
{
"epoch": 2.4109851508861566,
"grad_norm": 0.17540258992531277,
"learning_rate": 2.258049361975616e-06,
"loss": 0.0448,
"step": 15100
},
{
"epoch": 2.4141785087019,
"grad_norm": 0.2240022668610996,
"learning_rate": 2.234584682002726e-06,
"loss": 0.0436,
"step": 15120
},
{
"epoch": 2.417371866517643,
"grad_norm": 0.19377910419185784,
"learning_rate": 2.211227212378877e-06,
"loss": 0.0449,
"step": 15140
},
{
"epoch": 2.4205652243333864,
"grad_norm": 0.18307979574559963,
"learning_rate": 2.1879772755803763e-06,
"loss": 0.0437,
"step": 15160
},
{
"epoch": 2.4237585821491296,
"grad_norm": 0.18479960232316164,
"learning_rate": 2.1648351925989253e-06,
"loss": 0.0469,
"step": 15180
},
{
"epoch": 2.426951939964873,
"grad_norm": 0.19121025995799099,
"learning_rate": 2.1418012829371735e-06,
"loss": 0.0438,
"step": 15200
},
{
"epoch": 2.430145297780616,
"grad_norm": 0.19858616833926596,
"learning_rate": 2.1188758646043206e-06,
"loss": 0.044,
"step": 15220
},
{
"epoch": 2.4333386555963594,
"grad_norm": 0.18772227683807235,
"learning_rate": 2.0960592541117143e-06,
"loss": 0.0452,
"step": 15240
},
{
"epoch": 2.4365320134121027,
"grad_norm": 0.1743929147084694,
"learning_rate": 2.0733517664684944e-06,
"loss": 0.0438,
"step": 15260
},
{
"epoch": 2.439725371227846,
"grad_norm": 0.18605377215327853,
"learning_rate": 2.050753715177236e-06,
"loss": 0.0464,
"step": 15280
},
{
"epoch": 2.442918729043589,
"grad_norm": 0.19099944392969617,
"learning_rate": 2.0282654122296154e-06,
"loss": 0.0434,
"step": 15300
},
{
"epoch": 2.4461120868593325,
"grad_norm": 0.19579885958359836,
"learning_rate": 2.0058871681021087e-06,
"loss": 0.0433,
"step": 15320
},
{
"epoch": 2.4493054446750757,
"grad_norm": 0.2037719797424841,
"learning_rate": 1.983619291751716e-06,
"loss": 0.0445,
"step": 15340
},
{
"epoch": 2.452498802490819,
"grad_norm": 0.2288507482341902,
"learning_rate": 1.961462090611673e-06,
"loss": 0.0445,
"step": 15360
},
{
"epoch": 2.4556921603065622,
"grad_norm": 0.18192991033918157,
"learning_rate": 1.9394158705872244e-06,
"loss": 0.0453,
"step": 15380
},
{
"epoch": 2.4588855181223055,
"grad_norm": 0.2180936188857526,
"learning_rate": 1.9174809360513935e-06,
"loss": 0.045,
"step": 15400
},
{
"epoch": 2.4620788759380487,
"grad_norm": 0.1894861914106852,
"learning_rate": 1.8956575898407847e-06,
"loss": 0.0464,
"step": 15420
},
{
"epoch": 2.465272233753792,
"grad_norm": 0.2021847245639915,
"learning_rate": 1.8739461332513953e-06,
"loss": 0.0459,
"step": 15440
},
{
"epoch": 2.4684655915695353,
"grad_norm": 0.1992201840351267,
"learning_rate": 1.85234686603446e-06,
"loss": 0.044,
"step": 15460
},
{
"epoch": 2.4716589493852785,
"grad_norm": 0.18202769181733872,
"learning_rate": 1.8308600863923164e-06,
"loss": 0.0464,
"step": 15480
},
{
"epoch": 2.474852307201022,
"grad_norm": 0.17956705043459079,
"learning_rate": 1.8094860909742795e-06,
"loss": 0.0457,
"step": 15500
},
{
"epoch": 2.478045665016765,
"grad_norm": 0.1780847660838803,
"learning_rate": 1.78822517487255e-06,
"loss": 0.044,
"step": 15520
},
{
"epoch": 2.4812390228325083,
"grad_norm": 0.19200813107543122,
"learning_rate": 1.7670776316181427e-06,
"loss": 0.0432,
"step": 15540
},
{
"epoch": 2.4844323806482516,
"grad_norm": 0.2516917996505797,
"learning_rate": 1.746043753176836e-06,
"loss": 0.0448,
"step": 15560
},
{
"epoch": 2.487625738463995,
"grad_norm": 0.17194174394098138,
"learning_rate": 1.7251238299451301e-06,
"loss": 0.0449,
"step": 15580
},
{
"epoch": 2.490819096279738,
"grad_norm": 0.17011442140145003,
"learning_rate": 1.7043181507462448e-06,
"loss": 0.0457,
"step": 15600
},
{
"epoch": 2.4940124540954813,
"grad_norm": 0.17376564573157416,
"learning_rate": 1.6836270028261326e-06,
"loss": 0.0446,
"step": 15620
},
{
"epoch": 2.4972058119112246,
"grad_norm": 0.2600424543600025,
"learning_rate": 1.66305067184952e-06,
"loss": 0.0435,
"step": 15640
},
{
"epoch": 2.500399169726968,
"grad_norm": 0.1728773334170149,
"learning_rate": 1.6425894418959433e-06,
"loss": 0.0444,
"step": 15660
},
{
"epoch": 2.503592527542711,
"grad_norm": 0.2117397902480935,
"learning_rate": 1.6222435954558435e-06,
"loss": 0.0424,
"step": 15680
},
{
"epoch": 2.5067858853584544,
"grad_norm": 0.20379918000728395,
"learning_rate": 1.6020134134266674e-06,
"loss": 0.0449,
"step": 15700
},
{
"epoch": 2.5099792431741976,
"grad_norm": 0.3110350981628874,
"learning_rate": 1.5818991751089762e-06,
"loss": 0.0434,
"step": 15720
},
{
"epoch": 2.513172600989941,
"grad_norm": 0.18429144606858047,
"learning_rate": 1.5619011582025988e-06,
"loss": 0.0439,
"step": 15740
},
{
"epoch": 2.516365958805684,
"grad_norm": 0.1756584956115843,
"learning_rate": 1.5420196388027963e-06,
"loss": 0.0423,
"step": 15760
},
{
"epoch": 2.5195593166214274,
"grad_norm": 0.18747969624165203,
"learning_rate": 1.5222548913964508e-06,
"loss": 0.0432,
"step": 15780
},
{
"epoch": 2.5227526744371707,
"grad_norm": 0.17351521964113906,
"learning_rate": 1.5026071888582771e-06,
"loss": 0.0428,
"step": 15800
},
{
"epoch": 2.525946032252914,
"grad_norm": 0.1763855716931325,
"learning_rate": 1.4830768024470487e-06,
"loss": 0.0437,
"step": 15820
},
{
"epoch": 2.529139390068657,
"grad_norm": 0.19172367578038851,
"learning_rate": 1.4636640018018556e-06,
"loss": 0.0436,
"step": 15840
},
{
"epoch": 2.5323327478844004,
"grad_norm": 0.18955098367053075,
"learning_rate": 1.4443690549383904e-06,
"loss": 0.0422,
"step": 15860
},
{
"epoch": 2.5355261057001437,
"grad_norm": 0.2062297852474484,
"learning_rate": 1.4251922282452356e-06,
"loss": 0.0423,
"step": 15880
},
{
"epoch": 2.538719463515887,
"grad_norm": 0.184016665131291,
"learning_rate": 1.4061337864801916e-06,
"loss": 0.0441,
"step": 15900
},
{
"epoch": 2.54191282133163,
"grad_norm": 0.21880976113017805,
"learning_rate": 1.3871939927666189e-06,
"loss": 0.046,
"step": 15920
},
{
"epoch": 2.5451061791473735,
"grad_norm": 0.17335074095350983,
"learning_rate": 1.3683731085898144e-06,
"loss": 0.0441,
"step": 15940
},
{
"epoch": 2.5482995369631167,
"grad_norm": 0.19234479041549446,
"learning_rate": 1.349671393793388e-06,
"loss": 0.0427,
"step": 15960
},
{
"epoch": 2.55149289477886,
"grad_norm": 0.18631232012636342,
"learning_rate": 1.3310891065756814e-06,
"loss": 0.0435,
"step": 15980
},
{
"epoch": 2.5546862525946032,
"grad_norm": 0.19243767802224285,
"learning_rate": 1.3126265034862084e-06,
"loss": 0.0441,
"step": 16000
},
{
"epoch": 2.5578796104103465,
"grad_norm": 0.22553668043830372,
"learning_rate": 1.2942838394221002e-06,
"loss": 0.0438,
"step": 16020
},
{
"epoch": 2.5610729682260898,
"grad_norm": 0.2414806098978672,
"learning_rate": 1.2760613676246037e-06,
"loss": 0.0455,
"step": 16040
},
{
"epoch": 2.564266326041833,
"grad_norm": 0.17562297042382372,
"learning_rate": 1.2579593396755652e-06,
"loss": 0.0437,
"step": 16060
},
{
"epoch": 2.5674596838575763,
"grad_norm": 0.1714929007989254,
"learning_rate": 1.2399780054939758e-06,
"loss": 0.0435,
"step": 16080
},
{
"epoch": 2.5706530416733195,
"grad_norm": 0.18944429187488632,
"learning_rate": 1.2221176133325097e-06,
"loss": 0.0432,
"step": 16100
},
{
"epoch": 2.573846399489063,
"grad_norm": 0.18830587754770226,
"learning_rate": 1.2043784097740951e-06,
"loss": 0.044,
"step": 16120
},
{
"epoch": 2.577039757304806,
"grad_norm": 0.20515213794452525,
"learning_rate": 1.1867606397285191e-06,
"loss": 0.0444,
"step": 16140
},
{
"epoch": 2.5802331151205493,
"grad_norm": 0.2068320912840683,
"learning_rate": 1.1692645464290441e-06,
"loss": 0.0443,
"step": 16160
},
{
"epoch": 2.5834264729362926,
"grad_norm": 0.2065451583149461,
"learning_rate": 1.151890371429042e-06,
"loss": 0.0447,
"step": 16180
},
{
"epoch": 2.586619830752036,
"grad_norm": 0.20955876801496184,
"learning_rate": 1.1346383545986629e-06,
"loss": 0.043,
"step": 16200
},
{
"epoch": 2.589813188567779,
"grad_norm": 0.18475336946843543,
"learning_rate": 1.117508734121535e-06,
"loss": 0.0439,
"step": 16220
},
{
"epoch": 2.5930065463835223,
"grad_norm": 0.19250755490602636,
"learning_rate": 1.1005017464914568e-06,
"loss": 0.0431,
"step": 16240
},
{
"epoch": 2.5961999041992656,
"grad_norm": 0.2138444193531275,
"learning_rate": 1.0836176265091448e-06,
"loss": 0.0447,
"step": 16260
},
{
"epoch": 2.599393262015009,
"grad_norm": 0.19283181561318452,
"learning_rate": 1.0668566072789876e-06,
"loss": 0.0434,
"step": 16280
},
{
"epoch": 2.602586619830752,
"grad_norm": 0.19258136254237682,
"learning_rate": 1.05021892020583e-06,
"loss": 0.0452,
"step": 16300
},
{
"epoch": 2.6057799776464954,
"grad_norm": 0.239296573931001,
"learning_rate": 1.0337047949917777e-06,
"loss": 0.0432,
"step": 16320
},
{
"epoch": 2.6089733354622386,
"grad_norm": 0.18442185794546465,
"learning_rate": 1.0173144596330231e-06,
"loss": 0.0439,
"step": 16340
},
{
"epoch": 2.612166693277982,
"grad_norm": 0.17759720874685755,
"learning_rate": 1.0010481404166972e-06,
"loss": 0.0434,
"step": 16360
},
{
"epoch": 2.615360051093725,
"grad_norm": 0.1999834786965281,
"learning_rate": 9.849060619177553e-07,
"loss": 0.0446,
"step": 16380
},
{
"epoch": 2.6185534089094684,
"grad_norm": 0.21313365667220596,
"learning_rate": 9.688884469958604e-07,
"loss": 0.0434,
"step": 16400
},
{
"epoch": 2.6217467667252117,
"grad_norm": 0.19320209419752543,
"learning_rate": 9.5299551679232e-07,
"loss": 0.0445,
"step": 16420
},
{
"epoch": 2.624940124540955,
"grad_norm": 0.17847623577962735,
"learning_rate": 9.372274907270251e-07,
"loss": 0.0437,
"step": 16440
},
{
"epoch": 2.628133482356698,
"grad_norm": 0.23166885515187532,
"learning_rate": 9.215845864954287e-07,
"loss": 0.0419,
"step": 16460
},
{
"epoch": 2.6313268401724415,
"grad_norm": 0.18325681984081477,
"learning_rate": 9.060670200655286e-07,
"loss": 0.0439,
"step": 16480
},
{
"epoch": 2.6345201979881847,
"grad_norm": 0.20540975477642068,
"learning_rate": 8.906750056748947e-07,
"loss": 0.0448,
"step": 16500
},
{
"epoch": 2.637713555803928,
"grad_norm": 0.1786617783763284,
"learning_rate": 8.754087558277113e-07,
"loss": 0.0444,
"step": 16520
},
{
"epoch": 2.6409069136196712,
"grad_norm": 0.1901267431080617,
"learning_rate": 8.602684812918416e-07,
"loss": 0.0438,
"step": 16540
},
{
"epoch": 2.6441002714354145,
"grad_norm": 0.18259614623005302,
"learning_rate": 8.452543910959121e-07,
"loss": 0.0432,
"step": 16560
},
{
"epoch": 2.6472936292511577,
"grad_norm": 0.18713135077039142,
"learning_rate": 8.303666925264331e-07,
"loss": 0.0437,
"step": 16580
},
{
"epoch": 2.650486987066901,
"grad_norm": 0.1801858452235725,
"learning_rate": 8.156055911249394e-07,
"loss": 0.0448,
"step": 16600
},
{
"epoch": 2.6536803448826443,
"grad_norm": 0.17771380124624228,
"learning_rate": 8.00971290685143e-07,
"loss": 0.0445,
"step": 16620
},
{
"epoch": 2.6568737026983875,
"grad_norm": 0.22250062270982698,
"learning_rate": 7.864639932501294e-07,
"loss": 0.0427,
"step": 16640
},
{
"epoch": 2.6600670605141303,
"grad_norm": 0.20866465188062733,
"learning_rate": 7.720838991095602e-07,
"loss": 0.0427,
"step": 16660
},
{
"epoch": 2.663260418329874,
"grad_norm": 0.2055356708135395,
"learning_rate": 7.578312067969162e-07,
"loss": 0.043,
"step": 16680
},
{
"epoch": 2.666453776145617,
"grad_norm": 0.20698005060615937,
"learning_rate": 7.437061130867473e-07,
"loss": 0.0442,
"step": 16700
},
{
"epoch": 2.6696471339613606,
"grad_norm": 0.20876117607511466,
"learning_rate": 7.297088129919616e-07,
"loss": 0.0498,
"step": 16720
},
{
"epoch": 2.6728404917771034,
"grad_norm": 0.24032862358776724,
"learning_rate": 7.158394997611329e-07,
"loss": 0.0429,
"step": 16740
},
{
"epoch": 2.676033849592847,
"grad_norm": 0.20969273760927634,
"learning_rate": 7.020983648758318e-07,
"loss": 0.0447,
"step": 16760
},
{
"epoch": 2.67922720740859,
"grad_norm": 0.2174374325052259,
"learning_rate": 6.884855980479777e-07,
"loss": 0.0452,
"step": 16780
},
{
"epoch": 2.6824205652243336,
"grad_norm": 0.18004577133887417,
"learning_rate": 6.750013872172301e-07,
"loss": 0.0438,
"step": 16800
},
{
"epoch": 2.6856139230400764,
"grad_norm": 0.2035569950209219,
"learning_rate": 6.616459185483793e-07,
"loss": 0.0438,
"step": 16820
},
{
"epoch": 2.68880728085582,
"grad_norm": 0.20132465630515528,
"learning_rate": 6.484193764287938e-07,
"loss": 0.0445,
"step": 16840
},
{
"epoch": 2.692000638671563,
"grad_norm": 0.1712570311869676,
"learning_rate": 6.353219434658587e-07,
"loss": 0.0432,
"step": 16860
},
{
"epoch": 2.6951939964873066,
"grad_norm": 0.19144286472815933,
"learning_rate": 6.223538004844587e-07,
"loss": 0.0426,
"step": 16880
},
{
"epoch": 2.6983873543030494,
"grad_norm": 0.1761969500556086,
"learning_rate": 6.095151265244937e-07,
"loss": 0.0436,
"step": 16900
},
{
"epoch": 2.701580712118793,
"grad_norm": 0.18412941719997428,
"learning_rate": 5.968060988383884e-07,
"loss": 0.0419,
"step": 16920
},
{
"epoch": 2.704774069934536,
"grad_norm": 0.2088468477123862,
"learning_rate": 5.842268928886563e-07,
"loss": 0.0435,
"step": 16940
},
{
"epoch": 2.7079674277502797,
"grad_norm": 0.21087568774149862,
"learning_rate": 5.717776823454746e-07,
"loss": 0.0434,
"step": 16960
},
{
"epoch": 2.7111607855660225,
"grad_norm": 0.20533012449268137,
"learning_rate": 5.594586390842915e-07,
"loss": 0.0436,
"step": 16980
},
{
"epoch": 2.714354143381766,
"grad_norm": 0.23130477787372275,
"learning_rate": 5.472699331834408e-07,
"loss": 0.0434,
"step": 17000
},
{
"epoch": 2.717547501197509,
"grad_norm": 0.19246797825033052,
"learning_rate": 5.352117329218065e-07,
"loss": 0.0443,
"step": 17020
},
{
"epoch": 2.7207408590132527,
"grad_norm": 0.19825650332574749,
"learning_rate": 5.23284204776493e-07,
"loss": 0.0432,
"step": 17040
},
{
"epoch": 2.7239342168289955,
"grad_norm": 0.19435989820475502,
"learning_rate": 5.1148751342053e-07,
"loss": 0.0437,
"step": 17060
},
{
"epoch": 2.727127574644739,
"grad_norm": 0.17105286427984273,
"learning_rate": 4.998218217205941e-07,
"loss": 0.0431,
"step": 17080
},
{
"epoch": 2.730320932460482,
"grad_norm": 0.2076555517606956,
"learning_rate": 4.882872907347657e-07,
"loss": 0.0441,
"step": 17100
},
{
"epoch": 2.7335142902762257,
"grad_norm": 0.17467573768445724,
"learning_rate": 4.768840797103014e-07,
"loss": 0.0426,
"step": 17120
},
{
"epoch": 2.7367076480919685,
"grad_norm": 0.23656714472082974,
"learning_rate": 4.6561234608143993e-07,
"loss": 0.0442,
"step": 17140
},
{
"epoch": 2.739901005907712,
"grad_norm": 0.1991265479506836,
"learning_rate": 4.544722454672223e-07,
"loss": 0.0443,
"step": 17160
},
{
"epoch": 2.743094363723455,
"grad_norm": 0.16764542580219924,
"learning_rate": 4.434639316693479e-07,
"loss": 0.0441,
"step": 17180
},
{
"epoch": 2.7462877215391983,
"grad_norm": 0.18540914909816514,
"learning_rate": 4.3258755667005104e-07,
"loss": 0.0427,
"step": 17200
},
{
"epoch": 2.7494810793549416,
"grad_norm": 0.16756011986354746,
"learning_rate": 4.218432706300013e-07,
"loss": 0.0442,
"step": 17220
},
{
"epoch": 2.752674437170685,
"grad_norm": 0.19477880662403543,
"learning_rate": 4.1123122188623024e-07,
"loss": 0.0419,
"step": 17240
},
{
"epoch": 2.755867794986428,
"grad_norm": 0.16692137735923454,
"learning_rate": 4.0075155695008193e-07,
"loss": 0.0439,
"step": 17260
},
{
"epoch": 2.7590611528021713,
"grad_norm": 0.27371092487152754,
"learning_rate": 3.904044205051938e-07,
"loss": 0.0415,
"step": 17280
},
{
"epoch": 2.7622545106179146,
"grad_norm": 0.1730044575542229,
"learning_rate": 3.801899554055011e-07,
"loss": 0.0434,
"step": 17300
},
{
"epoch": 2.765447868433658,
"grad_norm": 0.2957249889697754,
"learning_rate": 3.7010830267325546e-07,
"loss": 0.0432,
"step": 17320
},
{
"epoch": 2.768641226249401,
"grad_norm": 0.20211132503418788,
"learning_rate": 3.601596014970843e-07,
"loss": 0.0448,
"step": 17340
},
{
"epoch": 2.7718345840651444,
"grad_norm": 0.2192148080396869,
"learning_rate": 3.5034398923007195e-07,
"loss": 0.0429,
"step": 17360
},
{
"epoch": 2.7750279418808876,
"grad_norm": 0.19416701667619607,
"learning_rate": 3.40661601387855e-07,
"loss": 0.0442,
"step": 17380
},
{
"epoch": 2.778221299696631,
"grad_norm": 0.2194341949029401,
"learning_rate": 3.311125716467578e-07,
"loss": 0.0451,
"step": 17400
},
{
"epoch": 2.781414657512374,
"grad_norm": 0.23997919053006325,
"learning_rate": 3.216970318419488e-07,
"loss": 0.0433,
"step": 17420
},
{
"epoch": 2.7846080153281174,
"grad_norm": 0.20048137685529088,
"learning_rate": 3.1241511196561045e-07,
"loss": 0.0436,
"step": 17440
},
{
"epoch": 2.7878013731438607,
"grad_norm": 0.18418386343058352,
"learning_rate": 3.0326694016515555e-07,
"loss": 0.0431,
"step": 17460
},
{
"epoch": 2.790994730959604,
"grad_norm": 0.18647531186123847,
"learning_rate": 2.9425264274144937e-07,
"loss": 0.0441,
"step": 17480
},
{
"epoch": 2.794188088775347,
"grad_norm": 0.18103520276457064,
"learning_rate": 2.8537234414707573e-07,
"loss": 0.0424,
"step": 17500
},
{
"epoch": 2.7973814465910904,
"grad_norm": 0.175838788085868,
"learning_rate": 2.766261669846071e-07,
"loss": 0.0428,
"step": 17520
},
{
"epoch": 2.8005748044068337,
"grad_norm": 0.18597288140297774,
"learning_rate": 2.680142320049195e-07,
"loss": 0.0461,
"step": 17540
},
{
"epoch": 2.803768162222577,
"grad_norm": 0.19306825995055335,
"learning_rate": 2.5953665810552586e-07,
"loss": 0.0432,
"step": 17560
},
{
"epoch": 2.8069615200383202,
"grad_norm": 0.19244074182083917,
"learning_rate": 2.5119356232892965e-07,
"loss": 0.0447,
"step": 17580
},
{
"epoch": 2.8101548778540635,
"grad_norm": 0.20041935845397732,
"learning_rate": 2.4298505986101397e-07,
"loss": 0.0417,
"step": 17600
},
{
"epoch": 2.8133482356698067,
"grad_norm": 0.1897352035064278,
"learning_rate": 2.3491126402944597e-07,
"loss": 0.0447,
"step": 17620
},
{
"epoch": 2.81654159348555,
"grad_norm": 0.1859749113332233,
"learning_rate": 2.269722863021162e-07,
"loss": 0.0441,
"step": 17640
},
{
"epoch": 2.8197349513012933,
"grad_norm": 0.18154530556190202,
"learning_rate": 2.191682362856018e-07,
"loss": 0.0449,
"step": 17660
},
{
"epoch": 2.8229283091170365,
"grad_norm": 0.19576462753720822,
"learning_rate": 2.1149922172364557e-07,
"loss": 0.043,
"step": 17680
},
{
"epoch": 2.8261216669327798,
"grad_norm": 0.19317600637380156,
"learning_rate": 2.0396534849567384e-07,
"loss": 0.0435,
"step": 17700
},
{
"epoch": 2.829315024748523,
"grad_norm": 0.18270539963789229,
"learning_rate": 1.9656672061533876e-07,
"loss": 0.0448,
"step": 17720
},
{
"epoch": 2.8325083825642663,
"grad_norm": 0.25190362174641373,
"learning_rate": 1.8930344022907055e-07,
"loss": 0.0433,
"step": 17740
},
{
"epoch": 2.8357017403800096,
"grad_norm": 0.19271629305777457,
"learning_rate": 1.8217560761467744e-07,
"loss": 0.0442,
"step": 17760
},
{
"epoch": 2.838895098195753,
"grad_norm": 0.6386981477198299,
"learning_rate": 1.7518332117995695e-07,
"loss": 0.0431,
"step": 17780
},
{
"epoch": 2.842088456011496,
"grad_norm": 0.20346250081845807,
"learning_rate": 1.6832667746134236e-07,
"loss": 0.0422,
"step": 17800
},
{
"epoch": 2.8452818138272393,
"grad_norm": 0.17777460027714007,
"learning_rate": 1.6160577112255827e-07,
"loss": 0.0425,
"step": 17820
},
{
"epoch": 2.8484751716429826,
"grad_norm": 0.255413137859645,
"learning_rate": 1.5502069495332616e-07,
"loss": 0.0435,
"step": 17840
},
{
"epoch": 2.851668529458726,
"grad_norm": 0.19607428087584267,
"learning_rate": 1.4857153986807649e-07,
"loss": 0.0418,
"step": 17860
},
{
"epoch": 2.854861887274469,
"grad_norm": 0.1780772888139799,
"learning_rate": 1.4225839490469628e-07,
"loss": 0.0427,
"step": 17880
},
{
"epoch": 2.8580552450902124,
"grad_norm": 0.21241047060680943,
"learning_rate": 1.3608134722329803e-07,
"loss": 0.0437,
"step": 17900
},
{
"epoch": 2.8612486029059556,
"grad_norm": 0.19239115510673255,
"learning_rate": 1.3004048210501718e-07,
"loss": 0.0434,
"step": 17920
},
{
"epoch": 2.864441960721699,
"grad_norm": 0.18795522932841213,
"learning_rate": 1.2413588295083656e-07,
"loss": 0.0431,
"step": 17940
},
{
"epoch": 2.867635318537442,
"grad_norm": 0.18585931164828967,
"learning_rate": 1.183676312804305e-07,
"loss": 0.0442,
"step": 17960
},
{
"epoch": 2.8708286763531854,
"grad_norm": 0.18075501501439709,
"learning_rate": 1.1273580673104245e-07,
"loss": 0.0444,
"step": 17980
},
{
"epoch": 2.8740220341689287,
"grad_norm": 0.19563735408076433,
"learning_rate": 1.072404870563859e-07,
"loss": 0.0447,
"step": 18000
},
{
"epoch": 2.877215391984672,
"grad_norm": 0.19825850897569677,
"learning_rate": 1.0188174812557073e-07,
"loss": 0.0439,
"step": 18020
},
{
"epoch": 2.880408749800415,
"grad_norm": 0.17410835997084562,
"learning_rate": 9.665966392205295e-08,
"loss": 0.0446,
"step": 18040
},
{
"epoch": 2.8836021076161584,
"grad_norm": 0.17894750425194603,
"learning_rate": 9.157430654261778e-08,
"loss": 0.0444,
"step": 18060
},
{
"epoch": 2.8867954654319017,
"grad_norm": 0.1932898053763739,
"learning_rate": 8.662574619637931e-08,
"loss": 0.043,
"step": 18080
},
{
"epoch": 2.889988823247645,
"grad_norm": 0.19451425195215136,
"learning_rate": 8.18140512038157e-08,
"loss": 0.0428,
"step": 18100
},
{
"epoch": 2.893182181063388,
"grad_norm": 0.18451759369547344,
"learning_rate": 7.713928799582215e-08,
"loss": 0.0443,
"step": 18120
},
{
"epoch": 2.8963755388791315,
"grad_norm": 0.21235909068408473,
"learning_rate": 7.260152111279839e-08,
"loss": 0.0443,
"step": 18140
},
{
"epoch": 2.8995688966948747,
"grad_norm": 0.18028750928095402,
"learning_rate": 6.82008132037515e-08,
"loss": 0.0425,
"step": 18160
},
{
"epoch": 2.902762254510618,
"grad_norm": 0.1865997727595832,
"learning_rate": 6.393722502543665e-08,
"loss": 0.045,
"step": 18180
},
{
"epoch": 2.9059556123263612,
"grad_norm": 0.18553943543624984,
"learning_rate": 5.981081544151446e-08,
"loss": 0.0428,
"step": 18200
},
{
"epoch": 2.9091489701421045,
"grad_norm": 0.19032355954882516,
"learning_rate": 5.5821641421741625e-08,
"loss": 0.0443,
"step": 18220
},
{
"epoch": 2.9123423279578478,
"grad_norm": 0.18084808651831624,
"learning_rate": 5.196975804117932e-08,
"loss": 0.0435,
"step": 18240
},
{
"epoch": 2.915535685773591,
"grad_norm": 0.21753074838538441,
"learning_rate": 4.825521847944048e-08,
"loss": 0.0418,
"step": 18260
},
{
"epoch": 2.9187290435893343,
"grad_norm": 0.1883119176872824,
"learning_rate": 4.467807401994706e-08,
"loss": 0.0426,
"step": 18280
},
{
"epoch": 2.9219224014050775,
"grad_norm": 0.17894355455146954,
"learning_rate": 4.123837404922726e-08,
"loss": 0.0429,
"step": 18300
},
{
"epoch": 2.925115759220821,
"grad_norm": 0.20477512942702414,
"learning_rate": 3.7936166056233845e-08,
"loss": 0.0421,
"step": 18320
},
{
"epoch": 2.928309117036564,
"grad_norm": 0.17982986336579576,
"learning_rate": 3.4771495631686914e-08,
"loss": 0.0433,
"step": 18340
},
{
"epoch": 2.9315024748523073,
"grad_norm": 0.19778942398473365,
"learning_rate": 3.174440646744326e-08,
"loss": 0.0434,
"step": 18360
},
{
"epoch": 2.9346958326680506,
"grad_norm": 0.1840797815880338,
"learning_rate": 2.8854940355895756e-08,
"loss": 0.0422,
"step": 18380
},
{
"epoch": 2.937889190483794,
"grad_norm": 0.20492139151779767,
"learning_rate": 2.6103137189394945e-08,
"loss": 0.0433,
"step": 18400
},
{
"epoch": 2.941082548299537,
"grad_norm": 0.18649980327789625,
"learning_rate": 2.3489034959698342e-08,
"loss": 0.0423,
"step": 18420
},
{
"epoch": 2.9442759061152803,
"grad_norm": 0.18710560587786274,
"learning_rate": 2.1012669757446423e-08,
"loss": 0.0447,
"step": 18440
},
{
"epoch": 2.9474692639310236,
"grad_norm": 0.1950435200815635,
"learning_rate": 1.8674075771665246e-08,
"loss": 0.0441,
"step": 18460
},
{
"epoch": 2.950662621746767,
"grad_norm": 0.23718279280034166,
"learning_rate": 1.647328528929126e-08,
"loss": 0.0443,
"step": 18480
},
{
"epoch": 2.95385597956251,
"grad_norm": 0.1828813035697597,
"learning_rate": 1.441032869472725e-08,
"loss": 0.0434,
"step": 18500
},
{
"epoch": 2.9570493373782534,
"grad_norm": 0.18330846523906766,
"learning_rate": 1.2485234469425955e-08,
"loss": 0.0447,
"step": 18520
},
{
"epoch": 2.9602426951939966,
"grad_norm": 0.19409145822202675,
"learning_rate": 1.0698029191491543e-08,
"loss": 0.0424,
"step": 18540
},
{
"epoch": 2.96343605300974,
"grad_norm": 0.19044949836984276,
"learning_rate": 9.048737535317654e-09,
"loss": 0.0421,
"step": 18560
},
{
"epoch": 2.966629410825483,
"grad_norm": 0.24441457630679606,
"learning_rate": 7.5373822712399e-09,
"loss": 0.0429,
"step": 18580
},
{
"epoch": 2.9698227686412264,
"grad_norm": 0.17238090085354812,
"learning_rate": 6.163984265230571e-09,
"loss": 0.0436,
"step": 18600
},
{
"epoch": 2.9730161264569697,
"grad_norm": 0.17724705729907833,
"learning_rate": 4.928562478603294e-09,
"loss": 0.0438,
"step": 18620
},
{
"epoch": 2.9762094842727125,
"grad_norm": 0.17813664105789478,
"learning_rate": 3.831133967754363e-09,
"loss": 0.0443,
"step": 18640
},
{
"epoch": 2.979402842088456,
"grad_norm": 0.19807197965691153,
"learning_rate": 2.8717138839262638e-09,
"loss": 0.0423,
"step": 18660
},
{
"epoch": 2.982596199904199,
"grad_norm": 0.20792602007313574,
"learning_rate": 2.050315473000053e-09,
"loss": 0.0437,
"step": 18680
},
{
"epoch": 2.9857895577199427,
"grad_norm": 0.18039436629311245,
"learning_rate": 1.3669500753099586e-09,
"loss": 0.0449,
"step": 18700
},
{
"epoch": 2.9889829155356855,
"grad_norm": 0.16777557295223433,
"learning_rate": 8.216271254901653e-10,
"loss": 0.0433,
"step": 18720
},
{
"epoch": 2.992176273351429,
"grad_norm": 0.21329586917274732,
"learning_rate": 4.1435415233936903e-10,
"loss": 0.0437,
"step": 18740
},
{
"epoch": 2.995369631167172,
"grad_norm": 0.17382850175198178,
"learning_rate": 1.451367787230762e-10,
"loss": 0.0434,
"step": 18760
},
{
"epoch": 2.9985629889829157,
"grad_norm": 0.19059012257580193,
"learning_rate": 1.3978721492557968e-11,
"loss": 0.0444,
"step": 18780
}
],
"logging_steps": 20,
"max_steps": 18789,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6401980526886912.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}