{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500.0, "global_step": 18789, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003193357815743254, "grad_norm": 1.9077140880050831, "learning_rate": 2.1287919105907397e-07, "loss": 0.293, "step": 20 }, { "epoch": 0.006386715631486508, "grad_norm": 1.2741126203759185, "learning_rate": 4.2575838211814794e-07, "loss": 0.2755, "step": 40 }, { "epoch": 0.009580073447229762, "grad_norm": 0.9579957272491677, "learning_rate": 6.38637573177222e-07, "loss": 0.2663, "step": 60 }, { "epoch": 0.012773431262973017, "grad_norm": 0.7572791753798676, "learning_rate": 8.515167642362959e-07, "loss": 0.2483, "step": 80 }, { "epoch": 0.01596678907871627, "grad_norm": 0.7631906150781687, "learning_rate": 1.0643959552953699e-06, "loss": 0.2345, "step": 100 }, { "epoch": 0.019160146894459523, "grad_norm": 0.6910231975995771, "learning_rate": 1.277275146354444e-06, "loss": 0.2325, "step": 120 }, { "epoch": 0.02235350471020278, "grad_norm": 0.6148039205762914, "learning_rate": 1.490154337413518e-06, "loss": 0.2204, "step": 140 }, { "epoch": 0.025546862525946033, "grad_norm": 0.5789874935423447, "learning_rate": 1.7030335284725918e-06, "loss": 0.2098, "step": 160 }, { "epoch": 0.028740220341689285, "grad_norm": 0.7476463496285388, "learning_rate": 1.915912719531666e-06, "loss": 0.2032, "step": 180 }, { "epoch": 0.03193357815743254, "grad_norm": 0.7428740533196578, "learning_rate": 2.1287919105907398e-06, "loss": 0.2012, "step": 200 }, { "epoch": 0.035126935973175795, "grad_norm": 0.7037948631808884, "learning_rate": 2.341671101649814e-06, "loss": 0.2009, "step": 220 }, { "epoch": 0.038320293788919046, "grad_norm": 0.7550244833274011, "learning_rate": 2.554550292708888e-06, "loss": 0.1909, "step": 240 }, { "epoch": 0.041513651604662305, "grad_norm": 0.6563323462206586, "learning_rate": 2.7674294837679623e-06, "loss": 0.1921, "step": 260 }, { "epoch": 0.04470700942040556, "grad_norm": 0.6164020880535914, "learning_rate": 2.980308674827036e-06, "loss": 0.18, "step": 280 }, { "epoch": 0.04790036723614881, "grad_norm": 0.6690736448965662, "learning_rate": 3.1931878658861097e-06, "loss": 0.1775, "step": 300 }, { "epoch": 0.05109372505189207, "grad_norm": 0.6290006787340572, "learning_rate": 3.4060670569451835e-06, "loss": 0.1806, "step": 320 }, { "epoch": 0.05428708286763532, "grad_norm": 0.6635212199407985, "learning_rate": 3.6189462480042583e-06, "loss": 0.1779, "step": 340 }, { "epoch": 0.05748044068337857, "grad_norm": 0.6197593777818997, "learning_rate": 3.831825439063332e-06, "loss": 0.1702, "step": 360 }, { "epoch": 0.06067379849912183, "grad_norm": 0.7111109613175086, "learning_rate": 4.044704630122406e-06, "loss": 0.1726, "step": 380 }, { "epoch": 0.06386715631486509, "grad_norm": 0.7014844198325911, "learning_rate": 4.2575838211814795e-06, "loss": 0.1677, "step": 400 }, { "epoch": 0.06706051413060833, "grad_norm": 0.6960545421354288, "learning_rate": 4.470463012240554e-06, "loss": 0.1661, "step": 420 }, { "epoch": 0.07025387194635159, "grad_norm": 0.6526527280390234, "learning_rate": 4.683342203299628e-06, "loss": 0.1599, "step": 440 }, { "epoch": 0.07344722976209485, "grad_norm": 0.6231986280201518, "learning_rate": 4.896221394358702e-06, "loss": 0.1608, "step": 460 }, { "epoch": 0.07664058757783809, "grad_norm": 0.6963837819044139, "learning_rate": 5.109100585417776e-06, "loss": 0.1622, "step": 480 }, { "epoch": 0.07983394539358135, "grad_norm": 0.6109671730909707, "learning_rate": 5.32197977647685e-06, "loss": 0.1598, "step": 500 }, { "epoch": 0.08302730320932461, "grad_norm": 0.5371960923403704, "learning_rate": 5.534858967535925e-06, "loss": 0.1594, "step": 520 }, { "epoch": 0.08622066102506785, "grad_norm": 0.5367820828152228, "learning_rate": 5.747738158594997e-06, "loss": 0.1596, "step": 540 }, { "epoch": 0.08941401884081111, "grad_norm": 0.5470627592374788, "learning_rate": 5.960617349654072e-06, "loss": 0.1572, "step": 560 }, { "epoch": 0.09260737665655437, "grad_norm": 0.6499813395079859, "learning_rate": 6.173496540713145e-06, "loss": 0.1608, "step": 580 }, { "epoch": 0.09580073447229762, "grad_norm": 0.5979118456987372, "learning_rate": 6.386375731772219e-06, "loss": 0.1556, "step": 600 }, { "epoch": 0.09899409228804087, "grad_norm": 0.6192365185802242, "learning_rate": 6.5992549228312945e-06, "loss": 0.1558, "step": 620 }, { "epoch": 0.10218745010378413, "grad_norm": 0.6113365486687661, "learning_rate": 6.812134113890367e-06, "loss": 0.1529, "step": 640 }, { "epoch": 0.10538080791952738, "grad_norm": 0.5734345240030044, "learning_rate": 7.025013304949441e-06, "loss": 0.1569, "step": 660 }, { "epoch": 0.10857416573527064, "grad_norm": 0.5661084962098065, "learning_rate": 7.2378924960085166e-06, "loss": 0.1516, "step": 680 }, { "epoch": 0.1117675235510139, "grad_norm": 0.5846067384703525, "learning_rate": 7.450771687067589e-06, "loss": 0.15, "step": 700 }, { "epoch": 0.11496088136675714, "grad_norm": 0.6374604468076045, "learning_rate": 7.663650878126664e-06, "loss": 0.1595, "step": 720 }, { "epoch": 0.1181542391825004, "grad_norm": 0.6518179456002492, "learning_rate": 7.876530069185738e-06, "loss": 0.1534, "step": 740 }, { "epoch": 0.12134759699824366, "grad_norm": 0.6418033051046158, "learning_rate": 8.089409260244812e-06, "loss": 0.1544, "step": 760 }, { "epoch": 0.1245409548139869, "grad_norm": 0.5560485727454112, "learning_rate": 8.302288451303886e-06, "loss": 0.1519, "step": 780 }, { "epoch": 0.12773431262973017, "grad_norm": 0.5218192022156695, "learning_rate": 8.515167642362959e-06, "loss": 0.1526, "step": 800 }, { "epoch": 0.1309276704454734, "grad_norm": 0.5873485030073047, "learning_rate": 8.728046833422033e-06, "loss": 0.1477, "step": 820 }, { "epoch": 0.13412102826121666, "grad_norm": 0.52521133386056, "learning_rate": 8.940926024481108e-06, "loss": 0.1462, "step": 840 }, { "epoch": 0.13731438607695992, "grad_norm": 0.49868364100795765, "learning_rate": 9.153805215540182e-06, "loss": 0.1459, "step": 860 }, { "epoch": 0.14050774389270318, "grad_norm": 0.46689167597503883, "learning_rate": 9.366684406599256e-06, "loss": 0.1522, "step": 880 }, { "epoch": 0.14370110170844644, "grad_norm": 0.5355408722325465, "learning_rate": 9.57956359765833e-06, "loss": 0.1512, "step": 900 }, { "epoch": 0.1468944595241897, "grad_norm": 0.46406202651388007, "learning_rate": 9.792442788717403e-06, "loss": 0.1503, "step": 920 }, { "epoch": 0.15008781733993293, "grad_norm": 0.5480845186900385, "learning_rate": 1.0005321979776476e-05, "loss": 0.1474, "step": 940 }, { "epoch": 0.15328117515567619, "grad_norm": 0.5541284715722858, "learning_rate": 1.0218201170835552e-05, "loss": 0.1469, "step": 960 }, { "epoch": 0.15647453297141944, "grad_norm": 0.6186186072342443, "learning_rate": 1.0431080361894626e-05, "loss": 0.1495, "step": 980 }, { "epoch": 0.1596678907871627, "grad_norm": 0.5921353515589192, "learning_rate": 1.06439595529537e-05, "loss": 0.1463, "step": 1000 }, { "epoch": 0.16286124860290596, "grad_norm": 0.5433507613364311, "learning_rate": 1.0856838744012775e-05, "loss": 0.1472, "step": 1020 }, { "epoch": 0.16605460641864922, "grad_norm": 0.5979565391588779, "learning_rate": 1.106971793507185e-05, "loss": 0.1506, "step": 1040 }, { "epoch": 0.16924796423439245, "grad_norm": 0.48287899522708827, "learning_rate": 1.128259712613092e-05, "loss": 0.1486, "step": 1060 }, { "epoch": 0.1724413220501357, "grad_norm": 0.5655815878985752, "learning_rate": 1.1495476317189994e-05, "loss": 0.1457, "step": 1080 }, { "epoch": 0.17563467986587897, "grad_norm": 0.512570124033007, "learning_rate": 1.170835550824907e-05, "loss": 0.1461, "step": 1100 }, { "epoch": 0.17882803768162223, "grad_norm": 0.46185823489920147, "learning_rate": 1.1921234699308145e-05, "loss": 0.144, "step": 1120 }, { "epoch": 0.18202139549736548, "grad_norm": 0.44757029208547805, "learning_rate": 1.2134113890367219e-05, "loss": 0.1473, "step": 1140 }, { "epoch": 0.18521475331310874, "grad_norm": 0.4396528603456714, "learning_rate": 1.234699308142629e-05, "loss": 0.142, "step": 1160 }, { "epoch": 0.188408111128852, "grad_norm": 0.4916233270385887, "learning_rate": 1.2559872272485364e-05, "loss": 0.1459, "step": 1180 }, { "epoch": 0.19160146894459523, "grad_norm": 0.49164646615860214, "learning_rate": 1.2772751463544439e-05, "loss": 0.1442, "step": 1200 }, { "epoch": 0.1947948267603385, "grad_norm": 0.47947460832408273, "learning_rate": 1.2985630654603515e-05, "loss": 0.1491, "step": 1220 }, { "epoch": 0.19798818457608175, "grad_norm": 0.5469424153214216, "learning_rate": 1.3198509845662589e-05, "loss": 0.1502, "step": 1240 }, { "epoch": 0.201181542391825, "grad_norm": 0.4638562794822929, "learning_rate": 1.3411389036721663e-05, "loss": 0.1442, "step": 1260 }, { "epoch": 0.20437490020756827, "grad_norm": 0.44344070089234844, "learning_rate": 1.3624268227780734e-05, "loss": 0.1488, "step": 1280 }, { "epoch": 0.20756825802331152, "grad_norm": 0.47717361703152655, "learning_rate": 1.3837147418839808e-05, "loss": 0.1488, "step": 1300 }, { "epoch": 0.21076161583905476, "grad_norm": 0.4703261039559369, "learning_rate": 1.4050026609898883e-05, "loss": 0.145, "step": 1320 }, { "epoch": 0.21395497365479801, "grad_norm": 0.45451915177321617, "learning_rate": 1.4262905800957957e-05, "loss": 0.1514, "step": 1340 }, { "epoch": 0.21714833147054127, "grad_norm": 0.4705441248128481, "learning_rate": 1.4475784992017033e-05, "loss": 0.1487, "step": 1360 }, { "epoch": 0.22034168928628453, "grad_norm": 0.4369632636042999, "learning_rate": 1.4688664183076104e-05, "loss": 0.1459, "step": 1380 }, { "epoch": 0.2235350471020278, "grad_norm": 0.42608257790275605, "learning_rate": 1.4901543374135178e-05, "loss": 0.1455, "step": 1400 }, { "epoch": 0.22672840491777105, "grad_norm": 0.50356002082837, "learning_rate": 1.5114422565194253e-05, "loss": 0.1451, "step": 1420 }, { "epoch": 0.22992176273351428, "grad_norm": 0.4561937833231143, "learning_rate": 1.5327301756253327e-05, "loss": 0.1477, "step": 1440 }, { "epoch": 0.23311512054925754, "grad_norm": 0.40765594909388037, "learning_rate": 1.55401809473124e-05, "loss": 0.1425, "step": 1460 }, { "epoch": 0.2363084783650008, "grad_norm": 0.487476872013174, "learning_rate": 1.5753060138371476e-05, "loss": 0.1412, "step": 1480 }, { "epoch": 0.23950183618074405, "grad_norm": 0.4680001690322545, "learning_rate": 1.596593932943055e-05, "loss": 0.1443, "step": 1500 }, { "epoch": 0.2426951939964873, "grad_norm": 0.41230879655966063, "learning_rate": 1.6178818520489624e-05, "loss": 0.1455, "step": 1520 }, { "epoch": 0.24588855181223057, "grad_norm": 0.4315075010200903, "learning_rate": 1.63916977115487e-05, "loss": 0.1453, "step": 1540 }, { "epoch": 0.2490819096279738, "grad_norm": 0.3880821656792041, "learning_rate": 1.6604576902607773e-05, "loss": 0.1367, "step": 1560 }, { "epoch": 0.2522752674437171, "grad_norm": 0.4170991591966089, "learning_rate": 1.6817456093666847e-05, "loss": 0.1444, "step": 1580 }, { "epoch": 0.25546862525946035, "grad_norm": 0.4352470317730404, "learning_rate": 1.7030335284725918e-05, "loss": 0.1462, "step": 1600 }, { "epoch": 0.25866198307520355, "grad_norm": 0.41926976953754025, "learning_rate": 1.7243214475784992e-05, "loss": 0.1427, "step": 1620 }, { "epoch": 0.2618553408909468, "grad_norm": 0.4067020140616968, "learning_rate": 1.7456093666844067e-05, "loss": 0.1435, "step": 1640 }, { "epoch": 0.26504869870669007, "grad_norm": 0.4568169742722482, "learning_rate": 1.766897285790314e-05, "loss": 0.1425, "step": 1660 }, { "epoch": 0.2682420565224333, "grad_norm": 0.4952678328950158, "learning_rate": 1.7881852048962215e-05, "loss": 0.1411, "step": 1680 }, { "epoch": 0.2714354143381766, "grad_norm": 0.36574600840843885, "learning_rate": 1.809473124002129e-05, "loss": 0.1424, "step": 1700 }, { "epoch": 0.27462877215391984, "grad_norm": 0.40710244186170225, "learning_rate": 1.8307610431080364e-05, "loss": 0.1435, "step": 1720 }, { "epoch": 0.2778221299696631, "grad_norm": 0.41415797524036474, "learning_rate": 1.852048962213944e-05, "loss": 0.1443, "step": 1740 }, { "epoch": 0.28101548778540636, "grad_norm": 0.38093938436737673, "learning_rate": 1.8733368813198513e-05, "loss": 0.1459, "step": 1760 }, { "epoch": 0.2842088456011496, "grad_norm": 0.36699157301783514, "learning_rate": 1.8946248004257587e-05, "loss": 0.1503, "step": 1780 }, { "epoch": 0.2874022034168929, "grad_norm": 0.4426133669933364, "learning_rate": 1.915912719531666e-05, "loss": 0.1458, "step": 1800 }, { "epoch": 0.29059556123263613, "grad_norm": 0.37577866094305634, "learning_rate": 1.9372006386375732e-05, "loss": 0.1437, "step": 1820 }, { "epoch": 0.2937889190483794, "grad_norm": 0.3853315977661372, "learning_rate": 1.9584885577434807e-05, "loss": 0.1424, "step": 1840 }, { "epoch": 0.29698227686412265, "grad_norm": 0.39658703817733554, "learning_rate": 1.979776476849388e-05, "loss": 0.143, "step": 1860 }, { "epoch": 0.30017563467986585, "grad_norm": 0.34168487028906286, "learning_rate": 1.9999999827423154e-05, "loss": 0.1472, "step": 1880 }, { "epoch": 0.3033689924956091, "grad_norm": 0.428099056379712, "learning_rate": 1.9999923893706236e-05, "loss": 0.1424, "step": 1900 }, { "epoch": 0.30656235031135237, "grad_norm": 0.47981664372403626, "learning_rate": 1.9999709899719893e-05, "loss": 0.1414, "step": 1920 }, { "epoch": 0.30975570812709563, "grad_norm": 0.4495236935209742, "learning_rate": 1.9999357848418547e-05, "loss": 0.1432, "step": 1940 }, { "epoch": 0.3129490659428389, "grad_norm": 0.4335827442743115, "learning_rate": 1.999886774466267e-05, "loss": 0.1449, "step": 1960 }, { "epoch": 0.31614242375858215, "grad_norm": 0.3740214770732922, "learning_rate": 1.9998239595218693e-05, "loss": 0.1455, "step": 1980 }, { "epoch": 0.3193357815743254, "grad_norm": 0.35431822792110484, "learning_rate": 1.999747340875894e-05, "loss": 0.14, "step": 2000 }, { "epoch": 0.32252913939006866, "grad_norm": 0.37271856793106084, "learning_rate": 1.9996569195861474e-05, "loss": 0.1433, "step": 2020 }, { "epoch": 0.3257224972058119, "grad_norm": 0.36904721824612496, "learning_rate": 1.999552696900998e-05, "loss": 0.1474, "step": 2040 }, { "epoch": 0.3289158550215552, "grad_norm": 0.4329302625174645, "learning_rate": 1.9994346742593577e-05, "loss": 0.1409, "step": 2060 }, { "epoch": 0.33210921283729844, "grad_norm": 0.4659341494260738, "learning_rate": 1.999302853290663e-05, "loss": 0.1453, "step": 2080 }, { "epoch": 0.3353025706530417, "grad_norm": 0.40127911103988617, "learning_rate": 1.9991572358148522e-05, "loss": 0.1396, "step": 2100 }, { "epoch": 0.3384959284687849, "grad_norm": 0.3087442313177786, "learning_rate": 1.9989978238423383e-05, "loss": 0.1474, "step": 2120 }, { "epoch": 0.34168928628452816, "grad_norm": 0.37193584969289195, "learning_rate": 1.9988246195739846e-05, "loss": 0.1422, "step": 2140 }, { "epoch": 0.3448826441002714, "grad_norm": 0.3814913494874711, "learning_rate": 1.998637625401072e-05, "loss": 0.1422, "step": 2160 }, { "epoch": 0.3480760019160147, "grad_norm": 0.34898690836821306, "learning_rate": 1.9984368439052668e-05, "loss": 0.1396, "step": 2180 }, { "epoch": 0.35126935973175794, "grad_norm": 0.3951747534549505, "learning_rate": 1.9982222778585845e-05, "loss": 0.1458, "step": 2200 }, { "epoch": 0.3544627175475012, "grad_norm": 0.34562618895160807, "learning_rate": 1.9979939302233524e-05, "loss": 0.1402, "step": 2220 }, { "epoch": 0.35765607536324445, "grad_norm": 0.388573729018997, "learning_rate": 1.9977518041521683e-05, "loss": 0.1402, "step": 2240 }, { "epoch": 0.3608494331789877, "grad_norm": 0.37200937634013864, "learning_rate": 1.9974959029878568e-05, "loss": 0.1438, "step": 2260 }, { "epoch": 0.36404279099473097, "grad_norm": 0.3965761915716373, "learning_rate": 1.9972262302634228e-05, "loss": 0.1401, "step": 2280 }, { "epoch": 0.3672361488104742, "grad_norm": 0.3173011648096044, "learning_rate": 1.996942789702004e-05, "loss": 0.1392, "step": 2300 }, { "epoch": 0.3704295066262175, "grad_norm": 0.3017733588197737, "learning_rate": 1.996645585216818e-05, "loss": 0.1424, "step": 2320 }, { "epoch": 0.37362286444196074, "grad_norm": 0.3363017850364413, "learning_rate": 1.9963346209111084e-05, "loss": 0.1396, "step": 2340 }, { "epoch": 0.376816222257704, "grad_norm": 0.36015352029461045, "learning_rate": 1.9960099010780906e-05, "loss": 0.1364, "step": 2360 }, { "epoch": 0.3800095800734472, "grad_norm": 0.3944315476618534, "learning_rate": 1.995671430200889e-05, "loss": 0.1367, "step": 2380 }, { "epoch": 0.38320293788919046, "grad_norm": 0.3511161782236592, "learning_rate": 1.9953192129524774e-05, "loss": 0.134, "step": 2400 }, { "epoch": 0.3863962957049337, "grad_norm": 0.28417767585244963, "learning_rate": 1.994953254195613e-05, "loss": 0.1345, "step": 2420 }, { "epoch": 0.389589653520677, "grad_norm": 0.33674300015583525, "learning_rate": 1.9945735589827714e-05, "loss": 0.1414, "step": 2440 }, { "epoch": 0.39278301133642024, "grad_norm": 0.35738285989377994, "learning_rate": 1.9941801325560748e-05, "loss": 0.1379, "step": 2460 }, { "epoch": 0.3959763691521635, "grad_norm": 0.3281943522856012, "learning_rate": 1.9937729803472198e-05, "loss": 0.1377, "step": 2480 }, { "epoch": 0.39916972696790676, "grad_norm": 0.45986080956623454, "learning_rate": 1.9933521079774043e-05, "loss": 0.1375, "step": 2500 }, { "epoch": 0.40236308478365, "grad_norm": 0.3345998745429948, "learning_rate": 1.9929175212572473e-05, "loss": 0.1376, "step": 2520 }, { "epoch": 0.4055564425993933, "grad_norm": 0.3589364581224636, "learning_rate": 1.9924692261867107e-05, "loss": 0.136, "step": 2540 }, { "epoch": 0.40874980041513653, "grad_norm": 0.3214892613419847, "learning_rate": 1.9920072289550152e-05, "loss": 0.1375, "step": 2560 }, { "epoch": 0.4119431582308798, "grad_norm": 0.293317969518762, "learning_rate": 1.9915315359405556e-05, "loss": 0.1396, "step": 2580 }, { "epoch": 0.41513651604662305, "grad_norm": 0.30656393646036983, "learning_rate": 1.9910421537108124e-05, "loss": 0.1417, "step": 2600 }, { "epoch": 0.41832987386236625, "grad_norm": 0.3145662089610958, "learning_rate": 1.990539089022262e-05, "loss": 0.1361, "step": 2620 }, { "epoch": 0.4215232316781095, "grad_norm": 0.3321283029025178, "learning_rate": 1.9900223488202807e-05, "loss": 0.1374, "step": 2640 }, { "epoch": 0.42471658949385277, "grad_norm": 0.3121994972039942, "learning_rate": 1.9894919402390527e-05, "loss": 0.1369, "step": 2660 }, { "epoch": 0.42790994730959603, "grad_norm": 0.3207245571484729, "learning_rate": 1.9889478706014687e-05, "loss": 0.1365, "step": 2680 }, { "epoch": 0.4311033051253393, "grad_norm": 0.3214891191577016, "learning_rate": 1.9883901474190258e-05, "loss": 0.134, "step": 2700 }, { "epoch": 0.43429666294108255, "grad_norm": 0.2948703624055813, "learning_rate": 1.9878187783917246e-05, "loss": 0.1358, "step": 2720 }, { "epoch": 0.4374900207568258, "grad_norm": 0.2962758480800377, "learning_rate": 1.9872337714079604e-05, "loss": 0.1353, "step": 2740 }, { "epoch": 0.44068337857256906, "grad_norm": 0.28750701401056433, "learning_rate": 1.9866351345444172e-05, "loss": 0.1397, "step": 2760 }, { "epoch": 0.4438767363883123, "grad_norm": 0.33961478398235684, "learning_rate": 1.9860228760659547e-05, "loss": 0.1395, "step": 2780 }, { "epoch": 0.4470700942040556, "grad_norm": 0.3327701106038422, "learning_rate": 1.9853970044254942e-05, "loss": 0.1362, "step": 2800 }, { "epoch": 0.45026345201979884, "grad_norm": 0.34046480079798697, "learning_rate": 1.9847575282639022e-05, "loss": 0.1357, "step": 2820 }, { "epoch": 0.4534568098355421, "grad_norm": 0.2591827835319709, "learning_rate": 1.984104456409871e-05, "loss": 0.1319, "step": 2840 }, { "epoch": 0.4566501676512853, "grad_norm": 0.31099418106495114, "learning_rate": 1.983437797879797e-05, "loss": 0.134, "step": 2860 }, { "epoch": 0.45984352546702856, "grad_norm": 0.34942376213984855, "learning_rate": 1.9827575618776556e-05, "loss": 0.1353, "step": 2880 }, { "epoch": 0.4630368832827718, "grad_norm": 0.29857742338407706, "learning_rate": 1.9820637577948746e-05, "loss": 0.1336, "step": 2900 }, { "epoch": 0.4662302410985151, "grad_norm": 0.2701149477986023, "learning_rate": 1.9813563952102056e-05, "loss": 0.1338, "step": 2920 }, { "epoch": 0.46942359891425833, "grad_norm": 0.35582085328111446, "learning_rate": 1.980635483889589e-05, "loss": 0.1325, "step": 2940 }, { "epoch": 0.4726169567300016, "grad_norm": 0.36536478089468427, "learning_rate": 1.979901033786022e-05, "loss": 0.138, "step": 2960 }, { "epoch": 0.47581031454574485, "grad_norm": 0.34482414871566835, "learning_rate": 1.9791530550394197e-05, "loss": 0.14, "step": 2980 }, { "epoch": 0.4790036723614881, "grad_norm": 0.313925122152452, "learning_rate": 1.9783915579764755e-05, "loss": 0.1349, "step": 3000 }, { "epoch": 0.48219703017723137, "grad_norm": 0.33065001108381514, "learning_rate": 1.9776165531105182e-05, "loss": 0.1334, "step": 3020 }, { "epoch": 0.4853903879929746, "grad_norm": 0.33106961743791363, "learning_rate": 1.9768280511413676e-05, "loss": 0.1346, "step": 3040 }, { "epoch": 0.4885837458087179, "grad_norm": 0.3038455922499442, "learning_rate": 1.9760260629551856e-05, "loss": 0.13, "step": 3060 }, { "epoch": 0.49177710362446114, "grad_norm": 0.32774568750571736, "learning_rate": 1.975210599624327e-05, "loss": 0.1317, "step": 3080 }, { "epoch": 0.4949704614402044, "grad_norm": 0.27913297393743014, "learning_rate": 1.9743816724071864e-05, "loss": 0.1299, "step": 3100 }, { "epoch": 0.4981638192559476, "grad_norm": 0.25535801906865635, "learning_rate": 1.9735392927480425e-05, "loss": 0.1341, "step": 3120 }, { "epoch": 0.5013571770716909, "grad_norm": 0.3450201878469047, "learning_rate": 1.9726834722768998e-05, "loss": 0.1307, "step": 3140 }, { "epoch": 0.5045505348874342, "grad_norm": 0.3355377854047922, "learning_rate": 1.9718142228093286e-05, "loss": 0.1373, "step": 3160 }, { "epoch": 0.5077438927031774, "grad_norm": 0.29501763605746917, "learning_rate": 1.9709315563463022e-05, "loss": 0.1329, "step": 3180 }, { "epoch": 0.5109372505189207, "grad_norm": 0.29498443847446687, "learning_rate": 1.9700354850740305e-05, "loss": 0.1302, "step": 3200 }, { "epoch": 0.514130608334664, "grad_norm": 0.3374549804904556, "learning_rate": 1.969126021363791e-05, "loss": 0.1332, "step": 3220 }, { "epoch": 0.5173239661504071, "grad_norm": 0.2937151476643792, "learning_rate": 1.9682031777717602e-05, "loss": 0.1289, "step": 3240 }, { "epoch": 0.5205173239661504, "grad_norm": 0.34027338318157424, "learning_rate": 1.9672669670388387e-05, "loss": 0.1335, "step": 3260 }, { "epoch": 0.5237106817818936, "grad_norm": 0.2958186800446919, "learning_rate": 1.966317402090475e-05, "loss": 0.1321, "step": 3280 }, { "epoch": 0.5269040395976369, "grad_norm": 0.2937191900726174, "learning_rate": 1.9653544960364886e-05, "loss": 0.132, "step": 3300 }, { "epoch": 0.5300973974133801, "grad_norm": 0.3133245335540435, "learning_rate": 1.9643782621708875e-05, "loss": 0.1311, "step": 3320 }, { "epoch": 0.5332907552291234, "grad_norm": 0.29304130620982005, "learning_rate": 1.963388713971685e-05, "loss": 0.1355, "step": 3340 }, { "epoch": 0.5364841130448667, "grad_norm": 0.31292116477262744, "learning_rate": 1.962385865100715e-05, "loss": 0.1351, "step": 3360 }, { "epoch": 0.5396774708606099, "grad_norm": 0.26493353801679925, "learning_rate": 1.9613697294034403e-05, "loss": 0.1315, "step": 3380 }, { "epoch": 0.5428708286763532, "grad_norm": 0.2630906644626646, "learning_rate": 1.9603403209087655e-05, "loss": 0.1312, "step": 3400 }, { "epoch": 0.5460641864920964, "grad_norm": 0.26085033107366945, "learning_rate": 1.9592976538288392e-05, "loss": 0.1296, "step": 3420 }, { "epoch": 0.5492575443078397, "grad_norm": 0.2940107915040809, "learning_rate": 1.9582417425588615e-05, "loss": 0.1305, "step": 3440 }, { "epoch": 0.5524509021235829, "grad_norm": 0.2648782222390229, "learning_rate": 1.9571726016768825e-05, "loss": 0.1298, "step": 3460 }, { "epoch": 0.5556442599393262, "grad_norm": 0.25767016449009617, "learning_rate": 1.9560902459436027e-05, "loss": 0.1287, "step": 3480 }, { "epoch": 0.5588376177550695, "grad_norm": 0.304832191804209, "learning_rate": 1.9549946903021676e-05, "loss": 0.1335, "step": 3500 }, { "epoch": 0.5620309755708127, "grad_norm": 0.2814622371172937, "learning_rate": 1.953885949877963e-05, "loss": 0.1287, "step": 3520 }, { "epoch": 0.565224333386556, "grad_norm": 0.27565323140470793, "learning_rate": 1.9527640399784066e-05, "loss": 0.132, "step": 3540 }, { "epoch": 0.5684176912022992, "grad_norm": 0.2874659718873625, "learning_rate": 1.9516289760927337e-05, "loss": 0.1306, "step": 3560 }, { "epoch": 0.5716110490180425, "grad_norm": 0.24637256127265056, "learning_rate": 1.9504807738917864e-05, "loss": 0.1294, "step": 3580 }, { "epoch": 0.5748044068337858, "grad_norm": 0.2683166797652062, "learning_rate": 1.949319449227796e-05, "loss": 0.1265, "step": 3600 }, { "epoch": 0.577997764649529, "grad_norm": 0.2991655914571407, "learning_rate": 1.9481450181341636e-05, "loss": 0.1307, "step": 3620 }, { "epoch": 0.5811911224652723, "grad_norm": 0.2629061135815468, "learning_rate": 1.9469574968252405e-05, "loss": 0.131, "step": 3640 }, { "epoch": 0.5843844802810155, "grad_norm": 0.30352941453895776, "learning_rate": 1.9457569016961025e-05, "loss": 0.1315, "step": 3660 }, { "epoch": 0.5875778380967588, "grad_norm": 0.32189790257315865, "learning_rate": 1.9445432493223243e-05, "loss": 0.1301, "step": 3680 }, { "epoch": 0.590771195912502, "grad_norm": 0.2262924484205468, "learning_rate": 1.943316556459751e-05, "loss": 0.1265, "step": 3700 }, { "epoch": 0.5939645537282453, "grad_norm": 0.2711892071402863, "learning_rate": 1.9420768400442657e-05, "loss": 0.1271, "step": 3720 }, { "epoch": 0.5971579115439885, "grad_norm": 0.256185445894437, "learning_rate": 1.9408241171915576e-05, "loss": 0.1277, "step": 3740 }, { "epoch": 0.6003512693597317, "grad_norm": 0.25593240031460607, "learning_rate": 1.9395584051968833e-05, "loss": 0.1287, "step": 3760 }, { "epoch": 0.603544627175475, "grad_norm": 0.2979762688925845, "learning_rate": 1.9382797215348303e-05, "loss": 0.1287, "step": 3780 }, { "epoch": 0.6067379849912182, "grad_norm": 0.2761523818504427, "learning_rate": 1.936988083859073e-05, "loss": 0.1289, "step": 3800 }, { "epoch": 0.6099313428069615, "grad_norm": 0.31322754272354847, "learning_rate": 1.935683510002133e-05, "loss": 0.1289, "step": 3820 }, { "epoch": 0.6131247006227047, "grad_norm": 0.32118979161692, "learning_rate": 1.934366017975128e-05, "loss": 0.1291, "step": 3840 }, { "epoch": 0.616318058438448, "grad_norm": 0.3965221736956701, "learning_rate": 1.9330356259675277e-05, "loss": 0.1291, "step": 3860 }, { "epoch": 0.6195114162541913, "grad_norm": 0.23124317796079472, "learning_rate": 1.9316923523468988e-05, "loss": 0.127, "step": 3880 }, { "epoch": 0.6227047740699345, "grad_norm": 0.26107711518189003, "learning_rate": 1.9303362156586554e-05, "loss": 0.1267, "step": 3900 }, { "epoch": 0.6258981318856778, "grad_norm": 0.23776916842759366, "learning_rate": 1.9289672346257988e-05, "loss": 0.1246, "step": 3920 }, { "epoch": 0.629091489701421, "grad_norm": 0.26149208748799935, "learning_rate": 1.9275854281486626e-05, "loss": 0.1251, "step": 3940 }, { "epoch": 0.6322848475171643, "grad_norm": 0.2488232391306922, "learning_rate": 1.9261908153046485e-05, "loss": 0.1268, "step": 3960 }, { "epoch": 0.6354782053329076, "grad_norm": 0.2541408784103856, "learning_rate": 1.924783415347966e-05, "loss": 0.1271, "step": 3980 }, { "epoch": 0.6386715631486508, "grad_norm": 0.2731460017360242, "learning_rate": 1.9233632477093655e-05, "loss": 0.1255, "step": 4000 }, { "epoch": 0.6418649209643941, "grad_norm": 0.22543278383772555, "learning_rate": 1.9219303319958675e-05, "loss": 0.1252, "step": 4020 }, { "epoch": 0.6450582787801373, "grad_norm": 0.3043429192344086, "learning_rate": 1.9204846879904966e-05, "loss": 0.1261, "step": 4040 }, { "epoch": 0.6482516365958806, "grad_norm": 0.2787988968661325, "learning_rate": 1.9190263356520044e-05, "loss": 0.1285, "step": 4060 }, { "epoch": 0.6514449944116238, "grad_norm": 0.28334459179072036, "learning_rate": 1.9175552951145953e-05, "loss": 0.1312, "step": 4080 }, { "epoch": 0.6546383522273671, "grad_norm": 0.2699681672265312, "learning_rate": 1.91607158668765e-05, "loss": 0.128, "step": 4100 }, { "epoch": 0.6578317100431104, "grad_norm": 0.2653884535783852, "learning_rate": 1.9145752308554422e-05, "loss": 0.1236, "step": 4120 }, { "epoch": 0.6610250678588536, "grad_norm": 0.24370062543889062, "learning_rate": 1.913066248276859e-05, "loss": 0.1267, "step": 4140 }, { "epoch": 0.6642184256745969, "grad_norm": 0.268136522123535, "learning_rate": 1.911544659785112e-05, "loss": 0.1251, "step": 4160 }, { "epoch": 0.6674117834903401, "grad_norm": 0.2804716904650792, "learning_rate": 1.9100104863874535e-05, "loss": 0.1282, "step": 4180 }, { "epoch": 0.6706051413060834, "grad_norm": 0.25256532175596885, "learning_rate": 1.9084637492648834e-05, "loss": 0.1291, "step": 4200 }, { "epoch": 0.6737984991218267, "grad_norm": 0.20654144385500267, "learning_rate": 1.9069044697718596e-05, "loss": 0.1275, "step": 4220 }, { "epoch": 0.6769918569375698, "grad_norm": 0.3170119063036349, "learning_rate": 1.9053326694359996e-05, "loss": 0.1252, "step": 4240 }, { "epoch": 0.6801852147533131, "grad_norm": 0.2518310103396095, "learning_rate": 1.9037483699577866e-05, "loss": 0.1252, "step": 4260 }, { "epoch": 0.6833785725690563, "grad_norm": 0.24576567016775977, "learning_rate": 1.9021515932102687e-05, "loss": 0.1262, "step": 4280 }, { "epoch": 0.6865719303847996, "grad_norm": 0.2272326194356311, "learning_rate": 1.9005423612387564e-05, "loss": 0.1277, "step": 4300 }, { "epoch": 0.6897652882005428, "grad_norm": 0.2241851322819629, "learning_rate": 1.8989206962605183e-05, "loss": 0.1254, "step": 4320 }, { "epoch": 0.6929586460162861, "grad_norm": 0.28963794959769024, "learning_rate": 1.8972866206644756e-05, "loss": 0.1269, "step": 4340 }, { "epoch": 0.6961520038320294, "grad_norm": 0.27244182001640865, "learning_rate": 1.8956401570108918e-05, "loss": 0.1268, "step": 4360 }, { "epoch": 0.6993453616477726, "grad_norm": 0.23505587827589292, "learning_rate": 1.893981328031061e-05, "loss": 0.128, "step": 4380 }, { "epoch": 0.7025387194635159, "grad_norm": 0.2636460746314892, "learning_rate": 1.8923101566269956e-05, "loss": 0.1268, "step": 4400 }, { "epoch": 0.7057320772792591, "grad_norm": 0.287020211559549, "learning_rate": 1.890626665871108e-05, "loss": 0.1251, "step": 4420 }, { "epoch": 0.7089254350950024, "grad_norm": 0.3666813610337495, "learning_rate": 1.8889308790058944e-05, "loss": 0.122, "step": 4440 }, { "epoch": 0.7121187929107456, "grad_norm": 0.24200632888509, "learning_rate": 1.887222819443612e-05, "loss": 0.1234, "step": 4460 }, { "epoch": 0.7153121507264889, "grad_norm": 0.3142721600257018, "learning_rate": 1.8855025107659565e-05, "loss": 0.1247, "step": 4480 }, { "epoch": 0.7185055085422322, "grad_norm": 0.2542404530052441, "learning_rate": 1.8837699767237363e-05, "loss": 0.1267, "step": 4500 }, { "epoch": 0.7216988663579754, "grad_norm": 0.2513575844512111, "learning_rate": 1.882025241236546e-05, "loss": 0.1254, "step": 4520 }, { "epoch": 0.7248922241737187, "grad_norm": 0.24131168314941073, "learning_rate": 1.880268328392433e-05, "loss": 0.1251, "step": 4540 }, { "epoch": 0.7280855819894619, "grad_norm": 0.22534176261187136, "learning_rate": 1.878499262447569e-05, "loss": 0.1241, "step": 4560 }, { "epoch": 0.7312789398052052, "grad_norm": 0.2812964686320165, "learning_rate": 1.8767180678259113e-05, "loss": 0.1257, "step": 4580 }, { "epoch": 0.7344722976209485, "grad_norm": 0.23889076217882216, "learning_rate": 1.874924769118868e-05, "loss": 0.1273, "step": 4600 }, { "epoch": 0.7376656554366917, "grad_norm": 0.27177520658222915, "learning_rate": 1.873119391084958e-05, "loss": 0.125, "step": 4620 }, { "epoch": 0.740859013252435, "grad_norm": 0.21614884950104765, "learning_rate": 1.8713019586494687e-05, "loss": 0.1244, "step": 4640 }, { "epoch": 0.7440523710681782, "grad_norm": 0.26972504495310423, "learning_rate": 1.869472496904112e-05, "loss": 0.1278, "step": 4660 }, { "epoch": 0.7472457288839215, "grad_norm": 0.26832330471480753, "learning_rate": 1.867631031106679e-05, "loss": 0.1217, "step": 4680 }, { "epoch": 0.7504390866996647, "grad_norm": 0.2204440405656476, "learning_rate": 1.8657775866806885e-05, "loss": 0.1226, "step": 4700 }, { "epoch": 0.753632444515408, "grad_norm": 0.25422716012201274, "learning_rate": 1.86391218921504e-05, "loss": 0.1264, "step": 4720 }, { "epoch": 0.7568258023311512, "grad_norm": 0.2334817914407686, "learning_rate": 1.8620348644636572e-05, "loss": 0.123, "step": 4740 }, { "epoch": 0.7600191601468944, "grad_norm": 0.24736892038169794, "learning_rate": 1.8601456383451325e-05, "loss": 0.1245, "step": 4760 }, { "epoch": 0.7632125179626377, "grad_norm": 0.23376798290995154, "learning_rate": 1.8582445369423716e-05, "loss": 0.1259, "step": 4780 }, { "epoch": 0.7664058757783809, "grad_norm": 0.24649571282123517, "learning_rate": 1.8563315865022318e-05, "loss": 0.125, "step": 4800 }, { "epoch": 0.7695992335941242, "grad_norm": 0.24228210919101548, "learning_rate": 1.8544068134351585e-05, "loss": 0.1225, "step": 4820 }, { "epoch": 0.7727925914098674, "grad_norm": 0.25429442512742784, "learning_rate": 1.852470244314824e-05, "loss": 0.1261, "step": 4840 }, { "epoch": 0.7759859492256107, "grad_norm": 0.2309045224127907, "learning_rate": 1.850521905877756e-05, "loss": 0.1249, "step": 4860 }, { "epoch": 0.779179307041354, "grad_norm": 0.25672801367790765, "learning_rate": 1.848561825022973e-05, "loss": 0.1234, "step": 4880 }, { "epoch": 0.7823726648570972, "grad_norm": 0.2473205806486083, "learning_rate": 1.8465900288116098e-05, "loss": 0.1284, "step": 4900 }, { "epoch": 0.7855660226728405, "grad_norm": 0.3035165882865362, "learning_rate": 1.844606544466545e-05, "loss": 0.1237, "step": 4920 }, { "epoch": 0.7887593804885837, "grad_norm": 0.26837139940976074, "learning_rate": 1.8426113993720255e-05, "loss": 0.1252, "step": 4940 }, { "epoch": 0.791952738304327, "grad_norm": 0.26373147498792854, "learning_rate": 1.840604621073288e-05, "loss": 0.1227, "step": 4960 }, { "epoch": 0.7951460961200703, "grad_norm": 0.2581673321881109, "learning_rate": 1.8385862372761784e-05, "loss": 0.1273, "step": 4980 }, { "epoch": 0.7983394539358135, "grad_norm": 0.26439250344256154, "learning_rate": 1.83655627584677e-05, "loss": 0.1218, "step": 5000 }, { "epoch": 0.8015328117515568, "grad_norm": 0.2816537144327537, "learning_rate": 1.8345147648109784e-05, "loss": 0.1263, "step": 5020 }, { "epoch": 0.8047261695673, "grad_norm": 0.2647977758183829, "learning_rate": 1.8324617323541738e-05, "loss": 0.1238, "step": 5040 }, { "epoch": 0.8079195273830433, "grad_norm": 0.2593258946289472, "learning_rate": 1.830397206820794e-05, "loss": 0.1246, "step": 5060 }, { "epoch": 0.8111128851987865, "grad_norm": 0.22990124735756534, "learning_rate": 1.8283212167139513e-05, "loss": 0.1226, "step": 5080 }, { "epoch": 0.8143062430145298, "grad_norm": 0.27455958743278586, "learning_rate": 1.8262337906950385e-05, "loss": 0.1261, "step": 5100 }, { "epoch": 0.8174996008302731, "grad_norm": 0.2608809929482469, "learning_rate": 1.8241349575833352e-05, "loss": 0.1226, "step": 5120 }, { "epoch": 0.8206929586460163, "grad_norm": 0.2640419564306298, "learning_rate": 1.822024746355608e-05, "loss": 0.1381, "step": 5140 }, { "epoch": 0.8238863164617596, "grad_norm": 0.29262015087553245, "learning_rate": 1.8199031861457123e-05, "loss": 0.1214, "step": 5160 }, { "epoch": 0.8270796742775028, "grad_norm": 0.2319619995331439, "learning_rate": 1.8177703062441882e-05, "loss": 0.1232, "step": 5180 }, { "epoch": 0.8302730320932461, "grad_norm": 0.26293647732336844, "learning_rate": 1.815626136097857e-05, "loss": 0.1233, "step": 5200 }, { "epoch": 0.8334663899089892, "grad_norm": 0.24081197327765444, "learning_rate": 1.8134707053094146e-05, "loss": 0.1202, "step": 5220 }, { "epoch": 0.8366597477247325, "grad_norm": 0.2736597574126886, "learning_rate": 1.8113040436370236e-05, "loss": 0.1189, "step": 5240 }, { "epoch": 0.8398531055404758, "grad_norm": 0.22867160064093073, "learning_rate": 1.809126180993901e-05, "loss": 0.1227, "step": 5260 }, { "epoch": 0.843046463356219, "grad_norm": 0.20241019354224027, "learning_rate": 1.8069371474479055e-05, "loss": 0.1207, "step": 5280 }, { "epoch": 0.8462398211719623, "grad_norm": 0.23512641329119113, "learning_rate": 1.8047369732211236e-05, "loss": 0.1227, "step": 5300 }, { "epoch": 0.8494331789877055, "grad_norm": 0.21831678736014193, "learning_rate": 1.8025256886894512e-05, "loss": 0.1263, "step": 5320 }, { "epoch": 0.8526265368034488, "grad_norm": 0.22942586598137038, "learning_rate": 1.800303324382174e-05, "loss": 0.1226, "step": 5340 }, { "epoch": 0.8558198946191921, "grad_norm": 0.22565630953315605, "learning_rate": 1.7980699109815476e-05, "loss": 0.1227, "step": 5360 }, { "epoch": 0.8590132524349353, "grad_norm": 0.2110233708822902, "learning_rate": 1.795825479322372e-05, "loss": 0.123, "step": 5380 }, { "epoch": 0.8622066102506786, "grad_norm": 0.2588140422630483, "learning_rate": 1.793570060391567e-05, "loss": 0.1233, "step": 5400 }, { "epoch": 0.8653999680664218, "grad_norm": 0.20643049269214508, "learning_rate": 1.791303685327744e-05, "loss": 0.1216, "step": 5420 }, { "epoch": 0.8685933258821651, "grad_norm": 0.2450716780518527, "learning_rate": 1.7890263854207766e-05, "loss": 0.1187, "step": 5440 }, { "epoch": 0.8717866836979083, "grad_norm": 0.2626908104568787, "learning_rate": 1.7867381921113672e-05, "loss": 0.1318, "step": 5460 }, { "epoch": 0.8749800415136516, "grad_norm": 0.21046084433838286, "learning_rate": 1.784439136990616e-05, "loss": 0.1216, "step": 5480 }, { "epoch": 0.8781733993293949, "grad_norm": 0.22390590052286838, "learning_rate": 1.7821292517995802e-05, "loss": 0.1222, "step": 5500 }, { "epoch": 0.8813667571451381, "grad_norm": 0.21545360667161884, "learning_rate": 1.7798085684288408e-05, "loss": 0.1245, "step": 5520 }, { "epoch": 0.8845601149608814, "grad_norm": 0.23969169247272867, "learning_rate": 1.777477118918058e-05, "loss": 0.1199, "step": 5540 }, { "epoch": 0.8877534727766246, "grad_norm": 0.25616719983123853, "learning_rate": 1.7751349354555315e-05, "loss": 0.12, "step": 5560 }, { "epoch": 0.8909468305923679, "grad_norm": 0.2327465548031593, "learning_rate": 1.7727820503777563e-05, "loss": 0.1188, "step": 5580 }, { "epoch": 0.8941401884081112, "grad_norm": 0.2704312448776363, "learning_rate": 1.770418496168973e-05, "loss": 0.1266, "step": 5600 }, { "epoch": 0.8973335462238544, "grad_norm": 0.280731488755357, "learning_rate": 1.7680443054607247e-05, "loss": 0.1186, "step": 5620 }, { "epoch": 0.9005269040395977, "grad_norm": 0.2190704544630761, "learning_rate": 1.7656595110314003e-05, "loss": 0.1227, "step": 5640 }, { "epoch": 0.9037202618553409, "grad_norm": 0.2676299073758256, "learning_rate": 1.7632641458057874e-05, "loss": 0.1166, "step": 5660 }, { "epoch": 0.9069136196710842, "grad_norm": 0.2699663729747412, "learning_rate": 1.7608582428546142e-05, "loss": 0.1245, "step": 5680 }, { "epoch": 0.9101069774868275, "grad_norm": 0.38105163760645616, "learning_rate": 1.7584418353940943e-05, "loss": 0.1218, "step": 5700 }, { "epoch": 0.9133003353025706, "grad_norm": 0.23014658636555574, "learning_rate": 1.756014956785468e-05, "loss": 0.1181, "step": 5720 }, { "epoch": 0.9164936931183139, "grad_norm": 0.24389786019248447, "learning_rate": 1.7535776405345428e-05, "loss": 0.1196, "step": 5740 }, { "epoch": 0.9196870509340571, "grad_norm": 0.26113050468693977, "learning_rate": 1.7511299202912275e-05, "loss": 0.1202, "step": 5760 }, { "epoch": 0.9228804087498004, "grad_norm": 0.2078740201372768, "learning_rate": 1.7486718298490713e-05, "loss": 0.124, "step": 5780 }, { "epoch": 0.9260737665655436, "grad_norm": 0.3157327866928938, "learning_rate": 1.7462034031447954e-05, "loss": 0.1252, "step": 5800 }, { "epoch": 0.9292671243812869, "grad_norm": 0.21114581099853116, "learning_rate": 1.7437246742578246e-05, "loss": 0.1204, "step": 5820 }, { "epoch": 0.9324604821970301, "grad_norm": 0.2200062852329027, "learning_rate": 1.7412356774098175e-05, "loss": 0.1249, "step": 5840 }, { "epoch": 0.9356538400127734, "grad_norm": 0.2739829354403811, "learning_rate": 1.7387364469641928e-05, "loss": 0.1207, "step": 5860 }, { "epoch": 0.9388471978285167, "grad_norm": 0.22036300962797467, "learning_rate": 1.736227017425656e-05, "loss": 0.1182, "step": 5880 }, { "epoch": 0.9420405556442599, "grad_norm": 0.2010246775840929, "learning_rate": 1.7337074234397228e-05, "loss": 0.1199, "step": 5900 }, { "epoch": 0.9452339134600032, "grad_norm": 0.22961494443205888, "learning_rate": 1.7311776997922404e-05, "loss": 0.1207, "step": 5920 }, { "epoch": 0.9484272712757464, "grad_norm": 0.26165957694875003, "learning_rate": 1.7286378814089072e-05, "loss": 0.1188, "step": 5940 }, { "epoch": 0.9516206290914897, "grad_norm": 0.22131834255107544, "learning_rate": 1.726088003354791e-05, "loss": 0.1205, "step": 5960 }, { "epoch": 0.954813986907233, "grad_norm": 0.2549539175287136, "learning_rate": 1.7235281008338452e-05, "loss": 0.1213, "step": 5980 }, { "epoch": 0.9580073447229762, "grad_norm": 0.2427772520556814, "learning_rate": 1.720958209188422e-05, "loss": 0.1211, "step": 6000 }, { "epoch": 0.9612007025387195, "grad_norm": 0.2442539895798861, "learning_rate": 1.7183783638987845e-05, "loss": 0.1193, "step": 6020 }, { "epoch": 0.9643940603544627, "grad_norm": 0.23954523978335746, "learning_rate": 1.7157886005826173e-05, "loss": 0.1196, "step": 6040 }, { "epoch": 0.967587418170206, "grad_norm": 0.20571373812832114, "learning_rate": 1.7131889549945348e-05, "loss": 0.1149, "step": 6060 }, { "epoch": 0.9707807759859493, "grad_norm": 0.22749917178842363, "learning_rate": 1.710579463025587e-05, "loss": 0.1176, "step": 6080 }, { "epoch": 0.9739741338016925, "grad_norm": 0.23012462875837292, "learning_rate": 1.7079601607027643e-05, "loss": 0.1186, "step": 6100 }, { "epoch": 0.9771674916174358, "grad_norm": 0.20338632953694447, "learning_rate": 1.7053310841885012e-05, "loss": 0.1187, "step": 6120 }, { "epoch": 0.980360849433179, "grad_norm": 0.23280208486194112, "learning_rate": 1.7026922697801746e-05, "loss": 0.1196, "step": 6140 }, { "epoch": 0.9835542072489223, "grad_norm": 0.20786109950948006, "learning_rate": 1.7000437539096046e-05, "loss": 0.1202, "step": 6160 }, { "epoch": 0.9867475650646655, "grad_norm": 0.21375986615043702, "learning_rate": 1.6973855731425507e-05, "loss": 0.1159, "step": 6180 }, { "epoch": 0.9899409228804088, "grad_norm": 0.20748661803980806, "learning_rate": 1.694717764178208e-05, "loss": 0.1153, "step": 6200 }, { "epoch": 0.993134280696152, "grad_norm": 0.22516009929996467, "learning_rate": 1.692040363848699e-05, "loss": 0.1204, "step": 6220 }, { "epoch": 0.9963276385118952, "grad_norm": 0.2595564019615457, "learning_rate": 1.6893534091185658e-05, "loss": 0.1197, "step": 6240 }, { "epoch": 0.9995209963276385, "grad_norm": 0.18297342882482412, "learning_rate": 1.686656937084261e-05, "loss": 0.1151, "step": 6260 }, { "epoch": 1.0027143541433818, "grad_norm": 0.22852815920466457, "learning_rate": 1.6839509849736326e-05, "loss": 0.0949, "step": 6280 }, { "epoch": 1.005907711959125, "grad_norm": 0.19728357385077158, "learning_rate": 1.6812355901454132e-05, "loss": 0.0872, "step": 6300 }, { "epoch": 1.0091010697748684, "grad_norm": 0.2623149708691154, "learning_rate": 1.678510790088702e-05, "loss": 0.0887, "step": 6320 }, { "epoch": 1.0122944275906116, "grad_norm": 0.18893451371595926, "learning_rate": 1.6757766224224483e-05, "loss": 0.0919, "step": 6340 }, { "epoch": 1.0154877854063549, "grad_norm": 0.21837196710349846, "learning_rate": 1.673033124894932e-05, "loss": 0.0871, "step": 6360 }, { "epoch": 1.0186811432220981, "grad_norm": 0.19258941847945746, "learning_rate": 1.670280335383242e-05, "loss": 0.0885, "step": 6380 }, { "epoch": 1.0218745010378414, "grad_norm": 0.19005062378076065, "learning_rate": 1.667518291892754e-05, "loss": 0.0893, "step": 6400 }, { "epoch": 1.0250678588535846, "grad_norm": 0.20663392660314553, "learning_rate": 1.6647470325566045e-05, "loss": 0.0891, "step": 6420 }, { "epoch": 1.028261216669328, "grad_norm": 0.22234403999553295, "learning_rate": 1.6619665956351664e-05, "loss": 0.0881, "step": 6440 }, { "epoch": 1.0314545744850712, "grad_norm": 0.2218548835051233, "learning_rate": 1.6591770195155185e-05, "loss": 0.0891, "step": 6460 }, { "epoch": 1.0346479323008142, "grad_norm": 0.19448202424429442, "learning_rate": 1.6563783427109173e-05, "loss": 0.0882, "step": 6480 }, { "epoch": 1.0378412901165575, "grad_norm": 0.2042849289860482, "learning_rate": 1.6535706038602637e-05, "loss": 0.0878, "step": 6500 }, { "epoch": 1.0410346479323007, "grad_norm": 0.2512755796704539, "learning_rate": 1.6507538417275716e-05, "loss": 0.0875, "step": 6520 }, { "epoch": 1.044228005748044, "grad_norm": 0.2131890646498463, "learning_rate": 1.6479280952014304e-05, "loss": 0.0898, "step": 6540 }, { "epoch": 1.0474213635637872, "grad_norm": 0.21427122055121073, "learning_rate": 1.6450934032944698e-05, "loss": 0.088, "step": 6560 }, { "epoch": 1.0506147213795305, "grad_norm": 0.2110500487102777, "learning_rate": 1.64224980514282e-05, "loss": 0.0877, "step": 6580 }, { "epoch": 1.0538080791952738, "grad_norm": 0.21674633072630997, "learning_rate": 1.6393973400055737e-05, "loss": 0.0919, "step": 6600 }, { "epoch": 1.057001437011017, "grad_norm": 0.20250013575431305, "learning_rate": 1.63653604726424e-05, "loss": 0.0878, "step": 6620 }, { "epoch": 1.0601947948267603, "grad_norm": 0.22853386096908568, "learning_rate": 1.6336659664222048e-05, "loss": 0.0865, "step": 6640 }, { "epoch": 1.0633881526425035, "grad_norm": 0.23371366704528887, "learning_rate": 1.630787137104183e-05, "loss": 0.0917, "step": 6660 }, { "epoch": 1.0665815104582468, "grad_norm": 0.2520515744099512, "learning_rate": 1.6278995990556725e-05, "loss": 0.0885, "step": 6680 }, { "epoch": 1.06977486827399, "grad_norm": 0.226518466734716, "learning_rate": 1.6250033921424038e-05, "loss": 0.089, "step": 6700 }, { "epoch": 1.0729682260897333, "grad_norm": 0.19588721298026593, "learning_rate": 1.6220985563497933e-05, "loss": 0.0893, "step": 6720 }, { "epoch": 1.0761615839054766, "grad_norm": 0.20545809450126928, "learning_rate": 1.6191851317823864e-05, "loss": 0.0878, "step": 6740 }, { "epoch": 1.0793549417212198, "grad_norm": 0.19233602078710613, "learning_rate": 1.6162631586633076e-05, "loss": 0.0866, "step": 6760 }, { "epoch": 1.082548299536963, "grad_norm": 0.16678814329219444, "learning_rate": 1.6133326773337033e-05, "loss": 0.0871, "step": 6780 }, { "epoch": 1.0857416573527063, "grad_norm": 0.1872528998042832, "learning_rate": 1.610393728252186e-05, "loss": 0.0855, "step": 6800 }, { "epoch": 1.0889350151684496, "grad_norm": 0.2125566089494784, "learning_rate": 1.6074463519942747e-05, "loss": 0.0868, "step": 6820 }, { "epoch": 1.0921283729841929, "grad_norm": 0.2174911829451179, "learning_rate": 1.604490589251835e-05, "loss": 0.0883, "step": 6840 }, { "epoch": 1.095321730799936, "grad_norm": 0.18461972367391402, "learning_rate": 1.6015264808325172e-05, "loss": 0.0866, "step": 6860 }, { "epoch": 1.0985150886156794, "grad_norm": 0.21622527426814506, "learning_rate": 1.5985540676591938e-05, "loss": 0.0863, "step": 6880 }, { "epoch": 1.1017084464314226, "grad_norm": 0.22055823564651658, "learning_rate": 1.5955733907693938e-05, "loss": 0.0864, "step": 6900 }, { "epoch": 1.1049018042471659, "grad_norm": 0.21748955927958816, "learning_rate": 1.592584491314735e-05, "loss": 0.0914, "step": 6920 }, { "epoch": 1.1080951620629091, "grad_norm": 0.19288286925997916, "learning_rate": 1.589587410560359e-05, "loss": 0.0886, "step": 6940 }, { "epoch": 1.1112885198786524, "grad_norm": 0.22073550271753697, "learning_rate": 1.586582189884357e-05, "loss": 0.0874, "step": 6960 }, { "epoch": 1.1144818776943957, "grad_norm": 0.19094293529375386, "learning_rate": 1.5835688707772035e-05, "loss": 0.0855, "step": 6980 }, { "epoch": 1.117675235510139, "grad_norm": 0.21947645518408387, "learning_rate": 1.5805474948411792e-05, "loss": 0.0891, "step": 7000 }, { "epoch": 1.1208685933258822, "grad_norm": 0.19228306320542188, "learning_rate": 1.5775181037897995e-05, "loss": 0.0864, "step": 7020 }, { "epoch": 1.1240619511416254, "grad_norm": 0.2416878479220072, "learning_rate": 1.5744807394472372e-05, "loss": 0.0892, "step": 7040 }, { "epoch": 1.1272553089573687, "grad_norm": 0.2763423491442259, "learning_rate": 1.5714354437477454e-05, "loss": 0.0903, "step": 7060 }, { "epoch": 1.130448666773112, "grad_norm": 9.009817853561485, "learning_rate": 1.568382258735078e-05, "loss": 0.0896, "step": 7080 }, { "epoch": 1.1336420245888552, "grad_norm": 0.21069452452749907, "learning_rate": 1.5653212265619114e-05, "loss": 0.0908, "step": 7100 }, { "epoch": 1.1368353824045985, "grad_norm": 0.20407807891775565, "learning_rate": 1.5622523894892587e-05, "loss": 0.0908, "step": 7120 }, { "epoch": 1.1400287402203417, "grad_norm": 0.2619102068507488, "learning_rate": 1.5591757898858907e-05, "loss": 0.0872, "step": 7140 }, { "epoch": 1.143222098036085, "grad_norm": 0.20634106575751654, "learning_rate": 1.556091470227747e-05, "loss": 0.0875, "step": 7160 }, { "epoch": 1.1464154558518282, "grad_norm": 0.23775033570197862, "learning_rate": 1.5529994730973522e-05, "loss": 0.0868, "step": 7180 }, { "epoch": 1.1496088136675715, "grad_norm": 0.20245603598906314, "learning_rate": 1.549899841183227e-05, "loss": 0.0868, "step": 7200 }, { "epoch": 1.1528021714833148, "grad_norm": 0.19815804657454472, "learning_rate": 1.546792617279299e-05, "loss": 0.0899, "step": 7220 }, { "epoch": 1.155995529299058, "grad_norm": 0.18751806743751373, "learning_rate": 1.5436778442843107e-05, "loss": 0.0884, "step": 7240 }, { "epoch": 1.1591888871148013, "grad_norm": 0.22312780655020503, "learning_rate": 1.5405555652012302e-05, "loss": 0.0895, "step": 7260 }, { "epoch": 1.1623822449305445, "grad_norm": 0.1924743563793643, "learning_rate": 1.5374258231366546e-05, "loss": 0.0881, "step": 7280 }, { "epoch": 1.1655756027462878, "grad_norm": 0.20844406290416265, "learning_rate": 1.5342886613002155e-05, "loss": 0.0867, "step": 7300 }, { "epoch": 1.168768960562031, "grad_norm": 0.1761650680293785, "learning_rate": 1.531144123003984e-05, "loss": 0.087, "step": 7320 }, { "epoch": 1.1719623183777743, "grad_norm": 0.1914806702266616, "learning_rate": 1.5279922516618702e-05, "loss": 0.0866, "step": 7340 }, { "epoch": 1.1751556761935176, "grad_norm": 0.2112719185689836, "learning_rate": 1.5248330907890272e-05, "loss": 0.0867, "step": 7360 }, { "epoch": 1.1783490340092608, "grad_norm": 0.20744289591360074, "learning_rate": 1.5216666840012455e-05, "loss": 0.0848, "step": 7380 }, { "epoch": 1.1815423918250039, "grad_norm": 0.21602516707177483, "learning_rate": 1.5184930750143565e-05, "loss": 0.0889, "step": 7400 }, { "epoch": 1.1847357496407471, "grad_norm": 0.1942180064010259, "learning_rate": 1.515312307643624e-05, "loss": 0.0871, "step": 7420 }, { "epoch": 1.1879291074564904, "grad_norm": 0.1809045891368503, "learning_rate": 1.5121244258031427e-05, "loss": 0.0887, "step": 7440 }, { "epoch": 1.1911224652722336, "grad_norm": 0.21509016663666897, "learning_rate": 1.50892947350523e-05, "loss": 0.0875, "step": 7460 }, { "epoch": 1.194315823087977, "grad_norm": 0.22222425875493532, "learning_rate": 1.5057274948598192e-05, "loss": 0.0904, "step": 7480 }, { "epoch": 1.1975091809037202, "grad_norm": 0.17436626344650585, "learning_rate": 1.5025185340738499e-05, "loss": 0.0869, "step": 7500 }, { "epoch": 1.2007025387194634, "grad_norm": 0.2315956494531892, "learning_rate": 1.4993026354506588e-05, "loss": 0.0893, "step": 7520 }, { "epoch": 1.2038958965352067, "grad_norm": 0.19438867498932094, "learning_rate": 1.4960798433893664e-05, "loss": 0.0898, "step": 7540 }, { "epoch": 1.20708925435095, "grad_norm": 0.21507570120321423, "learning_rate": 1.492850202384266e-05, "loss": 0.0888, "step": 7560 }, { "epoch": 1.2102826121666932, "grad_norm": 0.1756005064132717, "learning_rate": 1.4896137570242068e-05, "loss": 0.0886, "step": 7580 }, { "epoch": 1.2134759699824365, "grad_norm": 0.21082827374254784, "learning_rate": 1.486370551991981e-05, "loss": 0.0877, "step": 7600 }, { "epoch": 1.2166693277981797, "grad_norm": 0.25062287626591706, "learning_rate": 1.483120632063706e-05, "loss": 0.0889, "step": 7620 }, { "epoch": 1.219862685613923, "grad_norm": 0.18123970615998264, "learning_rate": 1.4798640421082047e-05, "loss": 0.0886, "step": 7640 }, { "epoch": 1.2230560434296662, "grad_norm": 0.21468260494577018, "learning_rate": 1.4766008270863883e-05, "loss": 0.0906, "step": 7660 }, { "epoch": 1.2262494012454095, "grad_norm": 0.18876901647341507, "learning_rate": 1.4733310320506343e-05, "loss": 0.0882, "step": 7680 }, { "epoch": 1.2294427590611527, "grad_norm": 0.19790235853542382, "learning_rate": 1.4700547021441642e-05, "loss": 0.0877, "step": 7700 }, { "epoch": 1.232636116876896, "grad_norm": 0.18688689214473558, "learning_rate": 1.4667718826004214e-05, "loss": 0.0882, "step": 7720 }, { "epoch": 1.2358294746926393, "grad_norm": 0.1951758945258833, "learning_rate": 1.463482618742446e-05, "loss": 0.0869, "step": 7740 }, { "epoch": 1.2390228325083825, "grad_norm": 0.19995389074426362, "learning_rate": 1.4601869559822488e-05, "loss": 0.0872, "step": 7760 }, { "epoch": 1.2422161903241258, "grad_norm": 0.2218492641305999, "learning_rate": 1.4568849398201855e-05, "loss": 0.0883, "step": 7780 }, { "epoch": 1.245409548139869, "grad_norm": 0.18443852015389814, "learning_rate": 1.4535766158443265e-05, "loss": 0.087, "step": 7800 }, { "epoch": 1.2486029059556123, "grad_norm": 0.19503753956864983, "learning_rate": 1.45026202972983e-05, "loss": 0.0885, "step": 7820 }, { "epoch": 1.2517962637713556, "grad_norm": 0.19853902671151866, "learning_rate": 1.446941227238309e-05, "loss": 0.0861, "step": 7840 }, { "epoch": 1.2549896215870988, "grad_norm": 0.21865153532249126, "learning_rate": 1.4436142542172009e-05, "loss": 0.0886, "step": 7860 }, { "epoch": 1.258182979402842, "grad_norm": 0.20818634190936489, "learning_rate": 1.4402811565991353e-05, "loss": 0.0889, "step": 7880 }, { "epoch": 1.2613763372185853, "grad_norm": 0.23080624800369903, "learning_rate": 1.436941980401297e-05, "loss": 0.0858, "step": 7900 }, { "epoch": 1.2645696950343286, "grad_norm": 0.19862256058128666, "learning_rate": 1.4335967717247941e-05, "loss": 0.0865, "step": 7920 }, { "epoch": 1.2677630528500718, "grad_norm": 0.18954472715597112, "learning_rate": 1.4302455767540189e-05, "loss": 0.0886, "step": 7940 }, { "epoch": 1.270956410665815, "grad_norm": 0.18922957380652522, "learning_rate": 1.4268884417560119e-05, "loss": 0.0881, "step": 7960 }, { "epoch": 1.2741497684815584, "grad_norm": 0.23661467243107595, "learning_rate": 1.4235254130798213e-05, "loss": 0.0884, "step": 7980 }, { "epoch": 1.2773431262973016, "grad_norm": 0.21028360452170922, "learning_rate": 1.4201565371558657e-05, "loss": 0.0858, "step": 8000 }, { "epoch": 1.2805364841130449, "grad_norm": 0.1857031394163611, "learning_rate": 1.4167818604952906e-05, "loss": 0.0865, "step": 8020 }, { "epoch": 1.2837298419287881, "grad_norm": 0.227255800263239, "learning_rate": 1.4134014296893275e-05, "loss": 0.0884, "step": 8040 }, { "epoch": 1.2869231997445314, "grad_norm": 0.26940362233973403, "learning_rate": 1.4100152914086504e-05, "loss": 0.0845, "step": 8060 }, { "epoch": 1.2901165575602747, "grad_norm": 0.22762705633128913, "learning_rate": 1.4066234924027318e-05, "loss": 0.0863, "step": 8080 }, { "epoch": 1.293309915376018, "grad_norm": 0.24522046661200322, "learning_rate": 1.4032260794991956e-05, "loss": 0.0854, "step": 8100 }, { "epoch": 1.2965032731917612, "grad_norm": 0.17298541823238414, "learning_rate": 1.3998230996031736e-05, "loss": 0.0884, "step": 8120 }, { "epoch": 1.2996966310075044, "grad_norm": 0.21973030306429478, "learning_rate": 1.3964145996966555e-05, "loss": 0.0879, "step": 8140 }, { "epoch": 1.3028899888232477, "grad_norm": 0.18077115808310013, "learning_rate": 1.3930006268378407e-05, "loss": 0.089, "step": 8160 }, { "epoch": 1.306083346638991, "grad_norm": 0.18437205616695954, "learning_rate": 1.3895812281604895e-05, "loss": 0.0887, "step": 8180 }, { "epoch": 1.3092767044547342, "grad_norm": 0.22324698589088907, "learning_rate": 1.386156450873271e-05, "loss": 0.1099, "step": 8200 }, { "epoch": 1.3124700622704775, "grad_norm": 0.1866174959700542, "learning_rate": 1.382726342259113e-05, "loss": 0.0899, "step": 8220 }, { "epoch": 1.3156634200862207, "grad_norm": 0.22011208651394024, "learning_rate": 1.3792909496745475e-05, "loss": 0.0869, "step": 8240 }, { "epoch": 1.318856777901964, "grad_norm": 0.21878645198323823, "learning_rate": 1.3758503205490583e-05, "loss": 0.0859, "step": 8260 }, { "epoch": 1.3220501357177072, "grad_norm": 0.1869477105143079, "learning_rate": 1.3724045023844253e-05, "loss": 0.0898, "step": 8280 }, { "epoch": 1.3252434935334505, "grad_norm": 0.21199782150015953, "learning_rate": 1.3689535427540687e-05, "loss": 0.0861, "step": 8300 }, { "epoch": 1.3284368513491938, "grad_norm": 0.18518093738165986, "learning_rate": 1.3654974893023934e-05, "loss": 0.0908, "step": 8320 }, { "epoch": 1.331630209164937, "grad_norm": 0.18688147397601756, "learning_rate": 1.3620363897441289e-05, "loss": 0.0868, "step": 8340 }, { "epoch": 1.3348235669806803, "grad_norm": 0.2067483479178462, "learning_rate": 1.358570291863673e-05, "loss": 0.0884, "step": 8360 }, { "epoch": 1.3380169247964235, "grad_norm": 0.21329007217550264, "learning_rate": 1.3550992435144304e-05, "loss": 0.086, "step": 8380 }, { "epoch": 1.3412102826121668, "grad_norm": 0.18073209909106028, "learning_rate": 1.3516232926181529e-05, "loss": 0.0868, "step": 8400 }, { "epoch": 1.34440364042791, "grad_norm": 0.23014446893395585, "learning_rate": 1.3481424871642778e-05, "loss": 0.088, "step": 8420 }, { "epoch": 1.3475969982436533, "grad_norm": 0.3028280433486724, "learning_rate": 1.3446568752092643e-05, "loss": 0.0848, "step": 8440 }, { "epoch": 1.3507903560593966, "grad_norm": 0.20888924306544646, "learning_rate": 1.3411665048759313e-05, "loss": 0.0885, "step": 8460 }, { "epoch": 1.3539837138751398, "grad_norm": 0.22324045695426223, "learning_rate": 1.3376714243527925e-05, "loss": 0.0901, "step": 8480 }, { "epoch": 1.357177071690883, "grad_norm": 0.19474459814659545, "learning_rate": 1.3341716818933912e-05, "loss": 0.088, "step": 8500 }, { "epoch": 1.3603704295066263, "grad_norm": 0.22602725655780065, "learning_rate": 1.3306673258156334e-05, "loss": 0.0867, "step": 8520 }, { "epoch": 1.3635637873223696, "grad_norm": 0.23360209320607728, "learning_rate": 1.3271584045011217e-05, "loss": 0.0886, "step": 8540 }, { "epoch": 1.3667571451381129, "grad_norm": 0.1873427703628018, "learning_rate": 1.3236449663944875e-05, "loss": 0.0866, "step": 8560 }, { "epoch": 1.369950502953856, "grad_norm": 0.1985433902478951, "learning_rate": 1.3201270600027208e-05, "loss": 0.0876, "step": 8580 }, { "epoch": 1.3731438607695992, "grad_norm": 0.18896595210872472, "learning_rate": 1.3166047338945019e-05, "loss": 0.0861, "step": 8600 }, { "epoch": 1.3763372185853424, "grad_norm": 0.22957720239257226, "learning_rate": 1.3130780366995297e-05, "loss": 0.0853, "step": 8620 }, { "epoch": 1.3795305764010857, "grad_norm": 0.1933824287848287, "learning_rate": 1.3095470171078512e-05, "loss": 0.0867, "step": 8640 }, { "epoch": 1.382723934216829, "grad_norm": 0.22324019535172776, "learning_rate": 1.3060117238691894e-05, "loss": 0.085, "step": 8660 }, { "epoch": 1.3859172920325722, "grad_norm": 0.2316030267030887, "learning_rate": 1.3024722057922696e-05, "loss": 0.0841, "step": 8680 }, { "epoch": 1.3891106498483154, "grad_norm": 0.1973247421696361, "learning_rate": 1.2989285117441452e-05, "loss": 0.0878, "step": 8700 }, { "epoch": 1.3923040076640587, "grad_norm": 0.2080002656842217, "learning_rate": 1.2953806906495244e-05, "loss": 0.0883, "step": 8720 }, { "epoch": 1.395497365479802, "grad_norm": 0.18517780070734782, "learning_rate": 1.2918287914900933e-05, "loss": 0.0852, "step": 8740 }, { "epoch": 1.3986907232955452, "grad_norm": 0.19697224463698385, "learning_rate": 1.2882728633038406e-05, "loss": 0.0855, "step": 8760 }, { "epoch": 1.4018840811112885, "grad_norm": 0.19736259450538857, "learning_rate": 1.2847129551843807e-05, "loss": 0.0876, "step": 8780 }, { "epoch": 1.4050774389270317, "grad_norm": 0.18942542996017805, "learning_rate": 1.2811491162802744e-05, "loss": 0.0884, "step": 8800 }, { "epoch": 1.408270796742775, "grad_norm": 0.19254196108878727, "learning_rate": 1.277581395794353e-05, "loss": 0.088, "step": 8820 }, { "epoch": 1.4114641545585183, "grad_norm": 0.24282865106690285, "learning_rate": 1.2740098429830357e-05, "loss": 0.0891, "step": 8840 }, { "epoch": 1.4146575123742615, "grad_norm": 0.23984915406072307, "learning_rate": 1.2704345071556525e-05, "loss": 0.0886, "step": 8860 }, { "epoch": 1.4178508701900048, "grad_norm": 0.2184606228075661, "learning_rate": 1.2668554376737619e-05, "loss": 0.087, "step": 8880 }, { "epoch": 1.421044228005748, "grad_norm": 0.19798737334853378, "learning_rate": 1.2632726839504693e-05, "loss": 0.0875, "step": 8900 }, { "epoch": 1.4242375858214913, "grad_norm": 0.23442081669151446, "learning_rate": 1.2596862954497458e-05, "loss": 0.0849, "step": 8920 }, { "epoch": 1.4274309436372346, "grad_norm": 0.21286909537115775, "learning_rate": 1.2560963216857447e-05, "loss": 0.0845, "step": 8940 }, { "epoch": 1.4306243014529778, "grad_norm": 0.19037684375350825, "learning_rate": 1.2525028122221172e-05, "loss": 0.0857, "step": 8960 }, { "epoch": 1.433817659268721, "grad_norm": 0.18725372186680364, "learning_rate": 1.24890581667133e-05, "loss": 0.0875, "step": 8980 }, { "epoch": 1.4370110170844643, "grad_norm": 0.20844623553872596, "learning_rate": 1.2453053846939783e-05, "loss": 0.0898, "step": 9000 }, { "epoch": 1.4402043749002076, "grad_norm": 0.21140506066201004, "learning_rate": 1.2417015659981007e-05, "loss": 0.0883, "step": 9020 }, { "epoch": 1.4433977327159508, "grad_norm": 0.2064339774677841, "learning_rate": 1.2380944103384946e-05, "loss": 0.0849, "step": 9040 }, { "epoch": 1.446591090531694, "grad_norm": 0.17652746458033255, "learning_rate": 1.2344839675160271e-05, "loss": 0.0867, "step": 9060 }, { "epoch": 1.4497844483474374, "grad_norm": 0.19101046403484023, "learning_rate": 1.2308702873769486e-05, "loss": 0.0865, "step": 9080 }, { "epoch": 1.4529778061631806, "grad_norm": 0.19778410360898788, "learning_rate": 1.227253419812204e-05, "loss": 0.0876, "step": 9100 }, { "epoch": 1.4561711639789239, "grad_norm": 0.1884773288145621, "learning_rate": 1.2236334147567442e-05, "loss": 0.0873, "step": 9120 }, { "epoch": 1.4593645217946671, "grad_norm": 0.22741564087867433, "learning_rate": 1.2200103221888365e-05, "loss": 0.0842, "step": 9140 }, { "epoch": 1.4625578796104104, "grad_norm": 0.19382271044214394, "learning_rate": 1.2163841921293761e-05, "loss": 0.0846, "step": 9160 }, { "epoch": 1.4657512374261537, "grad_norm": 0.2225438873976966, "learning_rate": 1.2127550746411932e-05, "loss": 0.086, "step": 9180 }, { "epoch": 1.468944595241897, "grad_norm": 0.20309796630710175, "learning_rate": 1.2091230198283626e-05, "loss": 0.0872, "step": 9200 }, { "epoch": 1.4721379530576402, "grad_norm": 0.21309103603253518, "learning_rate": 1.2054880778355122e-05, "loss": 0.0856, "step": 9220 }, { "epoch": 1.4753313108733834, "grad_norm": 0.20007800804028458, "learning_rate": 1.201850298847132e-05, "loss": 0.0843, "step": 9240 }, { "epoch": 1.4785246686891267, "grad_norm": 0.22102981325152446, "learning_rate": 1.198209733086878e-05, "loss": 0.0865, "step": 9260 }, { "epoch": 1.48171802650487, "grad_norm": 0.2509432577302147, "learning_rate": 1.194566430816882e-05, "loss": 0.0872, "step": 9280 }, { "epoch": 1.4849113843206132, "grad_norm": 0.21078643240774367, "learning_rate": 1.1909204423370564e-05, "loss": 0.0856, "step": 9300 }, { "epoch": 1.4881047421363562, "grad_norm": 0.22252302888210984, "learning_rate": 1.1872718179843994e-05, "loss": 0.0838, "step": 9320 }, { "epoch": 1.4912980999520995, "grad_norm": 0.18987560853570382, "learning_rate": 1.1836206081323003e-05, "loss": 0.085, "step": 9340 }, { "epoch": 1.4944914577678428, "grad_norm": 0.19549774907184778, "learning_rate": 1.1799668631898445e-05, "loss": 0.0877, "step": 9360 }, { "epoch": 1.497684815583586, "grad_norm": 0.19228104758868642, "learning_rate": 1.176310633601117e-05, "loss": 0.0956, "step": 9380 }, { "epoch": 1.5008781733993293, "grad_norm": 0.20819820045783494, "learning_rate": 1.1726519698445056e-05, "loss": 0.0867, "step": 9400 }, { "epoch": 1.5040715312150725, "grad_norm": 0.20733767509582143, "learning_rate": 1.1689909224320062e-05, "loss": 0.0863, "step": 9420 }, { "epoch": 1.5072648890308158, "grad_norm": 0.20925265086202188, "learning_rate": 1.165327541908522e-05, "loss": 0.0861, "step": 9440 }, { "epoch": 1.510458246846559, "grad_norm": 0.18493554321077676, "learning_rate": 1.1616618788511684e-05, "loss": 0.0849, "step": 9460 }, { "epoch": 1.5136516046623023, "grad_norm": 0.18797864341732143, "learning_rate": 1.1579939838685731e-05, "loss": 0.085, "step": 9480 }, { "epoch": 1.5168449624780456, "grad_norm": 0.2242101441050116, "learning_rate": 1.154323907600179e-05, "loss": 0.0867, "step": 9500 }, { "epoch": 1.5200383202937888, "grad_norm": 0.17084103768025352, "learning_rate": 1.1506517007155432e-05, "loss": 0.0838, "step": 9520 }, { "epoch": 1.523231678109532, "grad_norm": 0.18934207218377755, "learning_rate": 1.1469774139136389e-05, "loss": 0.0857, "step": 9540 }, { "epoch": 1.5264250359252753, "grad_norm": 0.2265706734312821, "learning_rate": 1.1433010979221545e-05, "loss": 0.0866, "step": 9560 }, { "epoch": 1.5296183937410186, "grad_norm": 0.22302910930406783, "learning_rate": 1.1396228034967942e-05, "loss": 0.0841, "step": 9580 }, { "epoch": 1.5328117515567619, "grad_norm": 0.20180278303765992, "learning_rate": 1.1359425814205767e-05, "loss": 0.0863, "step": 9600 }, { "epoch": 1.5360051093725051, "grad_norm": 0.22800639526769467, "learning_rate": 1.132260482503133e-05, "loss": 0.0873, "step": 9620 }, { "epoch": 1.5391984671882484, "grad_norm": 0.21277101714684102, "learning_rate": 1.1285765575800076e-05, "loss": 0.0874, "step": 9640 }, { "epoch": 1.5423918250039916, "grad_norm": 0.18816604414097163, "learning_rate": 1.1248908575119539e-05, "loss": 0.0862, "step": 9660 }, { "epoch": 1.545585182819735, "grad_norm": 0.20138026843291984, "learning_rate": 1.1212034331842338e-05, "loss": 0.0856, "step": 9680 }, { "epoch": 1.5487785406354782, "grad_norm": 0.18862474943057217, "learning_rate": 1.1175143355059144e-05, "loss": 0.085, "step": 9700 }, { "epoch": 1.5519718984512214, "grad_norm": 0.18561382698856643, "learning_rate": 1.1138236154091656e-05, "loss": 0.0852, "step": 9720 }, { "epoch": 1.5551652562669647, "grad_norm": 0.18884644793283215, "learning_rate": 1.1101313238485552e-05, "loss": 0.0839, "step": 9740 }, { "epoch": 1.558358614082708, "grad_norm": 0.17345642894126198, "learning_rate": 1.1064375118003487e-05, "loss": 0.0844, "step": 9760 }, { "epoch": 1.5615519718984512, "grad_norm": 0.1991026940192444, "learning_rate": 1.1027422302618032e-05, "loss": 0.0846, "step": 9780 }, { "epoch": 1.5647453297141944, "grad_norm": 0.22201127220587602, "learning_rate": 1.099045530250463e-05, "loss": 0.0823, "step": 9800 }, { "epoch": 1.5679386875299377, "grad_norm": 0.23679974637337212, "learning_rate": 1.0953474628034562e-05, "loss": 0.087, "step": 9820 }, { "epoch": 1.571132045345681, "grad_norm": 0.18945453405405135, "learning_rate": 1.0916480789767907e-05, "loss": 0.0861, "step": 9840 }, { "epoch": 1.5743254031614242, "grad_norm": 0.18943349755537386, "learning_rate": 1.0879474298446479e-05, "loss": 0.0831, "step": 9860 }, { "epoch": 1.5775187609771675, "grad_norm": 0.20905996320818215, "learning_rate": 1.0842455664986782e-05, "loss": 0.0858, "step": 9880 }, { "epoch": 1.5807121187929107, "grad_norm": 0.1863924849638652, "learning_rate": 1.0805425400472956e-05, "loss": 0.0856, "step": 9900 }, { "epoch": 1.583905476608654, "grad_norm": 0.2091352813903984, "learning_rate": 1.076838401614972e-05, "loss": 0.0857, "step": 9920 }, { "epoch": 1.5870988344243973, "grad_norm": 0.24371561015345014, "learning_rate": 1.0731332023415319e-05, "loss": 0.089, "step": 9940 }, { "epoch": 1.5902921922401405, "grad_norm": 0.2128926213672918, "learning_rate": 1.0694269933814456e-05, "loss": 0.084, "step": 9960 }, { "epoch": 1.5934855500558838, "grad_norm": 0.21916373322291655, "learning_rate": 1.0657198259031232e-05, "loss": 0.0826, "step": 9980 }, { "epoch": 1.596678907871627, "grad_norm": 0.1824350216961259, "learning_rate": 1.0620117510882083e-05, "loss": 0.0864, "step": 10000 }, { "epoch": 1.5998722656873703, "grad_norm": 0.17881824213547054, "learning_rate": 1.058302820130871e-05, "loss": 0.0839, "step": 10020 }, { "epoch": 1.6030656235031135, "grad_norm": 0.19729212364378013, "learning_rate": 1.0545930842371022e-05, "loss": 0.0854, "step": 10040 }, { "epoch": 1.6062589813188568, "grad_norm": 0.2087951067289451, "learning_rate": 1.0508825946240053e-05, "loss": 0.085, "step": 10060 }, { "epoch": 1.6094523391346, "grad_norm": 0.19718155636666373, "learning_rate": 1.0471714025190897e-05, "loss": 0.0856, "step": 10080 }, { "epoch": 1.6126456969503433, "grad_norm": 0.20228614287118912, "learning_rate": 1.0434595591595635e-05, "loss": 0.0853, "step": 10100 }, { "epoch": 1.6158390547660866, "grad_norm": 0.18693736149298812, "learning_rate": 1.0397471157916263e-05, "loss": 0.0849, "step": 10120 }, { "epoch": 1.6190324125818298, "grad_norm": 0.191886465159157, "learning_rate": 1.0360341236697611e-05, "loss": 0.0838, "step": 10140 }, { "epoch": 1.622225770397573, "grad_norm": 0.20503489329364863, "learning_rate": 1.0323206340560275e-05, "loss": 0.0856, "step": 10160 }, { "epoch": 1.6254191282133164, "grad_norm": 0.24260576208421464, "learning_rate": 1.028606698219353e-05, "loss": 0.0865, "step": 10180 }, { "epoch": 1.6286124860290596, "grad_norm": 0.22639324906871056, "learning_rate": 1.0248923674348268e-05, "loss": 0.0859, "step": 10200 }, { "epoch": 1.6318058438448029, "grad_norm": 0.176153514574258, "learning_rate": 1.0211776929829893e-05, "loss": 0.0867, "step": 10220 }, { "epoch": 1.6349992016605461, "grad_norm": 0.1877599319113198, "learning_rate": 1.0174627261491268e-05, "loss": 0.0829, "step": 10240 }, { "epoch": 1.6381925594762894, "grad_norm": 0.19998890698860952, "learning_rate": 1.0137475182225617e-05, "loss": 0.0841, "step": 10260 }, { "epoch": 1.6413859172920326, "grad_norm": 0.21610758072730218, "learning_rate": 1.0100321204959449e-05, "loss": 0.0841, "step": 10280 }, { "epoch": 1.644579275107776, "grad_norm": 0.18193308572064754, "learning_rate": 1.0063165842645484e-05, "loss": 0.0849, "step": 10300 }, { "epoch": 1.6477726329235192, "grad_norm": 0.20248453381225345, "learning_rate": 1.0026009608255555e-05, "loss": 0.0845, "step": 10320 }, { "epoch": 1.6509659907392624, "grad_norm": 0.2320711692298343, "learning_rate": 9.988853014773542e-06, "loss": 0.0852, "step": 10340 }, { "epoch": 1.6541593485550057, "grad_norm": 0.19584194025576318, "learning_rate": 9.951696575188278e-06, "loss": 0.085, "step": 10360 }, { "epoch": 1.657352706370749, "grad_norm": 0.21553283351451755, "learning_rate": 9.914540802486474e-06, "loss": 0.0856, "step": 10380 }, { "epoch": 1.6605460641864922, "grad_norm": 0.18489531692862257, "learning_rate": 9.877386209645633e-06, "loss": 0.0858, "step": 10400 }, { "epoch": 1.6637394220022355, "grad_norm": 0.20763123012361048, "learning_rate": 9.84023330962697e-06, "loss": 0.0852, "step": 10420 }, { "epoch": 1.6669327798179787, "grad_norm": 0.19596129486655178, "learning_rate": 9.803082615368323e-06, "loss": 0.0835, "step": 10440 }, { "epoch": 1.670126137633722, "grad_norm": 0.18532596629268455, "learning_rate": 9.765934639777087e-06, "loss": 0.0841, "step": 10460 }, { "epoch": 1.6733194954494652, "grad_norm": 0.17232132477688097, "learning_rate": 9.728789895723109e-06, "loss": 0.0835, "step": 10480 }, { "epoch": 1.6765128532652085, "grad_norm": 0.19279722790694026, "learning_rate": 9.691648896031642e-06, "loss": 0.0877, "step": 10500 }, { "epoch": 1.6797062110809517, "grad_norm": 0.20508136042678382, "learning_rate": 9.65451215347622e-06, "loss": 0.0849, "step": 10520 }, { "epoch": 1.682899568896695, "grad_norm": 0.2030859465238483, "learning_rate": 9.61738018077162e-06, "loss": 0.0828, "step": 10540 }, { "epoch": 1.6860929267124383, "grad_norm": 0.21547506058080174, "learning_rate": 9.580253490566753e-06, "loss": 0.0837, "step": 10560 }, { "epoch": 1.6892862845281815, "grad_norm": 0.22700049094169877, "learning_rate": 9.543132595437612e-06, "loss": 0.0849, "step": 10580 }, { "epoch": 1.6924796423439248, "grad_norm": 0.19256501341459278, "learning_rate": 9.506018007880169e-06, "loss": 0.0845, "step": 10600 }, { "epoch": 1.695673000159668, "grad_norm": 0.20795374910309583, "learning_rate": 9.468910240303324e-06, "loss": 0.0819, "step": 10620 }, { "epoch": 1.6988663579754113, "grad_norm": 0.19009887752439592, "learning_rate": 9.431809805021815e-06, "loss": 0.0816, "step": 10640 }, { "epoch": 1.7020597157911546, "grad_norm": 0.19799389204125842, "learning_rate": 9.394717214249147e-06, "loss": 0.0851, "step": 10660 }, { "epoch": 1.7052530736068978, "grad_norm": 0.2271148145004972, "learning_rate": 9.357632980090528e-06, "loss": 0.0852, "step": 10680 }, { "epoch": 1.7084464314226409, "grad_norm": 0.2344519009086231, "learning_rate": 9.320557614535787e-06, "loss": 0.0831, "step": 10700 }, { "epoch": 1.7116397892383841, "grad_norm": 0.23925944266837562, "learning_rate": 9.283491629452315e-06, "loss": 0.0853, "step": 10720 }, { "epoch": 1.7148331470541274, "grad_norm": 0.20081263527645807, "learning_rate": 9.246435536577999e-06, "loss": 0.085, "step": 10740 }, { "epoch": 1.7180265048698706, "grad_norm": 0.20700627503253236, "learning_rate": 9.20938984751415e-06, "loss": 0.0851, "step": 10760 }, { "epoch": 1.7212198626856139, "grad_norm": 0.201866541369534, "learning_rate": 9.172355073718439e-06, "loss": 0.0842, "step": 10780 }, { "epoch": 1.7244132205013571, "grad_norm": 0.20280888722283474, "learning_rate": 9.135331726497843e-06, "loss": 0.0822, "step": 10800 }, { "epoch": 1.7276065783171004, "grad_norm": 0.19344393506408358, "learning_rate": 9.09832031700158e-06, "loss": 0.0828, "step": 10820 }, { "epoch": 1.7307999361328437, "grad_norm": 0.16624847494447237, "learning_rate": 9.06132135621406e-06, "loss": 0.0829, "step": 10840 }, { "epoch": 1.733993293948587, "grad_norm": 0.21724180904413368, "learning_rate": 9.024335354947812e-06, "loss": 0.0838, "step": 10860 }, { "epoch": 1.7371866517643302, "grad_norm": 0.23846515088949718, "learning_rate": 8.987362823836461e-06, "loss": 0.0852, "step": 10880 }, { "epoch": 1.7403800095800734, "grad_norm": 0.20925184991512286, "learning_rate": 8.950404273327646e-06, "loss": 0.0834, "step": 10900 }, { "epoch": 1.7435733673958167, "grad_norm": 0.1716514985288543, "learning_rate": 8.913460213675998e-06, "loss": 0.0836, "step": 10920 }, { "epoch": 1.74676672521156, "grad_norm": 0.20494704853492865, "learning_rate": 8.876531154936084e-06, "loss": 0.0817, "step": 10940 }, { "epoch": 1.7499600830273032, "grad_norm": 0.24261768980108364, "learning_rate": 8.839617606955355e-06, "loss": 0.0842, "step": 10960 }, { "epoch": 1.7531534408430465, "grad_norm": 0.2090462161591236, "learning_rate": 8.802720079367136e-06, "loss": 0.0828, "step": 10980 }, { "epoch": 1.7563467986587897, "grad_norm": 0.18696550737565137, "learning_rate": 8.765839081583564e-06, "loss": 0.082, "step": 11000 }, { "epoch": 1.759540156474533, "grad_norm": 0.19121809093030445, "learning_rate": 8.72897512278856e-06, "loss": 0.0848, "step": 11020 }, { "epoch": 1.7627335142902762, "grad_norm": 0.2102957076954447, "learning_rate": 8.692128711930805e-06, "loss": 0.084, "step": 11040 }, { "epoch": 1.7659268721060195, "grad_norm": 0.20622666368626175, "learning_rate": 8.655300357716716e-06, "loss": 0.0845, "step": 11060 }, { "epoch": 1.7691202299217628, "grad_norm": 0.2091346262679367, "learning_rate": 8.618490568603409e-06, "loss": 0.0821, "step": 11080 }, { "epoch": 1.772313587737506, "grad_norm": 0.18255707260911555, "learning_rate": 8.581699852791696e-06, "loss": 0.0824, "step": 11100 }, { "epoch": 1.7755069455532493, "grad_norm": 0.2201418888200012, "learning_rate": 8.54492871821905e-06, "loss": 0.0836, "step": 11120 }, { "epoch": 1.7787003033689925, "grad_norm": 0.1875915274082898, "learning_rate": 8.508177672552617e-06, "loss": 0.0842, "step": 11140 }, { "epoch": 1.7818936611847358, "grad_norm": 0.19600313792607987, "learning_rate": 8.471447223182179e-06, "loss": 0.0836, "step": 11160 }, { "epoch": 1.785087019000479, "grad_norm": 0.19719362419525954, "learning_rate": 8.434737877213172e-06, "loss": 0.0856, "step": 11180 }, { "epoch": 1.788280376816222, "grad_norm": 0.17156639629201742, "learning_rate": 8.398050141459674e-06, "loss": 0.0819, "step": 11200 }, { "epoch": 1.7914737346319654, "grad_norm": 0.2039792577946715, "learning_rate": 8.361384522437402e-06, "loss": 0.0827, "step": 11220 }, { "epoch": 1.7946670924477086, "grad_norm": 0.19179744785660258, "learning_rate": 8.324741526356738e-06, "loss": 0.0826, "step": 11240 }, { "epoch": 1.7978604502634519, "grad_norm": 0.18215189474588084, "learning_rate": 8.288121659115727e-06, "loss": 0.0819, "step": 11260 }, { "epoch": 1.8010538080791951, "grad_norm": 0.1644377928850563, "learning_rate": 8.251525426293084e-06, "loss": 0.0827, "step": 11280 }, { "epoch": 1.8042471658949384, "grad_norm": 0.21222246533392128, "learning_rate": 8.21495333314123e-06, "loss": 0.0843, "step": 11300 }, { "epoch": 1.8074405237106816, "grad_norm": 0.25181863269369087, "learning_rate": 8.178405884579317e-06, "loss": 0.0842, "step": 11320 }, { "epoch": 1.810633881526425, "grad_norm": 0.2109399815982731, "learning_rate": 8.141883585186241e-06, "loss": 0.0829, "step": 11340 }, { "epoch": 1.8138272393421682, "grad_norm": 0.18073042845539122, "learning_rate": 8.10538693919369e-06, "loss": 0.0834, "step": 11360 }, { "epoch": 1.8170205971579114, "grad_norm": 0.20526943895282074, "learning_rate": 8.068916450479174e-06, "loss": 0.081, "step": 11380 }, { "epoch": 1.8202139549736547, "grad_norm": 0.19361555670993416, "learning_rate": 8.03247262255908e-06, "loss": 0.0836, "step": 11400 }, { "epoch": 1.823407312789398, "grad_norm": 0.24389934893406925, "learning_rate": 7.996055958581703e-06, "loss": 0.0828, "step": 11420 }, { "epoch": 1.8266006706051412, "grad_norm": 0.1877153126969613, "learning_rate": 7.959666961320314e-06, "loss": 0.0823, "step": 11440 }, { "epoch": 1.8297940284208845, "grad_norm": 0.19815842442257633, "learning_rate": 7.923306133166218e-06, "loss": 0.0827, "step": 11460 }, { "epoch": 1.8329873862366277, "grad_norm": 0.21678547999171613, "learning_rate": 7.886973976121797e-06, "loss": 0.0821, "step": 11480 }, { "epoch": 1.836180744052371, "grad_norm": 0.21618607294885436, "learning_rate": 7.850670991793621e-06, "loss": 0.0847, "step": 11500 }, { "epoch": 1.8393741018681142, "grad_norm": 0.1704593983368394, "learning_rate": 7.81439768138548e-06, "loss": 0.082, "step": 11520 }, { "epoch": 1.8425674596838575, "grad_norm": 0.18606341720829214, "learning_rate": 7.778154545691481e-06, "loss": 0.0812, "step": 11540 }, { "epoch": 1.8457608174996007, "grad_norm": 0.21208825422427718, "learning_rate": 7.741942085089146e-06, "loss": 0.083, "step": 11560 }, { "epoch": 1.848954175315344, "grad_norm": 0.18782574055868467, "learning_rate": 7.705760799532485e-06, "loss": 0.0828, "step": 11580 }, { "epoch": 1.8521475331310873, "grad_norm": 0.19574167645932028, "learning_rate": 7.669611188545103e-06, "loss": 0.083, "step": 11600 }, { "epoch": 1.8553408909468305, "grad_norm": 0.2065298678199762, "learning_rate": 7.6334937512133e-06, "loss": 0.0825, "step": 11620 }, { "epoch": 1.8585342487625738, "grad_norm": 0.1977503317300438, "learning_rate": 7.597408986179184e-06, "loss": 0.0806, "step": 11640 }, { "epoch": 1.861727606578317, "grad_norm": 0.20586182397186595, "learning_rate": 7.561357391633789e-06, "loss": 0.0824, "step": 11660 }, { "epoch": 1.8649209643940603, "grad_norm": 0.21998998145214102, "learning_rate": 7.525339465310183e-06, "loss": 0.0838, "step": 11680 }, { "epoch": 1.8681143222098036, "grad_norm": 0.24487809053970366, "learning_rate": 7.4893557044766145e-06, "loss": 0.0821, "step": 11700 }, { "epoch": 1.8713076800255468, "grad_norm": 0.18687218223534408, "learning_rate": 7.453406605929637e-06, "loss": 0.0806, "step": 11720 }, { "epoch": 1.87450103784129, "grad_norm": 0.17318503959159254, "learning_rate": 7.417492665987247e-06, "loss": 0.0819, "step": 11740 }, { "epoch": 1.8776943956570333, "grad_norm": 0.18945197729794094, "learning_rate": 7.3816143804820454e-06, "loss": 0.0835, "step": 11760 }, { "epoch": 1.8808877534727766, "grad_norm": 0.20142501192350587, "learning_rate": 7.345772244754377e-06, "loss": 0.0844, "step": 11780 }, { "epoch": 1.8840811112885198, "grad_norm": 0.20568732816869706, "learning_rate": 7.309966753645496e-06, "loss": 0.0801, "step": 11800 }, { "epoch": 1.887274469104263, "grad_norm": 0.20182816399217324, "learning_rate": 7.274198401490744e-06, "loss": 0.0846, "step": 11820 }, { "epoch": 1.8904678269200064, "grad_norm": 0.20018924573509358, "learning_rate": 7.2384676821127135e-06, "loss": 0.0798, "step": 11840 }, { "epoch": 1.8936611847357496, "grad_norm": 0.28199792560782483, "learning_rate": 7.202775088814429e-06, "loss": 0.0815, "step": 11860 }, { "epoch": 1.8968545425514929, "grad_norm": 0.22764478972933266, "learning_rate": 7.1671211143725485e-06, "loss": 0.0815, "step": 11880 }, { "epoch": 1.9000479003672361, "grad_norm": 0.1981593984765646, "learning_rate": 7.131506251030547e-06, "loss": 0.0809, "step": 11900 }, { "epoch": 1.9032412581829794, "grad_norm": 0.20992169378762218, "learning_rate": 7.095930990491933e-06, "loss": 0.0809, "step": 11920 }, { "epoch": 1.9064346159987227, "grad_norm": 0.19005910859773092, "learning_rate": 7.060395823913447e-06, "loss": 0.0842, "step": 11940 }, { "epoch": 1.909627973814466, "grad_norm": 0.19205175219083725, "learning_rate": 7.024901241898292e-06, "loss": 0.0819, "step": 11960 }, { "epoch": 1.9128213316302092, "grad_norm": 0.20008872943717196, "learning_rate": 6.9894477344893505e-06, "loss": 0.0819, "step": 11980 }, { "epoch": 1.9160146894459524, "grad_norm": 0.1773872749793287, "learning_rate": 6.9540357911624336e-06, "loss": 0.0823, "step": 12000 }, { "epoch": 1.9192080472616957, "grad_norm": 0.19417086960624413, "learning_rate": 6.918665900819497e-06, "loss": 0.0791, "step": 12020 }, { "epoch": 1.922401405077439, "grad_norm": 0.1814650138072353, "learning_rate": 6.883338551781923e-06, "loss": 0.0811, "step": 12040 }, { "epoch": 1.9255947628931822, "grad_norm": 0.1702657944804681, "learning_rate": 6.8480542317837505e-06, "loss": 0.0803, "step": 12060 }, { "epoch": 1.9287881207089255, "grad_norm": 0.18416550882743182, "learning_rate": 6.812813427964963e-06, "loss": 0.081, "step": 12080 }, { "epoch": 1.9319814785246687, "grad_norm": 0.21054620503327667, "learning_rate": 6.77761662686475e-06, "loss": 0.0837, "step": 12100 }, { "epoch": 1.935174836340412, "grad_norm": 0.1788773690242681, "learning_rate": 6.742464314414791e-06, "loss": 0.0809, "step": 12120 }, { "epoch": 1.9383681941561552, "grad_norm": 0.19629223674022553, "learning_rate": 6.707356975932559e-06, "loss": 0.0821, "step": 12140 }, { "epoch": 1.9415615519718985, "grad_norm": 0.17739114236704748, "learning_rate": 6.672295096114597e-06, "loss": 0.0816, "step": 12160 }, { "epoch": 1.9447549097876418, "grad_norm": 0.20468934483234205, "learning_rate": 6.637279159029851e-06, "loss": 0.0827, "step": 12180 }, { "epoch": 1.947948267603385, "grad_norm": 0.16608032221866548, "learning_rate": 6.602309648112968e-06, "loss": 0.0792, "step": 12200 }, { "epoch": 1.9511416254191283, "grad_norm": 0.1759677545684069, "learning_rate": 6.567387046157632e-06, "loss": 0.0785, "step": 12220 }, { "epoch": 1.9543349832348715, "grad_norm": 0.18405948214393053, "learning_rate": 6.532511835309896e-06, "loss": 0.0822, "step": 12240 }, { "epoch": 1.9575283410506148, "grad_norm": 0.2012173937759783, "learning_rate": 6.497684497061531e-06, "loss": 0.0818, "step": 12260 }, { "epoch": 1.960721698866358, "grad_norm": 0.2057906504416338, "learning_rate": 6.462905512243359e-06, "loss": 0.0806, "step": 12280 }, { "epoch": 1.9639150566821013, "grad_norm": 0.20687177701805626, "learning_rate": 6.428175361018643e-06, "loss": 0.0794, "step": 12300 }, { "epoch": 1.9671084144978446, "grad_norm": 0.2064196549144857, "learning_rate": 6.393494522876428e-06, "loss": 0.0816, "step": 12320 }, { "epoch": 1.9703017723135878, "grad_norm": 0.2133102540844893, "learning_rate": 6.358863476624948e-06, "loss": 0.0821, "step": 12340 }, { "epoch": 1.973495130129331, "grad_norm": 0.18497415279048168, "learning_rate": 6.324282700385e-06, "loss": 0.0824, "step": 12360 }, { "epoch": 1.9766884879450743, "grad_norm": 0.19520821054839646, "learning_rate": 6.289752671583344e-06, "loss": 0.0792, "step": 12380 }, { "epoch": 1.9798818457608176, "grad_norm": 0.18726221094986775, "learning_rate": 6.255273866946119e-06, "loss": 0.0799, "step": 12400 }, { "epoch": 1.9830752035765609, "grad_norm": 0.19525199269461027, "learning_rate": 6.22084676249225e-06, "loss": 0.0796, "step": 12420 }, { "epoch": 1.9862685613923041, "grad_norm": 0.16345775381577554, "learning_rate": 6.186471833526888e-06, "loss": 0.082, "step": 12440 }, { "epoch": 1.9894619192080474, "grad_norm": 0.1972221294843483, "learning_rate": 6.15214955463484e-06, "loss": 0.0787, "step": 12460 }, { "epoch": 1.9926552770237906, "grad_norm": 0.1935374722805669, "learning_rate": 6.117880399674016e-06, "loss": 0.0827, "step": 12480 }, { "epoch": 1.995848634839534, "grad_norm": 0.18315518408993714, "learning_rate": 6.083664841768901e-06, "loss": 0.0816, "step": 12500 }, { "epoch": 1.9990419926552772, "grad_norm": 0.16860052008855017, "learning_rate": 6.049503353304e-06, "loss": 0.0844, "step": 12520 }, { "epoch": 2.0022353504710204, "grad_norm": 0.18498027675472176, "learning_rate": 6.015396405917333e-06, "loss": 0.061, "step": 12540 }, { "epoch": 2.0054287082867637, "grad_norm": 0.20247862079416473, "learning_rate": 5.98134447049392e-06, "loss": 0.0494, "step": 12560 }, { "epoch": 2.008622066102507, "grad_norm": 0.17717972255777836, "learning_rate": 5.947348017159272e-06, "loss": 0.0496, "step": 12580 }, { "epoch": 2.01181542391825, "grad_norm": 0.17560899509079128, "learning_rate": 5.913407515272918e-06, "loss": 0.0484, "step": 12600 }, { "epoch": 2.0150087817339934, "grad_norm": 0.2107019559801837, "learning_rate": 5.879523433421903e-06, "loss": 0.0455, "step": 12620 }, { "epoch": 2.0182021395497367, "grad_norm": 0.17228228604398835, "learning_rate": 5.845696239414336e-06, "loss": 0.0481, "step": 12640 }, { "epoch": 2.02139549736548, "grad_norm": 0.16576058508327604, "learning_rate": 5.8119264002729244e-06, "loss": 0.0484, "step": 12660 }, { "epoch": 2.0245888551812232, "grad_norm": 0.17885300287909717, "learning_rate": 5.778214382228524e-06, "loss": 0.047, "step": 12680 }, { "epoch": 2.0277822129969665, "grad_norm": 0.20671449403256986, "learning_rate": 5.744560650713704e-06, "loss": 0.0471, "step": 12700 }, { "epoch": 2.0309755708127097, "grad_norm": 0.20083359478447635, "learning_rate": 5.710965670356332e-06, "loss": 0.0479, "step": 12720 }, { "epoch": 2.034168928628453, "grad_norm": 0.18961936533749266, "learning_rate": 5.6774299049731325e-06, "loss": 0.0478, "step": 12740 }, { "epoch": 2.0373622864441963, "grad_norm": 0.21979140727547378, "learning_rate": 5.643953817563318e-06, "loss": 0.0453, "step": 12760 }, { "epoch": 2.0405556442599395, "grad_norm": 0.16165099720000836, "learning_rate": 5.610537870302164e-06, "loss": 0.0476, "step": 12780 }, { "epoch": 2.0437490020756828, "grad_norm": 0.18343428699528758, "learning_rate": 5.577182524534657e-06, "loss": 0.0478, "step": 12800 }, { "epoch": 2.046942359891426, "grad_norm": 0.17215552651589366, "learning_rate": 5.5438882407691e-06, "loss": 0.0472, "step": 12820 }, { "epoch": 2.0501357177071693, "grad_norm": 0.1624976046442029, "learning_rate": 5.510655478670769e-06, "loss": 0.0478, "step": 12840 }, { "epoch": 2.0533290755229126, "grad_norm": 0.22026015940397797, "learning_rate": 5.4774846970555615e-06, "loss": 0.0461, "step": 12860 }, { "epoch": 2.056522433338656, "grad_norm": 0.17519613837123435, "learning_rate": 5.444376353883678e-06, "loss": 0.0462, "step": 12880 }, { "epoch": 2.059715791154399, "grad_norm": 0.18277575133361915, "learning_rate": 5.411330906253269e-06, "loss": 0.0455, "step": 12900 }, { "epoch": 2.0629091489701423, "grad_norm": 0.18787731365044255, "learning_rate": 5.378348810394143e-06, "loss": 0.0462, "step": 12920 }, { "epoch": 2.066102506785885, "grad_norm": 0.18201430894959444, "learning_rate": 5.3454305216614766e-06, "loss": 0.0473, "step": 12940 }, { "epoch": 2.0692958646016284, "grad_norm": 0.1904233887751224, "learning_rate": 5.312576494529507e-06, "loss": 0.0494, "step": 12960 }, { "epoch": 2.0724892224173717, "grad_norm": 0.18985642952053444, "learning_rate": 5.279787182585271e-06, "loss": 0.0462, "step": 12980 }, { "epoch": 2.075682580233115, "grad_norm": 0.1582812242047444, "learning_rate": 5.247063038522329e-06, "loss": 0.0469, "step": 13000 }, { "epoch": 2.078875938048858, "grad_norm": 0.19286531510895663, "learning_rate": 5.21440451413455e-06, "loss": 0.0465, "step": 13020 }, { "epoch": 2.0820692958646014, "grad_norm": 0.22047888942684946, "learning_rate": 5.181812060309825e-06, "loss": 0.0463, "step": 13040 }, { "epoch": 2.0852626536803447, "grad_norm": 0.22499631209380672, "learning_rate": 5.149286127023874e-06, "loss": 0.0467, "step": 13060 }, { "epoch": 2.088456011496088, "grad_norm": 0.18796568419290619, "learning_rate": 5.1168271633340235e-06, "loss": 0.0471, "step": 13080 }, { "epoch": 2.091649369311831, "grad_norm": 0.1796719273681106, "learning_rate": 5.084435617373018e-06, "loss": 0.048, "step": 13100 }, { "epoch": 2.0948427271275745, "grad_norm": 0.1916078526748605, "learning_rate": 5.052111936342812e-06, "loss": 0.0467, "step": 13120 }, { "epoch": 2.0980360849433177, "grad_norm": 0.19878847514842057, "learning_rate": 5.019856566508412e-06, "loss": 0.0478, "step": 13140 }, { "epoch": 2.101229442759061, "grad_norm": 0.2088933392167675, "learning_rate": 4.9876699531917186e-06, "loss": 0.0473, "step": 13160 }, { "epoch": 2.1044228005748042, "grad_norm": 0.20402583213332395, "learning_rate": 4.95555254076536e-06, "loss": 0.0457, "step": 13180 }, { "epoch": 2.1076161583905475, "grad_norm": 0.16605435030952836, "learning_rate": 4.923504772646573e-06, "loss": 0.0473, "step": 13200 }, { "epoch": 2.1108095162062908, "grad_norm": 0.17651776985556464, "learning_rate": 4.891527091291071e-06, "loss": 0.0477, "step": 13220 }, { "epoch": 2.114002874022034, "grad_norm": 0.1763790661182835, "learning_rate": 4.859619938186947e-06, "loss": 0.0456, "step": 13240 }, { "epoch": 2.1171962318377773, "grad_norm": 0.18886660022445972, "learning_rate": 4.827783753848575e-06, "loss": 0.0455, "step": 13260 }, { "epoch": 2.1203895896535205, "grad_norm": 0.2059211240085781, "learning_rate": 4.796018977810514e-06, "loss": 0.0457, "step": 13280 }, { "epoch": 2.123582947469264, "grad_norm": 0.19168043665328116, "learning_rate": 4.76432604862145e-06, "loss": 0.046, "step": 13300 }, { "epoch": 2.126776305285007, "grad_norm": 0.17778767466228898, "learning_rate": 4.732705403838159e-06, "loss": 0.0465, "step": 13320 }, { "epoch": 2.1299696631007503, "grad_norm": 0.170308319213917, "learning_rate": 4.701157480019429e-06, "loss": 0.0474, "step": 13340 }, { "epoch": 2.1331630209164936, "grad_norm": 0.1711104888651996, "learning_rate": 4.669682712720065e-06, "loss": 0.0462, "step": 13360 }, { "epoch": 2.136356378732237, "grad_norm": 0.1825464435577293, "learning_rate": 4.638281536484854e-06, "loss": 0.0485, "step": 13380 }, { "epoch": 2.13954973654798, "grad_norm": 0.1835185156049789, "learning_rate": 4.606954384842587e-06, "loss": 0.0455, "step": 13400 }, { "epoch": 2.1427430943637233, "grad_norm": 0.19538449656271248, "learning_rate": 4.575701690300051e-06, "loss": 0.0457, "step": 13420 }, { "epoch": 2.1459364521794666, "grad_norm": 0.20119853731280407, "learning_rate": 4.544523884336073e-06, "loss": 0.0462, "step": 13440 }, { "epoch": 2.14912980999521, "grad_norm": 0.19230165287264112, "learning_rate": 4.513421397395563e-06, "loss": 0.0449, "step": 13460 }, { "epoch": 2.152323167810953, "grad_norm": 0.19371541515972485, "learning_rate": 4.482394658883557e-06, "loss": 0.0465, "step": 13480 }, { "epoch": 2.1555165256266964, "grad_norm": 0.2749584429863373, "learning_rate": 4.451444097159301e-06, "loss": 0.0465, "step": 13500 }, { "epoch": 2.1587098834424396, "grad_norm": 0.181430213502962, "learning_rate": 4.4205701395303424e-06, "loss": 0.0469, "step": 13520 }, { "epoch": 2.161903241258183, "grad_norm": 0.21832000463916046, "learning_rate": 4.38977321224661e-06, "loss": 0.0472, "step": 13540 }, { "epoch": 2.165096599073926, "grad_norm": 0.36594927042777403, "learning_rate": 4.3590537404945535e-06, "loss": 0.0471, "step": 13560 }, { "epoch": 2.1682899568896694, "grad_norm": 0.19062769875876745, "learning_rate": 4.3284121483912525e-06, "loss": 0.0464, "step": 13580 }, { "epoch": 2.1714833147054127, "grad_norm": 0.18521477830070004, "learning_rate": 4.297848858978569e-06, "loss": 0.0461, "step": 13600 }, { "epoch": 2.174676672521156, "grad_norm": 0.2064934921930085, "learning_rate": 4.2673642942173184e-06, "loss": 0.0451, "step": 13620 }, { "epoch": 2.177870030336899, "grad_norm": 0.19089143723142035, "learning_rate": 4.236958874981423e-06, "loss": 0.0448, "step": 13640 }, { "epoch": 2.1810633881526424, "grad_norm": 0.17162658742427372, "learning_rate": 4.206633021052115e-06, "loss": 0.0453, "step": 13660 }, { "epoch": 2.1842567459683857, "grad_norm": 0.18039037729927956, "learning_rate": 4.176387151112134e-06, "loss": 0.0455, "step": 13680 }, { "epoch": 2.187450103784129, "grad_norm": 0.16510411035975564, "learning_rate": 4.1462216827399585e-06, "loss": 0.0446, "step": 13700 }, { "epoch": 2.190643461599872, "grad_norm": 0.2215703230886645, "learning_rate": 4.116137032404026e-06, "loss": 0.0453, "step": 13720 }, { "epoch": 2.1938368194156155, "grad_norm": 0.18140462418275824, "learning_rate": 4.0861336154569855e-06, "loss": 0.0446, "step": 13740 }, { "epoch": 2.1970301772313587, "grad_norm": 0.164963005058681, "learning_rate": 4.056211846129977e-06, "loss": 0.0451, "step": 13760 }, { "epoch": 2.200223535047102, "grad_norm": 0.22161978868062865, "learning_rate": 4.0263721375269e-06, "loss": 0.0439, "step": 13780 }, { "epoch": 2.2034168928628453, "grad_norm": 0.18997163122166422, "learning_rate": 3.99661490161871e-06, "loss": 0.0452, "step": 13800 }, { "epoch": 2.2066102506785885, "grad_norm": 0.19721572060634018, "learning_rate": 3.966940549237728e-06, "loss": 0.046, "step": 13820 }, { "epoch": 2.2098036084943318, "grad_norm": 0.1613696871656721, "learning_rate": 3.937349490071989e-06, "loss": 0.0451, "step": 13840 }, { "epoch": 2.212996966310075, "grad_norm": 0.23649764683113925, "learning_rate": 3.9078421326595575e-06, "loss": 0.0473, "step": 13860 }, { "epoch": 2.2161903241258183, "grad_norm": 0.15900455957581072, "learning_rate": 3.8784188843829075e-06, "loss": 0.0467, "step": 13880 }, { "epoch": 2.2193836819415615, "grad_norm": 0.16623211370488078, "learning_rate": 3.849080151463284e-06, "loss": 0.0447, "step": 13900 }, { "epoch": 2.222577039757305, "grad_norm": 0.23855246445899472, "learning_rate": 3.819826338955115e-06, "loss": 0.045, "step": 13920 }, { "epoch": 2.225770397573048, "grad_norm": 0.16852273819977373, "learning_rate": 3.7906578507403925e-06, "loss": 0.044, "step": 13940 }, { "epoch": 2.2289637553887913, "grad_norm": 0.19176422233347587, "learning_rate": 3.761575089523114e-06, "loss": 0.0451, "step": 13960 }, { "epoch": 2.2321571132045346, "grad_norm": 0.19217003400101632, "learning_rate": 3.7325784568237267e-06, "loss": 0.0456, "step": 13980 }, { "epoch": 2.235350471020278, "grad_norm": 0.2142815186061357, "learning_rate": 3.7036683529735616e-06, "loss": 0.0438, "step": 14000 }, { "epoch": 2.238543828836021, "grad_norm": 0.16980952681099654, "learning_rate": 3.6748451771093386e-06, "loss": 0.0456, "step": 14020 }, { "epoch": 2.2417371866517644, "grad_norm": 0.20792979968816608, "learning_rate": 3.6461093271676216e-06, "loss": 0.045, "step": 14040 }, { "epoch": 2.2449305444675076, "grad_norm": 0.19749481308114683, "learning_rate": 3.6174611998793486e-06, "loss": 0.0455, "step": 14060 }, { "epoch": 2.248123902283251, "grad_norm": 0.208757882997406, "learning_rate": 3.5889011907643523e-06, "loss": 0.0468, "step": 14080 }, { "epoch": 2.251317260098994, "grad_norm": 0.18603971145921822, "learning_rate": 3.5604296941258854e-06, "loss": 0.0456, "step": 14100 }, { "epoch": 2.2545106179147374, "grad_norm": 0.24232186850665094, "learning_rate": 3.532047103045185e-06, "loss": 0.0442, "step": 14120 }, { "epoch": 2.2577039757304806, "grad_norm": 0.24810029826855062, "learning_rate": 3.503753809376059e-06, "loss": 0.0463, "step": 14140 }, { "epoch": 2.260897333546224, "grad_norm": 0.23406287255675895, "learning_rate": 3.475550203739452e-06, "loss": 0.0451, "step": 14160 }, { "epoch": 2.264090691361967, "grad_norm": 0.17282967387502232, "learning_rate": 3.4474366755180644e-06, "loss": 0.0453, "step": 14180 }, { "epoch": 2.2672840491777104, "grad_norm": 0.21126534883401732, "learning_rate": 3.419413612850976e-06, "loss": 0.0461, "step": 14200 }, { "epoch": 2.2704774069934537, "grad_norm": 0.16104640464566056, "learning_rate": 3.391481402628297e-06, "loss": 0.0476, "step": 14220 }, { "epoch": 2.273670764809197, "grad_norm": 0.21435527733602905, "learning_rate": 3.363640430485804e-06, "loss": 0.0446, "step": 14240 }, { "epoch": 2.27686412262494, "grad_norm": 0.18548507359762656, "learning_rate": 3.3358910807996325e-06, "loss": 0.0451, "step": 14260 }, { "epoch": 2.2800574804406835, "grad_norm": 0.19423383437023095, "learning_rate": 3.3082337366809704e-06, "loss": 0.0448, "step": 14280 }, { "epoch": 2.2832508382564267, "grad_norm": 0.17237074664312235, "learning_rate": 3.2806687799707647e-06, "loss": 0.0459, "step": 14300 }, { "epoch": 2.28644419607217, "grad_norm": 0.22791506612179063, "learning_rate": 3.253196591234443e-06, "loss": 0.0449, "step": 14320 }, { "epoch": 2.2896375538879132, "grad_norm": 0.18890323777751128, "learning_rate": 3.2258175497566678e-06, "loss": 0.0449, "step": 14340 }, { "epoch": 2.2928309117036565, "grad_norm": 0.22098418299523961, "learning_rate": 3.198532033536107e-06, "loss": 0.0437, "step": 14360 }, { "epoch": 2.2960242695193998, "grad_norm": 0.22834203263219127, "learning_rate": 3.1713404192801945e-06, "loss": 0.0462, "step": 14380 }, { "epoch": 2.299217627335143, "grad_norm": 0.19033969048906568, "learning_rate": 3.144243082399947e-06, "loss": 0.0454, "step": 14400 }, { "epoch": 2.3024109851508863, "grad_norm": 0.1772642418355086, "learning_rate": 3.1172403970047725e-06, "loss": 0.0441, "step": 14420 }, { "epoch": 2.3056043429666295, "grad_norm": 0.2048657544909403, "learning_rate": 3.0903327358973168e-06, "loss": 0.0446, "step": 14440 }, { "epoch": 2.308797700782373, "grad_norm": 0.18540450076918674, "learning_rate": 3.0635204705682976e-06, "loss": 0.0451, "step": 14460 }, { "epoch": 2.311991058598116, "grad_norm": 0.18445665460036134, "learning_rate": 3.0368039711913867e-06, "loss": 0.0459, "step": 14480 }, { "epoch": 2.3151844164138593, "grad_norm": 0.22336940402363192, "learning_rate": 3.0101836066181033e-06, "loss": 0.0455, "step": 14500 }, { "epoch": 2.3183777742296026, "grad_norm": 0.16285692399794796, "learning_rate": 2.983659744372721e-06, "loss": 0.045, "step": 14520 }, { "epoch": 2.321571132045346, "grad_norm": 0.19697000745739243, "learning_rate": 2.9572327506471775e-06, "loss": 0.0454, "step": 14540 }, { "epoch": 2.324764489861089, "grad_norm": 0.1950278510185452, "learning_rate": 2.9309029902960395e-06, "loss": 0.0452, "step": 14560 }, { "epoch": 2.3279578476768323, "grad_norm": 0.1926073736357789, "learning_rate": 2.9046708268314494e-06, "loss": 0.0455, "step": 14580 }, { "epoch": 2.3311512054925756, "grad_norm": 0.5787988360468825, "learning_rate": 2.8785366224181265e-06, "loss": 0.047, "step": 14600 }, { "epoch": 2.334344563308319, "grad_norm": 0.19178497872154512, "learning_rate": 2.8525007378683433e-06, "loss": 0.0441, "step": 14620 }, { "epoch": 2.337537921124062, "grad_norm": 0.20463851817417028, "learning_rate": 2.8265635326369557e-06, "loss": 0.0443, "step": 14640 }, { "epoch": 2.3407312789398054, "grad_norm": 0.18832526122080892, "learning_rate": 2.8007253648164502e-06, "loss": 0.0447, "step": 14660 }, { "epoch": 2.3439246367555486, "grad_norm": 0.25535504048141416, "learning_rate": 2.7749865911319786e-06, "loss": 0.0462, "step": 14680 }, { "epoch": 2.347117994571292, "grad_norm": 0.2783926831983617, "learning_rate": 2.74934756693645e-06, "loss": 0.0461, "step": 14700 }, { "epoch": 2.350311352387035, "grad_norm": 0.1799001156488928, "learning_rate": 2.7238086462056125e-06, "loss": 0.0451, "step": 14720 }, { "epoch": 2.3535047102027784, "grad_norm": 0.22749744937087824, "learning_rate": 2.6983701815331844e-06, "loss": 0.0449, "step": 14740 }, { "epoch": 2.3566980680185217, "grad_norm": 0.192235427214562, "learning_rate": 2.6730325241259605e-06, "loss": 0.0447, "step": 14760 }, { "epoch": 2.359891425834265, "grad_norm": 0.1779393552771597, "learning_rate": 2.647796023798991e-06, "loss": 0.0455, "step": 14780 }, { "epoch": 2.3630847836500077, "grad_norm": 0.17636063193070986, "learning_rate": 2.6226610289707235e-06, "loss": 0.0453, "step": 14800 }, { "epoch": 2.3662781414657514, "grad_norm": 0.17751151289004394, "learning_rate": 2.5976278866582226e-06, "loss": 0.0439, "step": 14820 }, { "epoch": 2.3694714992814943, "grad_norm": 0.1612714192997329, "learning_rate": 2.5726969424723514e-06, "loss": 0.0451, "step": 14840 }, { "epoch": 2.372664857097238, "grad_norm": 0.19257379967637422, "learning_rate": 2.5478685406130143e-06, "loss": 0.0535, "step": 14860 }, { "epoch": 2.3758582149129808, "grad_norm": 0.18593345377491236, "learning_rate": 2.5231430238644106e-06, "loss": 0.045, "step": 14880 }, { "epoch": 2.3790515727287245, "grad_norm": 0.19051880160399431, "learning_rate": 2.4985207335902863e-06, "loss": 0.0451, "step": 14900 }, { "epoch": 2.3822449305444673, "grad_norm": 0.18531119849649635, "learning_rate": 2.4740020097292318e-06, "loss": 0.0426, "step": 14920 }, { "epoch": 2.385438288360211, "grad_norm": 0.23011458580940014, "learning_rate": 2.4495871907899816e-06, "loss": 0.0456, "step": 14940 }, { "epoch": 2.388631646175954, "grad_norm": 0.22814782369226178, "learning_rate": 2.425276613846755e-06, "loss": 0.0458, "step": 14960 }, { "epoch": 2.3918250039916975, "grad_norm": 0.18964633782059312, "learning_rate": 2.401070614534585e-06, "loss": 0.0445, "step": 14980 }, { "epoch": 2.3950183618074403, "grad_norm": 0.18585844070460122, "learning_rate": 2.3769695270446903e-06, "loss": 0.0433, "step": 15000 }, { "epoch": 2.398211719623184, "grad_norm": 0.2173023589979796, "learning_rate": 2.352973684119868e-06, "loss": 0.0452, "step": 15020 }, { "epoch": 2.401405077438927, "grad_norm": 0.1888223260670983, "learning_rate": 2.329083417049899e-06, "loss": 0.0453, "step": 15040 }, { "epoch": 2.40459843525467, "grad_norm": 0.2000345304946633, "learning_rate": 2.3052990556669587e-06, "loss": 0.0443, "step": 15060 }, { "epoch": 2.4077917930704134, "grad_norm": 0.350402818921811, "learning_rate": 2.2816209283410815e-06, "loss": 0.0446, "step": 15080 }, { "epoch": 2.4109851508861566, "grad_norm": 0.17540258992531277, "learning_rate": 2.258049361975616e-06, "loss": 0.0448, "step": 15100 }, { "epoch": 2.4141785087019, "grad_norm": 0.2240022668610996, "learning_rate": 2.234584682002726e-06, "loss": 0.0436, "step": 15120 }, { "epoch": 2.417371866517643, "grad_norm": 0.19377910419185784, "learning_rate": 2.211227212378877e-06, "loss": 0.0449, "step": 15140 }, { "epoch": 2.4205652243333864, "grad_norm": 0.18307979574559963, "learning_rate": 2.1879772755803763e-06, "loss": 0.0437, "step": 15160 }, { "epoch": 2.4237585821491296, "grad_norm": 0.18479960232316164, "learning_rate": 2.1648351925989253e-06, "loss": 0.0469, "step": 15180 }, { "epoch": 2.426951939964873, "grad_norm": 0.19121025995799099, "learning_rate": 2.1418012829371735e-06, "loss": 0.0438, "step": 15200 }, { "epoch": 2.430145297780616, "grad_norm": 0.19858616833926596, "learning_rate": 2.1188758646043206e-06, "loss": 0.044, "step": 15220 }, { "epoch": 2.4333386555963594, "grad_norm": 0.18772227683807235, "learning_rate": 2.0960592541117143e-06, "loss": 0.0452, "step": 15240 }, { "epoch": 2.4365320134121027, "grad_norm": 0.1743929147084694, "learning_rate": 2.0733517664684944e-06, "loss": 0.0438, "step": 15260 }, { "epoch": 2.439725371227846, "grad_norm": 0.18605377215327853, "learning_rate": 2.050753715177236e-06, "loss": 0.0464, "step": 15280 }, { "epoch": 2.442918729043589, "grad_norm": 0.19099944392969617, "learning_rate": 2.0282654122296154e-06, "loss": 0.0434, "step": 15300 }, { "epoch": 2.4461120868593325, "grad_norm": 0.19579885958359836, "learning_rate": 2.0058871681021087e-06, "loss": 0.0433, "step": 15320 }, { "epoch": 2.4493054446750757, "grad_norm": 0.2037719797424841, "learning_rate": 1.983619291751716e-06, "loss": 0.0445, "step": 15340 }, { "epoch": 2.452498802490819, "grad_norm": 0.2288507482341902, "learning_rate": 1.961462090611673e-06, "loss": 0.0445, "step": 15360 }, { "epoch": 2.4556921603065622, "grad_norm": 0.18192991033918157, "learning_rate": 1.9394158705872244e-06, "loss": 0.0453, "step": 15380 }, { "epoch": 2.4588855181223055, "grad_norm": 0.2180936188857526, "learning_rate": 1.9174809360513935e-06, "loss": 0.045, "step": 15400 }, { "epoch": 2.4620788759380487, "grad_norm": 0.1894861914106852, "learning_rate": 1.8956575898407847e-06, "loss": 0.0464, "step": 15420 }, { "epoch": 2.465272233753792, "grad_norm": 0.2021847245639915, "learning_rate": 1.8739461332513953e-06, "loss": 0.0459, "step": 15440 }, { "epoch": 2.4684655915695353, "grad_norm": 0.1992201840351267, "learning_rate": 1.85234686603446e-06, "loss": 0.044, "step": 15460 }, { "epoch": 2.4716589493852785, "grad_norm": 0.18202769181733872, "learning_rate": 1.8308600863923164e-06, "loss": 0.0464, "step": 15480 }, { "epoch": 2.474852307201022, "grad_norm": 0.17956705043459079, "learning_rate": 1.8094860909742795e-06, "loss": 0.0457, "step": 15500 }, { "epoch": 2.478045665016765, "grad_norm": 0.1780847660838803, "learning_rate": 1.78822517487255e-06, "loss": 0.044, "step": 15520 }, { "epoch": 2.4812390228325083, "grad_norm": 0.19200813107543122, "learning_rate": 1.7670776316181427e-06, "loss": 0.0432, "step": 15540 }, { "epoch": 2.4844323806482516, "grad_norm": 0.2516917996505797, "learning_rate": 1.746043753176836e-06, "loss": 0.0448, "step": 15560 }, { "epoch": 2.487625738463995, "grad_norm": 0.17194174394098138, "learning_rate": 1.7251238299451301e-06, "loss": 0.0449, "step": 15580 }, { "epoch": 2.490819096279738, "grad_norm": 0.17011442140145003, "learning_rate": 1.7043181507462448e-06, "loss": 0.0457, "step": 15600 }, { "epoch": 2.4940124540954813, "grad_norm": 0.17376564573157416, "learning_rate": 1.6836270028261326e-06, "loss": 0.0446, "step": 15620 }, { "epoch": 2.4972058119112246, "grad_norm": 0.2600424543600025, "learning_rate": 1.66305067184952e-06, "loss": 0.0435, "step": 15640 }, { "epoch": 2.500399169726968, "grad_norm": 0.1728773334170149, "learning_rate": 1.6425894418959433e-06, "loss": 0.0444, "step": 15660 }, { "epoch": 2.503592527542711, "grad_norm": 0.2117397902480935, "learning_rate": 1.6222435954558435e-06, "loss": 0.0424, "step": 15680 }, { "epoch": 2.5067858853584544, "grad_norm": 0.20379918000728395, "learning_rate": 1.6020134134266674e-06, "loss": 0.0449, "step": 15700 }, { "epoch": 2.5099792431741976, "grad_norm": 0.3110350981628874, "learning_rate": 1.5818991751089762e-06, "loss": 0.0434, "step": 15720 }, { "epoch": 2.513172600989941, "grad_norm": 0.18429144606858047, "learning_rate": 1.5619011582025988e-06, "loss": 0.0439, "step": 15740 }, { "epoch": 2.516365958805684, "grad_norm": 0.1756584956115843, "learning_rate": 1.5420196388027963e-06, "loss": 0.0423, "step": 15760 }, { "epoch": 2.5195593166214274, "grad_norm": 0.18747969624165203, "learning_rate": 1.5222548913964508e-06, "loss": 0.0432, "step": 15780 }, { "epoch": 2.5227526744371707, "grad_norm": 0.17351521964113906, "learning_rate": 1.5026071888582771e-06, "loss": 0.0428, "step": 15800 }, { "epoch": 2.525946032252914, "grad_norm": 0.1763855716931325, "learning_rate": 1.4830768024470487e-06, "loss": 0.0437, "step": 15820 }, { "epoch": 2.529139390068657, "grad_norm": 0.19172367578038851, "learning_rate": 1.4636640018018556e-06, "loss": 0.0436, "step": 15840 }, { "epoch": 2.5323327478844004, "grad_norm": 0.18955098367053075, "learning_rate": 1.4443690549383904e-06, "loss": 0.0422, "step": 15860 }, { "epoch": 2.5355261057001437, "grad_norm": 0.2062297852474484, "learning_rate": 1.4251922282452356e-06, "loss": 0.0423, "step": 15880 }, { "epoch": 2.538719463515887, "grad_norm": 0.184016665131291, "learning_rate": 1.4061337864801916e-06, "loss": 0.0441, "step": 15900 }, { "epoch": 2.54191282133163, "grad_norm": 0.21880976113017805, "learning_rate": 1.3871939927666189e-06, "loss": 0.046, "step": 15920 }, { "epoch": 2.5451061791473735, "grad_norm": 0.17335074095350983, "learning_rate": 1.3683731085898144e-06, "loss": 0.0441, "step": 15940 }, { "epoch": 2.5482995369631167, "grad_norm": 0.19234479041549446, "learning_rate": 1.349671393793388e-06, "loss": 0.0427, "step": 15960 }, { "epoch": 2.55149289477886, "grad_norm": 0.18631232012636342, "learning_rate": 1.3310891065756814e-06, "loss": 0.0435, "step": 15980 }, { "epoch": 2.5546862525946032, "grad_norm": 0.19243767802224285, "learning_rate": 1.3126265034862084e-06, "loss": 0.0441, "step": 16000 }, { "epoch": 2.5578796104103465, "grad_norm": 0.22553668043830372, "learning_rate": 1.2942838394221002e-06, "loss": 0.0438, "step": 16020 }, { "epoch": 2.5610729682260898, "grad_norm": 0.2414806098978672, "learning_rate": 1.2760613676246037e-06, "loss": 0.0455, "step": 16040 }, { "epoch": 2.564266326041833, "grad_norm": 0.17562297042382372, "learning_rate": 1.2579593396755652e-06, "loss": 0.0437, "step": 16060 }, { "epoch": 2.5674596838575763, "grad_norm": 0.1714929007989254, "learning_rate": 1.2399780054939758e-06, "loss": 0.0435, "step": 16080 }, { "epoch": 2.5706530416733195, "grad_norm": 0.18944429187488632, "learning_rate": 1.2221176133325097e-06, "loss": 0.0432, "step": 16100 }, { "epoch": 2.573846399489063, "grad_norm": 0.18830587754770226, "learning_rate": 1.2043784097740951e-06, "loss": 0.044, "step": 16120 }, { "epoch": 2.577039757304806, "grad_norm": 0.20515213794452525, "learning_rate": 1.1867606397285191e-06, "loss": 0.0444, "step": 16140 }, { "epoch": 2.5802331151205493, "grad_norm": 0.2068320912840683, "learning_rate": 1.1692645464290441e-06, "loss": 0.0443, "step": 16160 }, { "epoch": 2.5834264729362926, "grad_norm": 0.2065451583149461, "learning_rate": 1.151890371429042e-06, "loss": 0.0447, "step": 16180 }, { "epoch": 2.586619830752036, "grad_norm": 0.20955876801496184, "learning_rate": 1.1346383545986629e-06, "loss": 0.043, "step": 16200 }, { "epoch": 2.589813188567779, "grad_norm": 0.18475336946843543, "learning_rate": 1.117508734121535e-06, "loss": 0.0439, "step": 16220 }, { "epoch": 2.5930065463835223, "grad_norm": 0.19250755490602636, "learning_rate": 1.1005017464914568e-06, "loss": 0.0431, "step": 16240 }, { "epoch": 2.5961999041992656, "grad_norm": 0.2138444193531275, "learning_rate": 1.0836176265091448e-06, "loss": 0.0447, "step": 16260 }, { "epoch": 2.599393262015009, "grad_norm": 0.19283181561318452, "learning_rate": 1.0668566072789876e-06, "loss": 0.0434, "step": 16280 }, { "epoch": 2.602586619830752, "grad_norm": 0.19258136254237682, "learning_rate": 1.05021892020583e-06, "loss": 0.0452, "step": 16300 }, { "epoch": 2.6057799776464954, "grad_norm": 0.239296573931001, "learning_rate": 1.0337047949917777e-06, "loss": 0.0432, "step": 16320 }, { "epoch": 2.6089733354622386, "grad_norm": 0.18442185794546465, "learning_rate": 1.0173144596330231e-06, "loss": 0.0439, "step": 16340 }, { "epoch": 2.612166693277982, "grad_norm": 0.17759720874685755, "learning_rate": 1.0010481404166972e-06, "loss": 0.0434, "step": 16360 }, { "epoch": 2.615360051093725, "grad_norm": 0.1999834786965281, "learning_rate": 9.849060619177553e-07, "loss": 0.0446, "step": 16380 }, { "epoch": 2.6185534089094684, "grad_norm": 0.21313365667220596, "learning_rate": 9.688884469958604e-07, "loss": 0.0434, "step": 16400 }, { "epoch": 2.6217467667252117, "grad_norm": 0.19320209419752543, "learning_rate": 9.5299551679232e-07, "loss": 0.0445, "step": 16420 }, { "epoch": 2.624940124540955, "grad_norm": 0.17847623577962735, "learning_rate": 9.372274907270251e-07, "loss": 0.0437, "step": 16440 }, { "epoch": 2.628133482356698, "grad_norm": 0.23166885515187532, "learning_rate": 9.215845864954287e-07, "loss": 0.0419, "step": 16460 }, { "epoch": 2.6313268401724415, "grad_norm": 0.18325681984081477, "learning_rate": 9.060670200655286e-07, "loss": 0.0439, "step": 16480 }, { "epoch": 2.6345201979881847, "grad_norm": 0.20540975477642068, "learning_rate": 8.906750056748947e-07, "loss": 0.0448, "step": 16500 }, { "epoch": 2.637713555803928, "grad_norm": 0.1786617783763284, "learning_rate": 8.754087558277113e-07, "loss": 0.0444, "step": 16520 }, { "epoch": 2.6409069136196712, "grad_norm": 0.1901267431080617, "learning_rate": 8.602684812918416e-07, "loss": 0.0438, "step": 16540 }, { "epoch": 2.6441002714354145, "grad_norm": 0.18259614623005302, "learning_rate": 8.452543910959121e-07, "loss": 0.0432, "step": 16560 }, { "epoch": 2.6472936292511577, "grad_norm": 0.18713135077039142, "learning_rate": 8.303666925264331e-07, "loss": 0.0437, "step": 16580 }, { "epoch": 2.650486987066901, "grad_norm": 0.1801858452235725, "learning_rate": 8.156055911249394e-07, "loss": 0.0448, "step": 16600 }, { "epoch": 2.6536803448826443, "grad_norm": 0.17771380124624228, "learning_rate": 8.00971290685143e-07, "loss": 0.0445, "step": 16620 }, { "epoch": 2.6568737026983875, "grad_norm": 0.22250062270982698, "learning_rate": 7.864639932501294e-07, "loss": 0.0427, "step": 16640 }, { "epoch": 2.6600670605141303, "grad_norm": 0.20866465188062733, "learning_rate": 7.720838991095602e-07, "loss": 0.0427, "step": 16660 }, { "epoch": 2.663260418329874, "grad_norm": 0.2055356708135395, "learning_rate": 7.578312067969162e-07, "loss": 0.043, "step": 16680 }, { "epoch": 2.666453776145617, "grad_norm": 0.20698005060615937, "learning_rate": 7.437061130867473e-07, "loss": 0.0442, "step": 16700 }, { "epoch": 2.6696471339613606, "grad_norm": 0.20876117607511466, "learning_rate": 7.297088129919616e-07, "loss": 0.0498, "step": 16720 }, { "epoch": 2.6728404917771034, "grad_norm": 0.24032862358776724, "learning_rate": 7.158394997611329e-07, "loss": 0.0429, "step": 16740 }, { "epoch": 2.676033849592847, "grad_norm": 0.20969273760927634, "learning_rate": 7.020983648758318e-07, "loss": 0.0447, "step": 16760 }, { "epoch": 2.67922720740859, "grad_norm": 0.2174374325052259, "learning_rate": 6.884855980479777e-07, "loss": 0.0452, "step": 16780 }, { "epoch": 2.6824205652243336, "grad_norm": 0.18004577133887417, "learning_rate": 6.750013872172301e-07, "loss": 0.0438, "step": 16800 }, { "epoch": 2.6856139230400764, "grad_norm": 0.2035569950209219, "learning_rate": 6.616459185483793e-07, "loss": 0.0438, "step": 16820 }, { "epoch": 2.68880728085582, "grad_norm": 0.20132465630515528, "learning_rate": 6.484193764287938e-07, "loss": 0.0445, "step": 16840 }, { "epoch": 2.692000638671563, "grad_norm": 0.1712570311869676, "learning_rate": 6.353219434658587e-07, "loss": 0.0432, "step": 16860 }, { "epoch": 2.6951939964873066, "grad_norm": 0.19144286472815933, "learning_rate": 6.223538004844587e-07, "loss": 0.0426, "step": 16880 }, { "epoch": 2.6983873543030494, "grad_norm": 0.1761969500556086, "learning_rate": 6.095151265244937e-07, "loss": 0.0436, "step": 16900 }, { "epoch": 2.701580712118793, "grad_norm": 0.18412941719997428, "learning_rate": 5.968060988383884e-07, "loss": 0.0419, "step": 16920 }, { "epoch": 2.704774069934536, "grad_norm": 0.2088468477123862, "learning_rate": 5.842268928886563e-07, "loss": 0.0435, "step": 16940 }, { "epoch": 2.7079674277502797, "grad_norm": 0.21087568774149862, "learning_rate": 5.717776823454746e-07, "loss": 0.0434, "step": 16960 }, { "epoch": 2.7111607855660225, "grad_norm": 0.20533012449268137, "learning_rate": 5.594586390842915e-07, "loss": 0.0436, "step": 16980 }, { "epoch": 2.714354143381766, "grad_norm": 0.23130477787372275, "learning_rate": 5.472699331834408e-07, "loss": 0.0434, "step": 17000 }, { "epoch": 2.717547501197509, "grad_norm": 0.19246797825033052, "learning_rate": 5.352117329218065e-07, "loss": 0.0443, "step": 17020 }, { "epoch": 2.7207408590132527, "grad_norm": 0.19825650332574749, "learning_rate": 5.23284204776493e-07, "loss": 0.0432, "step": 17040 }, { "epoch": 2.7239342168289955, "grad_norm": 0.19435989820475502, "learning_rate": 5.1148751342053e-07, "loss": 0.0437, "step": 17060 }, { "epoch": 2.727127574644739, "grad_norm": 0.17105286427984273, "learning_rate": 4.998218217205941e-07, "loss": 0.0431, "step": 17080 }, { "epoch": 2.730320932460482, "grad_norm": 0.2076555517606956, "learning_rate": 4.882872907347657e-07, "loss": 0.0441, "step": 17100 }, { "epoch": 2.7335142902762257, "grad_norm": 0.17467573768445724, "learning_rate": 4.768840797103014e-07, "loss": 0.0426, "step": 17120 }, { "epoch": 2.7367076480919685, "grad_norm": 0.23656714472082974, "learning_rate": 4.6561234608143993e-07, "loss": 0.0442, "step": 17140 }, { "epoch": 2.739901005907712, "grad_norm": 0.1991265479506836, "learning_rate": 4.544722454672223e-07, "loss": 0.0443, "step": 17160 }, { "epoch": 2.743094363723455, "grad_norm": 0.16764542580219924, "learning_rate": 4.434639316693479e-07, "loss": 0.0441, "step": 17180 }, { "epoch": 2.7462877215391983, "grad_norm": 0.18540914909816514, "learning_rate": 4.3258755667005104e-07, "loss": 0.0427, "step": 17200 }, { "epoch": 2.7494810793549416, "grad_norm": 0.16756011986354746, "learning_rate": 4.218432706300013e-07, "loss": 0.0442, "step": 17220 }, { "epoch": 2.752674437170685, "grad_norm": 0.19477880662403543, "learning_rate": 4.1123122188623024e-07, "loss": 0.0419, "step": 17240 }, { "epoch": 2.755867794986428, "grad_norm": 0.16692137735923454, "learning_rate": 4.0075155695008193e-07, "loss": 0.0439, "step": 17260 }, { "epoch": 2.7590611528021713, "grad_norm": 0.27371092487152754, "learning_rate": 3.904044205051938e-07, "loss": 0.0415, "step": 17280 }, { "epoch": 2.7622545106179146, "grad_norm": 0.1730044575542229, "learning_rate": 3.801899554055011e-07, "loss": 0.0434, "step": 17300 }, { "epoch": 2.765447868433658, "grad_norm": 0.2957249889697754, "learning_rate": 3.7010830267325546e-07, "loss": 0.0432, "step": 17320 }, { "epoch": 2.768641226249401, "grad_norm": 0.20211132503418788, "learning_rate": 3.601596014970843e-07, "loss": 0.0448, "step": 17340 }, { "epoch": 2.7718345840651444, "grad_norm": 0.2192148080396869, "learning_rate": 3.5034398923007195e-07, "loss": 0.0429, "step": 17360 }, { "epoch": 2.7750279418808876, "grad_norm": 0.19416701667619607, "learning_rate": 3.40661601387855e-07, "loss": 0.0442, "step": 17380 }, { "epoch": 2.778221299696631, "grad_norm": 0.2194341949029401, "learning_rate": 3.311125716467578e-07, "loss": 0.0451, "step": 17400 }, { "epoch": 2.781414657512374, "grad_norm": 0.23997919053006325, "learning_rate": 3.216970318419488e-07, "loss": 0.0433, "step": 17420 }, { "epoch": 2.7846080153281174, "grad_norm": 0.20048137685529088, "learning_rate": 3.1241511196561045e-07, "loss": 0.0436, "step": 17440 }, { "epoch": 2.7878013731438607, "grad_norm": 0.18418386343058352, "learning_rate": 3.0326694016515555e-07, "loss": 0.0431, "step": 17460 }, { "epoch": 2.790994730959604, "grad_norm": 0.18647531186123847, "learning_rate": 2.9425264274144937e-07, "loss": 0.0441, "step": 17480 }, { "epoch": 2.794188088775347, "grad_norm": 0.18103520276457064, "learning_rate": 2.8537234414707573e-07, "loss": 0.0424, "step": 17500 }, { "epoch": 2.7973814465910904, "grad_norm": 0.175838788085868, "learning_rate": 2.766261669846071e-07, "loss": 0.0428, "step": 17520 }, { "epoch": 2.8005748044068337, "grad_norm": 0.18597288140297774, "learning_rate": 2.680142320049195e-07, "loss": 0.0461, "step": 17540 }, { "epoch": 2.803768162222577, "grad_norm": 0.19306825995055335, "learning_rate": 2.5953665810552586e-07, "loss": 0.0432, "step": 17560 }, { "epoch": 2.8069615200383202, "grad_norm": 0.19244074182083917, "learning_rate": 2.5119356232892965e-07, "loss": 0.0447, "step": 17580 }, { "epoch": 2.8101548778540635, "grad_norm": 0.20041935845397732, "learning_rate": 2.4298505986101397e-07, "loss": 0.0417, "step": 17600 }, { "epoch": 2.8133482356698067, "grad_norm": 0.1897352035064278, "learning_rate": 2.3491126402944597e-07, "loss": 0.0447, "step": 17620 }, { "epoch": 2.81654159348555, "grad_norm": 0.1859749113332233, "learning_rate": 2.269722863021162e-07, "loss": 0.0441, "step": 17640 }, { "epoch": 2.8197349513012933, "grad_norm": 0.18154530556190202, "learning_rate": 2.191682362856018e-07, "loss": 0.0449, "step": 17660 }, { "epoch": 2.8229283091170365, "grad_norm": 0.19576462753720822, "learning_rate": 2.1149922172364557e-07, "loss": 0.043, "step": 17680 }, { "epoch": 2.8261216669327798, "grad_norm": 0.19317600637380156, "learning_rate": 2.0396534849567384e-07, "loss": 0.0435, "step": 17700 }, { "epoch": 2.829315024748523, "grad_norm": 0.18270539963789229, "learning_rate": 1.9656672061533876e-07, "loss": 0.0448, "step": 17720 }, { "epoch": 2.8325083825642663, "grad_norm": 0.25190362174641373, "learning_rate": 1.8930344022907055e-07, "loss": 0.0433, "step": 17740 }, { "epoch": 2.8357017403800096, "grad_norm": 0.19271629305777457, "learning_rate": 1.8217560761467744e-07, "loss": 0.0442, "step": 17760 }, { "epoch": 2.838895098195753, "grad_norm": 0.6386981477198299, "learning_rate": 1.7518332117995695e-07, "loss": 0.0431, "step": 17780 }, { "epoch": 2.842088456011496, "grad_norm": 0.20346250081845807, "learning_rate": 1.6832667746134236e-07, "loss": 0.0422, "step": 17800 }, { "epoch": 2.8452818138272393, "grad_norm": 0.17777460027714007, "learning_rate": 1.6160577112255827e-07, "loss": 0.0425, "step": 17820 }, { "epoch": 2.8484751716429826, "grad_norm": 0.255413137859645, "learning_rate": 1.5502069495332616e-07, "loss": 0.0435, "step": 17840 }, { "epoch": 2.851668529458726, "grad_norm": 0.19607428087584267, "learning_rate": 1.4857153986807649e-07, "loss": 0.0418, "step": 17860 }, { "epoch": 2.854861887274469, "grad_norm": 0.1780772888139799, "learning_rate": 1.4225839490469628e-07, "loss": 0.0427, "step": 17880 }, { "epoch": 2.8580552450902124, "grad_norm": 0.21241047060680943, "learning_rate": 1.3608134722329803e-07, "loss": 0.0437, "step": 17900 }, { "epoch": 2.8612486029059556, "grad_norm": 0.19239115510673255, "learning_rate": 1.3004048210501718e-07, "loss": 0.0434, "step": 17920 }, { "epoch": 2.864441960721699, "grad_norm": 0.18795522932841213, "learning_rate": 1.2413588295083656e-07, "loss": 0.0431, "step": 17940 }, { "epoch": 2.867635318537442, "grad_norm": 0.18585931164828967, "learning_rate": 1.183676312804305e-07, "loss": 0.0442, "step": 17960 }, { "epoch": 2.8708286763531854, "grad_norm": 0.18075501501439709, "learning_rate": 1.1273580673104245e-07, "loss": 0.0444, "step": 17980 }, { "epoch": 2.8740220341689287, "grad_norm": 0.19563735408076433, "learning_rate": 1.072404870563859e-07, "loss": 0.0447, "step": 18000 }, { "epoch": 2.877215391984672, "grad_norm": 0.19825850897569677, "learning_rate": 1.0188174812557073e-07, "loss": 0.0439, "step": 18020 }, { "epoch": 2.880408749800415, "grad_norm": 0.17410835997084562, "learning_rate": 9.665966392205295e-08, "loss": 0.0446, "step": 18040 }, { "epoch": 2.8836021076161584, "grad_norm": 0.17894750425194603, "learning_rate": 9.157430654261778e-08, "loss": 0.0444, "step": 18060 }, { "epoch": 2.8867954654319017, "grad_norm": 0.1932898053763739, "learning_rate": 8.662574619637931e-08, "loss": 0.043, "step": 18080 }, { "epoch": 2.889988823247645, "grad_norm": 0.19451425195215136, "learning_rate": 8.18140512038157e-08, "loss": 0.0428, "step": 18100 }, { "epoch": 2.893182181063388, "grad_norm": 0.18451759369547344, "learning_rate": 7.713928799582215e-08, "loss": 0.0443, "step": 18120 }, { "epoch": 2.8963755388791315, "grad_norm": 0.21235909068408473, "learning_rate": 7.260152111279839e-08, "loss": 0.0443, "step": 18140 }, { "epoch": 2.8995688966948747, "grad_norm": 0.18028750928095402, "learning_rate": 6.82008132037515e-08, "loss": 0.0425, "step": 18160 }, { "epoch": 2.902762254510618, "grad_norm": 0.1865997727595832, "learning_rate": 6.393722502543665e-08, "loss": 0.045, "step": 18180 }, { "epoch": 2.9059556123263612, "grad_norm": 0.18553943543624984, "learning_rate": 5.981081544151446e-08, "loss": 0.0428, "step": 18200 }, { "epoch": 2.9091489701421045, "grad_norm": 0.19032355954882516, "learning_rate": 5.5821641421741625e-08, "loss": 0.0443, "step": 18220 }, { "epoch": 2.9123423279578478, "grad_norm": 0.18084808651831624, "learning_rate": 5.196975804117932e-08, "loss": 0.0435, "step": 18240 }, { "epoch": 2.915535685773591, "grad_norm": 0.21753074838538441, "learning_rate": 4.825521847944048e-08, "loss": 0.0418, "step": 18260 }, { "epoch": 2.9187290435893343, "grad_norm": 0.1883119176872824, "learning_rate": 4.467807401994706e-08, "loss": 0.0426, "step": 18280 }, { "epoch": 2.9219224014050775, "grad_norm": 0.17894355455146954, "learning_rate": 4.123837404922726e-08, "loss": 0.0429, "step": 18300 }, { "epoch": 2.925115759220821, "grad_norm": 0.20477512942702414, "learning_rate": 3.7936166056233845e-08, "loss": 0.0421, "step": 18320 }, { "epoch": 2.928309117036564, "grad_norm": 0.17982986336579576, "learning_rate": 3.4771495631686914e-08, "loss": 0.0433, "step": 18340 }, { "epoch": 2.9315024748523073, "grad_norm": 0.19778942398473365, "learning_rate": 3.174440646744326e-08, "loss": 0.0434, "step": 18360 }, { "epoch": 2.9346958326680506, "grad_norm": 0.1840797815880338, "learning_rate": 2.8854940355895756e-08, "loss": 0.0422, "step": 18380 }, { "epoch": 2.937889190483794, "grad_norm": 0.20492139151779767, "learning_rate": 2.6103137189394945e-08, "loss": 0.0433, "step": 18400 }, { "epoch": 2.941082548299537, "grad_norm": 0.18649980327789625, "learning_rate": 2.3489034959698342e-08, "loss": 0.0423, "step": 18420 }, { "epoch": 2.9442759061152803, "grad_norm": 0.18710560587786274, "learning_rate": 2.1012669757446423e-08, "loss": 0.0447, "step": 18440 }, { "epoch": 2.9474692639310236, "grad_norm": 0.1950435200815635, "learning_rate": 1.8674075771665246e-08, "loss": 0.0441, "step": 18460 }, { "epoch": 2.950662621746767, "grad_norm": 0.23718279280034166, "learning_rate": 1.647328528929126e-08, "loss": 0.0443, "step": 18480 }, { "epoch": 2.95385597956251, "grad_norm": 0.1828813035697597, "learning_rate": 1.441032869472725e-08, "loss": 0.0434, "step": 18500 }, { "epoch": 2.9570493373782534, "grad_norm": 0.18330846523906766, "learning_rate": 1.2485234469425955e-08, "loss": 0.0447, "step": 18520 }, { "epoch": 2.9602426951939966, "grad_norm": 0.19409145822202675, "learning_rate": 1.0698029191491543e-08, "loss": 0.0424, "step": 18540 }, { "epoch": 2.96343605300974, "grad_norm": 0.19044949836984276, "learning_rate": 9.048737535317654e-09, "loss": 0.0421, "step": 18560 }, { "epoch": 2.966629410825483, "grad_norm": 0.24441457630679606, "learning_rate": 7.5373822712399e-09, "loss": 0.0429, "step": 18580 }, { "epoch": 2.9698227686412264, "grad_norm": 0.17238090085354812, "learning_rate": 6.163984265230571e-09, "loss": 0.0436, "step": 18600 }, { "epoch": 2.9730161264569697, "grad_norm": 0.17724705729907833, "learning_rate": 4.928562478603294e-09, "loss": 0.0438, "step": 18620 }, { "epoch": 2.9762094842727125, "grad_norm": 0.17813664105789478, "learning_rate": 3.831133967754363e-09, "loss": 0.0443, "step": 18640 }, { "epoch": 2.979402842088456, "grad_norm": 0.19807197965691153, "learning_rate": 2.8717138839262638e-09, "loss": 0.0423, "step": 18660 }, { "epoch": 2.982596199904199, "grad_norm": 0.20792602007313574, "learning_rate": 2.050315473000053e-09, "loss": 0.0437, "step": 18680 }, { "epoch": 2.9857895577199427, "grad_norm": 0.18039436629311245, "learning_rate": 1.3669500753099586e-09, "loss": 0.0449, "step": 18700 }, { "epoch": 2.9889829155356855, "grad_norm": 0.16777557295223433, "learning_rate": 8.216271254901653e-10, "loss": 0.0433, "step": 18720 }, { "epoch": 2.992176273351429, "grad_norm": 0.21329586917274732, "learning_rate": 4.1435415233936903e-10, "loss": 0.0437, "step": 18740 }, { "epoch": 2.995369631167172, "grad_norm": 0.17382850175198178, "learning_rate": 1.451367787230762e-10, "loss": 0.0434, "step": 18760 }, { "epoch": 2.9985629889829157, "grad_norm": 0.19059012257580193, "learning_rate": 1.3978721492557968e-11, "loss": 0.0444, "step": 18780 } ], "logging_steps": 20, "max_steps": 18789, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6401980526886912.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }