{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08267797093601252, "eval_steps": 10000, "global_step": 20001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.133691862207516e-06, "grad_norm": 71.12261039322922, "learning_rate": 8.264462809917355e-09, "loss": 7.6413, "step": 1 }, { "epoch": 4.1336918622075155e-05, "grad_norm": 67.75271488374239, "learning_rate": 8.264462809917357e-08, "loss": 7.6837, "step": 10 }, { "epoch": 8.267383724415031e-05, "grad_norm": 68.70872130093693, "learning_rate": 1.6528925619834713e-07, "loss": 7.6852, "step": 20 }, { "epoch": 0.00012401075586622546, "grad_norm": 62.98947446015298, "learning_rate": 2.4793388429752067e-07, "loss": 7.6376, "step": 30 }, { "epoch": 0.00016534767448830062, "grad_norm": 54.98248131444704, "learning_rate": 3.3057851239669426e-07, "loss": 7.4517, "step": 40 }, { "epoch": 0.00020668459311037578, "grad_norm": 42.7313362524158, "learning_rate": 4.132231404958678e-07, "loss": 7.1676, "step": 50 }, { "epoch": 0.0002480215117324509, "grad_norm": 38.42250005234289, "learning_rate": 4.958677685950413e-07, "loss": 6.7673, "step": 60 }, { "epoch": 0.0002893584303545261, "grad_norm": 26.17988841617355, "learning_rate": 5.78512396694215e-07, "loss": 6.1421, "step": 70 }, { "epoch": 0.00033069534897660124, "grad_norm": 20.709699960482357, "learning_rate": 6.611570247933885e-07, "loss": 5.7812, "step": 80 }, { "epoch": 0.0003720322675986764, "grad_norm": 18.83787744791924, "learning_rate": 7.438016528925621e-07, "loss": 5.4004, "step": 90 }, { "epoch": 0.00041336918622075157, "grad_norm": 19.12503087164104, "learning_rate": 8.264462809917356e-07, "loss": 5.0367, "step": 100 }, { "epoch": 0.0004547061048428267, "grad_norm": 14.34978763505406, "learning_rate": 9.090909090909091e-07, "loss": 4.7081, "step": 110 }, { "epoch": 0.0004960430234649018, "grad_norm": 13.492660464865253, "learning_rate": 9.917355371900827e-07, "loss": 4.5718, "step": 120 }, { "epoch": 0.000537379942086977, "grad_norm": 14.420928714044905, "learning_rate": 1.0743801652892562e-06, "loss": 4.4218, "step": 130 }, { "epoch": 0.0005787168607090522, "grad_norm": 12.495448321891903, "learning_rate": 1.15702479338843e-06, "loss": 4.166, "step": 140 }, { "epoch": 0.0006200537793311273, "grad_norm": 9.801331301009261, "learning_rate": 1.2396694214876035e-06, "loss": 4.0983, "step": 150 }, { "epoch": 0.0006613906979532025, "grad_norm": 11.207849241885746, "learning_rate": 1.322314049586777e-06, "loss": 3.8569, "step": 160 }, { "epoch": 0.0007027276165752776, "grad_norm": 9.679145248918651, "learning_rate": 1.4049586776859506e-06, "loss": 3.8265, "step": 170 }, { "epoch": 0.0007440645351973528, "grad_norm": 11.74286989793758, "learning_rate": 1.4876033057851241e-06, "loss": 3.7614, "step": 180 }, { "epoch": 0.0007854014538194279, "grad_norm": 12.757634031418831, "learning_rate": 1.5702479338842977e-06, "loss": 3.7239, "step": 190 }, { "epoch": 0.0008267383724415031, "grad_norm": 10.441179098409538, "learning_rate": 1.6528925619834712e-06, "loss": 3.5734, "step": 200 }, { "epoch": 0.0008680752910635782, "grad_norm": 9.812592859538713, "learning_rate": 1.7355371900826448e-06, "loss": 3.567, "step": 210 }, { "epoch": 0.0009094122096856533, "grad_norm": 13.93844427924458, "learning_rate": 1.8181818181818183e-06, "loss": 3.4882, "step": 220 }, { "epoch": 0.0009507491283077286, "grad_norm": 10.098507514325146, "learning_rate": 1.900826446280992e-06, "loss": 3.4818, "step": 230 }, { "epoch": 0.0009920860469298037, "grad_norm": 9.58855528934084, "learning_rate": 1.9834710743801654e-06, "loss": 3.4219, "step": 240 }, { "epoch": 0.0010334229655518789, "grad_norm": 10.682635147410519, "learning_rate": 2.066115702479339e-06, "loss": 3.4406, "step": 250 }, { "epoch": 0.001074759884173954, "grad_norm": 8.223333896011393, "learning_rate": 2.1487603305785124e-06, "loss": 3.3402, "step": 260 }, { "epoch": 0.001116096802796029, "grad_norm": 8.984512948693363, "learning_rate": 2.231404958677686e-06, "loss": 3.3059, "step": 270 }, { "epoch": 0.0011574337214181043, "grad_norm": 9.341193028741104, "learning_rate": 2.31404958677686e-06, "loss": 3.1823, "step": 280 }, { "epoch": 0.0011987706400401795, "grad_norm": 10.3056509420508, "learning_rate": 2.3966942148760335e-06, "loss": 3.2817, "step": 290 }, { "epoch": 0.0012401075586622545, "grad_norm": 6.768688690699956, "learning_rate": 2.479338842975207e-06, "loss": 3.1747, "step": 300 }, { "epoch": 0.0012814444772843298, "grad_norm": 6.809665393368199, "learning_rate": 2.56198347107438e-06, "loss": 3.1865, "step": 310 }, { "epoch": 0.001322781395906405, "grad_norm": 6.996784607657393, "learning_rate": 2.644628099173554e-06, "loss": 3.1331, "step": 320 }, { "epoch": 0.0013641183145284802, "grad_norm": 9.752811655395744, "learning_rate": 2.7272727272727272e-06, "loss": 3.1656, "step": 330 }, { "epoch": 0.0014054552331505552, "grad_norm": 8.381106520516422, "learning_rate": 2.809917355371901e-06, "loss": 3.1045, "step": 340 }, { "epoch": 0.0014467921517726304, "grad_norm": 9.529557732528744, "learning_rate": 2.8925619834710743e-06, "loss": 3.0388, "step": 350 }, { "epoch": 0.0014881290703947056, "grad_norm": 7.141802881599084, "learning_rate": 2.9752066115702483e-06, "loss": 3.1119, "step": 360 }, { "epoch": 0.0015294659890167806, "grad_norm": 8.784456968597924, "learning_rate": 3.0578512396694214e-06, "loss": 2.9281, "step": 370 }, { "epoch": 0.0015708029076388558, "grad_norm": 7.19486645716449, "learning_rate": 3.1404958677685953e-06, "loss": 2.9817, "step": 380 }, { "epoch": 0.001612139826260931, "grad_norm": 8.694417122337162, "learning_rate": 3.2231404958677685e-06, "loss": 3.0107, "step": 390 }, { "epoch": 0.0016534767448830063, "grad_norm": 8.323745687899638, "learning_rate": 3.3057851239669424e-06, "loss": 3.0367, "step": 400 }, { "epoch": 0.0016948136635050813, "grad_norm": 7.289283064256704, "learning_rate": 3.388429752066116e-06, "loss": 2.9071, "step": 410 }, { "epoch": 0.0017361505821271565, "grad_norm": 8.436424184415285, "learning_rate": 3.4710743801652895e-06, "loss": 2.8761, "step": 420 }, { "epoch": 0.0017774875007492317, "grad_norm": 8.284395953073583, "learning_rate": 3.553719008264463e-06, "loss": 2.8742, "step": 430 }, { "epoch": 0.0018188244193713067, "grad_norm": 7.765622556565918, "learning_rate": 3.6363636363636366e-06, "loss": 2.9028, "step": 440 }, { "epoch": 0.001860161337993382, "grad_norm": 8.06677494412333, "learning_rate": 3.71900826446281e-06, "loss": 2.885, "step": 450 }, { "epoch": 0.0019014982566154571, "grad_norm": 6.847325183046608, "learning_rate": 3.801652892561984e-06, "loss": 2.8491, "step": 460 }, { "epoch": 0.0019428351752375323, "grad_norm": 6.363354378607742, "learning_rate": 3.884297520661157e-06, "loss": 2.8626, "step": 470 }, { "epoch": 0.0019841720938596073, "grad_norm": 6.752545926601506, "learning_rate": 3.966942148760331e-06, "loss": 2.8759, "step": 480 }, { "epoch": 0.0020255090124816828, "grad_norm": 6.408269470963144, "learning_rate": 4.049586776859504e-06, "loss": 2.8132, "step": 490 }, { "epoch": 0.0020668459311037578, "grad_norm": 10.174350278228932, "learning_rate": 4.132231404958678e-06, "loss": 2.7726, "step": 500 }, { "epoch": 0.0021081828497258328, "grad_norm": 7.3668788002558045, "learning_rate": 4.214876033057851e-06, "loss": 2.781, "step": 510 }, { "epoch": 0.002149519768347908, "grad_norm": 6.065779568736328, "learning_rate": 4.297520661157025e-06, "loss": 2.7566, "step": 520 }, { "epoch": 0.002190856686969983, "grad_norm": 7.249820467027506, "learning_rate": 4.3801652892561984e-06, "loss": 2.7745, "step": 530 }, { "epoch": 0.002232193605592058, "grad_norm": 6.6147066822580305, "learning_rate": 4.462809917355372e-06, "loss": 2.7287, "step": 540 }, { "epoch": 0.0022735305242141336, "grad_norm": 6.730411628057589, "learning_rate": 4.5454545454545455e-06, "loss": 2.7263, "step": 550 }, { "epoch": 0.0023148674428362086, "grad_norm": 5.90842778355055, "learning_rate": 4.62809917355372e-06, "loss": 2.7113, "step": 560 }, { "epoch": 0.0023562043614582836, "grad_norm": 5.290690112865889, "learning_rate": 4.710743801652893e-06, "loss": 2.7407, "step": 570 }, { "epoch": 0.002397541280080359, "grad_norm": 6.913494491726395, "learning_rate": 4.793388429752067e-06, "loss": 2.7089, "step": 580 }, { "epoch": 0.002438878198702434, "grad_norm": 6.999126848562726, "learning_rate": 4.87603305785124e-06, "loss": 2.7074, "step": 590 }, { "epoch": 0.002480215117324509, "grad_norm": 7.325247968940623, "learning_rate": 4.958677685950414e-06, "loss": 2.6561, "step": 600 }, { "epoch": 0.0025215520359465845, "grad_norm": 5.841708656820878, "learning_rate": 5.041322314049587e-06, "loss": 2.5882, "step": 610 }, { "epoch": 0.0025628889545686595, "grad_norm": 6.0353216317971725, "learning_rate": 5.12396694214876e-06, "loss": 2.6469, "step": 620 }, { "epoch": 0.002604225873190735, "grad_norm": 7.544181254798358, "learning_rate": 5.206611570247935e-06, "loss": 2.6267, "step": 630 }, { "epoch": 0.00264556279181281, "grad_norm": 6.608459353317291, "learning_rate": 5.289256198347108e-06, "loss": 2.5662, "step": 640 }, { "epoch": 0.002686899710434885, "grad_norm": 6.839416904552874, "learning_rate": 5.371900826446281e-06, "loss": 2.6888, "step": 650 }, { "epoch": 0.0027282366290569604, "grad_norm": 6.668329679339745, "learning_rate": 5.4545454545454545e-06, "loss": 2.5642, "step": 660 }, { "epoch": 0.0027695735476790354, "grad_norm": 7.132958283503685, "learning_rate": 5.537190082644629e-06, "loss": 2.5651, "step": 670 }, { "epoch": 0.0028109104663011104, "grad_norm": 6.277307177686086, "learning_rate": 5.619834710743802e-06, "loss": 2.5659, "step": 680 }, { "epoch": 0.002852247384923186, "grad_norm": 6.128689798291957, "learning_rate": 5.702479338842976e-06, "loss": 2.5826, "step": 690 }, { "epoch": 0.002893584303545261, "grad_norm": 6.5950769294424125, "learning_rate": 5.785123966942149e-06, "loss": 2.5845, "step": 700 }, { "epoch": 0.002934921222167336, "grad_norm": 6.419190212095196, "learning_rate": 5.867768595041323e-06, "loss": 2.5336, "step": 710 }, { "epoch": 0.0029762581407894112, "grad_norm": 8.4242870632546, "learning_rate": 5.9504132231404965e-06, "loss": 2.5085, "step": 720 }, { "epoch": 0.0030175950594114862, "grad_norm": 7.690590337257814, "learning_rate": 6.03305785123967e-06, "loss": 2.6229, "step": 730 }, { "epoch": 0.0030589319780335612, "grad_norm": 6.501607766316929, "learning_rate": 6.115702479338843e-06, "loss": 2.5214, "step": 740 }, { "epoch": 0.0031002688966556367, "grad_norm": 6.318891494759645, "learning_rate": 6.198347107438017e-06, "loss": 2.5001, "step": 750 }, { "epoch": 0.0031416058152777117, "grad_norm": 6.549087764929742, "learning_rate": 6.280991735537191e-06, "loss": 2.4692, "step": 760 }, { "epoch": 0.003182942733899787, "grad_norm": 6.139353182718512, "learning_rate": 6.363636363636364e-06, "loss": 2.4862, "step": 770 }, { "epoch": 0.003224279652521862, "grad_norm": 6.927572304151442, "learning_rate": 6.446280991735537e-06, "loss": 2.5114, "step": 780 }, { "epoch": 0.003265616571143937, "grad_norm": 6.193127375797419, "learning_rate": 6.528925619834712e-06, "loss": 2.5163, "step": 790 }, { "epoch": 0.0033069534897660125, "grad_norm": 6.624590490134376, "learning_rate": 6.611570247933885e-06, "loss": 2.4314, "step": 800 }, { "epoch": 0.0033482904083880875, "grad_norm": 6.821779483779539, "learning_rate": 6.694214876033058e-06, "loss": 2.5099, "step": 810 }, { "epoch": 0.0033896273270101625, "grad_norm": 7.545681159342933, "learning_rate": 6.776859504132232e-06, "loss": 2.431, "step": 820 }, { "epoch": 0.003430964245632238, "grad_norm": 7.4575512413535785, "learning_rate": 6.859504132231406e-06, "loss": 2.5166, "step": 830 }, { "epoch": 0.003472301164254313, "grad_norm": 5.330361729769768, "learning_rate": 6.942148760330579e-06, "loss": 2.4147, "step": 840 }, { "epoch": 0.003513638082876388, "grad_norm": 9.642946599111673, "learning_rate": 7.0247933884297525e-06, "loss": 2.416, "step": 850 }, { "epoch": 0.0035549750014984634, "grad_norm": 5.81243704909108, "learning_rate": 7.107438016528926e-06, "loss": 2.4712, "step": 860 }, { "epoch": 0.0035963119201205384, "grad_norm": 8.181150357128717, "learning_rate": 7.1900826446281005e-06, "loss": 2.4275, "step": 870 }, { "epoch": 0.0036376488387426134, "grad_norm": 6.783789830926592, "learning_rate": 7.272727272727273e-06, "loss": 2.4283, "step": 880 }, { "epoch": 0.003678985757364689, "grad_norm": 6.537482232422147, "learning_rate": 7.355371900826447e-06, "loss": 2.4064, "step": 890 }, { "epoch": 0.003720322675986764, "grad_norm": 5.51502652262759, "learning_rate": 7.43801652892562e-06, "loss": 2.3804, "step": 900 }, { "epoch": 0.0037616595946088393, "grad_norm": 5.643663413025215, "learning_rate": 7.520661157024795e-06, "loss": 2.4225, "step": 910 }, { "epoch": 0.0038029965132309143, "grad_norm": 5.698077553184173, "learning_rate": 7.603305785123968e-06, "loss": 2.3767, "step": 920 }, { "epoch": 0.0038443334318529893, "grad_norm": 7.7844289388382695, "learning_rate": 7.685950413223142e-06, "loss": 2.3515, "step": 930 }, { "epoch": 0.0038856703504750647, "grad_norm": 7.037093549799256, "learning_rate": 7.768595041322314e-06, "loss": 2.34, "step": 940 }, { "epoch": 0.00392700726909714, "grad_norm": 6.080619495201754, "learning_rate": 7.851239669421489e-06, "loss": 2.3174, "step": 950 }, { "epoch": 0.003968344187719215, "grad_norm": 5.880580728396556, "learning_rate": 7.933884297520661e-06, "loss": 2.3706, "step": 960 }, { "epoch": 0.00400968110634129, "grad_norm": 5.661072675024262, "learning_rate": 8.016528925619836e-06, "loss": 2.3481, "step": 970 }, { "epoch": 0.0040510180249633656, "grad_norm": 7.201611884034939, "learning_rate": 8.099173553719009e-06, "loss": 2.3667, "step": 980 }, { "epoch": 0.00409235494358544, "grad_norm": 6.275874199218544, "learning_rate": 8.181818181818183e-06, "loss": 2.3169, "step": 990 }, { "epoch": 0.0041336918622075156, "grad_norm": 5.267583094894874, "learning_rate": 8.264462809917356e-06, "loss": 2.3757, "step": 1000 }, { "epoch": 0.004175028780829591, "grad_norm": 5.377757889968936, "learning_rate": 8.34710743801653e-06, "loss": 2.3631, "step": 1010 }, { "epoch": 0.0042163656994516656, "grad_norm": 6.0201100161095225, "learning_rate": 8.429752066115703e-06, "loss": 2.2818, "step": 1020 }, { "epoch": 0.004257702618073741, "grad_norm": 6.579057670565248, "learning_rate": 8.512396694214877e-06, "loss": 2.3313, "step": 1030 }, { "epoch": 0.004299039536695816, "grad_norm": 7.27660719754988, "learning_rate": 8.59504132231405e-06, "loss": 2.3288, "step": 1040 }, { "epoch": 0.004340376455317891, "grad_norm": 6.144262724026651, "learning_rate": 8.677685950413224e-06, "loss": 2.2981, "step": 1050 }, { "epoch": 0.004381713373939966, "grad_norm": 5.672033241927713, "learning_rate": 8.760330578512397e-06, "loss": 2.3059, "step": 1060 }, { "epoch": 0.004423050292562042, "grad_norm": 6.00241597585073, "learning_rate": 8.842975206611571e-06, "loss": 2.389, "step": 1070 }, { "epoch": 0.004464387211184116, "grad_norm": 5.649027277167286, "learning_rate": 8.925619834710744e-06, "loss": 2.3371, "step": 1080 }, { "epoch": 0.004505724129806192, "grad_norm": 5.927371810838215, "learning_rate": 9.008264462809918e-06, "loss": 2.3133, "step": 1090 }, { "epoch": 0.004547061048428267, "grad_norm": 5.662885075294936, "learning_rate": 9.090909090909091e-06, "loss": 2.2779, "step": 1100 }, { "epoch": 0.004588397967050342, "grad_norm": 5.654705250045512, "learning_rate": 9.173553719008265e-06, "loss": 2.2234, "step": 1110 }, { "epoch": 0.004629734885672417, "grad_norm": 6.241923587474726, "learning_rate": 9.25619834710744e-06, "loss": 2.2626, "step": 1120 }, { "epoch": 0.004671071804294493, "grad_norm": 5.741893435201511, "learning_rate": 9.338842975206613e-06, "loss": 2.3012, "step": 1130 }, { "epoch": 0.004712408722916567, "grad_norm": 6.034507786920065, "learning_rate": 9.421487603305785e-06, "loss": 2.2682, "step": 1140 }, { "epoch": 0.004753745641538643, "grad_norm": 7.410349347874194, "learning_rate": 9.50413223140496e-06, "loss": 2.2796, "step": 1150 }, { "epoch": 0.004795082560160718, "grad_norm": 5.992102424922784, "learning_rate": 9.586776859504134e-06, "loss": 2.2112, "step": 1160 }, { "epoch": 0.004836419478782793, "grad_norm": 5.453311998034154, "learning_rate": 9.669421487603307e-06, "loss": 2.1744, "step": 1170 }, { "epoch": 0.004877756397404868, "grad_norm": 5.521988823358632, "learning_rate": 9.75206611570248e-06, "loss": 2.2984, "step": 1180 }, { "epoch": 0.004919093316026944, "grad_norm": 6.153893937530345, "learning_rate": 9.834710743801654e-06, "loss": 2.2443, "step": 1190 }, { "epoch": 0.004960430234649018, "grad_norm": 6.5507135206490315, "learning_rate": 9.917355371900828e-06, "loss": 2.245, "step": 1200 }, { "epoch": 0.005001767153271094, "grad_norm": 8.209761096607327, "learning_rate": 1e-05, "loss": 2.1959, "step": 1210 }, { "epoch": 0.005043104071893169, "grad_norm": 4.986264712575914, "learning_rate": 1.0082644628099174e-05, "loss": 2.1612, "step": 1220 }, { "epoch": 0.0050844409905152444, "grad_norm": 6.3381969120868895, "learning_rate": 1.0165289256198348e-05, "loss": 2.187, "step": 1230 }, { "epoch": 0.005125777909137319, "grad_norm": 5.750067641203542, "learning_rate": 1.024793388429752e-05, "loss": 2.2004, "step": 1240 }, { "epoch": 0.0051671148277593944, "grad_norm": 5.826539821613237, "learning_rate": 1.0330578512396693e-05, "loss": 2.1668, "step": 1250 }, { "epoch": 0.00520845174638147, "grad_norm": 6.296936925085496, "learning_rate": 1.041322314049587e-05, "loss": 2.1807, "step": 1260 }, { "epoch": 0.0052497886650035444, "grad_norm": 5.812866932063289, "learning_rate": 1.0495867768595042e-05, "loss": 2.209, "step": 1270 }, { "epoch": 0.00529112558362562, "grad_norm": 5.808144224407848, "learning_rate": 1.0578512396694216e-05, "loss": 2.1807, "step": 1280 }, { "epoch": 0.005332462502247695, "grad_norm": 7.460856083590218, "learning_rate": 1.0661157024793389e-05, "loss": 2.2229, "step": 1290 }, { "epoch": 0.00537379942086977, "grad_norm": 6.980089322389665, "learning_rate": 1.0743801652892562e-05, "loss": 2.175, "step": 1300 }, { "epoch": 0.005415136339491845, "grad_norm": 5.57740557557049, "learning_rate": 1.0826446280991736e-05, "loss": 2.159, "step": 1310 }, { "epoch": 0.005456473258113921, "grad_norm": 6.647266714783434, "learning_rate": 1.0909090909090909e-05, "loss": 2.1214, "step": 1320 }, { "epoch": 0.005497810176735995, "grad_norm": 6.128334205497799, "learning_rate": 1.0991735537190083e-05, "loss": 2.1792, "step": 1330 }, { "epoch": 0.005539147095358071, "grad_norm": 6.483094766646449, "learning_rate": 1.1074380165289258e-05, "loss": 2.2472, "step": 1340 }, { "epoch": 0.005580484013980146, "grad_norm": 5.359049945838656, "learning_rate": 1.1157024793388432e-05, "loss": 2.182, "step": 1350 }, { "epoch": 0.005621820932602221, "grad_norm": 6.6553587609192695, "learning_rate": 1.1239669421487605e-05, "loss": 2.1918, "step": 1360 }, { "epoch": 0.005663157851224296, "grad_norm": 6.105297642757683, "learning_rate": 1.1322314049586777e-05, "loss": 2.1221, "step": 1370 }, { "epoch": 0.005704494769846372, "grad_norm": 5.22407946250878, "learning_rate": 1.1404958677685952e-05, "loss": 2.0898, "step": 1380 }, { "epoch": 0.005745831688468446, "grad_norm": 5.695260375861287, "learning_rate": 1.1487603305785125e-05, "loss": 2.2, "step": 1390 }, { "epoch": 0.005787168607090522, "grad_norm": 5.834677547053157, "learning_rate": 1.1570247933884297e-05, "loss": 2.1191, "step": 1400 }, { "epoch": 0.005828505525712597, "grad_norm": 7.863484441598645, "learning_rate": 1.1652892561983472e-05, "loss": 2.1255, "step": 1410 }, { "epoch": 0.005869842444334672, "grad_norm": 5.295752440326079, "learning_rate": 1.1735537190082646e-05, "loss": 2.1535, "step": 1420 }, { "epoch": 0.005911179362956747, "grad_norm": 6.925687761354192, "learning_rate": 1.181818181818182e-05, "loss": 2.0618, "step": 1430 }, { "epoch": 0.0059525162815788225, "grad_norm": 4.89230568395151, "learning_rate": 1.1900826446280993e-05, "loss": 2.1745, "step": 1440 }, { "epoch": 0.005993853200200897, "grad_norm": 5.795044632849597, "learning_rate": 1.1983471074380166e-05, "loss": 2.1352, "step": 1450 }, { "epoch": 0.0060351901188229725, "grad_norm": 6.43513153980254, "learning_rate": 1.206611570247934e-05, "loss": 2.102, "step": 1460 }, { "epoch": 0.006076527037445048, "grad_norm": 6.46737354415826, "learning_rate": 1.2148760330578513e-05, "loss": 2.0934, "step": 1470 }, { "epoch": 0.0061178639560671225, "grad_norm": 6.202005592277405, "learning_rate": 1.2231404958677686e-05, "loss": 2.1482, "step": 1480 }, { "epoch": 0.006159200874689198, "grad_norm": 5.8071883971926725, "learning_rate": 1.231404958677686e-05, "loss": 2.0553, "step": 1490 }, { "epoch": 0.006200537793311273, "grad_norm": 6.2092050955251334, "learning_rate": 1.2396694214876034e-05, "loss": 2.0836, "step": 1500 }, { "epoch": 0.006241874711933349, "grad_norm": 4.486772138898485, "learning_rate": 1.2479338842975209e-05, "loss": 2.1622, "step": 1510 }, { "epoch": 0.006283211630555423, "grad_norm": 5.229981562060858, "learning_rate": 1.2561983471074381e-05, "loss": 2.0854, "step": 1520 }, { "epoch": 0.006324548549177499, "grad_norm": 5.604269574061805, "learning_rate": 1.2644628099173554e-05, "loss": 2.1604, "step": 1530 }, { "epoch": 0.006365885467799574, "grad_norm": 5.361170550183367, "learning_rate": 1.2727272727272728e-05, "loss": 2.063, "step": 1540 }, { "epoch": 0.006407222386421649, "grad_norm": 7.20018526125173, "learning_rate": 1.2809917355371901e-05, "loss": 2.0935, "step": 1550 }, { "epoch": 0.006448559305043724, "grad_norm": 5.379224740428811, "learning_rate": 1.2892561983471074e-05, "loss": 2.1291, "step": 1560 }, { "epoch": 0.0064898962236658, "grad_norm": 4.967695885199312, "learning_rate": 1.2975206611570248e-05, "loss": 2.102, "step": 1570 }, { "epoch": 0.006531233142287874, "grad_norm": 6.039676530986522, "learning_rate": 1.3057851239669424e-05, "loss": 2.1134, "step": 1580 }, { "epoch": 0.00657257006090995, "grad_norm": 5.459189111144024, "learning_rate": 1.3140495867768597e-05, "loss": 2.0302, "step": 1590 }, { "epoch": 0.006613906979532025, "grad_norm": 6.143839950859222, "learning_rate": 1.322314049586777e-05, "loss": 2.0978, "step": 1600 }, { "epoch": 0.0066552438981541, "grad_norm": 5.6756704061902825, "learning_rate": 1.3305785123966944e-05, "loss": 2.1592, "step": 1610 }, { "epoch": 0.006696580816776175, "grad_norm": 4.994981957965064, "learning_rate": 1.3388429752066117e-05, "loss": 2.1035, "step": 1620 }, { "epoch": 0.0067379177353982505, "grad_norm": 5.853251459350967, "learning_rate": 1.347107438016529e-05, "loss": 2.0701, "step": 1630 }, { "epoch": 0.006779254654020325, "grad_norm": 5.4681573607696965, "learning_rate": 1.3553719008264464e-05, "loss": 2.089, "step": 1640 }, { "epoch": 0.0068205915726424005, "grad_norm": 5.848581304068256, "learning_rate": 1.3636363636363637e-05, "loss": 2.0819, "step": 1650 }, { "epoch": 0.006861928491264476, "grad_norm": 5.481243900559041, "learning_rate": 1.3719008264462813e-05, "loss": 2.0284, "step": 1660 }, { "epoch": 0.0069032654098865505, "grad_norm": 5.699959566604993, "learning_rate": 1.3801652892561985e-05, "loss": 2.0622, "step": 1670 }, { "epoch": 0.006944602328508626, "grad_norm": 4.996648526388665, "learning_rate": 1.3884297520661158e-05, "loss": 2.0071, "step": 1680 }, { "epoch": 0.006985939247130701, "grad_norm": 5.126923483054136, "learning_rate": 1.3966942148760332e-05, "loss": 2.0631, "step": 1690 }, { "epoch": 0.007027276165752776, "grad_norm": 5.883552104251492, "learning_rate": 1.4049586776859505e-05, "loss": 2.0791, "step": 1700 }, { "epoch": 0.007068613084374851, "grad_norm": 4.929514966855435, "learning_rate": 1.4132231404958678e-05, "loss": 2.0364, "step": 1710 }, { "epoch": 0.007109950002996927, "grad_norm": 5.301129760644346, "learning_rate": 1.4214876033057852e-05, "loss": 2.0373, "step": 1720 }, { "epoch": 0.007151286921619001, "grad_norm": 5.523739748516145, "learning_rate": 1.4297520661157025e-05, "loss": 2.1079, "step": 1730 }, { "epoch": 0.007192623840241077, "grad_norm": 5.7887756838227755, "learning_rate": 1.4380165289256201e-05, "loss": 2.049, "step": 1740 }, { "epoch": 0.007233960758863152, "grad_norm": 5.2452853088604865, "learning_rate": 1.4462809917355374e-05, "loss": 2.0686, "step": 1750 }, { "epoch": 0.007275297677485227, "grad_norm": 4.454384214370969, "learning_rate": 1.4545454545454546e-05, "loss": 2.0124, "step": 1760 }, { "epoch": 0.007316634596107302, "grad_norm": 6.397338503442304, "learning_rate": 1.462809917355372e-05, "loss": 2.0152, "step": 1770 }, { "epoch": 0.007357971514729378, "grad_norm": 6.554144037150873, "learning_rate": 1.4710743801652893e-05, "loss": 1.9976, "step": 1780 }, { "epoch": 0.007399308433351453, "grad_norm": 4.973940426595748, "learning_rate": 1.4793388429752066e-05, "loss": 2.0085, "step": 1790 }, { "epoch": 0.007440645351973528, "grad_norm": 5.776375204519037, "learning_rate": 1.487603305785124e-05, "loss": 2.0339, "step": 1800 }, { "epoch": 0.007481982270595603, "grad_norm": 5.472367097758556, "learning_rate": 1.4958677685950413e-05, "loss": 2.0016, "step": 1810 }, { "epoch": 0.0075233191892176785, "grad_norm": 4.850880114898939, "learning_rate": 1.504132231404959e-05, "loss": 1.9952, "step": 1820 }, { "epoch": 0.007564656107839753, "grad_norm": 4.825492061262016, "learning_rate": 1.5123966942148762e-05, "loss": 2.0149, "step": 1830 }, { "epoch": 0.0076059930264618285, "grad_norm": 6.317700924322252, "learning_rate": 1.5206611570247936e-05, "loss": 1.9765, "step": 1840 }, { "epoch": 0.007647329945083904, "grad_norm": 5.831048263887902, "learning_rate": 1.528925619834711e-05, "loss": 1.9669, "step": 1850 }, { "epoch": 0.0076886668637059785, "grad_norm": 5.190457786334756, "learning_rate": 1.5371900826446283e-05, "loss": 2.0342, "step": 1860 }, { "epoch": 0.007730003782328054, "grad_norm": 5.752029895196757, "learning_rate": 1.5454545454545454e-05, "loss": 2.0606, "step": 1870 }, { "epoch": 0.007771340700950129, "grad_norm": 5.005855197604682, "learning_rate": 1.553719008264463e-05, "loss": 2.0764, "step": 1880 }, { "epoch": 0.007812677619572205, "grad_norm": 5.362161895494138, "learning_rate": 1.5619834710743803e-05, "loss": 2.0373, "step": 1890 }, { "epoch": 0.00785401453819428, "grad_norm": 5.589239650428267, "learning_rate": 1.5702479338842978e-05, "loss": 2.0536, "step": 1900 }, { "epoch": 0.007895351456816354, "grad_norm": 5.38085836484136, "learning_rate": 1.5785123966942152e-05, "loss": 2.0357, "step": 1910 }, { "epoch": 0.00793668837543843, "grad_norm": 6.4123494555744065, "learning_rate": 1.5867768595041323e-05, "loss": 1.9992, "step": 1920 }, { "epoch": 0.007978025294060505, "grad_norm": 5.369699763052158, "learning_rate": 1.5950413223140497e-05, "loss": 1.9645, "step": 1930 }, { "epoch": 0.00801936221268258, "grad_norm": 6.655726980448801, "learning_rate": 1.6033057851239672e-05, "loss": 2.0023, "step": 1940 }, { "epoch": 0.008060699131304656, "grad_norm": 5.150395460216213, "learning_rate": 1.6115702479338843e-05, "loss": 1.9953, "step": 1950 }, { "epoch": 0.008102036049926731, "grad_norm": 5.534796132616727, "learning_rate": 1.6198347107438017e-05, "loss": 1.964, "step": 1960 }, { "epoch": 0.008143372968548805, "grad_norm": 5.0714233165065075, "learning_rate": 1.628099173553719e-05, "loss": 1.9942, "step": 1970 }, { "epoch": 0.00818470988717088, "grad_norm": 5.370096628339807, "learning_rate": 1.6363636363636366e-05, "loss": 1.9938, "step": 1980 }, { "epoch": 0.008226046805792956, "grad_norm": 4.816680798680657, "learning_rate": 1.644628099173554e-05, "loss": 1.9023, "step": 1990 }, { "epoch": 0.008267383724415031, "grad_norm": 5.910326143029371, "learning_rate": 1.652892561983471e-05, "loss": 1.9366, "step": 2000 }, { "epoch": 0.008308720643037107, "grad_norm": 5.364682793090204, "learning_rate": 1.6611570247933886e-05, "loss": 1.9835, "step": 2010 }, { "epoch": 0.008350057561659182, "grad_norm": 6.171717096992393, "learning_rate": 1.669421487603306e-05, "loss": 1.9641, "step": 2020 }, { "epoch": 0.008391394480281256, "grad_norm": 4.794750763380389, "learning_rate": 1.677685950413223e-05, "loss": 1.9405, "step": 2030 }, { "epoch": 0.008432731398903331, "grad_norm": 6.3363242673070745, "learning_rate": 1.6859504132231405e-05, "loss": 1.9717, "step": 2040 }, { "epoch": 0.008474068317525407, "grad_norm": 5.10756576497978, "learning_rate": 1.694214876033058e-05, "loss": 1.9312, "step": 2050 }, { "epoch": 0.008515405236147482, "grad_norm": 5.5429121513722945, "learning_rate": 1.7024793388429754e-05, "loss": 1.9692, "step": 2060 }, { "epoch": 0.008556742154769557, "grad_norm": 5.053921879606705, "learning_rate": 1.710743801652893e-05, "loss": 1.9187, "step": 2070 }, { "epoch": 0.008598079073391633, "grad_norm": 5.246682645264326, "learning_rate": 1.71900826446281e-05, "loss": 2.0086, "step": 2080 }, { "epoch": 0.008639415992013707, "grad_norm": 4.651358563124329, "learning_rate": 1.7272727272727274e-05, "loss": 1.9676, "step": 2090 }, { "epoch": 0.008680752910635782, "grad_norm": 5.254574557184252, "learning_rate": 1.735537190082645e-05, "loss": 1.9193, "step": 2100 }, { "epoch": 0.008722089829257857, "grad_norm": 5.5559516380514316, "learning_rate": 1.743801652892562e-05, "loss": 1.9123, "step": 2110 }, { "epoch": 0.008763426747879933, "grad_norm": 5.714609535718523, "learning_rate": 1.7520661157024794e-05, "loss": 1.9831, "step": 2120 }, { "epoch": 0.008804763666502008, "grad_norm": 4.664121459414757, "learning_rate": 1.7603305785123968e-05, "loss": 1.9606, "step": 2130 }, { "epoch": 0.008846100585124084, "grad_norm": 4.9060858638182685, "learning_rate": 1.7685950413223143e-05, "loss": 1.9535, "step": 2140 }, { "epoch": 0.008887437503746157, "grad_norm": 4.997171967559315, "learning_rate": 1.7768595041322317e-05, "loss": 1.9243, "step": 2150 }, { "epoch": 0.008928774422368233, "grad_norm": 4.60645777188567, "learning_rate": 1.7851239669421488e-05, "loss": 1.9291, "step": 2160 }, { "epoch": 0.008970111340990308, "grad_norm": 4.2131519608354004, "learning_rate": 1.7933884297520662e-05, "loss": 1.9105, "step": 2170 }, { "epoch": 0.009011448259612384, "grad_norm": 5.51444703850531, "learning_rate": 1.8016528925619837e-05, "loss": 1.9502, "step": 2180 }, { "epoch": 0.00905278517823446, "grad_norm": 5.2003808089855825, "learning_rate": 1.809917355371901e-05, "loss": 1.9436, "step": 2190 }, { "epoch": 0.009094122096856535, "grad_norm": 4.240179682087964, "learning_rate": 1.8181818181818182e-05, "loss": 1.9194, "step": 2200 }, { "epoch": 0.00913545901547861, "grad_norm": 4.582501074244312, "learning_rate": 1.8264462809917356e-05, "loss": 1.9145, "step": 2210 }, { "epoch": 0.009176795934100684, "grad_norm": 5.362083861786352, "learning_rate": 1.834710743801653e-05, "loss": 1.9165, "step": 2220 }, { "epoch": 0.009218132852722759, "grad_norm": 5.06281875114174, "learning_rate": 1.8429752066115705e-05, "loss": 1.977, "step": 2230 }, { "epoch": 0.009259469771344835, "grad_norm": 4.661496047656461, "learning_rate": 1.851239669421488e-05, "loss": 1.8892, "step": 2240 }, { "epoch": 0.00930080668996691, "grad_norm": 4.735532406310298, "learning_rate": 1.859504132231405e-05, "loss": 1.8689, "step": 2250 }, { "epoch": 0.009342143608588985, "grad_norm": 4.445771479719063, "learning_rate": 1.8677685950413225e-05, "loss": 1.9301, "step": 2260 }, { "epoch": 0.00938348052721106, "grad_norm": 5.100109904664726, "learning_rate": 1.87603305785124e-05, "loss": 1.9309, "step": 2270 }, { "epoch": 0.009424817445833135, "grad_norm": 6.115469535323335, "learning_rate": 1.884297520661157e-05, "loss": 1.918, "step": 2280 }, { "epoch": 0.00946615436445521, "grad_norm": 4.890019742506766, "learning_rate": 1.8925619834710745e-05, "loss": 1.945, "step": 2290 }, { "epoch": 0.009507491283077285, "grad_norm": 5.023798525711054, "learning_rate": 1.900826446280992e-05, "loss": 1.9004, "step": 2300 }, { "epoch": 0.00954882820169936, "grad_norm": 5.652810624249754, "learning_rate": 1.9090909090909094e-05, "loss": 1.9339, "step": 2310 }, { "epoch": 0.009590165120321436, "grad_norm": 6.04847384963266, "learning_rate": 1.9173553719008268e-05, "loss": 1.8687, "step": 2320 }, { "epoch": 0.009631502038943512, "grad_norm": 5.332733823425359, "learning_rate": 1.925619834710744e-05, "loss": 1.8938, "step": 2330 }, { "epoch": 0.009672838957565585, "grad_norm": 4.814331448709232, "learning_rate": 1.9338842975206613e-05, "loss": 1.8752, "step": 2340 }, { "epoch": 0.00971417587618766, "grad_norm": 5.558977499560294, "learning_rate": 1.9421487603305788e-05, "loss": 1.8835, "step": 2350 }, { "epoch": 0.009755512794809736, "grad_norm": 5.554940177569548, "learning_rate": 1.950413223140496e-05, "loss": 1.857, "step": 2360 }, { "epoch": 0.009796849713431812, "grad_norm": 4.970502489086558, "learning_rate": 1.9586776859504133e-05, "loss": 1.8553, "step": 2370 }, { "epoch": 0.009838186632053887, "grad_norm": 4.044099915606779, "learning_rate": 1.9669421487603307e-05, "loss": 1.924, "step": 2380 }, { "epoch": 0.009879523550675963, "grad_norm": 4.880726953238654, "learning_rate": 1.9752066115702482e-05, "loss": 1.9785, "step": 2390 }, { "epoch": 0.009920860469298036, "grad_norm": 5.457077789861094, "learning_rate": 1.9834710743801656e-05, "loss": 1.8585, "step": 2400 }, { "epoch": 0.009962197387920112, "grad_norm": 4.608586390817221, "learning_rate": 1.9917355371900827e-05, "loss": 1.8861, "step": 2410 }, { "epoch": 0.010003534306542187, "grad_norm": 4.5178969512670335, "learning_rate": 2e-05, "loss": 1.8936, "step": 2420 }, { "epoch": 0.010044871225164263, "grad_norm": 5.722004352525454, "learning_rate": 1.999999991396395e-05, "loss": 1.8616, "step": 2430 }, { "epoch": 0.010086208143786338, "grad_norm": 4.99862696301366, "learning_rate": 1.9999999655855794e-05, "loss": 1.8865, "step": 2440 }, { "epoch": 0.010127545062408413, "grad_norm": 5.204994732642035, "learning_rate": 1.9999999225675543e-05, "loss": 1.8602, "step": 2450 }, { "epoch": 0.010168881981030489, "grad_norm": 4.1143956012846505, "learning_rate": 1.9999998623423198e-05, "loss": 1.9101, "step": 2460 }, { "epoch": 0.010210218899652563, "grad_norm": 5.535771463118041, "learning_rate": 1.9999997849098773e-05, "loss": 1.8596, "step": 2470 }, { "epoch": 0.010251555818274638, "grad_norm": 5.020430211409416, "learning_rate": 1.999999690270228e-05, "loss": 1.8393, "step": 2480 }, { "epoch": 0.010292892736896713, "grad_norm": 5.571737674116448, "learning_rate": 1.999999578423374e-05, "loss": 1.8987, "step": 2490 }, { "epoch": 0.010334229655518789, "grad_norm": 4.336614412280944, "learning_rate": 1.9999994493693165e-05, "loss": 1.9194, "step": 2500 }, { "epoch": 0.010375566574140864, "grad_norm": 4.815853586635344, "learning_rate": 1.999999303108058e-05, "loss": 1.8332, "step": 2510 }, { "epoch": 0.01041690349276294, "grad_norm": 4.559920874208704, "learning_rate": 1.9999991396396014e-05, "loss": 1.8818, "step": 2520 }, { "epoch": 0.010458240411385013, "grad_norm": 5.23388810362715, "learning_rate": 1.9999989589639487e-05, "loss": 1.8302, "step": 2530 }, { "epoch": 0.010499577330007089, "grad_norm": 5.000301577463503, "learning_rate": 1.999998761081104e-05, "loss": 1.8657, "step": 2540 }, { "epoch": 0.010540914248629164, "grad_norm": 4.448533864801871, "learning_rate": 1.9999985459910698e-05, "loss": 1.8762, "step": 2550 }, { "epoch": 0.01058225116725124, "grad_norm": 4.715035035883112, "learning_rate": 1.9999983136938504e-05, "loss": 1.8511, "step": 2560 }, { "epoch": 0.010623588085873315, "grad_norm": 4.025529484549816, "learning_rate": 1.9999980641894497e-05, "loss": 1.8458, "step": 2570 }, { "epoch": 0.01066492500449539, "grad_norm": 4.754000032727581, "learning_rate": 1.9999977974778715e-05, "loss": 1.8714, "step": 2580 }, { "epoch": 0.010706261923117464, "grad_norm": 4.930978688660729, "learning_rate": 1.999997513559121e-05, "loss": 1.8656, "step": 2590 }, { "epoch": 0.01074759884173954, "grad_norm": 4.4132729377261475, "learning_rate": 1.9999972124332028e-05, "loss": 1.9383, "step": 2600 }, { "epoch": 0.010788935760361615, "grad_norm": 4.4540551253199325, "learning_rate": 1.9999968941001225e-05, "loss": 1.8426, "step": 2610 }, { "epoch": 0.01083027267898369, "grad_norm": 4.797250042059473, "learning_rate": 1.999996558559885e-05, "loss": 1.8634, "step": 2620 }, { "epoch": 0.010871609597605766, "grad_norm": 5.456931710111963, "learning_rate": 1.999996205812496e-05, "loss": 1.825, "step": 2630 }, { "epoch": 0.010912946516227841, "grad_norm": 4.377649523138056, "learning_rate": 1.999995835857962e-05, "loss": 1.8545, "step": 2640 }, { "epoch": 0.010954283434849915, "grad_norm": 4.5317328844732145, "learning_rate": 1.9999954486962893e-05, "loss": 1.8774, "step": 2650 }, { "epoch": 0.01099562035347199, "grad_norm": 4.709498283906347, "learning_rate": 1.9999950443274847e-05, "loss": 1.8083, "step": 2660 }, { "epoch": 0.011036957272094066, "grad_norm": 4.592327748002219, "learning_rate": 1.9999946227515547e-05, "loss": 1.792, "step": 2670 }, { "epoch": 0.011078294190716141, "grad_norm": 5.036724935294618, "learning_rate": 1.999994183968507e-05, "loss": 1.8779, "step": 2680 }, { "epoch": 0.011119631109338217, "grad_norm": 6.694409030503598, "learning_rate": 1.999993727978349e-05, "loss": 1.8573, "step": 2690 }, { "epoch": 0.011160968027960292, "grad_norm": 5.018981836140064, "learning_rate": 1.9999932547810883e-05, "loss": 1.8726, "step": 2700 }, { "epoch": 0.011202304946582366, "grad_norm": 4.451864521641474, "learning_rate": 1.9999927643767332e-05, "loss": 1.8193, "step": 2710 }, { "epoch": 0.011243641865204441, "grad_norm": 5.036890897058994, "learning_rate": 1.999992256765292e-05, "loss": 1.8594, "step": 2720 }, { "epoch": 0.011284978783826517, "grad_norm": 4.7545987502372755, "learning_rate": 1.999991731946774e-05, "loss": 1.9158, "step": 2730 }, { "epoch": 0.011326315702448592, "grad_norm": 3.9156550800432783, "learning_rate": 1.999991189921188e-05, "loss": 1.8166, "step": 2740 }, { "epoch": 0.011367652621070668, "grad_norm": 4.622686377530181, "learning_rate": 1.999990630688543e-05, "loss": 1.8426, "step": 2750 }, { "epoch": 0.011408989539692743, "grad_norm": 4.176720366120709, "learning_rate": 1.9999900542488487e-05, "loss": 1.8701, "step": 2760 }, { "epoch": 0.011450326458314819, "grad_norm": 4.588055146989058, "learning_rate": 1.999989460602115e-05, "loss": 1.8474, "step": 2770 }, { "epoch": 0.011491663376936892, "grad_norm": 4.7632605353618604, "learning_rate": 1.9999888497483523e-05, "loss": 1.7611, "step": 2780 }, { "epoch": 0.011533000295558968, "grad_norm": 5.168047411415939, "learning_rate": 1.9999882216875714e-05, "loss": 1.8297, "step": 2790 }, { "epoch": 0.011574337214181043, "grad_norm": 5.6032261833368215, "learning_rate": 1.9999875764197824e-05, "loss": 1.8273, "step": 2800 }, { "epoch": 0.011615674132803119, "grad_norm": 4.836606306201456, "learning_rate": 1.9999869139449965e-05, "loss": 1.8067, "step": 2810 }, { "epoch": 0.011657011051425194, "grad_norm": 5.371156408385522, "learning_rate": 1.9999862342632258e-05, "loss": 1.7726, "step": 2820 }, { "epoch": 0.01169834797004727, "grad_norm": 4.715562111195242, "learning_rate": 1.9999855373744813e-05, "loss": 1.8257, "step": 2830 }, { "epoch": 0.011739684888669343, "grad_norm": 4.672226047314074, "learning_rate": 1.9999848232787753e-05, "loss": 1.807, "step": 2840 }, { "epoch": 0.011781021807291419, "grad_norm": 5.305211095175868, "learning_rate": 1.9999840919761202e-05, "loss": 1.8398, "step": 2850 }, { "epoch": 0.011822358725913494, "grad_norm": 4.973800529744849, "learning_rate": 1.9999833434665282e-05, "loss": 1.8028, "step": 2860 }, { "epoch": 0.01186369564453557, "grad_norm": 4.580336749750151, "learning_rate": 1.9999825777500127e-05, "loss": 1.7559, "step": 2870 }, { "epoch": 0.011905032563157645, "grad_norm": 5.203176556084249, "learning_rate": 1.999981794826586e-05, "loss": 1.8375, "step": 2880 }, { "epoch": 0.01194636948177972, "grad_norm": 5.810430258629928, "learning_rate": 1.9999809946962627e-05, "loss": 1.8126, "step": 2890 }, { "epoch": 0.011987706400401794, "grad_norm": 5.7480488342439955, "learning_rate": 1.9999801773590556e-05, "loss": 1.8228, "step": 2900 }, { "epoch": 0.01202904331902387, "grad_norm": 4.946108636349945, "learning_rate": 1.9999793428149793e-05, "loss": 1.7801, "step": 2910 }, { "epoch": 0.012070380237645945, "grad_norm": 4.686375909907021, "learning_rate": 1.9999784910640484e-05, "loss": 1.7595, "step": 2920 }, { "epoch": 0.01211171715626802, "grad_norm": 5.2352360374303135, "learning_rate": 1.9999776221062767e-05, "loss": 1.8413, "step": 2930 }, { "epoch": 0.012153054074890096, "grad_norm": 4.509401680547479, "learning_rate": 1.99997673594168e-05, "loss": 1.7973, "step": 2940 }, { "epoch": 0.012194390993512171, "grad_norm": 4.614511294927466, "learning_rate": 1.9999758325702728e-05, "loss": 1.8206, "step": 2950 }, { "epoch": 0.012235727912134245, "grad_norm": 6.185921445660834, "learning_rate": 1.9999749119920714e-05, "loss": 1.8462, "step": 2960 }, { "epoch": 0.01227706483075632, "grad_norm": 4.174924494562577, "learning_rate": 1.999973974207091e-05, "loss": 1.8036, "step": 2970 }, { "epoch": 0.012318401749378396, "grad_norm": 4.836665186633954, "learning_rate": 1.9999730192153483e-05, "loss": 1.8517, "step": 2980 }, { "epoch": 0.012359738668000471, "grad_norm": 5.384960643252126, "learning_rate": 1.999972047016859e-05, "loss": 1.8159, "step": 2990 }, { "epoch": 0.012401075586622547, "grad_norm": 5.021462883098841, "learning_rate": 1.9999710576116403e-05, "loss": 1.7985, "step": 3000 }, { "epoch": 0.012442412505244622, "grad_norm": 5.427243361920921, "learning_rate": 1.99997005099971e-05, "loss": 1.7765, "step": 3010 }, { "epoch": 0.012483749423866698, "grad_norm": 4.491526906719712, "learning_rate": 1.999969027181084e-05, "loss": 1.8183, "step": 3020 }, { "epoch": 0.012525086342488771, "grad_norm": 4.559161527860771, "learning_rate": 1.9999679861557804e-05, "loss": 1.724, "step": 3030 }, { "epoch": 0.012566423261110847, "grad_norm": 4.161439649907769, "learning_rate": 1.9999669279238173e-05, "loss": 1.7683, "step": 3040 }, { "epoch": 0.012607760179732922, "grad_norm": 4.56366674854018, "learning_rate": 1.999965852485213e-05, "loss": 1.786, "step": 3050 }, { "epoch": 0.012649097098354998, "grad_norm": 5.245429404102266, "learning_rate": 1.999964759839986e-05, "loss": 1.7822, "step": 3060 }, { "epoch": 0.012690434016977073, "grad_norm": 4.6630532976211825, "learning_rate": 1.9999636499881548e-05, "loss": 1.7466, "step": 3070 }, { "epoch": 0.012731770935599148, "grad_norm": 4.060133282127772, "learning_rate": 1.9999625229297385e-05, "loss": 1.795, "step": 3080 }, { "epoch": 0.012773107854221222, "grad_norm": 3.9035653210065644, "learning_rate": 1.9999613786647568e-05, "loss": 1.7644, "step": 3090 }, { "epoch": 0.012814444772843298, "grad_norm": 4.309261522039182, "learning_rate": 1.9999602171932292e-05, "loss": 1.7843, "step": 3100 }, { "epoch": 0.012855781691465373, "grad_norm": 4.849350440475919, "learning_rate": 1.999959038515176e-05, "loss": 1.7703, "step": 3110 }, { "epoch": 0.012897118610087448, "grad_norm": 4.275025549538348, "learning_rate": 1.999957842630617e-05, "loss": 1.7714, "step": 3120 }, { "epoch": 0.012938455528709524, "grad_norm": 5.173686081149218, "learning_rate": 1.9999566295395728e-05, "loss": 1.7638, "step": 3130 }, { "epoch": 0.0129797924473316, "grad_norm": 4.908518222034618, "learning_rate": 1.999955399242065e-05, "loss": 1.8008, "step": 3140 }, { "epoch": 0.013021129365953673, "grad_norm": 4.14921313541257, "learning_rate": 1.9999541517381137e-05, "loss": 1.7741, "step": 3150 }, { "epoch": 0.013062466284575748, "grad_norm": 4.543722393877187, "learning_rate": 1.9999528870277412e-05, "loss": 1.7949, "step": 3160 }, { "epoch": 0.013103803203197824, "grad_norm": 4.410976439510873, "learning_rate": 1.9999516051109688e-05, "loss": 1.7547, "step": 3170 }, { "epoch": 0.0131451401218199, "grad_norm": 5.705194883861194, "learning_rate": 1.9999503059878188e-05, "loss": 1.7513, "step": 3180 }, { "epoch": 0.013186477040441975, "grad_norm": 4.65186813408292, "learning_rate": 1.999948989658313e-05, "loss": 1.7664, "step": 3190 }, { "epoch": 0.01322781395906405, "grad_norm": 5.20074413082596, "learning_rate": 1.9999476561224754e-05, "loss": 1.7545, "step": 3200 }, { "epoch": 0.013269150877686124, "grad_norm": 3.9975643391331745, "learning_rate": 1.9999463053803275e-05, "loss": 1.7175, "step": 3210 }, { "epoch": 0.0133104877963082, "grad_norm": 4.798261065065511, "learning_rate": 1.9999449374318934e-05, "loss": 1.7464, "step": 3220 }, { "epoch": 0.013351824714930275, "grad_norm": 4.858469902498838, "learning_rate": 1.9999435522771963e-05, "loss": 1.7568, "step": 3230 }, { "epoch": 0.01339316163355235, "grad_norm": 4.236662694907985, "learning_rate": 1.99994214991626e-05, "loss": 1.7335, "step": 3240 }, { "epoch": 0.013434498552174426, "grad_norm": 4.6188062645939585, "learning_rate": 1.9999407303491085e-05, "loss": 1.7529, "step": 3250 }, { "epoch": 0.013475835470796501, "grad_norm": 4.77521754475989, "learning_rate": 1.9999392935757668e-05, "loss": 1.7734, "step": 3260 }, { "epoch": 0.013517172389418576, "grad_norm": 6.027108769658543, "learning_rate": 1.999937839596259e-05, "loss": 1.8136, "step": 3270 }, { "epoch": 0.01355850930804065, "grad_norm": 4.163761649197771, "learning_rate": 1.9999363684106105e-05, "loss": 1.7085, "step": 3280 }, { "epoch": 0.013599846226662726, "grad_norm": 3.916493440655603, "learning_rate": 1.9999348800188466e-05, "loss": 1.7815, "step": 3290 }, { "epoch": 0.013641183145284801, "grad_norm": 4.397530066361572, "learning_rate": 1.9999333744209924e-05, "loss": 1.7759, "step": 3300 }, { "epoch": 0.013682520063906876, "grad_norm": 4.839462185241853, "learning_rate": 1.9999318516170747e-05, "loss": 1.7548, "step": 3310 }, { "epoch": 0.013723856982528952, "grad_norm": 4.759012044819632, "learning_rate": 1.999930311607119e-05, "loss": 1.7815, "step": 3320 }, { "epoch": 0.013765193901151027, "grad_norm": 4.1799473470272295, "learning_rate": 1.9999287543911522e-05, "loss": 1.7907, "step": 3330 }, { "epoch": 0.013806530819773101, "grad_norm": 4.454633377063746, "learning_rate": 1.9999271799692006e-05, "loss": 1.7579, "step": 3340 }, { "epoch": 0.013847867738395176, "grad_norm": 4.997867503776301, "learning_rate": 1.999925588341292e-05, "loss": 1.7335, "step": 3350 }, { "epoch": 0.013889204657017252, "grad_norm": 4.345433706332678, "learning_rate": 1.999923979507453e-05, "loss": 1.7124, "step": 3360 }, { "epoch": 0.013930541575639327, "grad_norm": 4.531985231521044, "learning_rate": 1.999922353467712e-05, "loss": 1.758, "step": 3370 }, { "epoch": 0.013971878494261403, "grad_norm": 4.399801080955952, "learning_rate": 1.9999207102220962e-05, "loss": 1.7065, "step": 3380 }, { "epoch": 0.014013215412883478, "grad_norm": 5.059800651377326, "learning_rate": 1.999919049770635e-05, "loss": 1.693, "step": 3390 }, { "epoch": 0.014054552331505552, "grad_norm": 4.260000291303237, "learning_rate": 1.9999173721133557e-05, "loss": 1.7488, "step": 3400 }, { "epoch": 0.014095889250127627, "grad_norm": 4.5331171056345605, "learning_rate": 1.999915677250288e-05, "loss": 1.7046, "step": 3410 }, { "epoch": 0.014137226168749703, "grad_norm": 4.187185061547482, "learning_rate": 1.999913965181461e-05, "loss": 1.7013, "step": 3420 }, { "epoch": 0.014178563087371778, "grad_norm": 4.429191188349303, "learning_rate": 1.999912235906904e-05, "loss": 1.7127, "step": 3430 }, { "epoch": 0.014219900005993854, "grad_norm": 4.083346883074332, "learning_rate": 1.9999104894266466e-05, "loss": 1.7571, "step": 3440 }, { "epoch": 0.014261236924615929, "grad_norm": 4.424685185794806, "learning_rate": 1.999908725740719e-05, "loss": 1.7563, "step": 3450 }, { "epoch": 0.014302573843238003, "grad_norm": 4.285454928033791, "learning_rate": 1.9999069448491516e-05, "loss": 1.7547, "step": 3460 }, { "epoch": 0.014343910761860078, "grad_norm": 4.424718664433471, "learning_rate": 1.999905146751975e-05, "loss": 1.7374, "step": 3470 }, { "epoch": 0.014385247680482154, "grad_norm": 4.075077717271962, "learning_rate": 1.99990333144922e-05, "loss": 1.7661, "step": 3480 }, { "epoch": 0.014426584599104229, "grad_norm": 4.538474712661468, "learning_rate": 1.999901498940918e-05, "loss": 1.7118, "step": 3490 }, { "epoch": 0.014467921517726304, "grad_norm": 4.435775318213515, "learning_rate": 1.9998996492271007e-05, "loss": 1.7368, "step": 3500 }, { "epoch": 0.01450925843634838, "grad_norm": 4.545629750120307, "learning_rate": 1.9998977823077998e-05, "loss": 1.7335, "step": 3510 }, { "epoch": 0.014550595354970454, "grad_norm": 4.359608762821868, "learning_rate": 1.9998958981830473e-05, "loss": 1.7318, "step": 3520 }, { "epoch": 0.014591932273592529, "grad_norm": 4.453842525389737, "learning_rate": 1.9998939968528754e-05, "loss": 1.7499, "step": 3530 }, { "epoch": 0.014633269192214604, "grad_norm": 5.088846901725583, "learning_rate": 1.9998920783173172e-05, "loss": 1.7555, "step": 3540 }, { "epoch": 0.01467460611083668, "grad_norm": 4.1590343693668395, "learning_rate": 1.9998901425764057e-05, "loss": 1.7386, "step": 3550 }, { "epoch": 0.014715943029458755, "grad_norm": 4.181140524329235, "learning_rate": 1.9998881896301744e-05, "loss": 1.6455, "step": 3560 }, { "epoch": 0.01475727994808083, "grad_norm": 4.228896471972448, "learning_rate": 1.999886219478656e-05, "loss": 1.7282, "step": 3570 }, { "epoch": 0.014798616866702906, "grad_norm": 3.9899831004408526, "learning_rate": 1.9998842321218855e-05, "loss": 1.7201, "step": 3580 }, { "epoch": 0.01483995378532498, "grad_norm": 3.9178031007246408, "learning_rate": 1.9998822275598964e-05, "loss": 1.6812, "step": 3590 }, { "epoch": 0.014881290703947055, "grad_norm": 4.3808976089497484, "learning_rate": 1.9998802057927236e-05, "loss": 1.7175, "step": 3600 }, { "epoch": 0.01492262762256913, "grad_norm": 3.9780395209303197, "learning_rate": 1.9998781668204015e-05, "loss": 1.7351, "step": 3610 }, { "epoch": 0.014963964541191206, "grad_norm": 5.378812354806347, "learning_rate": 1.9998761106429655e-05, "loss": 1.7092, "step": 3620 }, { "epoch": 0.015005301459813282, "grad_norm": 3.9422939515447246, "learning_rate": 1.999874037260451e-05, "loss": 1.7261, "step": 3630 }, { "epoch": 0.015046638378435357, "grad_norm": 4.442422033504748, "learning_rate": 1.9998719466728934e-05, "loss": 1.7027, "step": 3640 }, { "epoch": 0.01508797529705743, "grad_norm": 4.102984271689072, "learning_rate": 1.9998698388803288e-05, "loss": 1.6741, "step": 3650 }, { "epoch": 0.015129312215679506, "grad_norm": 3.9608491048290615, "learning_rate": 1.9998677138827934e-05, "loss": 1.7542, "step": 3660 }, { "epoch": 0.015170649134301582, "grad_norm": 4.561629575046756, "learning_rate": 1.999865571680324e-05, "loss": 1.6785, "step": 3670 }, { "epoch": 0.015211986052923657, "grad_norm": 4.4640127715057885, "learning_rate": 1.9998634122729573e-05, "loss": 1.7, "step": 3680 }, { "epoch": 0.015253322971545732, "grad_norm": 3.8935864417828188, "learning_rate": 1.9998612356607303e-05, "loss": 1.6939, "step": 3690 }, { "epoch": 0.015294659890167808, "grad_norm": 5.011277234521909, "learning_rate": 1.9998590418436808e-05, "loss": 1.7019, "step": 3700 }, { "epoch": 0.015335996808789882, "grad_norm": 4.107354033282244, "learning_rate": 1.9998568308218465e-05, "loss": 1.6637, "step": 3710 }, { "epoch": 0.015377333727411957, "grad_norm": 5.3480918453617905, "learning_rate": 1.999854602595265e-05, "loss": 1.7322, "step": 3720 }, { "epoch": 0.015418670646034032, "grad_norm": 4.443648241332512, "learning_rate": 1.9998523571639752e-05, "loss": 1.6794, "step": 3730 }, { "epoch": 0.015460007564656108, "grad_norm": 3.4677507775480025, "learning_rate": 1.999850094528015e-05, "loss": 1.6943, "step": 3740 }, { "epoch": 0.015501344483278183, "grad_norm": 4.306434811794374, "learning_rate": 1.9998478146874244e-05, "loss": 1.6996, "step": 3750 }, { "epoch": 0.015542681401900259, "grad_norm": 5.783322294479809, "learning_rate": 1.9998455176422423e-05, "loss": 1.7071, "step": 3760 }, { "epoch": 0.015584018320522332, "grad_norm": 5.907422561947855, "learning_rate": 1.999843203392507e-05, "loss": 1.7736, "step": 3770 }, { "epoch": 0.01562535523914441, "grad_norm": 4.114299347318918, "learning_rate": 1.9998408719382602e-05, "loss": 1.7068, "step": 3780 }, { "epoch": 0.015666692157766483, "grad_norm": 4.897826252389082, "learning_rate": 1.999838523279541e-05, "loss": 1.6542, "step": 3790 }, { "epoch": 0.01570802907638856, "grad_norm": 4.387711122090114, "learning_rate": 1.9998361574163897e-05, "loss": 1.7202, "step": 3800 }, { "epoch": 0.015749365995010634, "grad_norm": 4.249935651772863, "learning_rate": 1.999833774348847e-05, "loss": 1.6871, "step": 3810 }, { "epoch": 0.015790702913632708, "grad_norm": 4.961734747800958, "learning_rate": 1.9998313740769547e-05, "loss": 1.7012, "step": 3820 }, { "epoch": 0.015832039832254785, "grad_norm": 4.247988360660198, "learning_rate": 1.9998289566007535e-05, "loss": 1.684, "step": 3830 }, { "epoch": 0.01587337675087686, "grad_norm": 5.434305269506113, "learning_rate": 1.999826521920285e-05, "loss": 1.7673, "step": 3840 }, { "epoch": 0.015914713669498936, "grad_norm": 4.617133742171007, "learning_rate": 1.999824070035591e-05, "loss": 1.6622, "step": 3850 }, { "epoch": 0.01595605058812101, "grad_norm": 3.70746479523289, "learning_rate": 1.9998216009467136e-05, "loss": 1.6647, "step": 3860 }, { "epoch": 0.015997387506743083, "grad_norm": 4.604026510578186, "learning_rate": 1.999819114653696e-05, "loss": 1.6772, "step": 3870 }, { "epoch": 0.01603872442536516, "grad_norm": 3.8606213125382642, "learning_rate": 1.9998166111565804e-05, "loss": 1.694, "step": 3880 }, { "epoch": 0.016080061343987234, "grad_norm": 5.0244652420608045, "learning_rate": 1.99981409045541e-05, "loss": 1.7797, "step": 3890 }, { "epoch": 0.01612139826260931, "grad_norm": 4.707739461519922, "learning_rate": 1.999811552550228e-05, "loss": 1.7159, "step": 3900 }, { "epoch": 0.016162735181231385, "grad_norm": 3.9677147576335043, "learning_rate": 1.9998089974410782e-05, "loss": 1.6708, "step": 3910 }, { "epoch": 0.016204072099853462, "grad_norm": 4.311084704937728, "learning_rate": 1.9998064251280048e-05, "loss": 1.7109, "step": 3920 }, { "epoch": 0.016245409018475536, "grad_norm": 3.9457174661249534, "learning_rate": 1.999803835611052e-05, "loss": 1.6713, "step": 3930 }, { "epoch": 0.01628674593709761, "grad_norm": 3.947531059176682, "learning_rate": 1.999801228890264e-05, "loss": 1.6796, "step": 3940 }, { "epoch": 0.016328082855719687, "grad_norm": 4.14663907999712, "learning_rate": 1.9997986049656858e-05, "loss": 1.6452, "step": 3950 }, { "epoch": 0.01636941977434176, "grad_norm": 3.897276226226099, "learning_rate": 1.9997959638373626e-05, "loss": 1.6507, "step": 3960 }, { "epoch": 0.016410756692963838, "grad_norm": 3.778326978683171, "learning_rate": 1.9997933055053402e-05, "loss": 1.7378, "step": 3970 }, { "epoch": 0.01645209361158591, "grad_norm": 4.014730222130603, "learning_rate": 1.9997906299696635e-05, "loss": 1.6651, "step": 3980 }, { "epoch": 0.016493430530207985, "grad_norm": 3.8164751076978223, "learning_rate": 1.9997879372303797e-05, "loss": 1.7007, "step": 3990 }, { "epoch": 0.016534767448830062, "grad_norm": 3.922371704332535, "learning_rate": 1.999785227287534e-05, "loss": 1.7161, "step": 4000 }, { "epoch": 0.016576104367452136, "grad_norm": 3.934785675300376, "learning_rate": 1.9997825001411738e-05, "loss": 1.6704, "step": 4010 }, { "epoch": 0.016617441286074213, "grad_norm": 4.564033996587743, "learning_rate": 1.9997797557913455e-05, "loss": 1.6918, "step": 4020 }, { "epoch": 0.016658778204696287, "grad_norm": 4.4245567390274365, "learning_rate": 1.9997769942380968e-05, "loss": 1.7143, "step": 4030 }, { "epoch": 0.016700115123318364, "grad_norm": 3.8624198473379874, "learning_rate": 1.9997742154814744e-05, "loss": 1.7298, "step": 4040 }, { "epoch": 0.016741452041940438, "grad_norm": 4.010446146589402, "learning_rate": 1.9997714195215275e-05, "loss": 1.6851, "step": 4050 }, { "epoch": 0.01678278896056251, "grad_norm": 4.139527737935189, "learning_rate": 1.9997686063583028e-05, "loss": 1.6597, "step": 4060 }, { "epoch": 0.01682412587918459, "grad_norm": 3.617422879629344, "learning_rate": 1.9997657759918498e-05, "loss": 1.7078, "step": 4070 }, { "epoch": 0.016865462797806662, "grad_norm": 4.492323213426353, "learning_rate": 1.9997629284222165e-05, "loss": 1.6521, "step": 4080 }, { "epoch": 0.01690679971642874, "grad_norm": 5.007903819964739, "learning_rate": 1.999760063649452e-05, "loss": 1.6694, "step": 4090 }, { "epoch": 0.016948136635050813, "grad_norm": 4.960862868620129, "learning_rate": 1.999757181673606e-05, "loss": 1.68, "step": 4100 }, { "epoch": 0.01698947355367289, "grad_norm": 5.878432740559922, "learning_rate": 1.9997542824947276e-05, "loss": 1.6736, "step": 4110 }, { "epoch": 0.017030810472294964, "grad_norm": 4.440326929426054, "learning_rate": 1.999751366112867e-05, "loss": 1.6335, "step": 4120 }, { "epoch": 0.017072147390917038, "grad_norm": 4.263618522816504, "learning_rate": 1.999748432528074e-05, "loss": 1.7186, "step": 4130 }, { "epoch": 0.017113484309539115, "grad_norm": 4.292363992231819, "learning_rate": 1.9997454817403996e-05, "loss": 1.6416, "step": 4140 }, { "epoch": 0.01715482122816119, "grad_norm": 4.013314862106662, "learning_rate": 1.9997425137498944e-05, "loss": 1.723, "step": 4150 }, { "epoch": 0.017196158146783266, "grad_norm": 4.07382683143937, "learning_rate": 1.999739528556609e-05, "loss": 1.6604, "step": 4160 }, { "epoch": 0.01723749506540534, "grad_norm": 4.533516304139438, "learning_rate": 1.9997365261605957e-05, "loss": 1.6683, "step": 4170 }, { "epoch": 0.017278831984027413, "grad_norm": 5.114666733039835, "learning_rate": 1.999733506561905e-05, "loss": 1.6925, "step": 4180 }, { "epoch": 0.01732016890264949, "grad_norm": 3.895641699630939, "learning_rate": 1.99973046976059e-05, "loss": 1.6743, "step": 4190 }, { "epoch": 0.017361505821271564, "grad_norm": 3.9125805892169465, "learning_rate": 1.9997274157567025e-05, "loss": 1.6823, "step": 4200 }, { "epoch": 0.01740284273989364, "grad_norm": 4.530982763817902, "learning_rate": 1.999724344550295e-05, "loss": 1.666, "step": 4210 }, { "epoch": 0.017444179658515715, "grad_norm": 4.806928145874966, "learning_rate": 1.9997212561414198e-05, "loss": 1.7254, "step": 4220 }, { "epoch": 0.017485516577137792, "grad_norm": 3.9697720655534483, "learning_rate": 1.999718150530131e-05, "loss": 1.6241, "step": 4230 }, { "epoch": 0.017526853495759866, "grad_norm": 4.257480914158059, "learning_rate": 1.9997150277164815e-05, "loss": 1.6346, "step": 4240 }, { "epoch": 0.01756819041438194, "grad_norm": 3.799531767116148, "learning_rate": 1.999711887700525e-05, "loss": 1.6296, "step": 4250 }, { "epoch": 0.017609527333004017, "grad_norm": 3.802902072405634, "learning_rate": 1.999708730482316e-05, "loss": 1.6296, "step": 4260 }, { "epoch": 0.01765086425162609, "grad_norm": 5.118064089629252, "learning_rate": 1.9997055560619082e-05, "loss": 1.643, "step": 4270 }, { "epoch": 0.017692201170248167, "grad_norm": 4.227158901611068, "learning_rate": 1.9997023644393567e-05, "loss": 1.6698, "step": 4280 }, { "epoch": 0.01773353808887024, "grad_norm": 4.238927562799819, "learning_rate": 1.9996991556147166e-05, "loss": 1.653, "step": 4290 }, { "epoch": 0.017774875007492315, "grad_norm": 4.204830304370112, "learning_rate": 1.9996959295880423e-05, "loss": 1.6844, "step": 4300 }, { "epoch": 0.017816211926114392, "grad_norm": 4.097133417277415, "learning_rate": 1.99969268635939e-05, "loss": 1.6212, "step": 4310 }, { "epoch": 0.017857548844736466, "grad_norm": 4.65335395814053, "learning_rate": 1.999689425928815e-05, "loss": 1.6882, "step": 4320 }, { "epoch": 0.017898885763358543, "grad_norm": 4.112571966210029, "learning_rate": 1.999686148296374e-05, "loss": 1.6929, "step": 4330 }, { "epoch": 0.017940222681980617, "grad_norm": 5.088602258322444, "learning_rate": 1.999682853462123e-05, "loss": 1.6648, "step": 4340 }, { "epoch": 0.017981559600602694, "grad_norm": 3.9480572889086147, "learning_rate": 1.9996795414261186e-05, "loss": 1.5896, "step": 4350 }, { "epoch": 0.018022896519224767, "grad_norm": 4.8104711694243, "learning_rate": 1.9996762121884186e-05, "loss": 1.6709, "step": 4360 }, { "epoch": 0.01806423343784684, "grad_norm": 5.388396623467715, "learning_rate": 1.999672865749079e-05, "loss": 1.6716, "step": 4370 }, { "epoch": 0.01810557035646892, "grad_norm": 4.279793170082693, "learning_rate": 1.9996695021081584e-05, "loss": 1.632, "step": 4380 }, { "epoch": 0.018146907275090992, "grad_norm": 4.624743597271427, "learning_rate": 1.999666121265714e-05, "loss": 1.6054, "step": 4390 }, { "epoch": 0.01818824419371307, "grad_norm": 4.133320200289432, "learning_rate": 1.9996627232218048e-05, "loss": 1.6418, "step": 4400 }, { "epoch": 0.018229581112335143, "grad_norm": 4.0963463496824986, "learning_rate": 1.9996593079764884e-05, "loss": 1.6683, "step": 4410 }, { "epoch": 0.01827091803095722, "grad_norm": 4.03547359932741, "learning_rate": 1.9996558755298238e-05, "loss": 1.5996, "step": 4420 }, { "epoch": 0.018312254949579294, "grad_norm": 4.156363997210419, "learning_rate": 1.9996524258818706e-05, "loss": 1.6471, "step": 4430 }, { "epoch": 0.018353591868201367, "grad_norm": 4.075479637959615, "learning_rate": 1.9996489590326874e-05, "loss": 1.5989, "step": 4440 }, { "epoch": 0.018394928786823445, "grad_norm": 4.63601174765512, "learning_rate": 1.9996454749823345e-05, "loss": 1.6642, "step": 4450 }, { "epoch": 0.018436265705445518, "grad_norm": 3.760851338042477, "learning_rate": 1.9996419737308715e-05, "loss": 1.6579, "step": 4460 }, { "epoch": 0.018477602624067595, "grad_norm": 3.979536768168784, "learning_rate": 1.9996384552783588e-05, "loss": 1.6006, "step": 4470 }, { "epoch": 0.01851893954268967, "grad_norm": 4.246767902971398, "learning_rate": 1.9996349196248563e-05, "loss": 1.6715, "step": 4480 }, { "epoch": 0.018560276461311743, "grad_norm": 4.26779353731614, "learning_rate": 1.999631366770426e-05, "loss": 1.6859, "step": 4490 }, { "epoch": 0.01860161337993382, "grad_norm": 4.049582440808523, "learning_rate": 1.9996277967151283e-05, "loss": 1.6882, "step": 4500 }, { "epoch": 0.018642950298555894, "grad_norm": 4.066185344313316, "learning_rate": 1.9996242094590248e-05, "loss": 1.6601, "step": 4510 }, { "epoch": 0.01868428721717797, "grad_norm": 3.7309702600230494, "learning_rate": 1.9996206050021768e-05, "loss": 1.6453, "step": 4520 }, { "epoch": 0.018725624135800045, "grad_norm": 4.307728435051617, "learning_rate": 1.9996169833446473e-05, "loss": 1.6728, "step": 4530 }, { "epoch": 0.01876696105442212, "grad_norm": 3.892468749865279, "learning_rate": 1.9996133444864974e-05, "loss": 1.6996, "step": 4540 }, { "epoch": 0.018808297973044195, "grad_norm": 4.172694653615993, "learning_rate": 1.999609688427791e-05, "loss": 1.6519, "step": 4550 }, { "epoch": 0.01884963489166627, "grad_norm": 4.211392128772361, "learning_rate": 1.9996060151685895e-05, "loss": 1.6096, "step": 4560 }, { "epoch": 0.018890971810288346, "grad_norm": 4.728429773380645, "learning_rate": 1.9996023247089576e-05, "loss": 1.6217, "step": 4570 }, { "epoch": 0.01893230872891042, "grad_norm": 3.7603074265755745, "learning_rate": 1.999598617048958e-05, "loss": 1.617, "step": 4580 }, { "epoch": 0.018973645647532497, "grad_norm": 4.5264911357846165, "learning_rate": 1.9995948921886547e-05, "loss": 1.6009, "step": 4590 }, { "epoch": 0.01901498256615457, "grad_norm": 4.285402551531064, "learning_rate": 1.999591150128112e-05, "loss": 1.6666, "step": 4600 }, { "epoch": 0.019056319484776648, "grad_norm": 4.528562163332608, "learning_rate": 1.9995873908673936e-05, "loss": 1.6967, "step": 4610 }, { "epoch": 0.01909765640339872, "grad_norm": 4.331142545150304, "learning_rate": 1.999583614406565e-05, "loss": 1.6387, "step": 4620 }, { "epoch": 0.019138993322020795, "grad_norm": 4.277497333006759, "learning_rate": 1.9995798207456906e-05, "loss": 1.6407, "step": 4630 }, { "epoch": 0.019180330240642873, "grad_norm": 4.236531733677237, "learning_rate": 1.999576009884836e-05, "loss": 1.6528, "step": 4640 }, { "epoch": 0.019221667159264946, "grad_norm": 4.1527404087837825, "learning_rate": 1.9995721818240664e-05, "loss": 1.6386, "step": 4650 }, { "epoch": 0.019263004077887023, "grad_norm": 4.21734134516066, "learning_rate": 1.999568336563448e-05, "loss": 1.6531, "step": 4660 }, { "epoch": 0.019304340996509097, "grad_norm": 4.010277949791672, "learning_rate": 1.999564474103047e-05, "loss": 1.6125, "step": 4670 }, { "epoch": 0.01934567791513117, "grad_norm": 4.974363400314765, "learning_rate": 1.99956059444293e-05, "loss": 1.6562, "step": 4680 }, { "epoch": 0.019387014833753248, "grad_norm": 3.461845715262989, "learning_rate": 1.999556697583163e-05, "loss": 1.6715, "step": 4690 }, { "epoch": 0.01942835175237532, "grad_norm": 4.501289760535044, "learning_rate": 1.999552783523814e-05, "loss": 1.6276, "step": 4700 }, { "epoch": 0.0194696886709974, "grad_norm": 3.980526992455661, "learning_rate": 1.99954885226495e-05, "loss": 1.6512, "step": 4710 }, { "epoch": 0.019511025589619473, "grad_norm": 4.754361998561602, "learning_rate": 1.9995449038066385e-05, "loss": 1.6563, "step": 4720 }, { "epoch": 0.01955236250824155, "grad_norm": 3.962924389993788, "learning_rate": 1.9995409381489473e-05, "loss": 1.5921, "step": 4730 }, { "epoch": 0.019593699426863623, "grad_norm": 4.230038259640959, "learning_rate": 1.999536955291945e-05, "loss": 1.6266, "step": 4740 }, { "epoch": 0.019635036345485697, "grad_norm": 3.4637303252434863, "learning_rate": 1.9995329552356996e-05, "loss": 1.5613, "step": 4750 }, { "epoch": 0.019676373264107774, "grad_norm": 4.180047059414082, "learning_rate": 1.999528937980281e-05, "loss": 1.6358, "step": 4760 }, { "epoch": 0.019717710182729848, "grad_norm": 4.407688478601427, "learning_rate": 1.9995249035257572e-05, "loss": 1.6276, "step": 4770 }, { "epoch": 0.019759047101351925, "grad_norm": 5.682179019619738, "learning_rate": 1.999520851872198e-05, "loss": 1.6339, "step": 4780 }, { "epoch": 0.019800384019974, "grad_norm": 5.80296950656401, "learning_rate": 1.9995167830196732e-05, "loss": 1.6735, "step": 4790 }, { "epoch": 0.019841720938596073, "grad_norm": 4.788010741660107, "learning_rate": 1.999512696968253e-05, "loss": 1.6183, "step": 4800 }, { "epoch": 0.01988305785721815, "grad_norm": 3.2823877198029683, "learning_rate": 1.9995085937180075e-05, "loss": 1.6314, "step": 4810 }, { "epoch": 0.019924394775840223, "grad_norm": 4.513204723991569, "learning_rate": 1.9995044732690074e-05, "loss": 1.6558, "step": 4820 }, { "epoch": 0.0199657316944623, "grad_norm": 3.710887033971277, "learning_rate": 1.999500335621323e-05, "loss": 1.6339, "step": 4830 }, { "epoch": 0.020007068613084374, "grad_norm": 3.914180149814728, "learning_rate": 1.9994961807750264e-05, "loss": 1.6263, "step": 4840 }, { "epoch": 0.02004840553170645, "grad_norm": 4.149254446951243, "learning_rate": 1.999492008730189e-05, "loss": 1.6276, "step": 4850 }, { "epoch": 0.020089742450328525, "grad_norm": 3.8520876610172756, "learning_rate": 1.9994878194868817e-05, "loss": 1.6168, "step": 4860 }, { "epoch": 0.0201310793689506, "grad_norm": 4.315135033151227, "learning_rate": 1.9994836130451777e-05, "loss": 1.6799, "step": 4870 }, { "epoch": 0.020172416287572676, "grad_norm": 4.299172694880712, "learning_rate": 1.9994793894051483e-05, "loss": 1.6094, "step": 4880 }, { "epoch": 0.02021375320619475, "grad_norm": 3.9099719074716974, "learning_rate": 1.999475148566867e-05, "loss": 1.6002, "step": 4890 }, { "epoch": 0.020255090124816827, "grad_norm": 3.621204913700773, "learning_rate": 1.9994708905304066e-05, "loss": 1.627, "step": 4900 }, { "epoch": 0.0202964270434389, "grad_norm": 4.002608239997497, "learning_rate": 1.9994666152958403e-05, "loss": 1.6377, "step": 4910 }, { "epoch": 0.020337763962060978, "grad_norm": 3.509839578650558, "learning_rate": 1.9994623228632413e-05, "loss": 1.6498, "step": 4920 }, { "epoch": 0.02037910088068305, "grad_norm": 3.948041169756955, "learning_rate": 1.9994580132326843e-05, "loss": 1.6605, "step": 4930 }, { "epoch": 0.020420437799305125, "grad_norm": 3.7588684802290713, "learning_rate": 1.9994536864042428e-05, "loss": 1.6845, "step": 4940 }, { "epoch": 0.020461774717927202, "grad_norm": 4.867688920782023, "learning_rate": 1.999449342377991e-05, "loss": 1.5575, "step": 4950 }, { "epoch": 0.020503111636549276, "grad_norm": 4.235921275935457, "learning_rate": 1.9994449811540044e-05, "loss": 1.6329, "step": 4960 }, { "epoch": 0.020544448555171353, "grad_norm": 5.353004787701509, "learning_rate": 1.9994406027323578e-05, "loss": 1.5961, "step": 4970 }, { "epoch": 0.020585785473793427, "grad_norm": 4.49092979482084, "learning_rate": 1.999436207113126e-05, "loss": 1.6152, "step": 4980 }, { "epoch": 0.0206271223924155, "grad_norm": 4.786632872232947, "learning_rate": 1.9994317942963856e-05, "loss": 1.5889, "step": 4990 }, { "epoch": 0.020668459311037578, "grad_norm": 3.7616100197105324, "learning_rate": 1.999427364282212e-05, "loss": 1.6428, "step": 5000 }, { "epoch": 0.02070979622965965, "grad_norm": 4.922026489251745, "learning_rate": 1.999422917070681e-05, "loss": 1.6404, "step": 5010 }, { "epoch": 0.02075113314828173, "grad_norm": 4.51143708824428, "learning_rate": 1.9994184526618698e-05, "loss": 1.6532, "step": 5020 }, { "epoch": 0.020792470066903802, "grad_norm": 4.104589032058005, "learning_rate": 1.999413971055855e-05, "loss": 1.6071, "step": 5030 }, { "epoch": 0.02083380698552588, "grad_norm": 4.89262784656072, "learning_rate": 1.999409472252714e-05, "loss": 1.6516, "step": 5040 }, { "epoch": 0.020875143904147953, "grad_norm": 3.6347037714340122, "learning_rate": 1.9994049562525235e-05, "loss": 1.5681, "step": 5050 }, { "epoch": 0.020916480822770027, "grad_norm": 3.986687295644655, "learning_rate": 1.9994004230553616e-05, "loss": 1.6061, "step": 5060 }, { "epoch": 0.020957817741392104, "grad_norm": 5.1196884550128825, "learning_rate": 1.999395872661307e-05, "loss": 1.646, "step": 5070 }, { "epoch": 0.020999154660014178, "grad_norm": 4.073313251564883, "learning_rate": 1.9993913050704362e-05, "loss": 1.5632, "step": 5080 }, { "epoch": 0.021040491578636255, "grad_norm": 3.773829349198683, "learning_rate": 1.99938672028283e-05, "loss": 1.596, "step": 5090 }, { "epoch": 0.02108182849725833, "grad_norm": 5.707286361857388, "learning_rate": 1.9993821182985655e-05, "loss": 1.587, "step": 5100 }, { "epoch": 0.021123165415880402, "grad_norm": 4.135913165404502, "learning_rate": 1.9993774991177227e-05, "loss": 1.6229, "step": 5110 }, { "epoch": 0.02116450233450248, "grad_norm": 4.538213401615244, "learning_rate": 1.9993728627403814e-05, "loss": 1.5913, "step": 5120 }, { "epoch": 0.021205839253124553, "grad_norm": 4.103580788767663, "learning_rate": 1.9993682091666206e-05, "loss": 1.6532, "step": 5130 }, { "epoch": 0.02124717617174663, "grad_norm": 3.6711472807654064, "learning_rate": 1.9993635383965205e-05, "loss": 1.5746, "step": 5140 }, { "epoch": 0.021288513090368704, "grad_norm": 5.277279072305559, "learning_rate": 1.9993588504301623e-05, "loss": 1.597, "step": 5150 }, { "epoch": 0.02132985000899078, "grad_norm": 3.646653216373581, "learning_rate": 1.9993541452676257e-05, "loss": 1.6045, "step": 5160 }, { "epoch": 0.021371186927612855, "grad_norm": 4.454553625669168, "learning_rate": 1.999349422908992e-05, "loss": 1.6168, "step": 5170 }, { "epoch": 0.02141252384623493, "grad_norm": 4.408940295701244, "learning_rate": 1.999344683354343e-05, "loss": 1.5688, "step": 5180 }, { "epoch": 0.021453860764857006, "grad_norm": 4.30626191840598, "learning_rate": 1.9993399266037593e-05, "loss": 1.5743, "step": 5190 }, { "epoch": 0.02149519768347908, "grad_norm": 3.674456985901954, "learning_rate": 1.999335152657323e-05, "loss": 1.5872, "step": 5200 }, { "epoch": 0.021536534602101157, "grad_norm": 3.641790233464658, "learning_rate": 1.9993303615151168e-05, "loss": 1.5612, "step": 5210 }, { "epoch": 0.02157787152072323, "grad_norm": 4.165728119210956, "learning_rate": 1.9993255531772225e-05, "loss": 1.59, "step": 5220 }, { "epoch": 0.021619208439345308, "grad_norm": 3.8319777859342246, "learning_rate": 1.9993207276437235e-05, "loss": 1.5912, "step": 5230 }, { "epoch": 0.02166054535796738, "grad_norm": 3.9855756729463168, "learning_rate": 1.999315884914702e-05, "loss": 1.58, "step": 5240 }, { "epoch": 0.021701882276589455, "grad_norm": 3.8011477722676807, "learning_rate": 1.999311024990242e-05, "loss": 1.6003, "step": 5250 }, { "epoch": 0.021743219195211532, "grad_norm": 3.985198206647649, "learning_rate": 1.9993061478704275e-05, "loss": 1.5986, "step": 5260 }, { "epoch": 0.021784556113833606, "grad_norm": 3.9838081605823636, "learning_rate": 1.9993012535553412e-05, "loss": 1.6166, "step": 5270 }, { "epoch": 0.021825893032455683, "grad_norm": 3.9996617755784043, "learning_rate": 1.999296342045068e-05, "loss": 1.5792, "step": 5280 }, { "epoch": 0.021867229951077757, "grad_norm": 5.892962480457768, "learning_rate": 1.9992914133396926e-05, "loss": 1.6053, "step": 5290 }, { "epoch": 0.02190856686969983, "grad_norm": 4.427789486632826, "learning_rate": 1.9992864674392994e-05, "loss": 1.6374, "step": 5300 }, { "epoch": 0.021949903788321908, "grad_norm": 4.488482688049845, "learning_rate": 1.9992815043439736e-05, "loss": 1.6198, "step": 5310 }, { "epoch": 0.02199124070694398, "grad_norm": 3.9697984164903035, "learning_rate": 1.999276524053801e-05, "loss": 1.6112, "step": 5320 }, { "epoch": 0.02203257762556606, "grad_norm": 4.708237856178089, "learning_rate": 1.9992715265688666e-05, "loss": 1.569, "step": 5330 }, { "epoch": 0.022073914544188132, "grad_norm": 4.180089792931872, "learning_rate": 1.999266511889257e-05, "loss": 1.564, "step": 5340 }, { "epoch": 0.02211525146281021, "grad_norm": 4.540705844431402, "learning_rate": 1.9992614800150582e-05, "loss": 1.6062, "step": 5350 }, { "epoch": 0.022156588381432283, "grad_norm": 3.6164199548569256, "learning_rate": 1.999256430946357e-05, "loss": 1.614, "step": 5360 }, { "epoch": 0.022197925300054357, "grad_norm": 3.815681996528154, "learning_rate": 1.9992513646832398e-05, "loss": 1.5836, "step": 5370 }, { "epoch": 0.022239262218676434, "grad_norm": 4.806439757203068, "learning_rate": 1.9992462812257943e-05, "loss": 1.6162, "step": 5380 }, { "epoch": 0.022280599137298508, "grad_norm": 4.354139965343947, "learning_rate": 1.999241180574108e-05, "loss": 1.5888, "step": 5390 }, { "epoch": 0.022321936055920585, "grad_norm": 4.126817858976234, "learning_rate": 1.999236062728268e-05, "loss": 1.5879, "step": 5400 }, { "epoch": 0.02236327297454266, "grad_norm": 4.47607737943672, "learning_rate": 1.9992309276883632e-05, "loss": 1.6099, "step": 5410 }, { "epoch": 0.022404609893164732, "grad_norm": 5.610066619695038, "learning_rate": 1.9992257754544814e-05, "loss": 1.593, "step": 5420 }, { "epoch": 0.02244594681178681, "grad_norm": 4.2928973652861675, "learning_rate": 1.9992206060267114e-05, "loss": 1.5793, "step": 5430 }, { "epoch": 0.022487283730408883, "grad_norm": 3.8921859700664325, "learning_rate": 1.9992154194051422e-05, "loss": 1.608, "step": 5440 }, { "epoch": 0.02252862064903096, "grad_norm": 3.677731550454947, "learning_rate": 1.999210215589863e-05, "loss": 1.6151, "step": 5450 }, { "epoch": 0.022569957567653034, "grad_norm": 4.200629201423265, "learning_rate": 1.9992049945809632e-05, "loss": 1.6246, "step": 5460 }, { "epoch": 0.02261129448627511, "grad_norm": 4.064480908765512, "learning_rate": 1.9991997563785332e-05, "loss": 1.5607, "step": 5470 }, { "epoch": 0.022652631404897185, "grad_norm": 3.5486537855524176, "learning_rate": 1.9991945009826623e-05, "loss": 1.5906, "step": 5480 }, { "epoch": 0.02269396832351926, "grad_norm": 4.0698465101707, "learning_rate": 1.9991892283934415e-05, "loss": 1.5864, "step": 5490 }, { "epoch": 0.022735305242141336, "grad_norm": 3.698399389749536, "learning_rate": 1.9991839386109615e-05, "loss": 1.593, "step": 5500 }, { "epoch": 0.02277664216076341, "grad_norm": 4.854255782396672, "learning_rate": 1.9991786316353134e-05, "loss": 1.5961, "step": 5510 }, { "epoch": 0.022817979079385486, "grad_norm": 3.5841353274799244, "learning_rate": 1.9991733074665884e-05, "loss": 1.5638, "step": 5520 }, { "epoch": 0.02285931599800756, "grad_norm": 4.188646894537988, "learning_rate": 1.9991679661048774e-05, "loss": 1.5605, "step": 5530 }, { "epoch": 0.022900652916629637, "grad_norm": 3.646293980881599, "learning_rate": 1.9991626075502736e-05, "loss": 1.5672, "step": 5540 }, { "epoch": 0.02294198983525171, "grad_norm": 3.513345408175718, "learning_rate": 1.999157231802868e-05, "loss": 1.5228, "step": 5550 }, { "epoch": 0.022983326753873785, "grad_norm": 4.22409759900443, "learning_rate": 1.999151838862754e-05, "loss": 1.5742, "step": 5560 }, { "epoch": 0.023024663672495862, "grad_norm": 3.9606510772786674, "learning_rate": 1.999146428730024e-05, "loss": 1.5898, "step": 5570 }, { "epoch": 0.023066000591117936, "grad_norm": 4.723833314885466, "learning_rate": 1.9991410014047713e-05, "loss": 1.6293, "step": 5580 }, { "epoch": 0.023107337509740013, "grad_norm": 3.79738622812003, "learning_rate": 1.999135556887089e-05, "loss": 1.5347, "step": 5590 }, { "epoch": 0.023148674428362086, "grad_norm": 3.5876021705924277, "learning_rate": 1.9991300951770712e-05, "loss": 1.5639, "step": 5600 }, { "epoch": 0.02319001134698416, "grad_norm": 4.466727344043237, "learning_rate": 1.9991246162748116e-05, "loss": 1.5821, "step": 5610 }, { "epoch": 0.023231348265606237, "grad_norm": 4.027485882579859, "learning_rate": 1.999119120180404e-05, "loss": 1.5641, "step": 5620 }, { "epoch": 0.02327268518422831, "grad_norm": 4.698907728867797, "learning_rate": 1.9991136068939436e-05, "loss": 1.5717, "step": 5630 }, { "epoch": 0.023314022102850388, "grad_norm": 3.9675562129009534, "learning_rate": 1.9991080764155254e-05, "loss": 1.5984, "step": 5640 }, { "epoch": 0.023355359021472462, "grad_norm": 4.469330328433558, "learning_rate": 1.9991025287452442e-05, "loss": 1.5836, "step": 5650 }, { "epoch": 0.02339669594009454, "grad_norm": 4.315359559691392, "learning_rate": 1.9990969638831955e-05, "loss": 1.5456, "step": 5660 }, { "epoch": 0.023438032858716613, "grad_norm": 3.67958327218992, "learning_rate": 1.9990913818294753e-05, "loss": 1.6191, "step": 5670 }, { "epoch": 0.023479369777338686, "grad_norm": 4.491956126894857, "learning_rate": 1.9990857825841793e-05, "loss": 1.5808, "step": 5680 }, { "epoch": 0.023520706695960764, "grad_norm": 3.79674314266457, "learning_rate": 1.999080166147404e-05, "loss": 1.5183, "step": 5690 }, { "epoch": 0.023562043614582837, "grad_norm": 4.5968252548890245, "learning_rate": 1.999074532519246e-05, "loss": 1.5757, "step": 5700 }, { "epoch": 0.023603380533204914, "grad_norm": 3.5990834672231284, "learning_rate": 1.9990688816998025e-05, "loss": 1.6086, "step": 5710 }, { "epoch": 0.023644717451826988, "grad_norm": 4.344410017466151, "learning_rate": 1.99906321368917e-05, "loss": 1.6113, "step": 5720 }, { "epoch": 0.023686054370449065, "grad_norm": 3.7938891603257603, "learning_rate": 1.9990575284874473e-05, "loss": 1.6365, "step": 5730 }, { "epoch": 0.02372739128907114, "grad_norm": 3.562057149121525, "learning_rate": 1.999051826094731e-05, "loss": 1.5485, "step": 5740 }, { "epoch": 0.023768728207693213, "grad_norm": 4.081479989742111, "learning_rate": 1.99904610651112e-05, "loss": 1.5689, "step": 5750 }, { "epoch": 0.02381006512631529, "grad_norm": 3.759485760858795, "learning_rate": 1.999040369736712e-05, "loss": 1.564, "step": 5760 }, { "epoch": 0.023851402044937364, "grad_norm": 4.032363621919849, "learning_rate": 1.9990346157716064e-05, "loss": 1.6025, "step": 5770 }, { "epoch": 0.02389273896355944, "grad_norm": 3.6432323322843403, "learning_rate": 1.999028844615902e-05, "loss": 1.5271, "step": 5780 }, { "epoch": 0.023934075882181514, "grad_norm": 3.802770545609017, "learning_rate": 1.9990230562696983e-05, "loss": 1.5967, "step": 5790 }, { "epoch": 0.023975412800803588, "grad_norm": 3.795072573463222, "learning_rate": 1.9990172507330943e-05, "loss": 1.5247, "step": 5800 }, { "epoch": 0.024016749719425665, "grad_norm": 4.366382080210575, "learning_rate": 1.99901142800619e-05, "loss": 1.5781, "step": 5810 }, { "epoch": 0.02405808663804774, "grad_norm": 3.9097914526353605, "learning_rate": 1.9990055880890864e-05, "loss": 1.6034, "step": 5820 }, { "epoch": 0.024099423556669816, "grad_norm": 4.123926255872013, "learning_rate": 1.9989997309818833e-05, "loss": 1.5464, "step": 5830 }, { "epoch": 0.02414076047529189, "grad_norm": 4.446493191993532, "learning_rate": 1.9989938566846812e-05, "loss": 1.5586, "step": 5840 }, { "epoch": 0.024182097393913967, "grad_norm": 3.7337639849714233, "learning_rate": 1.998987965197582e-05, "loss": 1.5479, "step": 5850 }, { "epoch": 0.02422343431253604, "grad_norm": 4.7444952313768525, "learning_rate": 1.9989820565206865e-05, "loss": 1.5808, "step": 5860 }, { "epoch": 0.024264771231158114, "grad_norm": 4.247725775065283, "learning_rate": 1.9989761306540966e-05, "loss": 1.523, "step": 5870 }, { "epoch": 0.02430610814978019, "grad_norm": 3.995186643530754, "learning_rate": 1.998970187597914e-05, "loss": 1.5785, "step": 5880 }, { "epoch": 0.024347445068402265, "grad_norm": 4.816092056889684, "learning_rate": 1.9989642273522416e-05, "loss": 1.5746, "step": 5890 }, { "epoch": 0.024388781987024342, "grad_norm": 4.290367502884436, "learning_rate": 1.9989582499171813e-05, "loss": 1.6119, "step": 5900 }, { "epoch": 0.024430118905646416, "grad_norm": 3.5513668937922236, "learning_rate": 1.9989522552928365e-05, "loss": 1.5162, "step": 5910 }, { "epoch": 0.02447145582426849, "grad_norm": 3.6198772954827665, "learning_rate": 1.9989462434793096e-05, "loss": 1.5323, "step": 5920 }, { "epoch": 0.024512792742890567, "grad_norm": 3.852832040089439, "learning_rate": 1.9989402144767046e-05, "loss": 1.5311, "step": 5930 }, { "epoch": 0.02455412966151264, "grad_norm": 3.9519174433535325, "learning_rate": 1.9989341682851254e-05, "loss": 1.5429, "step": 5940 }, { "epoch": 0.024595466580134718, "grad_norm": 4.87353052847372, "learning_rate": 1.9989281049046755e-05, "loss": 1.6002, "step": 5950 }, { "epoch": 0.02463680349875679, "grad_norm": 3.3803857370087225, "learning_rate": 1.9989220243354595e-05, "loss": 1.5793, "step": 5960 }, { "epoch": 0.02467814041737887, "grad_norm": 3.8766963938819075, "learning_rate": 1.998915926577582e-05, "loss": 1.533, "step": 5970 }, { "epoch": 0.024719477336000942, "grad_norm": 4.09044180490663, "learning_rate": 1.998909811631148e-05, "loss": 1.565, "step": 5980 }, { "epoch": 0.024760814254623016, "grad_norm": 4.537575506546124, "learning_rate": 1.998903679496263e-05, "loss": 1.5523, "step": 5990 }, { "epoch": 0.024802151173245093, "grad_norm": 3.4086201638465803, "learning_rate": 1.9988975301730317e-05, "loss": 1.5467, "step": 6000 }, { "epoch": 0.024843488091867167, "grad_norm": 3.7512592579174244, "learning_rate": 1.9988913636615608e-05, "loss": 1.6148, "step": 6010 }, { "epoch": 0.024884825010489244, "grad_norm": 3.9606857815229035, "learning_rate": 1.9988851799619557e-05, "loss": 1.5529, "step": 6020 }, { "epoch": 0.024926161929111318, "grad_norm": 4.87271131926588, "learning_rate": 1.9988789790743235e-05, "loss": 1.624, "step": 6030 }, { "epoch": 0.024967498847733395, "grad_norm": 4.562111872082575, "learning_rate": 1.9988727609987705e-05, "loss": 1.5954, "step": 6040 }, { "epoch": 0.02500883576635547, "grad_norm": 4.160920766227917, "learning_rate": 1.9988665257354035e-05, "loss": 1.5745, "step": 6050 }, { "epoch": 0.025050172684977542, "grad_norm": 3.7976329240225284, "learning_rate": 1.9988602732843296e-05, "loss": 1.539, "step": 6060 }, { "epoch": 0.02509150960359962, "grad_norm": 3.9348324977710347, "learning_rate": 1.9988540036456575e-05, "loss": 1.5802, "step": 6070 }, { "epoch": 0.025132846522221693, "grad_norm": 3.3649859713246313, "learning_rate": 1.998847716819494e-05, "loss": 1.5349, "step": 6080 }, { "epoch": 0.02517418344084377, "grad_norm": 5.035730829505278, "learning_rate": 1.998841412805948e-05, "loss": 1.5522, "step": 6090 }, { "epoch": 0.025215520359465844, "grad_norm": 4.38089533529463, "learning_rate": 1.9988350916051272e-05, "loss": 1.5696, "step": 6100 }, { "epoch": 0.025256857278087918, "grad_norm": 4.0458062619048185, "learning_rate": 1.9988287532171408e-05, "loss": 1.582, "step": 6110 }, { "epoch": 0.025298194196709995, "grad_norm": 5.197316936196237, "learning_rate": 1.9988223976420983e-05, "loss": 1.5685, "step": 6120 }, { "epoch": 0.02533953111533207, "grad_norm": 3.701848060366763, "learning_rate": 1.998816024880108e-05, "loss": 1.568, "step": 6130 }, { "epoch": 0.025380868033954146, "grad_norm": 4.576812131388496, "learning_rate": 1.9988096349312808e-05, "loss": 1.5925, "step": 6140 }, { "epoch": 0.02542220495257622, "grad_norm": 3.626416937979281, "learning_rate": 1.998803227795726e-05, "loss": 1.6026, "step": 6150 }, { "epoch": 0.025463541871198297, "grad_norm": 3.7415000301009016, "learning_rate": 1.9987968034735535e-05, "loss": 1.5632, "step": 6160 }, { "epoch": 0.02550487878982037, "grad_norm": 4.093809033114078, "learning_rate": 1.9987903619648745e-05, "loss": 1.5442, "step": 6170 }, { "epoch": 0.025546215708442444, "grad_norm": 3.782350165490308, "learning_rate": 1.9987839032697995e-05, "loss": 1.5423, "step": 6180 }, { "epoch": 0.02558755262706452, "grad_norm": 3.1897173529667695, "learning_rate": 1.9987774273884398e-05, "loss": 1.5332, "step": 6190 }, { "epoch": 0.025628889545686595, "grad_norm": 3.9224918301369276, "learning_rate": 1.9987709343209066e-05, "loss": 1.5133, "step": 6200 }, { "epoch": 0.025670226464308672, "grad_norm": 3.830850927059349, "learning_rate": 1.9987644240673118e-05, "loss": 1.555, "step": 6210 }, { "epoch": 0.025711563382930746, "grad_norm": 4.0209145103807575, "learning_rate": 1.9987578966277678e-05, "loss": 1.5114, "step": 6220 }, { "epoch": 0.02575290030155282, "grad_norm": 3.936778500441379, "learning_rate": 1.998751352002386e-05, "loss": 1.5257, "step": 6230 }, { "epoch": 0.025794237220174897, "grad_norm": 3.7293623909553313, "learning_rate": 1.9987447901912794e-05, "loss": 1.5694, "step": 6240 }, { "epoch": 0.02583557413879697, "grad_norm": 3.5818490617695575, "learning_rate": 1.9987382111945614e-05, "loss": 1.5531, "step": 6250 }, { "epoch": 0.025876911057419048, "grad_norm": 3.6430367540575928, "learning_rate": 1.998731615012345e-05, "loss": 1.5434, "step": 6260 }, { "epoch": 0.02591824797604112, "grad_norm": 4.589782090699277, "learning_rate": 1.998725001644743e-05, "loss": 1.557, "step": 6270 }, { "epoch": 0.0259595848946632, "grad_norm": 4.283265289643292, "learning_rate": 1.99871837109187e-05, "loss": 1.5624, "step": 6280 }, { "epoch": 0.026000921813285272, "grad_norm": 4.946779268912546, "learning_rate": 1.99871172335384e-05, "loss": 1.5124, "step": 6290 }, { "epoch": 0.026042258731907346, "grad_norm": 3.5038967204373694, "learning_rate": 1.998705058430767e-05, "loss": 1.5683, "step": 6300 }, { "epoch": 0.026083595650529423, "grad_norm": 3.8770191196485886, "learning_rate": 1.998698376322766e-05, "loss": 1.5435, "step": 6310 }, { "epoch": 0.026124932569151497, "grad_norm": 4.439160488934939, "learning_rate": 1.998691677029952e-05, "loss": 1.5295, "step": 6320 }, { "epoch": 0.026166269487773574, "grad_norm": 4.092904098781107, "learning_rate": 1.99868496055244e-05, "loss": 1.55, "step": 6330 }, { "epoch": 0.026207606406395648, "grad_norm": 4.303539009198583, "learning_rate": 1.9986782268903457e-05, "loss": 1.5484, "step": 6340 }, { "epoch": 0.026248943325017725, "grad_norm": 3.9078194955949916, "learning_rate": 1.9986714760437853e-05, "loss": 1.5827, "step": 6350 }, { "epoch": 0.0262902802436398, "grad_norm": 4.342380780259694, "learning_rate": 1.9986647080128746e-05, "loss": 1.557, "step": 6360 }, { "epoch": 0.026331617162261872, "grad_norm": 3.985596918279314, "learning_rate": 1.99865792279773e-05, "loss": 1.578, "step": 6370 }, { "epoch": 0.02637295408088395, "grad_norm": 3.653720676962278, "learning_rate": 1.9986511203984683e-05, "loss": 1.5668, "step": 6380 }, { "epoch": 0.026414290999506023, "grad_norm": 3.751487721843964, "learning_rate": 1.998644300815207e-05, "loss": 1.5305, "step": 6390 }, { "epoch": 0.0264556279181281, "grad_norm": 4.009883655861525, "learning_rate": 1.9986374640480627e-05, "loss": 1.5495, "step": 6400 }, { "epoch": 0.026496964836750174, "grad_norm": 3.8266120037819396, "learning_rate": 1.9986306100971533e-05, "loss": 1.5255, "step": 6410 }, { "epoch": 0.026538301755372248, "grad_norm": 3.6427176903376384, "learning_rate": 1.9986237389625974e-05, "loss": 1.5525, "step": 6420 }, { "epoch": 0.026579638673994325, "grad_norm": 4.099974786255079, "learning_rate": 1.998616850644512e-05, "loss": 1.5424, "step": 6430 }, { "epoch": 0.0266209755926164, "grad_norm": 3.8198383445190793, "learning_rate": 1.998609945143017e-05, "loss": 1.567, "step": 6440 }, { "epoch": 0.026662312511238476, "grad_norm": 4.473728987789235, "learning_rate": 1.9986030224582302e-05, "loss": 1.4823, "step": 6450 }, { "epoch": 0.02670364942986055, "grad_norm": 4.534257895704366, "learning_rate": 1.998596082590271e-05, "loss": 1.5712, "step": 6460 }, { "epoch": 0.026744986348482627, "grad_norm": 3.644766820996961, "learning_rate": 1.998589125539259e-05, "loss": 1.4863, "step": 6470 }, { "epoch": 0.0267863232671047, "grad_norm": 3.5999094373664526, "learning_rate": 1.9985821513053137e-05, "loss": 1.5326, "step": 6480 }, { "epoch": 0.026827660185726774, "grad_norm": 4.1425167904001565, "learning_rate": 1.9985751598885552e-05, "loss": 1.5378, "step": 6490 }, { "epoch": 0.02686899710434885, "grad_norm": 4.493755521920297, "learning_rate": 1.998568151289104e-05, "loss": 1.559, "step": 6500 }, { "epoch": 0.026910334022970925, "grad_norm": 3.8756107746153896, "learning_rate": 1.9985611255070806e-05, "loss": 1.556, "step": 6510 }, { "epoch": 0.026951670941593002, "grad_norm": 4.038084156630909, "learning_rate": 1.9985540825426055e-05, "loss": 1.5645, "step": 6520 }, { "epoch": 0.026993007860215076, "grad_norm": 3.5809330138239392, "learning_rate": 1.9985470223958e-05, "loss": 1.5548, "step": 6530 }, { "epoch": 0.027034344778837153, "grad_norm": 3.8676960332502963, "learning_rate": 1.998539945066786e-05, "loss": 1.5276, "step": 6540 }, { "epoch": 0.027075681697459227, "grad_norm": 3.4449297103173593, "learning_rate": 1.9985328505556852e-05, "loss": 1.5651, "step": 6550 }, { "epoch": 0.0271170186160813, "grad_norm": 4.288962654330007, "learning_rate": 1.9985257388626196e-05, "loss": 1.4996, "step": 6560 }, { "epoch": 0.027158355534703377, "grad_norm": 4.128501380292175, "learning_rate": 1.9985186099877112e-05, "loss": 1.5419, "step": 6570 }, { "epoch": 0.02719969245332545, "grad_norm": 4.575002926290647, "learning_rate": 1.998511463931083e-05, "loss": 1.5515, "step": 6580 }, { "epoch": 0.02724102937194753, "grad_norm": 3.821068699345836, "learning_rate": 1.998504300692858e-05, "loss": 1.5233, "step": 6590 }, { "epoch": 0.027282366290569602, "grad_norm": 3.2424897263365016, "learning_rate": 1.9984971202731596e-05, "loss": 1.5479, "step": 6600 }, { "epoch": 0.027323703209191676, "grad_norm": 4.834170577555974, "learning_rate": 1.9984899226721107e-05, "loss": 1.5502, "step": 6610 }, { "epoch": 0.027365040127813753, "grad_norm": 3.764169873278093, "learning_rate": 1.998482707889836e-05, "loss": 1.5891, "step": 6620 }, { "epoch": 0.027406377046435827, "grad_norm": 3.414782018354158, "learning_rate": 1.998475475926459e-05, "loss": 1.5159, "step": 6630 }, { "epoch": 0.027447713965057904, "grad_norm": 4.666184759190313, "learning_rate": 1.9984682267821046e-05, "loss": 1.5628, "step": 6640 }, { "epoch": 0.027489050883679977, "grad_norm": 3.5574578914802943, "learning_rate": 1.998460960456897e-05, "loss": 1.5315, "step": 6650 }, { "epoch": 0.027530387802302055, "grad_norm": 4.817382615755327, "learning_rate": 1.9984536769509615e-05, "loss": 1.5081, "step": 6660 }, { "epoch": 0.02757172472092413, "grad_norm": 4.458867619168575, "learning_rate": 1.998446376264424e-05, "loss": 1.5099, "step": 6670 }, { "epoch": 0.027613061639546202, "grad_norm": 4.929644851290023, "learning_rate": 1.9984390583974093e-05, "loss": 1.5122, "step": 6680 }, { "epoch": 0.02765439855816828, "grad_norm": 4.625479741961043, "learning_rate": 1.9984317233500435e-05, "loss": 1.5516, "step": 6690 }, { "epoch": 0.027695735476790353, "grad_norm": 4.116997057727521, "learning_rate": 1.9984243711224535e-05, "loss": 1.5376, "step": 6700 }, { "epoch": 0.02773707239541243, "grad_norm": 3.5829728345047314, "learning_rate": 1.998417001714765e-05, "loss": 1.5175, "step": 6710 }, { "epoch": 0.027778409314034504, "grad_norm": 3.9734979789101996, "learning_rate": 1.9984096151271048e-05, "loss": 1.4871, "step": 6720 }, { "epoch": 0.027819746232656577, "grad_norm": 3.430905345503222, "learning_rate": 1.9984022113596003e-05, "loss": 1.5413, "step": 6730 }, { "epoch": 0.027861083151278655, "grad_norm": 4.2373775185116465, "learning_rate": 1.998394790412379e-05, "loss": 1.508, "step": 6740 }, { "epoch": 0.027902420069900728, "grad_norm": 3.973156898260741, "learning_rate": 1.9983873522855684e-05, "loss": 1.5283, "step": 6750 }, { "epoch": 0.027943756988522805, "grad_norm": 3.714285928818181, "learning_rate": 1.9983798969792966e-05, "loss": 1.5362, "step": 6760 }, { "epoch": 0.02798509390714488, "grad_norm": 4.251009623472971, "learning_rate": 1.9983724244936916e-05, "loss": 1.5282, "step": 6770 }, { "epoch": 0.028026430825766956, "grad_norm": 3.886447728722872, "learning_rate": 1.9983649348288825e-05, "loss": 1.5719, "step": 6780 }, { "epoch": 0.02806776774438903, "grad_norm": 3.5572049346515757, "learning_rate": 1.9983574279849977e-05, "loss": 1.5302, "step": 6790 }, { "epoch": 0.028109104663011104, "grad_norm": 3.688224659646708, "learning_rate": 1.9983499039621667e-05, "loss": 1.5132, "step": 6800 }, { "epoch": 0.02815044158163318, "grad_norm": 3.986436014630721, "learning_rate": 1.998342362760519e-05, "loss": 1.5045, "step": 6810 }, { "epoch": 0.028191778500255255, "grad_norm": 4.042321521286428, "learning_rate": 1.998334804380184e-05, "loss": 1.52, "step": 6820 }, { "epoch": 0.02823311541887733, "grad_norm": 3.936708692378881, "learning_rate": 1.9983272288212917e-05, "loss": 1.5208, "step": 6830 }, { "epoch": 0.028274452337499405, "grad_norm": 4.692602969518199, "learning_rate": 1.998319636083973e-05, "loss": 1.5667, "step": 6840 }, { "epoch": 0.028315789256121483, "grad_norm": 3.9313878236010598, "learning_rate": 1.9983120261683582e-05, "loss": 1.5831, "step": 6850 }, { "epoch": 0.028357126174743556, "grad_norm": 3.551615629888668, "learning_rate": 1.9983043990745784e-05, "loss": 1.5308, "step": 6860 }, { "epoch": 0.02839846309336563, "grad_norm": 4.6846872186437905, "learning_rate": 1.9982967548027645e-05, "loss": 1.4921, "step": 6870 }, { "epoch": 0.028439800011987707, "grad_norm": 4.130277420309701, "learning_rate": 1.9982890933530482e-05, "loss": 1.4943, "step": 6880 }, { "epoch": 0.02848113693060978, "grad_norm": 4.84212625045545, "learning_rate": 1.9982814147255617e-05, "loss": 1.5353, "step": 6890 }, { "epoch": 0.028522473849231858, "grad_norm": 4.484005829649726, "learning_rate": 1.9982737189204367e-05, "loss": 1.5051, "step": 6900 }, { "epoch": 0.02856381076785393, "grad_norm": 4.60797848842926, "learning_rate": 1.998266005937806e-05, "loss": 1.5816, "step": 6910 }, { "epoch": 0.028605147686476005, "grad_norm": 4.6416638061065685, "learning_rate": 1.998258275777802e-05, "loss": 1.4904, "step": 6920 }, { "epoch": 0.028646484605098083, "grad_norm": 3.477343383643086, "learning_rate": 1.9982505284405574e-05, "loss": 1.4904, "step": 6930 }, { "epoch": 0.028687821523720156, "grad_norm": 3.6088868644511427, "learning_rate": 1.9982427639262065e-05, "loss": 1.5314, "step": 6940 }, { "epoch": 0.028729158442342233, "grad_norm": 3.5925096362200333, "learning_rate": 1.9982349822348816e-05, "loss": 1.5714, "step": 6950 }, { "epoch": 0.028770495360964307, "grad_norm": 3.2379164637124855, "learning_rate": 1.9982271833667178e-05, "loss": 1.538, "step": 6960 }, { "epoch": 0.028811832279586384, "grad_norm": 4.70836437147594, "learning_rate": 1.9982193673218487e-05, "loss": 1.5221, "step": 6970 }, { "epoch": 0.028853169198208458, "grad_norm": 4.481706228912118, "learning_rate": 1.9982115341004088e-05, "loss": 1.5313, "step": 6980 }, { "epoch": 0.02889450611683053, "grad_norm": 4.687318727733607, "learning_rate": 1.9982036837025332e-05, "loss": 1.5051, "step": 6990 }, { "epoch": 0.02893584303545261, "grad_norm": 4.598444922768698, "learning_rate": 1.998195816128357e-05, "loss": 1.4934, "step": 7000 }, { "epoch": 0.028977179954074683, "grad_norm": 3.9891444926266555, "learning_rate": 1.9981879313780145e-05, "loss": 1.5511, "step": 7010 }, { "epoch": 0.02901851687269676, "grad_norm": 4.416153022094333, "learning_rate": 1.998180029451643e-05, "loss": 1.5097, "step": 7020 }, { "epoch": 0.029059853791318833, "grad_norm": 3.798723984502823, "learning_rate": 1.9981721103493775e-05, "loss": 1.4997, "step": 7030 }, { "epoch": 0.029101190709940907, "grad_norm": 3.8640966990883223, "learning_rate": 1.9981641740713545e-05, "loss": 1.54, "step": 7040 }, { "epoch": 0.029142527628562984, "grad_norm": 3.9923946800397436, "learning_rate": 1.9981562206177104e-05, "loss": 1.5511, "step": 7050 }, { "epoch": 0.029183864547185058, "grad_norm": 3.451062899533455, "learning_rate": 1.998148249988582e-05, "loss": 1.5094, "step": 7060 }, { "epoch": 0.029225201465807135, "grad_norm": 3.608872589804411, "learning_rate": 1.998140262184107e-05, "loss": 1.5162, "step": 7070 }, { "epoch": 0.02926653838442921, "grad_norm": 4.033124276978576, "learning_rate": 1.998132257204422e-05, "loss": 1.4959, "step": 7080 }, { "epoch": 0.029307875303051286, "grad_norm": 3.6841192145575397, "learning_rate": 1.9981242350496656e-05, "loss": 1.5223, "step": 7090 }, { "epoch": 0.02934921222167336, "grad_norm": 3.5915932028439226, "learning_rate": 1.9981161957199754e-05, "loss": 1.5257, "step": 7100 }, { "epoch": 0.029390549140295433, "grad_norm": 3.870756947521686, "learning_rate": 1.9981081392154898e-05, "loss": 1.4904, "step": 7110 }, { "epoch": 0.02943188605891751, "grad_norm": 4.22816037060308, "learning_rate": 1.9981000655363473e-05, "loss": 1.4982, "step": 7120 }, { "epoch": 0.029473222977539584, "grad_norm": 3.808092113157194, "learning_rate": 1.9980919746826872e-05, "loss": 1.519, "step": 7130 }, { "epoch": 0.02951455989616166, "grad_norm": 3.709300764256846, "learning_rate": 1.9980838666546483e-05, "loss": 1.5533, "step": 7140 }, { "epoch": 0.029555896814783735, "grad_norm": 3.4732849314384127, "learning_rate": 1.9980757414523704e-05, "loss": 1.5633, "step": 7150 }, { "epoch": 0.029597233733405812, "grad_norm": 3.713987413842907, "learning_rate": 1.998067599075993e-05, "loss": 1.5201, "step": 7160 }, { "epoch": 0.029638570652027886, "grad_norm": 4.113428203135297, "learning_rate": 1.9980594395256564e-05, "loss": 1.4594, "step": 7170 }, { "epoch": 0.02967990757064996, "grad_norm": 2.9935199155014285, "learning_rate": 1.9980512628015014e-05, "loss": 1.4986, "step": 7180 }, { "epoch": 0.029721244489272037, "grad_norm": 4.691195443226011, "learning_rate": 1.998043068903668e-05, "loss": 1.5357, "step": 7190 }, { "epoch": 0.02976258140789411, "grad_norm": 3.4891344421625647, "learning_rate": 1.9980348578322973e-05, "loss": 1.5306, "step": 7200 }, { "epoch": 0.029803918326516188, "grad_norm": 5.006688866087417, "learning_rate": 1.9980266295875313e-05, "loss": 1.512, "step": 7210 }, { "epoch": 0.02984525524513826, "grad_norm": 4.053771198338998, "learning_rate": 1.9980183841695107e-05, "loss": 1.4794, "step": 7220 }, { "epoch": 0.029886592163760335, "grad_norm": 3.352224471660883, "learning_rate": 1.998010121578378e-05, "loss": 1.4914, "step": 7230 }, { "epoch": 0.029927929082382412, "grad_norm": 3.864235630776693, "learning_rate": 1.998001841814275e-05, "loss": 1.5253, "step": 7240 }, { "epoch": 0.029969266001004486, "grad_norm": 3.706315048662037, "learning_rate": 1.997993544877344e-05, "loss": 1.5509, "step": 7250 }, { "epoch": 0.030010602919626563, "grad_norm": 3.5465275464245676, "learning_rate": 1.9979852307677285e-05, "loss": 1.5605, "step": 7260 }, { "epoch": 0.030051939838248637, "grad_norm": 4.129168530718907, "learning_rate": 1.997976899485571e-05, "loss": 1.5204, "step": 7270 }, { "epoch": 0.030093276756870714, "grad_norm": 3.500344444775805, "learning_rate": 1.997968551031015e-05, "loss": 1.4853, "step": 7280 }, { "epoch": 0.030134613675492788, "grad_norm": 3.930695708227359, "learning_rate": 1.9979601854042044e-05, "loss": 1.5186, "step": 7290 }, { "epoch": 0.03017595059411486, "grad_norm": 3.8784171251524846, "learning_rate": 1.9979518026052826e-05, "loss": 1.5031, "step": 7300 }, { "epoch": 0.03021728751273694, "grad_norm": 4.264401345696811, "learning_rate": 1.997943402634394e-05, "loss": 1.5175, "step": 7310 }, { "epoch": 0.030258624431359012, "grad_norm": 3.924430485404916, "learning_rate": 1.9979349854916836e-05, "loss": 1.5415, "step": 7320 }, { "epoch": 0.03029996134998109, "grad_norm": 5.108018679538745, "learning_rate": 1.9979265511772958e-05, "loss": 1.4635, "step": 7330 }, { "epoch": 0.030341298268603163, "grad_norm": 3.5308579618903253, "learning_rate": 1.997918099691376e-05, "loss": 1.5076, "step": 7340 }, { "epoch": 0.03038263518722524, "grad_norm": 5.35750207475576, "learning_rate": 1.997909631034069e-05, "loss": 1.5426, "step": 7350 }, { "epoch": 0.030423972105847314, "grad_norm": 3.4906620060732645, "learning_rate": 1.9979011452055216e-05, "loss": 1.5012, "step": 7360 }, { "epoch": 0.030465309024469388, "grad_norm": 3.059087960421475, "learning_rate": 1.9978926422058788e-05, "loss": 1.5542, "step": 7370 }, { "epoch": 0.030506645943091465, "grad_norm": 3.9845949063681227, "learning_rate": 1.9978841220352875e-05, "loss": 1.546, "step": 7380 }, { "epoch": 0.03054798286171354, "grad_norm": 3.8610381846741815, "learning_rate": 1.9978755846938943e-05, "loss": 1.5437, "step": 7390 }, { "epoch": 0.030589319780335616, "grad_norm": 3.2351748039531154, "learning_rate": 1.9978670301818456e-05, "loss": 1.4819, "step": 7400 }, { "epoch": 0.03063065669895769, "grad_norm": 3.3661408607769823, "learning_rate": 1.997858458499289e-05, "loss": 1.4698, "step": 7410 }, { "epoch": 0.030671993617579763, "grad_norm": 3.882697263150638, "learning_rate": 1.997849869646372e-05, "loss": 1.5216, "step": 7420 }, { "epoch": 0.03071333053620184, "grad_norm": 3.574753801928485, "learning_rate": 1.9978412636232425e-05, "loss": 1.4803, "step": 7430 }, { "epoch": 0.030754667454823914, "grad_norm": 4.443698443398163, "learning_rate": 1.997832640430048e-05, "loss": 1.5006, "step": 7440 }, { "epoch": 0.03079600437344599, "grad_norm": 4.180214183413532, "learning_rate": 1.9978240000669377e-05, "loss": 1.4823, "step": 7450 }, { "epoch": 0.030837341292068065, "grad_norm": 4.271849788381411, "learning_rate": 1.9978153425340596e-05, "loss": 1.4888, "step": 7460 }, { "epoch": 0.030878678210690142, "grad_norm": 3.638691493855987, "learning_rate": 1.9978066678315634e-05, "loss": 1.5171, "step": 7470 }, { "epoch": 0.030920015129312216, "grad_norm": 4.31218852799806, "learning_rate": 1.9977979759595972e-05, "loss": 1.5313, "step": 7480 }, { "epoch": 0.03096135204793429, "grad_norm": 4.188985873406339, "learning_rate": 1.9977892669183115e-05, "loss": 1.5333, "step": 7490 }, { "epoch": 0.031002688966556367, "grad_norm": 4.091422365669455, "learning_rate": 1.9977805407078563e-05, "loss": 1.5104, "step": 7500 }, { "epoch": 0.03104402588517844, "grad_norm": 3.7904969438772995, "learning_rate": 1.997771797328381e-05, "loss": 1.4961, "step": 7510 }, { "epoch": 0.031085362803800518, "grad_norm": 4.393169555680857, "learning_rate": 1.9977630367800366e-05, "loss": 1.4876, "step": 7520 }, { "epoch": 0.03112669972242259, "grad_norm": 3.3377888267921163, "learning_rate": 1.9977542590629736e-05, "loss": 1.5107, "step": 7530 }, { "epoch": 0.031168036641044665, "grad_norm": 3.6979315229218512, "learning_rate": 1.9977454641773432e-05, "loss": 1.4984, "step": 7540 }, { "epoch": 0.031209373559666742, "grad_norm": 4.422235985099495, "learning_rate": 1.9977366521232966e-05, "loss": 1.5166, "step": 7550 }, { "epoch": 0.03125071047828882, "grad_norm": 4.357975123788466, "learning_rate": 1.9977278229009854e-05, "loss": 1.5133, "step": 7560 }, { "epoch": 0.03129204739691089, "grad_norm": 3.618227619392518, "learning_rate": 1.997718976510562e-05, "loss": 1.4872, "step": 7570 }, { "epoch": 0.03133338431553297, "grad_norm": 3.6903691829248175, "learning_rate": 1.9977101129521778e-05, "loss": 1.4968, "step": 7580 }, { "epoch": 0.031374721234155044, "grad_norm": 3.325924769769221, "learning_rate": 1.997701232225986e-05, "loss": 1.484, "step": 7590 }, { "epoch": 0.03141605815277712, "grad_norm": 3.244599139580768, "learning_rate": 1.9976923343321388e-05, "loss": 1.501, "step": 7600 }, { "epoch": 0.03145739507139919, "grad_norm": 3.9512458600612224, "learning_rate": 1.9976834192707898e-05, "loss": 1.5146, "step": 7610 }, { "epoch": 0.03149873199002127, "grad_norm": 6.319637912227645, "learning_rate": 1.9976744870420925e-05, "loss": 1.5232, "step": 7620 }, { "epoch": 0.031540068908643346, "grad_norm": 3.385728813673924, "learning_rate": 1.9976655376462003e-05, "loss": 1.4964, "step": 7630 }, { "epoch": 0.031581405827265416, "grad_norm": 3.5621843155206365, "learning_rate": 1.997656571083267e-05, "loss": 1.4948, "step": 7640 }, { "epoch": 0.03162274274588749, "grad_norm": 3.5127469264785334, "learning_rate": 1.9976475873534476e-05, "loss": 1.4788, "step": 7650 }, { "epoch": 0.03166407966450957, "grad_norm": 5.123261236833159, "learning_rate": 1.9976385864568958e-05, "loss": 1.4906, "step": 7660 }, { "epoch": 0.03170541658313164, "grad_norm": 3.0872059709849378, "learning_rate": 1.997629568393767e-05, "loss": 1.5013, "step": 7670 }, { "epoch": 0.03174675350175372, "grad_norm": 3.516937874543627, "learning_rate": 1.9976205331642165e-05, "loss": 1.4802, "step": 7680 }, { "epoch": 0.031788090420375795, "grad_norm": 3.7194230277933684, "learning_rate": 1.9976114807683996e-05, "loss": 1.4776, "step": 7690 }, { "epoch": 0.03182942733899787, "grad_norm": 3.6171028748264016, "learning_rate": 1.9976024112064718e-05, "loss": 1.4867, "step": 7700 }, { "epoch": 0.03187076425761994, "grad_norm": 4.74734081777917, "learning_rate": 1.9975933244785894e-05, "loss": 1.5321, "step": 7710 }, { "epoch": 0.03191210117624202, "grad_norm": 3.4998159594245912, "learning_rate": 1.997584220584909e-05, "loss": 1.4555, "step": 7720 }, { "epoch": 0.031953438094864096, "grad_norm": 4.0664157482197245, "learning_rate": 1.9975750995255865e-05, "loss": 1.4982, "step": 7730 }, { "epoch": 0.03199477501348617, "grad_norm": 4.448956391697098, "learning_rate": 1.9975659613007797e-05, "loss": 1.4877, "step": 7740 }, { "epoch": 0.032036111932108244, "grad_norm": 3.9116050665163056, "learning_rate": 1.9975568059106455e-05, "loss": 1.51, "step": 7750 }, { "epoch": 0.03207744885073032, "grad_norm": 3.8835007718258874, "learning_rate": 1.9975476333553416e-05, "loss": 1.5245, "step": 7760 }, { "epoch": 0.0321187857693524, "grad_norm": 3.8364373709477215, "learning_rate": 1.9975384436350254e-05, "loss": 1.467, "step": 7770 }, { "epoch": 0.03216012268797447, "grad_norm": 3.848935245597496, "learning_rate": 1.9975292367498556e-05, "loss": 1.4999, "step": 7780 }, { "epoch": 0.032201459606596546, "grad_norm": 3.309302500010883, "learning_rate": 1.99752001269999e-05, "loss": 1.513, "step": 7790 }, { "epoch": 0.03224279652521862, "grad_norm": 3.417545265677783, "learning_rate": 1.9975107714855875e-05, "loss": 1.5138, "step": 7800 }, { "epoch": 0.03228413344384069, "grad_norm": 4.218881911775881, "learning_rate": 1.9975015131068078e-05, "loss": 1.4763, "step": 7810 }, { "epoch": 0.03232547036246277, "grad_norm": 3.581778505125333, "learning_rate": 1.997492237563809e-05, "loss": 1.5195, "step": 7820 }, { "epoch": 0.03236680728108485, "grad_norm": 3.7604938020140226, "learning_rate": 1.997482944856752e-05, "loss": 1.4933, "step": 7830 }, { "epoch": 0.032408144199706924, "grad_norm": 4.37155325952897, "learning_rate": 1.997473634985796e-05, "loss": 1.4557, "step": 7840 }, { "epoch": 0.032449481118328995, "grad_norm": 3.3738873014096153, "learning_rate": 1.9974643079511008e-05, "loss": 1.5323, "step": 7850 }, { "epoch": 0.03249081803695107, "grad_norm": 4.587428592575524, "learning_rate": 1.9974549637528276e-05, "loss": 1.5129, "step": 7860 }, { "epoch": 0.03253215495557315, "grad_norm": 3.7141451702858252, "learning_rate": 1.997445602391137e-05, "loss": 1.5176, "step": 7870 }, { "epoch": 0.03257349187419522, "grad_norm": 3.585481088159076, "learning_rate": 1.9974362238661903e-05, "loss": 1.5109, "step": 7880 }, { "epoch": 0.032614828792817296, "grad_norm": 3.4290626560881936, "learning_rate": 1.9974268281781484e-05, "loss": 1.4477, "step": 7890 }, { "epoch": 0.032656165711439374, "grad_norm": 3.612891113663602, "learning_rate": 1.9974174153271728e-05, "loss": 1.4229, "step": 7900 }, { "epoch": 0.03269750263006145, "grad_norm": 3.487845093010647, "learning_rate": 1.9974079853134266e-05, "loss": 1.5321, "step": 7910 }, { "epoch": 0.03273883954868352, "grad_norm": 5.587764895917656, "learning_rate": 1.9973985381370707e-05, "loss": 1.4645, "step": 7920 }, { "epoch": 0.0327801764673056, "grad_norm": 3.5490448514590494, "learning_rate": 1.9973890737982684e-05, "loss": 1.5374, "step": 7930 }, { "epoch": 0.032821513385927675, "grad_norm": 3.94930169954439, "learning_rate": 1.9973795922971827e-05, "loss": 1.4661, "step": 7940 }, { "epoch": 0.032862850304549746, "grad_norm": 3.494031781745132, "learning_rate": 1.9973700936339763e-05, "loss": 1.4291, "step": 7950 }, { "epoch": 0.03290418722317182, "grad_norm": 3.0746272722997414, "learning_rate": 1.9973605778088126e-05, "loss": 1.5493, "step": 7960 }, { "epoch": 0.0329455241417939, "grad_norm": 3.6416998354027736, "learning_rate": 1.9973510448218558e-05, "loss": 1.4471, "step": 7970 }, { "epoch": 0.03298686106041597, "grad_norm": 3.8385152763033883, "learning_rate": 1.99734149467327e-05, "loss": 1.5182, "step": 7980 }, { "epoch": 0.03302819797903805, "grad_norm": 3.5463908529630985, "learning_rate": 1.9973319273632187e-05, "loss": 1.4848, "step": 7990 }, { "epoch": 0.033069534897660124, "grad_norm": 4.494540529736681, "learning_rate": 1.9973223428918677e-05, "loss": 1.4656, "step": 8000 }, { "epoch": 0.0331108718162822, "grad_norm": 4.138487839617815, "learning_rate": 1.997312741259381e-05, "loss": 1.4533, "step": 8010 }, { "epoch": 0.03315220873490427, "grad_norm": 3.2836571982439957, "learning_rate": 1.9973031224659238e-05, "loss": 1.4637, "step": 8020 }, { "epoch": 0.03319354565352635, "grad_norm": 4.101602166042781, "learning_rate": 1.9972934865116622e-05, "loss": 1.4656, "step": 8030 }, { "epoch": 0.033234882572148426, "grad_norm": 4.372499340047587, "learning_rate": 1.9972838333967615e-05, "loss": 1.5377, "step": 8040 }, { "epoch": 0.033276219490770496, "grad_norm": 3.984477694550147, "learning_rate": 1.997274163121388e-05, "loss": 1.5216, "step": 8050 }, { "epoch": 0.033317556409392574, "grad_norm": 3.484892612102319, "learning_rate": 1.9972644756857087e-05, "loss": 1.459, "step": 8060 }, { "epoch": 0.03335889332801465, "grad_norm": 3.473953984247227, "learning_rate": 1.9972547710898894e-05, "loss": 1.4889, "step": 8070 }, { "epoch": 0.03340023024663673, "grad_norm": 3.6982560350258384, "learning_rate": 1.9972450493340973e-05, "loss": 1.4529, "step": 8080 }, { "epoch": 0.0334415671652588, "grad_norm": 4.338255434805353, "learning_rate": 1.9972353104185e-05, "loss": 1.4906, "step": 8090 }, { "epoch": 0.033482904083880875, "grad_norm": 3.2105273217876897, "learning_rate": 1.9972255543432644e-05, "loss": 1.4846, "step": 8100 }, { "epoch": 0.03352424100250295, "grad_norm": 3.7063825752894037, "learning_rate": 1.997215781108559e-05, "loss": 1.4354, "step": 8110 }, { "epoch": 0.03356557792112502, "grad_norm": 3.606082322653538, "learning_rate": 1.997205990714552e-05, "loss": 1.5067, "step": 8120 }, { "epoch": 0.0336069148397471, "grad_norm": 3.8291971667898643, "learning_rate": 1.9971961831614116e-05, "loss": 1.4619, "step": 8130 }, { "epoch": 0.03364825175836918, "grad_norm": 3.331721665794406, "learning_rate": 1.997186358449307e-05, "loss": 1.4484, "step": 8140 }, { "epoch": 0.033689588676991254, "grad_norm": 4.417828154129124, "learning_rate": 1.9971765165784065e-05, "loss": 1.508, "step": 8150 }, { "epoch": 0.033730925595613324, "grad_norm": 3.782991542711988, "learning_rate": 1.9971666575488798e-05, "loss": 1.3925, "step": 8160 }, { "epoch": 0.0337722625142354, "grad_norm": 4.079542042860949, "learning_rate": 1.997156781360897e-05, "loss": 1.4996, "step": 8170 }, { "epoch": 0.03381359943285748, "grad_norm": 4.1626484536651995, "learning_rate": 1.9971468880146273e-05, "loss": 1.5178, "step": 8180 }, { "epoch": 0.03385493635147955, "grad_norm": 4.122058968257828, "learning_rate": 1.9971369775102417e-05, "loss": 1.4267, "step": 8190 }, { "epoch": 0.033896273270101626, "grad_norm": 4.017697912034419, "learning_rate": 1.9971270498479097e-05, "loss": 1.5129, "step": 8200 }, { "epoch": 0.0339376101887237, "grad_norm": 3.3575071407879977, "learning_rate": 1.997117105027803e-05, "loss": 1.4796, "step": 8210 }, { "epoch": 0.03397894710734578, "grad_norm": 3.9246891678069598, "learning_rate": 1.9971071430500924e-05, "loss": 1.5052, "step": 8220 }, { "epoch": 0.03402028402596785, "grad_norm": 4.075111279351767, "learning_rate": 1.9970971639149493e-05, "loss": 1.4606, "step": 8230 }, { "epoch": 0.03406162094458993, "grad_norm": 4.564662754708322, "learning_rate": 1.997087167622546e-05, "loss": 1.5111, "step": 8240 }, { "epoch": 0.034102957863212005, "grad_norm": 3.753297984489193, "learning_rate": 1.9970771541730536e-05, "loss": 1.4899, "step": 8250 }, { "epoch": 0.034144294781834075, "grad_norm": 3.913489388979073, "learning_rate": 1.997067123566645e-05, "loss": 1.4796, "step": 8260 }, { "epoch": 0.03418563170045615, "grad_norm": 3.440711906522703, "learning_rate": 1.9970570758034924e-05, "loss": 1.5184, "step": 8270 }, { "epoch": 0.03422696861907823, "grad_norm": 3.7595940155205363, "learning_rate": 1.997047010883769e-05, "loss": 1.4901, "step": 8280 }, { "epoch": 0.0342683055377003, "grad_norm": 4.387403962431217, "learning_rate": 1.9970369288076478e-05, "loss": 1.4553, "step": 8290 }, { "epoch": 0.03430964245632238, "grad_norm": 3.7034201439850594, "learning_rate": 1.9970268295753022e-05, "loss": 1.4534, "step": 8300 }, { "epoch": 0.034350979374944454, "grad_norm": 4.0052763549673225, "learning_rate": 1.9970167131869064e-05, "loss": 1.4539, "step": 8310 }, { "epoch": 0.03439231629356653, "grad_norm": 4.079349801665228, "learning_rate": 1.9970065796426342e-05, "loss": 1.4698, "step": 8320 }, { "epoch": 0.0344336532121886, "grad_norm": 3.9322937150085875, "learning_rate": 1.99699642894266e-05, "loss": 1.4318, "step": 8330 }, { "epoch": 0.03447499013081068, "grad_norm": 3.834658918473933, "learning_rate": 1.9969862610871586e-05, "loss": 1.4687, "step": 8340 }, { "epoch": 0.034516327049432756, "grad_norm": 3.3637972502778912, "learning_rate": 1.9969760760763045e-05, "loss": 1.4661, "step": 8350 }, { "epoch": 0.034557663968054826, "grad_norm": 3.5654841117273026, "learning_rate": 1.9969658739102733e-05, "loss": 1.4302, "step": 8360 }, { "epoch": 0.0345990008866769, "grad_norm": 3.7270409908212194, "learning_rate": 1.9969556545892405e-05, "loss": 1.4447, "step": 8370 }, { "epoch": 0.03464033780529898, "grad_norm": 3.914859995823126, "learning_rate": 1.996945418113382e-05, "loss": 1.4519, "step": 8380 }, { "epoch": 0.03468167472392106, "grad_norm": 4.5791406660012095, "learning_rate": 1.9969351644828742e-05, "loss": 1.5204, "step": 8390 }, { "epoch": 0.03472301164254313, "grad_norm": 3.6418617205879817, "learning_rate": 1.9969248936978932e-05, "loss": 1.4943, "step": 8400 }, { "epoch": 0.034764348561165205, "grad_norm": 3.4609359361113534, "learning_rate": 1.9969146057586156e-05, "loss": 1.4799, "step": 8410 }, { "epoch": 0.03480568547978728, "grad_norm": 3.9925889106780987, "learning_rate": 1.9969043006652186e-05, "loss": 1.4687, "step": 8420 }, { "epoch": 0.03484702239840935, "grad_norm": 3.2054804086428548, "learning_rate": 1.9968939784178794e-05, "loss": 1.4816, "step": 8430 }, { "epoch": 0.03488835931703143, "grad_norm": 3.349889124617882, "learning_rate": 1.996883639016776e-05, "loss": 1.4577, "step": 8440 }, { "epoch": 0.03492969623565351, "grad_norm": 3.5789857070884086, "learning_rate": 1.996873282462086e-05, "loss": 1.5172, "step": 8450 }, { "epoch": 0.034971033154275584, "grad_norm": 3.4989039985130272, "learning_rate": 1.9968629087539876e-05, "loss": 1.4852, "step": 8460 }, { "epoch": 0.035012370072897654, "grad_norm": 3.3811833684889154, "learning_rate": 1.9968525178926595e-05, "loss": 1.4594, "step": 8470 }, { "epoch": 0.03505370699151973, "grad_norm": 3.4385931194022223, "learning_rate": 1.9968421098782803e-05, "loss": 1.4595, "step": 8480 }, { "epoch": 0.03509504391014181, "grad_norm": 4.16413923561225, "learning_rate": 1.9968316847110292e-05, "loss": 1.4963, "step": 8490 }, { "epoch": 0.03513638082876388, "grad_norm": 4.289305042774894, "learning_rate": 1.9968212423910855e-05, "loss": 1.4551, "step": 8500 }, { "epoch": 0.035177717747385956, "grad_norm": 5.453654756696216, "learning_rate": 1.9968107829186287e-05, "loss": 1.4885, "step": 8510 }, { "epoch": 0.03521905466600803, "grad_norm": 3.6797118668666795, "learning_rate": 1.996800306293839e-05, "loss": 1.4984, "step": 8520 }, { "epoch": 0.03526039158463011, "grad_norm": 3.2371854818646995, "learning_rate": 1.9967898125168973e-05, "loss": 1.4481, "step": 8530 }, { "epoch": 0.03530172850325218, "grad_norm": 3.238508861502653, "learning_rate": 1.9967793015879828e-05, "loss": 1.4562, "step": 8540 }, { "epoch": 0.03534306542187426, "grad_norm": 3.5415115005606177, "learning_rate": 1.9967687735072776e-05, "loss": 1.476, "step": 8550 }, { "epoch": 0.035384402340496335, "grad_norm": 3.843042698193225, "learning_rate": 1.9967582282749622e-05, "loss": 1.4751, "step": 8560 }, { "epoch": 0.035425739259118405, "grad_norm": 3.5779391668735006, "learning_rate": 1.9967476658912184e-05, "loss": 1.4804, "step": 8570 }, { "epoch": 0.03546707617774048, "grad_norm": 4.949952686769368, "learning_rate": 1.9967370863562276e-05, "loss": 1.4245, "step": 8580 }, { "epoch": 0.03550841309636256, "grad_norm": 3.8134978579481924, "learning_rate": 1.996726489670172e-05, "loss": 1.494, "step": 8590 }, { "epoch": 0.03554975001498463, "grad_norm": 4.098567290916666, "learning_rate": 1.996715875833234e-05, "loss": 1.4339, "step": 8600 }, { "epoch": 0.03559108693360671, "grad_norm": 3.4443466301897074, "learning_rate": 1.9967052448455962e-05, "loss": 1.4808, "step": 8610 }, { "epoch": 0.035632423852228784, "grad_norm": 3.939242075931307, "learning_rate": 1.9966945967074416e-05, "loss": 1.4884, "step": 8620 }, { "epoch": 0.03567376077085086, "grad_norm": 3.3941498280577975, "learning_rate": 1.996683931418953e-05, "loss": 1.4635, "step": 8630 }, { "epoch": 0.03571509768947293, "grad_norm": 3.911248054251368, "learning_rate": 1.996673248980315e-05, "loss": 1.4785, "step": 8640 }, { "epoch": 0.03575643460809501, "grad_norm": 4.0383619484944155, "learning_rate": 1.99666254939171e-05, "loss": 1.4334, "step": 8650 }, { "epoch": 0.035797771526717086, "grad_norm": 3.21818266356431, "learning_rate": 1.996651832653323e-05, "loss": 1.5279, "step": 8660 }, { "epoch": 0.035839108445339156, "grad_norm": 4.068360221073268, "learning_rate": 1.9966410987653383e-05, "loss": 1.5073, "step": 8670 }, { "epoch": 0.03588044536396123, "grad_norm": 5.64416307388456, "learning_rate": 1.9966303477279404e-05, "loss": 1.4595, "step": 8680 }, { "epoch": 0.03592178228258331, "grad_norm": 4.456991706091006, "learning_rate": 1.9966195795413145e-05, "loss": 1.5152, "step": 8690 }, { "epoch": 0.03596311920120539, "grad_norm": 3.541237309488241, "learning_rate": 1.9966087942056457e-05, "loss": 1.4773, "step": 8700 }, { "epoch": 0.03600445611982746, "grad_norm": 3.5306668816992186, "learning_rate": 1.9965979917211196e-05, "loss": 1.4838, "step": 8710 }, { "epoch": 0.036045793038449535, "grad_norm": 4.051613339547623, "learning_rate": 1.9965871720879223e-05, "loss": 1.463, "step": 8720 }, { "epoch": 0.03608712995707161, "grad_norm": 3.6129954404785, "learning_rate": 1.9965763353062394e-05, "loss": 1.4479, "step": 8730 }, { "epoch": 0.03612846687569368, "grad_norm": 4.2384349637413825, "learning_rate": 1.9965654813762582e-05, "loss": 1.4928, "step": 8740 }, { "epoch": 0.03616980379431576, "grad_norm": 4.343148391315392, "learning_rate": 1.9965546102981652e-05, "loss": 1.4418, "step": 8750 }, { "epoch": 0.03621114071293784, "grad_norm": 3.9477945474327276, "learning_rate": 1.996543722072147e-05, "loss": 1.4417, "step": 8760 }, { "epoch": 0.036252477631559914, "grad_norm": 3.912481869555381, "learning_rate": 1.9965328166983916e-05, "loss": 1.4877, "step": 8770 }, { "epoch": 0.036293814550181984, "grad_norm": 4.391935734682612, "learning_rate": 1.9965218941770857e-05, "loss": 1.4335, "step": 8780 }, { "epoch": 0.03633515146880406, "grad_norm": 4.493537291846412, "learning_rate": 1.9965109545084185e-05, "loss": 1.4919, "step": 8790 }, { "epoch": 0.03637648838742614, "grad_norm": 2.93026955700472, "learning_rate": 1.9964999976925775e-05, "loss": 1.4304, "step": 8800 }, { "epoch": 0.03641782530604821, "grad_norm": 3.6053506467813032, "learning_rate": 1.9964890237297512e-05, "loss": 1.4635, "step": 8810 }, { "epoch": 0.036459162224670286, "grad_norm": 3.5234834011018648, "learning_rate": 1.9964780326201286e-05, "loss": 1.4981, "step": 8820 }, { "epoch": 0.03650049914329236, "grad_norm": 3.750450253620856, "learning_rate": 1.996467024363899e-05, "loss": 1.4627, "step": 8830 }, { "epoch": 0.03654183606191444, "grad_norm": 3.666723051780572, "learning_rate": 1.9964559989612516e-05, "loss": 1.4514, "step": 8840 }, { "epoch": 0.03658317298053651, "grad_norm": 3.3239044375214633, "learning_rate": 1.996444956412376e-05, "loss": 1.4972, "step": 8850 }, { "epoch": 0.03662450989915859, "grad_norm": 3.8599698199624064, "learning_rate": 1.9964338967174625e-05, "loss": 1.5057, "step": 8860 }, { "epoch": 0.036665846817780665, "grad_norm": 4.132699231086706, "learning_rate": 1.9964228198767012e-05, "loss": 1.4519, "step": 8870 }, { "epoch": 0.036707183736402735, "grad_norm": 3.0714085451165745, "learning_rate": 1.9964117258902828e-05, "loss": 1.434, "step": 8880 }, { "epoch": 0.03674852065502481, "grad_norm": 3.8796486291954904, "learning_rate": 1.9964006147583982e-05, "loss": 1.4505, "step": 8890 }, { "epoch": 0.03678985757364689, "grad_norm": 3.832002897416075, "learning_rate": 1.9963894864812383e-05, "loss": 1.4526, "step": 8900 }, { "epoch": 0.03683119449226896, "grad_norm": 4.887224283091199, "learning_rate": 1.9963783410589948e-05, "loss": 1.4644, "step": 8910 }, { "epoch": 0.036872531410891037, "grad_norm": 4.158724114940273, "learning_rate": 1.99636717849186e-05, "loss": 1.4417, "step": 8920 }, { "epoch": 0.036913868329513114, "grad_norm": 3.81771878130769, "learning_rate": 1.9963559987800253e-05, "loss": 1.508, "step": 8930 }, { "epoch": 0.03695520524813519, "grad_norm": 3.5553407292065207, "learning_rate": 1.9963448019236834e-05, "loss": 1.383, "step": 8940 }, { "epoch": 0.03699654216675726, "grad_norm": 5.03061141095772, "learning_rate": 1.9963335879230264e-05, "loss": 1.4293, "step": 8950 }, { "epoch": 0.03703787908537934, "grad_norm": 3.183233332739406, "learning_rate": 1.996322356778248e-05, "loss": 1.4355, "step": 8960 }, { "epoch": 0.037079216004001415, "grad_norm": 3.555732688914675, "learning_rate": 1.996311108489541e-05, "loss": 1.4338, "step": 8970 }, { "epoch": 0.037120552922623486, "grad_norm": 3.696220192021282, "learning_rate": 1.9962998430570994e-05, "loss": 1.4883, "step": 8980 }, { "epoch": 0.03716188984124556, "grad_norm": 4.796096029475931, "learning_rate": 1.9962885604811168e-05, "loss": 1.4901, "step": 8990 }, { "epoch": 0.03720322675986764, "grad_norm": 5.814203236754815, "learning_rate": 1.996277260761787e-05, "loss": 1.4053, "step": 9000 }, { "epoch": 0.03724456367848972, "grad_norm": 3.3287110492970795, "learning_rate": 1.996265943899305e-05, "loss": 1.4202, "step": 9010 }, { "epoch": 0.03728590059711179, "grad_norm": 3.877230681858091, "learning_rate": 1.996254609893865e-05, "loss": 1.4297, "step": 9020 }, { "epoch": 0.037327237515733865, "grad_norm": 3.48844533397734, "learning_rate": 1.9962432587456622e-05, "loss": 1.4652, "step": 9030 }, { "epoch": 0.03736857443435594, "grad_norm": 3.5520610987897943, "learning_rate": 1.9962318904548923e-05, "loss": 1.4807, "step": 9040 }, { "epoch": 0.03740991135297801, "grad_norm": 3.181838391240591, "learning_rate": 1.9962205050217504e-05, "loss": 1.4757, "step": 9050 }, { "epoch": 0.03745124827160009, "grad_norm": 3.7425531387998907, "learning_rate": 1.996209102446433e-05, "loss": 1.4331, "step": 9060 }, { "epoch": 0.037492585190222166, "grad_norm": 3.663633392520708, "learning_rate": 1.9961976827291358e-05, "loss": 1.4718, "step": 9070 }, { "epoch": 0.03753392210884424, "grad_norm": 4.833995454604731, "learning_rate": 1.9961862458700554e-05, "loss": 1.4217, "step": 9080 }, { "epoch": 0.037575259027466314, "grad_norm": 3.6290459016542216, "learning_rate": 1.9961747918693887e-05, "loss": 1.4848, "step": 9090 }, { "epoch": 0.03761659594608839, "grad_norm": 3.585806885070931, "learning_rate": 1.9961633207273325e-05, "loss": 1.4358, "step": 9100 }, { "epoch": 0.03765793286471047, "grad_norm": 3.4952003665857134, "learning_rate": 1.9961518324440847e-05, "loss": 1.3939, "step": 9110 }, { "epoch": 0.03769926978333254, "grad_norm": 3.279719203181294, "learning_rate": 1.9961403270198424e-05, "loss": 1.4808, "step": 9120 }, { "epoch": 0.037740606701954615, "grad_norm": 3.2692766545796528, "learning_rate": 1.9961288044548043e-05, "loss": 1.3822, "step": 9130 }, { "epoch": 0.03778194362057669, "grad_norm": 3.6490123739235623, "learning_rate": 1.996117264749168e-05, "loss": 1.4485, "step": 9140 }, { "epoch": 0.03782328053919877, "grad_norm": 4.464763724322134, "learning_rate": 1.996105707903132e-05, "loss": 1.4795, "step": 9150 }, { "epoch": 0.03786461745782084, "grad_norm": 3.529618572994803, "learning_rate": 1.9960941339168963e-05, "loss": 1.4452, "step": 9160 }, { "epoch": 0.03790595437644292, "grad_norm": 3.949852891089842, "learning_rate": 1.9960825427906587e-05, "loss": 1.4866, "step": 9170 }, { "epoch": 0.037947291295064994, "grad_norm": 6.3198129841396735, "learning_rate": 1.9960709345246192e-05, "loss": 1.4661, "step": 9180 }, { "epoch": 0.037988628213687065, "grad_norm": 4.3016466998403775, "learning_rate": 1.9960593091189776e-05, "loss": 1.4575, "step": 9190 }, { "epoch": 0.03802996513230914, "grad_norm": 3.218809542898574, "learning_rate": 1.996047666573934e-05, "loss": 1.4385, "step": 9200 }, { "epoch": 0.03807130205093122, "grad_norm": 3.507546904929844, "learning_rate": 1.9960360068896884e-05, "loss": 1.456, "step": 9210 }, { "epoch": 0.038112638969553296, "grad_norm": 3.2658287561416866, "learning_rate": 1.9960243300664418e-05, "loss": 1.4937, "step": 9220 }, { "epoch": 0.038153975888175366, "grad_norm": 3.9657257849748078, "learning_rate": 1.996012636104395e-05, "loss": 1.4743, "step": 9230 }, { "epoch": 0.03819531280679744, "grad_norm": 3.7419945345055865, "learning_rate": 1.996000925003749e-05, "loss": 1.4645, "step": 9240 }, { "epoch": 0.03823664972541952, "grad_norm": 3.717998186266208, "learning_rate": 1.9959891967647055e-05, "loss": 1.4304, "step": 9250 }, { "epoch": 0.03827798664404159, "grad_norm": 4.122611974230224, "learning_rate": 1.9959774513874666e-05, "loss": 1.4396, "step": 9260 }, { "epoch": 0.03831932356266367, "grad_norm": 4.081766829152903, "learning_rate": 1.9959656888722338e-05, "loss": 1.4296, "step": 9270 }, { "epoch": 0.038360660481285745, "grad_norm": 3.4327932618086807, "learning_rate": 1.99595390921921e-05, "loss": 1.479, "step": 9280 }, { "epoch": 0.038401997399907815, "grad_norm": 4.251866528393302, "learning_rate": 1.9959421124285976e-05, "loss": 1.4399, "step": 9290 }, { "epoch": 0.03844333431852989, "grad_norm": 4.132921022210262, "learning_rate": 1.9959302985006e-05, "loss": 1.4366, "step": 9300 }, { "epoch": 0.03848467123715197, "grad_norm": 4.791211851168452, "learning_rate": 1.9959184674354198e-05, "loss": 1.4838, "step": 9310 }, { "epoch": 0.03852600815577405, "grad_norm": 3.000007579210258, "learning_rate": 1.995906619233261e-05, "loss": 1.4541, "step": 9320 }, { "epoch": 0.03856734507439612, "grad_norm": 5.059959210643911, "learning_rate": 1.9958947538943278e-05, "loss": 1.5233, "step": 9330 }, { "epoch": 0.038608681993018194, "grad_norm": 3.20842711732194, "learning_rate": 1.9958828714188236e-05, "loss": 1.4718, "step": 9340 }, { "epoch": 0.03865001891164027, "grad_norm": 3.796018357701994, "learning_rate": 1.9958709718069532e-05, "loss": 1.4522, "step": 9350 }, { "epoch": 0.03869135583026234, "grad_norm": 3.9321479256125347, "learning_rate": 1.995859055058922e-05, "loss": 1.5065, "step": 9360 }, { "epoch": 0.03873269274888442, "grad_norm": 3.254019632954085, "learning_rate": 1.9958471211749342e-05, "loss": 1.4114, "step": 9370 }, { "epoch": 0.038774029667506496, "grad_norm": 3.3308294037697896, "learning_rate": 1.9958351701551953e-05, "loss": 1.4285, "step": 9380 }, { "epoch": 0.03881536658612857, "grad_norm": 4.08834871777043, "learning_rate": 1.9958232019999114e-05, "loss": 1.4295, "step": 9390 }, { "epoch": 0.03885670350475064, "grad_norm": 3.441069579264666, "learning_rate": 1.995811216709288e-05, "loss": 1.4472, "step": 9400 }, { "epoch": 0.03889804042337272, "grad_norm": 3.426532775633606, "learning_rate": 1.995799214283531e-05, "loss": 1.4566, "step": 9410 }, { "epoch": 0.0389393773419948, "grad_norm": 3.399689817601649, "learning_rate": 1.9957871947228476e-05, "loss": 1.4642, "step": 9420 }, { "epoch": 0.03898071426061687, "grad_norm": 3.388140389856613, "learning_rate": 1.995775158027445e-05, "loss": 1.4593, "step": 9430 }, { "epoch": 0.039022051179238945, "grad_norm": 3.325888034679041, "learning_rate": 1.9957631041975292e-05, "loss": 1.473, "step": 9440 }, { "epoch": 0.03906338809786102, "grad_norm": 5.25058506424265, "learning_rate": 1.995751033233308e-05, "loss": 1.4085, "step": 9450 }, { "epoch": 0.0391047250164831, "grad_norm": 3.8257776442726135, "learning_rate": 1.9957389451349898e-05, "loss": 1.4926, "step": 9460 }, { "epoch": 0.03914606193510517, "grad_norm": 3.9914355755037514, "learning_rate": 1.9957268399027815e-05, "loss": 1.4433, "step": 9470 }, { "epoch": 0.03918739885372725, "grad_norm": 4.259453650516103, "learning_rate": 1.9957147175368923e-05, "loss": 1.4435, "step": 9480 }, { "epoch": 0.039228735772349324, "grad_norm": 3.4057039561381974, "learning_rate": 1.99570257803753e-05, "loss": 1.4021, "step": 9490 }, { "epoch": 0.039270072690971394, "grad_norm": 3.9702568689341735, "learning_rate": 1.9956904214049044e-05, "loss": 1.3975, "step": 9500 }, { "epoch": 0.03931140960959347, "grad_norm": 4.162984306124767, "learning_rate": 1.995678247639224e-05, "loss": 1.4269, "step": 9510 }, { "epoch": 0.03935274652821555, "grad_norm": 3.4623660466543216, "learning_rate": 1.9956660567406984e-05, "loss": 1.4812, "step": 9520 }, { "epoch": 0.039394083446837626, "grad_norm": 3.9487634862208663, "learning_rate": 1.9956538487095375e-05, "loss": 1.3904, "step": 9530 }, { "epoch": 0.039435420365459696, "grad_norm": 3.940768474272943, "learning_rate": 1.9956416235459514e-05, "loss": 1.4627, "step": 9540 }, { "epoch": 0.03947675728408177, "grad_norm": 3.7240510688214488, "learning_rate": 1.9956293812501503e-05, "loss": 1.4714, "step": 9550 }, { "epoch": 0.03951809420270385, "grad_norm": 3.544248199002313, "learning_rate": 1.995617121822345e-05, "loss": 1.4418, "step": 9560 }, { "epoch": 0.03955943112132592, "grad_norm": 3.7941720521427453, "learning_rate": 1.9956048452627463e-05, "loss": 1.398, "step": 9570 }, { "epoch": 0.039600768039948, "grad_norm": 3.231769382614049, "learning_rate": 1.9955925515715656e-05, "loss": 1.4323, "step": 9580 }, { "epoch": 0.039642104958570075, "grad_norm": 3.4504677343753585, "learning_rate": 1.9955802407490144e-05, "loss": 1.4508, "step": 9590 }, { "epoch": 0.039683441877192145, "grad_norm": 4.608743387499926, "learning_rate": 1.9955679127953046e-05, "loss": 1.4849, "step": 9600 }, { "epoch": 0.03972477879581422, "grad_norm": 3.2583619571782223, "learning_rate": 1.995555567710648e-05, "loss": 1.4528, "step": 9610 }, { "epoch": 0.0397661157144363, "grad_norm": 3.592600847545303, "learning_rate": 1.9955432054952573e-05, "loss": 1.4222, "step": 9620 }, { "epoch": 0.03980745263305838, "grad_norm": 3.935340478064598, "learning_rate": 1.9955308261493457e-05, "loss": 1.4243, "step": 9630 }, { "epoch": 0.03984878955168045, "grad_norm": 3.7051161334020075, "learning_rate": 1.995518429673125e-05, "loss": 1.4487, "step": 9640 }, { "epoch": 0.039890126470302524, "grad_norm": 3.6647142900939977, "learning_rate": 1.9955060160668095e-05, "loss": 1.4458, "step": 9650 }, { "epoch": 0.0399314633889246, "grad_norm": 4.428497991354939, "learning_rate": 1.9954935853306124e-05, "loss": 1.4721, "step": 9660 }, { "epoch": 0.03997280030754667, "grad_norm": 3.2958564393103056, "learning_rate": 1.9954811374647474e-05, "loss": 1.4394, "step": 9670 }, { "epoch": 0.04001413722616875, "grad_norm": 3.10104196973718, "learning_rate": 1.9954686724694297e-05, "loss": 1.4361, "step": 9680 }, { "epoch": 0.040055474144790826, "grad_norm": 3.957938872776804, "learning_rate": 1.9954561903448727e-05, "loss": 1.4602, "step": 9690 }, { "epoch": 0.0400968110634129, "grad_norm": 3.760794840185392, "learning_rate": 1.9954436910912914e-05, "loss": 1.4285, "step": 9700 }, { "epoch": 0.04013814798203497, "grad_norm": 3.421396807117046, "learning_rate": 1.9954311747089012e-05, "loss": 1.4774, "step": 9710 }, { "epoch": 0.04017948490065705, "grad_norm": 3.91789094802535, "learning_rate": 1.9954186411979175e-05, "loss": 1.4021, "step": 9720 }, { "epoch": 0.04022082181927913, "grad_norm": 3.081464490088515, "learning_rate": 1.9954060905585556e-05, "loss": 1.4219, "step": 9730 }, { "epoch": 0.0402621587379012, "grad_norm": 3.3381107703512507, "learning_rate": 1.9953935227910316e-05, "loss": 1.4632, "step": 9740 }, { "epoch": 0.040303495656523275, "grad_norm": 3.8300980744875828, "learning_rate": 1.995380937895562e-05, "loss": 1.4322, "step": 9750 }, { "epoch": 0.04034483257514535, "grad_norm": 3.4534661824404633, "learning_rate": 1.995368335872363e-05, "loss": 1.4436, "step": 9760 }, { "epoch": 0.04038616949376743, "grad_norm": 3.983712880561037, "learning_rate": 1.995355716721652e-05, "loss": 1.4598, "step": 9770 }, { "epoch": 0.0404275064123895, "grad_norm": 3.840919795852268, "learning_rate": 1.995343080443645e-05, "loss": 1.4456, "step": 9780 }, { "epoch": 0.04046884333101158, "grad_norm": 3.901157368681076, "learning_rate": 1.9953304270385607e-05, "loss": 1.4525, "step": 9790 }, { "epoch": 0.040510180249633654, "grad_norm": 3.189808091891606, "learning_rate": 1.9953177565066163e-05, "loss": 1.4462, "step": 9800 }, { "epoch": 0.040551517168255724, "grad_norm": 4.025890961267514, "learning_rate": 1.9953050688480293e-05, "loss": 1.443, "step": 9810 }, { "epoch": 0.0405928540868778, "grad_norm": 3.3710964799433007, "learning_rate": 1.995292364063019e-05, "loss": 1.4262, "step": 9820 }, { "epoch": 0.04063419100549988, "grad_norm": 3.883950857165337, "learning_rate": 1.9952796421518034e-05, "loss": 1.4174, "step": 9830 }, { "epoch": 0.040675527924121956, "grad_norm": 3.474777308443348, "learning_rate": 1.995266903114602e-05, "loss": 1.4654, "step": 9840 }, { "epoch": 0.040716864842744026, "grad_norm": 3.116715119287666, "learning_rate": 1.995254146951633e-05, "loss": 1.3727, "step": 9850 }, { "epoch": 0.0407582017613661, "grad_norm": 3.9762493552410203, "learning_rate": 1.9952413736631165e-05, "loss": 1.4567, "step": 9860 }, { "epoch": 0.04079953867998818, "grad_norm": 2.9468825909120033, "learning_rate": 1.9952285832492726e-05, "loss": 1.4422, "step": 9870 }, { "epoch": 0.04084087559861025, "grad_norm": 3.5348361015353444, "learning_rate": 1.995215775710321e-05, "loss": 1.3795, "step": 9880 }, { "epoch": 0.04088221251723233, "grad_norm": 3.276927230678387, "learning_rate": 1.995202951046482e-05, "loss": 1.4218, "step": 9890 }, { "epoch": 0.040923549435854405, "grad_norm": 3.606741214717579, "learning_rate": 1.9951901092579763e-05, "loss": 1.4364, "step": 9900 }, { "epoch": 0.040964886354476475, "grad_norm": 4.3260733895333425, "learning_rate": 1.9951772503450252e-05, "loss": 1.4398, "step": 9910 }, { "epoch": 0.04100622327309855, "grad_norm": 3.1621905456493544, "learning_rate": 1.9951643743078496e-05, "loss": 1.4397, "step": 9920 }, { "epoch": 0.04104756019172063, "grad_norm": 3.4412582079657623, "learning_rate": 1.9951514811466713e-05, "loss": 1.4036, "step": 9930 }, { "epoch": 0.041088897110342706, "grad_norm": 4.873896899838307, "learning_rate": 1.995138570861712e-05, "loss": 1.4263, "step": 9940 }, { "epoch": 0.04113023402896478, "grad_norm": 4.331814069124075, "learning_rate": 1.9951256434531943e-05, "loss": 1.4817, "step": 9950 }, { "epoch": 0.041171570947586854, "grad_norm": 3.9349360259135926, "learning_rate": 1.9951126989213398e-05, "loss": 1.4483, "step": 9960 }, { "epoch": 0.04121290786620893, "grad_norm": 3.3613448968668864, "learning_rate": 1.995099737266372e-05, "loss": 1.4229, "step": 9970 }, { "epoch": 0.041254244784831, "grad_norm": 3.549433934959654, "learning_rate": 1.9950867584885132e-05, "loss": 1.4283, "step": 9980 }, { "epoch": 0.04129558170345308, "grad_norm": 3.5652364655273208, "learning_rate": 1.995073762587987e-05, "loss": 1.4642, "step": 9990 }, { "epoch": 0.041336918622075156, "grad_norm": 4.029695967481624, "learning_rate": 1.995060749565018e-05, "loss": 1.3657, "step": 10000 }, { "epoch": 0.041336918622075156, "eval_loss": 1.736175537109375, "eval_runtime": 393.8494, "eval_samples_per_second": 10.4, "eval_steps_per_second": 2.6, "step": 10000 }, { "epoch": 0.04137825554069723, "grad_norm": 3.414046152937389, "learning_rate": 1.9950477194198287e-05, "loss": 1.3957, "step": 10010 }, { "epoch": 0.0414195924593193, "grad_norm": 5.320606616740586, "learning_rate": 1.9950346721526443e-05, "loss": 1.4508, "step": 10020 }, { "epoch": 0.04146092937794138, "grad_norm": 3.9807925522216423, "learning_rate": 1.9950216077636886e-05, "loss": 1.3943, "step": 10030 }, { "epoch": 0.04150226629656346, "grad_norm": 3.501083066632413, "learning_rate": 1.9950085262531868e-05, "loss": 1.4352, "step": 10040 }, { "epoch": 0.04154360321518553, "grad_norm": 3.771268569637735, "learning_rate": 1.994995427621364e-05, "loss": 1.452, "step": 10050 }, { "epoch": 0.041584940133807605, "grad_norm": 3.8515101224909216, "learning_rate": 1.9949823118684454e-05, "loss": 1.4306, "step": 10060 }, { "epoch": 0.04162627705242968, "grad_norm": 3.7934745333554782, "learning_rate": 1.9949691789946567e-05, "loss": 1.4805, "step": 10070 }, { "epoch": 0.04166761397105176, "grad_norm": 3.396103842576295, "learning_rate": 1.9949560290002245e-05, "loss": 1.4516, "step": 10080 }, { "epoch": 0.04170895088967383, "grad_norm": 3.4268415637061085, "learning_rate": 1.994942861885374e-05, "loss": 1.4143, "step": 10090 }, { "epoch": 0.041750287808295906, "grad_norm": 3.4582505595292203, "learning_rate": 1.9949296776503324e-05, "loss": 1.3815, "step": 10100 }, { "epoch": 0.041791624726917984, "grad_norm": 3.5395990026077304, "learning_rate": 1.994916476295327e-05, "loss": 1.4449, "step": 10110 }, { "epoch": 0.041832961645540054, "grad_norm": 3.4229281128115403, "learning_rate": 1.9949032578205834e-05, "loss": 1.4526, "step": 10120 }, { "epoch": 0.04187429856416213, "grad_norm": 3.983206436567361, "learning_rate": 1.994890022226331e-05, "loss": 1.4463, "step": 10130 }, { "epoch": 0.04191563548278421, "grad_norm": 3.668734425155437, "learning_rate": 1.9948767695127964e-05, "loss": 1.419, "step": 10140 }, { "epoch": 0.041956972401406285, "grad_norm": 3.3634372280714517, "learning_rate": 1.9948634996802078e-05, "loss": 1.4329, "step": 10150 }, { "epoch": 0.041998309320028356, "grad_norm": 4.062775728402737, "learning_rate": 1.9948502127287936e-05, "loss": 1.4361, "step": 10160 }, { "epoch": 0.04203964623865043, "grad_norm": 3.4149660693597084, "learning_rate": 1.9948369086587823e-05, "loss": 1.4725, "step": 10170 }, { "epoch": 0.04208098315727251, "grad_norm": 3.6916128915527313, "learning_rate": 1.9948235874704035e-05, "loss": 1.4732, "step": 10180 }, { "epoch": 0.04212232007589458, "grad_norm": 3.9231999868924206, "learning_rate": 1.9948102491638853e-05, "loss": 1.4558, "step": 10190 }, { "epoch": 0.04216365699451666, "grad_norm": 4.846870150341976, "learning_rate": 1.9947968937394583e-05, "loss": 1.4455, "step": 10200 }, { "epoch": 0.042204993913138734, "grad_norm": 3.426175390236964, "learning_rate": 1.9947835211973517e-05, "loss": 1.3997, "step": 10210 }, { "epoch": 0.042246330831760805, "grad_norm": 3.7909997652306258, "learning_rate": 1.9947701315377954e-05, "loss": 1.4361, "step": 10220 }, { "epoch": 0.04228766775038288, "grad_norm": 3.535939765317278, "learning_rate": 1.9947567247610206e-05, "loss": 1.4449, "step": 10230 }, { "epoch": 0.04232900466900496, "grad_norm": 3.3731810089302523, "learning_rate": 1.9947433008672572e-05, "loss": 1.4193, "step": 10240 }, { "epoch": 0.042370341587627036, "grad_norm": 3.9292291070435077, "learning_rate": 1.9947298598567364e-05, "loss": 1.4657, "step": 10250 }, { "epoch": 0.042411678506249106, "grad_norm": 3.369066359531392, "learning_rate": 1.99471640172969e-05, "loss": 1.4509, "step": 10260 }, { "epoch": 0.042453015424871184, "grad_norm": 3.6668982318612495, "learning_rate": 1.994702926486349e-05, "loss": 1.3931, "step": 10270 }, { "epoch": 0.04249435234349326, "grad_norm": 3.2034209344506097, "learning_rate": 1.9946894341269453e-05, "loss": 1.4217, "step": 10280 }, { "epoch": 0.04253568926211533, "grad_norm": 4.400853617662863, "learning_rate": 1.9946759246517113e-05, "loss": 1.4544, "step": 10290 }, { "epoch": 0.04257702618073741, "grad_norm": 3.1712083272819473, "learning_rate": 1.9946623980608792e-05, "loss": 1.4813, "step": 10300 }, { "epoch": 0.042618363099359485, "grad_norm": 3.5677581184867395, "learning_rate": 1.994648854354682e-05, "loss": 1.4321, "step": 10310 }, { "epoch": 0.04265970001798156, "grad_norm": 3.38462741867337, "learning_rate": 1.9946352935333528e-05, "loss": 1.3907, "step": 10320 }, { "epoch": 0.04270103693660363, "grad_norm": 3.6690520985143054, "learning_rate": 1.994621715597125e-05, "loss": 1.453, "step": 10330 }, { "epoch": 0.04274237385522571, "grad_norm": 3.628541207308318, "learning_rate": 1.9946081205462315e-05, "loss": 1.4224, "step": 10340 }, { "epoch": 0.04278371077384779, "grad_norm": 3.573675637942579, "learning_rate": 1.994594508380907e-05, "loss": 1.4409, "step": 10350 }, { "epoch": 0.04282504769246986, "grad_norm": 3.9512735810988584, "learning_rate": 1.9945808791013857e-05, "loss": 1.4116, "step": 10360 }, { "epoch": 0.042866384611091934, "grad_norm": 3.2189804332946936, "learning_rate": 1.994567232707902e-05, "loss": 1.4239, "step": 10370 }, { "epoch": 0.04290772152971401, "grad_norm": 3.277452726822312, "learning_rate": 1.9945535692006903e-05, "loss": 1.419, "step": 10380 }, { "epoch": 0.04294905844833609, "grad_norm": 3.2409307004738594, "learning_rate": 1.994539888579986e-05, "loss": 1.412, "step": 10390 }, { "epoch": 0.04299039536695816, "grad_norm": 5.955091940094864, "learning_rate": 1.9945261908460248e-05, "loss": 1.4001, "step": 10400 }, { "epoch": 0.043031732285580236, "grad_norm": 3.626590483447886, "learning_rate": 1.9945124759990424e-05, "loss": 1.4598, "step": 10410 }, { "epoch": 0.04307306920420231, "grad_norm": 3.4065599382832197, "learning_rate": 1.9944987440392742e-05, "loss": 1.3991, "step": 10420 }, { "epoch": 0.043114406122824384, "grad_norm": 3.3228235524159824, "learning_rate": 1.994484994966957e-05, "loss": 1.4069, "step": 10430 }, { "epoch": 0.04315574304144646, "grad_norm": 4.069300982498486, "learning_rate": 1.9944712287823275e-05, "loss": 1.4376, "step": 10440 }, { "epoch": 0.04319707996006854, "grad_norm": 3.668990751455877, "learning_rate": 1.9944574454856216e-05, "loss": 1.4185, "step": 10450 }, { "epoch": 0.043238416878690615, "grad_norm": 3.189803317003545, "learning_rate": 1.9944436450770775e-05, "loss": 1.3998, "step": 10460 }, { "epoch": 0.043279753797312685, "grad_norm": 3.7817594340150924, "learning_rate": 1.9944298275569328e-05, "loss": 1.4494, "step": 10470 }, { "epoch": 0.04332109071593476, "grad_norm": 4.780904235096889, "learning_rate": 1.9944159929254245e-05, "loss": 1.4616, "step": 10480 }, { "epoch": 0.04336242763455684, "grad_norm": 4.010329780152807, "learning_rate": 1.9944021411827905e-05, "loss": 1.4532, "step": 10490 }, { "epoch": 0.04340376455317891, "grad_norm": 4.209509632753131, "learning_rate": 1.9943882723292704e-05, "loss": 1.4622, "step": 10500 }, { "epoch": 0.04344510147180099, "grad_norm": 3.228687583673167, "learning_rate": 1.9943743863651017e-05, "loss": 1.4053, "step": 10510 }, { "epoch": 0.043486438390423064, "grad_norm": 3.288729771085999, "learning_rate": 1.994360483290523e-05, "loss": 1.4192, "step": 10520 }, { "epoch": 0.043527775309045134, "grad_norm": 4.41078023777337, "learning_rate": 1.994346563105775e-05, "loss": 1.4098, "step": 10530 }, { "epoch": 0.04356911222766721, "grad_norm": 3.1815816594140487, "learning_rate": 1.9943326258110963e-05, "loss": 1.4676, "step": 10540 }, { "epoch": 0.04361044914628929, "grad_norm": 3.554730176042178, "learning_rate": 1.994318671406727e-05, "loss": 1.4262, "step": 10550 }, { "epoch": 0.043651786064911366, "grad_norm": 4.564103408690964, "learning_rate": 1.9943046998929073e-05, "loss": 1.4104, "step": 10560 }, { "epoch": 0.043693122983533436, "grad_norm": 3.5454961573863994, "learning_rate": 1.994290711269877e-05, "loss": 1.4235, "step": 10570 }, { "epoch": 0.04373445990215551, "grad_norm": 3.6248317766975857, "learning_rate": 1.9942767055378775e-05, "loss": 1.3733, "step": 10580 }, { "epoch": 0.04377579682077759, "grad_norm": 3.2489128741687123, "learning_rate": 1.9942626826971493e-05, "loss": 1.4456, "step": 10590 }, { "epoch": 0.04381713373939966, "grad_norm": 3.5799361135868057, "learning_rate": 1.994248642747934e-05, "loss": 1.4071, "step": 10600 }, { "epoch": 0.04385847065802174, "grad_norm": 3.4391607635624033, "learning_rate": 1.9942345856904727e-05, "loss": 1.388, "step": 10610 }, { "epoch": 0.043899807576643815, "grad_norm": 4.900926633402934, "learning_rate": 1.994220511525008e-05, "loss": 1.4214, "step": 10620 }, { "epoch": 0.04394114449526589, "grad_norm": 3.1998682814537807, "learning_rate": 1.994206420251782e-05, "loss": 1.4392, "step": 10630 }, { "epoch": 0.04398248141388796, "grad_norm": 3.512730762072939, "learning_rate": 1.9941923118710366e-05, "loss": 1.3833, "step": 10640 }, { "epoch": 0.04402381833251004, "grad_norm": 3.5959575986075354, "learning_rate": 1.9941781863830153e-05, "loss": 1.4666, "step": 10650 }, { "epoch": 0.04406515525113212, "grad_norm": 4.116993239444605, "learning_rate": 1.9941640437879603e-05, "loss": 1.417, "step": 10660 }, { "epoch": 0.04410649216975419, "grad_norm": 4.587080576933717, "learning_rate": 1.9941498840861153e-05, "loss": 1.3558, "step": 10670 }, { "epoch": 0.044147829088376264, "grad_norm": 3.923348655449712, "learning_rate": 1.9941357072777245e-05, "loss": 1.403, "step": 10680 }, { "epoch": 0.04418916600699834, "grad_norm": 3.3131343328753884, "learning_rate": 1.9941215133630312e-05, "loss": 1.414, "step": 10690 }, { "epoch": 0.04423050292562042, "grad_norm": 3.815180569497117, "learning_rate": 1.9941073023422796e-05, "loss": 1.4567, "step": 10700 }, { "epoch": 0.04427183984424249, "grad_norm": 3.0191885771803264, "learning_rate": 1.994093074215715e-05, "loss": 1.4257, "step": 10710 }, { "epoch": 0.044313176762864566, "grad_norm": 3.4376292652965494, "learning_rate": 1.994078828983581e-05, "loss": 1.4118, "step": 10720 }, { "epoch": 0.04435451368148664, "grad_norm": 3.5106899932837643, "learning_rate": 1.994064566646124e-05, "loss": 1.4159, "step": 10730 }, { "epoch": 0.04439585060010871, "grad_norm": 3.6846637686102413, "learning_rate": 1.9940502872035888e-05, "loss": 1.3948, "step": 10740 }, { "epoch": 0.04443718751873079, "grad_norm": 3.657265133747329, "learning_rate": 1.9940359906562207e-05, "loss": 1.4087, "step": 10750 }, { "epoch": 0.04447852443735287, "grad_norm": 4.430332521129557, "learning_rate": 1.9940216770042666e-05, "loss": 1.3989, "step": 10760 }, { "epoch": 0.044519861355974945, "grad_norm": 4.43254812995105, "learning_rate": 1.994007346247972e-05, "loss": 1.3781, "step": 10770 }, { "epoch": 0.044561198274597015, "grad_norm": 3.547905857131194, "learning_rate": 1.9939929983875837e-05, "loss": 1.443, "step": 10780 }, { "epoch": 0.04460253519321909, "grad_norm": 4.225199610421922, "learning_rate": 1.9939786334233492e-05, "loss": 1.3992, "step": 10790 }, { "epoch": 0.04464387211184117, "grad_norm": 3.2850031799014494, "learning_rate": 1.993964251355515e-05, "loss": 1.39, "step": 10800 }, { "epoch": 0.04468520903046324, "grad_norm": 3.576860893151518, "learning_rate": 1.993949852184329e-05, "loss": 1.4019, "step": 10810 }, { "epoch": 0.04472654594908532, "grad_norm": 4.14729049725031, "learning_rate": 1.9939354359100385e-05, "loss": 1.407, "step": 10820 }, { "epoch": 0.044767882867707394, "grad_norm": 3.6785935387585447, "learning_rate": 1.9939210025328915e-05, "loss": 1.4188, "step": 10830 }, { "epoch": 0.044809219786329464, "grad_norm": 3.475380301816819, "learning_rate": 1.993906552053137e-05, "loss": 1.4146, "step": 10840 }, { "epoch": 0.04485055670495154, "grad_norm": 3.3679721828323217, "learning_rate": 1.9938920844710235e-05, "loss": 1.4208, "step": 10850 }, { "epoch": 0.04489189362357362, "grad_norm": 3.679471702118622, "learning_rate": 1.9938775997867995e-05, "loss": 1.4209, "step": 10860 }, { "epoch": 0.044933230542195696, "grad_norm": 3.8980049289176377, "learning_rate": 1.9938630980007147e-05, "loss": 1.4121, "step": 10870 }, { "epoch": 0.044974567460817766, "grad_norm": 3.7079901840906713, "learning_rate": 1.9938485791130183e-05, "loss": 1.3969, "step": 10880 }, { "epoch": 0.04501590437943984, "grad_norm": 3.7675855531387996, "learning_rate": 1.9938340431239603e-05, "loss": 1.4012, "step": 10890 }, { "epoch": 0.04505724129806192, "grad_norm": 3.3894112723434127, "learning_rate": 1.9938194900337908e-05, "loss": 1.4184, "step": 10900 }, { "epoch": 0.04509857821668399, "grad_norm": 4.1568950335530825, "learning_rate": 1.9938049198427604e-05, "loss": 1.452, "step": 10910 }, { "epoch": 0.04513991513530607, "grad_norm": 3.630087506411177, "learning_rate": 1.9937903325511193e-05, "loss": 1.4657, "step": 10920 }, { "epoch": 0.045181252053928145, "grad_norm": 3.510575809020148, "learning_rate": 1.9937757281591187e-05, "loss": 1.4341, "step": 10930 }, { "epoch": 0.04522258897255022, "grad_norm": 3.309825385197255, "learning_rate": 1.9937611066670106e-05, "loss": 1.3789, "step": 10940 }, { "epoch": 0.04526392589117229, "grad_norm": 3.239522136091904, "learning_rate": 1.9937464680750454e-05, "loss": 1.4103, "step": 10950 }, { "epoch": 0.04530526280979437, "grad_norm": 4.673675936224972, "learning_rate": 1.9937318123834762e-05, "loss": 1.3989, "step": 10960 }, { "epoch": 0.04534659972841645, "grad_norm": 4.627358104306948, "learning_rate": 1.9937171395925544e-05, "loss": 1.4203, "step": 10970 }, { "epoch": 0.04538793664703852, "grad_norm": 3.311365466265083, "learning_rate": 1.9937024497025325e-05, "loss": 1.389, "step": 10980 }, { "epoch": 0.045429273565660594, "grad_norm": 4.134318195617502, "learning_rate": 1.9936877427136637e-05, "loss": 1.4224, "step": 10990 }, { "epoch": 0.04547061048428267, "grad_norm": 3.006959002241816, "learning_rate": 1.9936730186262007e-05, "loss": 1.3988, "step": 11000 }, { "epoch": 0.04551194740290475, "grad_norm": 4.01529741437254, "learning_rate": 1.993658277440397e-05, "loss": 1.4396, "step": 11010 }, { "epoch": 0.04555328432152682, "grad_norm": 3.2748525540941507, "learning_rate": 1.993643519156506e-05, "loss": 1.3921, "step": 11020 }, { "epoch": 0.045594621240148896, "grad_norm": 4.018973443549097, "learning_rate": 1.9936287437747822e-05, "loss": 1.3617, "step": 11030 }, { "epoch": 0.04563595815877097, "grad_norm": 3.462150874636636, "learning_rate": 1.993613951295479e-05, "loss": 1.4075, "step": 11040 }, { "epoch": 0.04567729507739304, "grad_norm": 3.6284928503493528, "learning_rate": 1.9935991417188523e-05, "loss": 1.3774, "step": 11050 }, { "epoch": 0.04571863199601512, "grad_norm": 4.331109900085688, "learning_rate": 1.9935843150451558e-05, "loss": 1.4156, "step": 11060 }, { "epoch": 0.0457599689146372, "grad_norm": 3.749722584209809, "learning_rate": 1.9935694712746448e-05, "loss": 1.4314, "step": 11070 }, { "epoch": 0.045801305833259275, "grad_norm": 3.1035349095523266, "learning_rate": 1.9935546104075746e-05, "loss": 1.4167, "step": 11080 }, { "epoch": 0.045842642751881345, "grad_norm": 4.134657657963317, "learning_rate": 1.9935397324442015e-05, "loss": 1.4377, "step": 11090 }, { "epoch": 0.04588397967050342, "grad_norm": 3.378268647534537, "learning_rate": 1.993524837384781e-05, "loss": 1.4201, "step": 11100 }, { "epoch": 0.0459253165891255, "grad_norm": 3.3061200097201615, "learning_rate": 1.9935099252295694e-05, "loss": 1.391, "step": 11110 }, { "epoch": 0.04596665350774757, "grad_norm": 3.6514603716731133, "learning_rate": 1.9934949959788237e-05, "loss": 1.4423, "step": 11120 }, { "epoch": 0.046007990426369647, "grad_norm": 3.2198262717397896, "learning_rate": 1.9934800496328006e-05, "loss": 1.4049, "step": 11130 }, { "epoch": 0.046049327344991724, "grad_norm": 3.1555815125987197, "learning_rate": 1.993465086191757e-05, "loss": 1.4418, "step": 11140 }, { "epoch": 0.0460906642636138, "grad_norm": 3.6391685610732476, "learning_rate": 1.993450105655951e-05, "loss": 1.3824, "step": 11150 }, { "epoch": 0.04613200118223587, "grad_norm": 3.426206028662225, "learning_rate": 1.9934351080256395e-05, "loss": 1.3837, "step": 11160 }, { "epoch": 0.04617333810085795, "grad_norm": 4.616699518929945, "learning_rate": 1.9934200933010816e-05, "loss": 1.3886, "step": 11170 }, { "epoch": 0.046214675019480025, "grad_norm": 3.919463255914606, "learning_rate": 1.993405061482535e-05, "loss": 1.418, "step": 11180 }, { "epoch": 0.046256011938102096, "grad_norm": 4.372063175345489, "learning_rate": 1.9933900125702582e-05, "loss": 1.3976, "step": 11190 }, { "epoch": 0.04629734885672417, "grad_norm": 4.040676043168612, "learning_rate": 1.9933749465645103e-05, "loss": 1.4122, "step": 11200 }, { "epoch": 0.04633868577534625, "grad_norm": 3.3730911260973815, "learning_rate": 1.9933598634655512e-05, "loss": 1.3707, "step": 11210 }, { "epoch": 0.04638002269396832, "grad_norm": 3.6851619767350066, "learning_rate": 1.9933447632736393e-05, "loss": 1.398, "step": 11220 }, { "epoch": 0.0464213596125904, "grad_norm": 4.001189833407079, "learning_rate": 1.9933296459890355e-05, "loss": 1.4071, "step": 11230 }, { "epoch": 0.046462696531212475, "grad_norm": 3.3898995076664757, "learning_rate": 1.993314511611999e-05, "loss": 1.408, "step": 11240 }, { "epoch": 0.04650403344983455, "grad_norm": 3.6996997277102146, "learning_rate": 1.9932993601427912e-05, "loss": 1.3975, "step": 11250 }, { "epoch": 0.04654537036845662, "grad_norm": 3.313365690401916, "learning_rate": 1.993284191581672e-05, "loss": 1.408, "step": 11260 }, { "epoch": 0.0465867072870787, "grad_norm": 4.633876867393197, "learning_rate": 1.993269005928903e-05, "loss": 1.396, "step": 11270 }, { "epoch": 0.046628044205700776, "grad_norm": 3.5037620526852304, "learning_rate": 1.993253803184745e-05, "loss": 1.4024, "step": 11280 }, { "epoch": 0.046669381124322847, "grad_norm": 3.081503642322238, "learning_rate": 1.9932385833494597e-05, "loss": 1.4109, "step": 11290 }, { "epoch": 0.046710718042944924, "grad_norm": 3.547058360569091, "learning_rate": 1.9932233464233092e-05, "loss": 1.3796, "step": 11300 }, { "epoch": 0.046752054961567, "grad_norm": 3.814887745242394, "learning_rate": 1.9932080924065556e-05, "loss": 1.4401, "step": 11310 }, { "epoch": 0.04679339188018908, "grad_norm": 3.5004316252867085, "learning_rate": 1.993192821299461e-05, "loss": 1.452, "step": 11320 }, { "epoch": 0.04683472879881115, "grad_norm": 3.449228750426351, "learning_rate": 1.993177533102289e-05, "loss": 1.3734, "step": 11330 }, { "epoch": 0.046876065717433225, "grad_norm": 3.2964308484381784, "learning_rate": 1.9931622278153024e-05, "loss": 1.4018, "step": 11340 }, { "epoch": 0.0469174026360553, "grad_norm": 2.8180078039607697, "learning_rate": 1.993146905438764e-05, "loss": 1.4081, "step": 11350 }, { "epoch": 0.04695873955467737, "grad_norm": 3.2236402061866545, "learning_rate": 1.9931315659729376e-05, "loss": 1.4534, "step": 11360 }, { "epoch": 0.04700007647329945, "grad_norm": 3.5082002531342473, "learning_rate": 1.9931162094180874e-05, "loss": 1.4173, "step": 11370 }, { "epoch": 0.04704141339192153, "grad_norm": 3.284436648082263, "learning_rate": 1.993100835774478e-05, "loss": 1.4092, "step": 11380 }, { "epoch": 0.047082750310543604, "grad_norm": 3.4816460403688314, "learning_rate": 1.9930854450423736e-05, "loss": 1.3913, "step": 11390 }, { "epoch": 0.047124087229165675, "grad_norm": 4.14476906280773, "learning_rate": 1.9930700372220387e-05, "loss": 1.3703, "step": 11400 }, { "epoch": 0.04716542414778775, "grad_norm": 4.375945137034217, "learning_rate": 1.993054612313739e-05, "loss": 1.4362, "step": 11410 }, { "epoch": 0.04720676106640983, "grad_norm": 4.277168672236525, "learning_rate": 1.993039170317739e-05, "loss": 1.479, "step": 11420 }, { "epoch": 0.0472480979850319, "grad_norm": 3.927053905964369, "learning_rate": 1.9930237112343056e-05, "loss": 1.3872, "step": 11430 }, { "epoch": 0.047289434903653976, "grad_norm": 3.607150421833661, "learning_rate": 1.9930082350637042e-05, "loss": 1.3891, "step": 11440 }, { "epoch": 0.04733077182227605, "grad_norm": 3.338415521714108, "learning_rate": 1.992992741806201e-05, "loss": 1.4211, "step": 11450 }, { "epoch": 0.04737210874089813, "grad_norm": 3.629104841971202, "learning_rate": 1.9929772314620627e-05, "loss": 1.3425, "step": 11460 }, { "epoch": 0.0474134456595202, "grad_norm": 3.099599716044088, "learning_rate": 1.9929617040315563e-05, "loss": 1.382, "step": 11470 }, { "epoch": 0.04745478257814228, "grad_norm": 3.7158273713517893, "learning_rate": 1.992946159514949e-05, "loss": 1.4361, "step": 11480 }, { "epoch": 0.047496119496764355, "grad_norm": 4.389528378421289, "learning_rate": 1.992930597912508e-05, "loss": 1.4435, "step": 11490 }, { "epoch": 0.047537456415386425, "grad_norm": 3.669631509338215, "learning_rate": 1.9929150192245016e-05, "loss": 1.4321, "step": 11500 }, { "epoch": 0.0475787933340085, "grad_norm": 3.4995914344254624, "learning_rate": 1.992899423451197e-05, "loss": 1.446, "step": 11510 }, { "epoch": 0.04762013025263058, "grad_norm": 3.3809354325443017, "learning_rate": 1.9928838105928635e-05, "loss": 1.3941, "step": 11520 }, { "epoch": 0.04766146717125265, "grad_norm": 3.1788524195701684, "learning_rate": 1.9928681806497693e-05, "loss": 1.4027, "step": 11530 }, { "epoch": 0.04770280408987473, "grad_norm": 3.368111520244943, "learning_rate": 1.9928525336221837e-05, "loss": 1.4038, "step": 11540 }, { "epoch": 0.047744141008496804, "grad_norm": 3.6045389016529636, "learning_rate": 1.992836869510375e-05, "loss": 1.4523, "step": 11550 }, { "epoch": 0.04778547792711888, "grad_norm": 3.859885628963866, "learning_rate": 1.9928211883146136e-05, "loss": 1.4307, "step": 11560 }, { "epoch": 0.04782681484574095, "grad_norm": 3.2925025525674756, "learning_rate": 1.9928054900351693e-05, "loss": 1.4473, "step": 11570 }, { "epoch": 0.04786815176436303, "grad_norm": 4.075772971569745, "learning_rate": 1.992789774672312e-05, "loss": 1.4384, "step": 11580 }, { "epoch": 0.047909488682985106, "grad_norm": 3.391288577095074, "learning_rate": 1.9927740422263117e-05, "loss": 1.4038, "step": 11590 }, { "epoch": 0.047950825601607176, "grad_norm": 3.4783586083297626, "learning_rate": 1.9927582926974402e-05, "loss": 1.3911, "step": 11600 }, { "epoch": 0.04799216252022925, "grad_norm": 3.468149363696469, "learning_rate": 1.9927425260859673e-05, "loss": 1.4123, "step": 11610 }, { "epoch": 0.04803349943885133, "grad_norm": 3.464512963986536, "learning_rate": 1.992726742392165e-05, "loss": 1.3973, "step": 11620 }, { "epoch": 0.04807483635747341, "grad_norm": 4.061746689943389, "learning_rate": 1.992710941616305e-05, "loss": 1.3841, "step": 11630 }, { "epoch": 0.04811617327609548, "grad_norm": 4.139608234706919, "learning_rate": 1.992695123758659e-05, "loss": 1.3787, "step": 11640 }, { "epoch": 0.048157510194717555, "grad_norm": 3.8447070353873025, "learning_rate": 1.992679288819499e-05, "loss": 1.3815, "step": 11650 }, { "epoch": 0.04819884711333963, "grad_norm": 3.6946628562082693, "learning_rate": 1.9926634367990973e-05, "loss": 1.3788, "step": 11660 }, { "epoch": 0.0482401840319617, "grad_norm": 3.3618354267056465, "learning_rate": 1.992647567697727e-05, "loss": 1.4063, "step": 11670 }, { "epoch": 0.04828152095058378, "grad_norm": 3.495643041214259, "learning_rate": 1.9926316815156617e-05, "loss": 1.4348, "step": 11680 }, { "epoch": 0.04832285786920586, "grad_norm": 3.552951920155812, "learning_rate": 1.9926157782531735e-05, "loss": 1.3604, "step": 11690 }, { "epoch": 0.048364194787827934, "grad_norm": 3.2600229354667025, "learning_rate": 1.9925998579105374e-05, "loss": 1.3395, "step": 11700 }, { "epoch": 0.048405531706450004, "grad_norm": 3.4189972893062635, "learning_rate": 1.9925839204880263e-05, "loss": 1.4291, "step": 11710 }, { "epoch": 0.04844686862507208, "grad_norm": 3.2696931259943494, "learning_rate": 1.9925679659859148e-05, "loss": 1.3748, "step": 11720 }, { "epoch": 0.04848820554369416, "grad_norm": 3.400522141333647, "learning_rate": 1.9925519944044772e-05, "loss": 1.4141, "step": 11730 }, { "epoch": 0.04852954246231623, "grad_norm": 3.347216625916271, "learning_rate": 1.9925360057439887e-05, "loss": 1.4062, "step": 11740 }, { "epoch": 0.048570879380938306, "grad_norm": 3.55815106093295, "learning_rate": 1.9925200000047248e-05, "loss": 1.4056, "step": 11750 }, { "epoch": 0.04861221629956038, "grad_norm": 4.75921473200291, "learning_rate": 1.99250397718696e-05, "loss": 1.418, "step": 11760 }, { "epoch": 0.04865355321818246, "grad_norm": 3.2697324056422588, "learning_rate": 1.9924879372909703e-05, "loss": 1.4015, "step": 11770 }, { "epoch": 0.04869489013680453, "grad_norm": 3.590517133814017, "learning_rate": 1.9924718803170324e-05, "loss": 1.3738, "step": 11780 }, { "epoch": 0.04873622705542661, "grad_norm": 3.579850493829701, "learning_rate": 1.9924558062654215e-05, "loss": 1.334, "step": 11790 }, { "epoch": 0.048777563974048685, "grad_norm": 3.552170187760315, "learning_rate": 1.9924397151364148e-05, "loss": 1.4169, "step": 11800 }, { "epoch": 0.048818900892670755, "grad_norm": 3.0842304871945037, "learning_rate": 1.992423606930289e-05, "loss": 1.3906, "step": 11810 }, { "epoch": 0.04886023781129283, "grad_norm": 3.379537547677814, "learning_rate": 1.9924074816473215e-05, "loss": 1.4351, "step": 11820 }, { "epoch": 0.04890157472991491, "grad_norm": 3.53218922925584, "learning_rate": 1.9923913392877896e-05, "loss": 1.4032, "step": 11830 }, { "epoch": 0.04894291164853698, "grad_norm": 5.016060492627983, "learning_rate": 1.992375179851971e-05, "loss": 1.3918, "step": 11840 }, { "epoch": 0.04898424856715906, "grad_norm": 3.540388411237351, "learning_rate": 1.9923590033401443e-05, "loss": 1.4196, "step": 11850 }, { "epoch": 0.049025585485781134, "grad_norm": 3.3104883369223765, "learning_rate": 1.9923428097525872e-05, "loss": 1.4141, "step": 11860 }, { "epoch": 0.04906692240440321, "grad_norm": 3.580413781862322, "learning_rate": 1.9923265990895785e-05, "loss": 1.4291, "step": 11870 }, { "epoch": 0.04910825932302528, "grad_norm": 4.057142008160253, "learning_rate": 1.9923103713513972e-05, "loss": 1.4193, "step": 11880 }, { "epoch": 0.04914959624164736, "grad_norm": 3.157870137031843, "learning_rate": 1.9922941265383226e-05, "loss": 1.3949, "step": 11890 }, { "epoch": 0.049190933160269436, "grad_norm": 3.7453792892755255, "learning_rate": 1.992277864650634e-05, "loss": 1.373, "step": 11900 }, { "epoch": 0.049232270078891506, "grad_norm": 4.17858858176331, "learning_rate": 1.992261585688611e-05, "loss": 1.3732, "step": 11910 }, { "epoch": 0.04927360699751358, "grad_norm": 3.3040489276601157, "learning_rate": 1.992245289652535e-05, "loss": 1.373, "step": 11920 }, { "epoch": 0.04931494391613566, "grad_norm": 3.0410499391716117, "learning_rate": 1.992228976542685e-05, "loss": 1.3839, "step": 11930 }, { "epoch": 0.04935628083475774, "grad_norm": 3.541114095436553, "learning_rate": 1.9922126463593422e-05, "loss": 1.4006, "step": 11940 }, { "epoch": 0.04939761775337981, "grad_norm": 3.9811902872742673, "learning_rate": 1.992196299102788e-05, "loss": 1.3546, "step": 11950 }, { "epoch": 0.049438954672001885, "grad_norm": 3.1780875587126705, "learning_rate": 1.9921799347733026e-05, "loss": 1.3693, "step": 11960 }, { "epoch": 0.04948029159062396, "grad_norm": 3.5586898768297415, "learning_rate": 1.9921635533711687e-05, "loss": 1.4215, "step": 11970 }, { "epoch": 0.04952162850924603, "grad_norm": 2.8477978920610565, "learning_rate": 1.9921471548966678e-05, "loss": 1.4256, "step": 11980 }, { "epoch": 0.04956296542786811, "grad_norm": 3.5714848436239275, "learning_rate": 1.9921307393500822e-05, "loss": 1.4358, "step": 11990 }, { "epoch": 0.04960430234649019, "grad_norm": 3.746696171382991, "learning_rate": 1.992114306731694e-05, "loss": 1.4556, "step": 12000 }, { "epoch": 0.049645639265112264, "grad_norm": 3.6562155551861353, "learning_rate": 1.992097857041786e-05, "loss": 1.404, "step": 12010 }, { "epoch": 0.049686976183734334, "grad_norm": 3.2699664906016803, "learning_rate": 1.9920813902806414e-05, "loss": 1.3946, "step": 12020 }, { "epoch": 0.04972831310235641, "grad_norm": 4.034724844068233, "learning_rate": 1.992064906448544e-05, "loss": 1.4321, "step": 12030 }, { "epoch": 0.04976965002097849, "grad_norm": 3.4822925081412497, "learning_rate": 1.9920484055457767e-05, "loss": 1.4105, "step": 12040 }, { "epoch": 0.04981098693960056, "grad_norm": 3.587853195823534, "learning_rate": 1.9920318875726238e-05, "loss": 1.3697, "step": 12050 }, { "epoch": 0.049852323858222636, "grad_norm": 3.205239993732016, "learning_rate": 1.9920153525293694e-05, "loss": 1.3979, "step": 12060 }, { "epoch": 0.04989366077684471, "grad_norm": 3.388133162074283, "learning_rate": 1.991998800416298e-05, "loss": 1.4023, "step": 12070 }, { "epoch": 0.04993499769546679, "grad_norm": 4.2719394165918985, "learning_rate": 1.9919822312336947e-05, "loss": 1.3956, "step": 12080 }, { "epoch": 0.04997633461408886, "grad_norm": 3.234674011512212, "learning_rate": 1.9919656449818444e-05, "loss": 1.4101, "step": 12090 }, { "epoch": 0.05001767153271094, "grad_norm": 3.4860946302341107, "learning_rate": 1.9919490416610327e-05, "loss": 1.3802, "step": 12100 }, { "epoch": 0.050059008451333015, "grad_norm": 3.3542639404969483, "learning_rate": 1.9919324212715448e-05, "loss": 1.3865, "step": 12110 }, { "epoch": 0.050100345369955085, "grad_norm": 3.7877647269232293, "learning_rate": 1.9919157838136668e-05, "loss": 1.4198, "step": 12120 }, { "epoch": 0.05014168228857716, "grad_norm": 3.6768245303830214, "learning_rate": 1.9918991292876857e-05, "loss": 1.392, "step": 12130 }, { "epoch": 0.05018301920719924, "grad_norm": 3.0504339083555054, "learning_rate": 1.9918824576938872e-05, "loss": 1.3943, "step": 12140 }, { "epoch": 0.05022435612582131, "grad_norm": 4.003605772511745, "learning_rate": 1.9918657690325586e-05, "loss": 1.3627, "step": 12150 }, { "epoch": 0.05026569304444339, "grad_norm": 4.04766902184383, "learning_rate": 1.9918490633039873e-05, "loss": 1.3867, "step": 12160 }, { "epoch": 0.050307029963065464, "grad_norm": 4.054619230075598, "learning_rate": 1.99183234050846e-05, "loss": 1.3558, "step": 12170 }, { "epoch": 0.05034836688168754, "grad_norm": 3.3575853249481873, "learning_rate": 1.9918156006462653e-05, "loss": 1.3863, "step": 12180 }, { "epoch": 0.05038970380030961, "grad_norm": 3.5267112119515582, "learning_rate": 1.9917988437176908e-05, "loss": 1.3705, "step": 12190 }, { "epoch": 0.05043104071893169, "grad_norm": 3.528333383483439, "learning_rate": 1.9917820697230247e-05, "loss": 1.4441, "step": 12200 }, { "epoch": 0.050472377637553766, "grad_norm": 3.233459986557015, "learning_rate": 1.991765278662556e-05, "loss": 1.3532, "step": 12210 }, { "epoch": 0.050513714556175836, "grad_norm": 3.4949233408933034, "learning_rate": 1.991748470536573e-05, "loss": 1.3658, "step": 12220 }, { "epoch": 0.05055505147479791, "grad_norm": 3.194405798297394, "learning_rate": 1.9917316453453657e-05, "loss": 1.397, "step": 12230 }, { "epoch": 0.05059638839341999, "grad_norm": 3.6606193297707046, "learning_rate": 1.9917148030892238e-05, "loss": 1.4072, "step": 12240 }, { "epoch": 0.05063772531204207, "grad_norm": 4.069430004373759, "learning_rate": 1.9916979437684362e-05, "loss": 1.4136, "step": 12250 }, { "epoch": 0.05067906223066414, "grad_norm": 3.1603885636318956, "learning_rate": 1.991681067383293e-05, "loss": 1.4097, "step": 12260 }, { "epoch": 0.050720399149286215, "grad_norm": 3.177326748991991, "learning_rate": 1.9916641739340857e-05, "loss": 1.4195, "step": 12270 }, { "epoch": 0.05076173606790829, "grad_norm": 3.918139727949541, "learning_rate": 1.991647263421104e-05, "loss": 1.3876, "step": 12280 }, { "epoch": 0.05080307298653036, "grad_norm": 3.9573038369404427, "learning_rate": 1.9916303358446392e-05, "loss": 1.3683, "step": 12290 }, { "epoch": 0.05084440990515244, "grad_norm": 4.275806482199394, "learning_rate": 1.9916133912049825e-05, "loss": 1.4204, "step": 12300 }, { "epoch": 0.050885746823774516, "grad_norm": 4.847335915383492, "learning_rate": 1.9915964295024254e-05, "loss": 1.4034, "step": 12310 }, { "epoch": 0.050927083742396594, "grad_norm": 3.273054866007932, "learning_rate": 1.99157945073726e-05, "loss": 1.3968, "step": 12320 }, { "epoch": 0.050968420661018664, "grad_norm": 3.7167200760785173, "learning_rate": 1.9915624549097784e-05, "loss": 1.3999, "step": 12330 }, { "epoch": 0.05100975757964074, "grad_norm": 3.4590376733691337, "learning_rate": 1.991545442020273e-05, "loss": 1.4122, "step": 12340 }, { "epoch": 0.05105109449826282, "grad_norm": 3.1847142656240255, "learning_rate": 1.9915284120690362e-05, "loss": 1.4239, "step": 12350 }, { "epoch": 0.05109243141688489, "grad_norm": 3.3467985786610353, "learning_rate": 1.991511365056362e-05, "loss": 1.3844, "step": 12360 }, { "epoch": 0.051133768335506966, "grad_norm": 4.094929119314472, "learning_rate": 1.9914943009825425e-05, "loss": 1.3577, "step": 12370 }, { "epoch": 0.05117510525412904, "grad_norm": 3.521925167773648, "learning_rate": 1.9914772198478723e-05, "loss": 1.3954, "step": 12380 }, { "epoch": 0.05121644217275112, "grad_norm": 3.3998170027309067, "learning_rate": 1.9914601216526446e-05, "loss": 1.4102, "step": 12390 }, { "epoch": 0.05125777909137319, "grad_norm": 3.6457154728502035, "learning_rate": 1.9914430063971542e-05, "loss": 1.3666, "step": 12400 }, { "epoch": 0.05129911600999527, "grad_norm": 3.133616222950282, "learning_rate": 1.9914258740816956e-05, "loss": 1.4071, "step": 12410 }, { "epoch": 0.051340452928617344, "grad_norm": 3.5165895133634133, "learning_rate": 1.9914087247065634e-05, "loss": 1.4127, "step": 12420 }, { "epoch": 0.051381789847239415, "grad_norm": 3.1317185826456795, "learning_rate": 1.991391558272052e-05, "loss": 1.3609, "step": 12430 }, { "epoch": 0.05142312676586149, "grad_norm": 3.7493626264431605, "learning_rate": 1.991374374778458e-05, "loss": 1.3832, "step": 12440 }, { "epoch": 0.05146446368448357, "grad_norm": 3.5662863496670196, "learning_rate": 1.991357174226076e-05, "loss": 1.3665, "step": 12450 }, { "epoch": 0.05150580060310564, "grad_norm": 3.1448919796113035, "learning_rate": 1.9913399566152033e-05, "loss": 1.3965, "step": 12460 }, { "epoch": 0.051547137521727716, "grad_norm": 3.3154750106704625, "learning_rate": 1.991322721946135e-05, "loss": 1.3873, "step": 12470 }, { "epoch": 0.051588474440349794, "grad_norm": 3.688390777094577, "learning_rate": 1.991305470219168e-05, "loss": 1.3224, "step": 12480 }, { "epoch": 0.05162981135897187, "grad_norm": 3.0270982188671907, "learning_rate": 1.9912882014345988e-05, "loss": 1.3551, "step": 12490 }, { "epoch": 0.05167114827759394, "grad_norm": 3.7441081256974136, "learning_rate": 1.9912709155927254e-05, "loss": 1.3945, "step": 12500 }, { "epoch": 0.05171248519621602, "grad_norm": 3.697861466581755, "learning_rate": 1.9912536126938446e-05, "loss": 1.3612, "step": 12510 }, { "epoch": 0.051753822114838095, "grad_norm": 3.523721321551833, "learning_rate": 1.9912362927382546e-05, "loss": 1.3747, "step": 12520 }, { "epoch": 0.051795159033460166, "grad_norm": 3.0061067921910727, "learning_rate": 1.9912189557262528e-05, "loss": 1.4086, "step": 12530 }, { "epoch": 0.05183649595208224, "grad_norm": 4.0409382024057425, "learning_rate": 1.991201601658138e-05, "loss": 1.3849, "step": 12540 }, { "epoch": 0.05187783287070432, "grad_norm": 3.7066972530256983, "learning_rate": 1.9911842305342085e-05, "loss": 1.3775, "step": 12550 }, { "epoch": 0.0519191697893264, "grad_norm": 3.8641850919990737, "learning_rate": 1.9911668423547635e-05, "loss": 1.4056, "step": 12560 }, { "epoch": 0.05196050670794847, "grad_norm": 2.8845765281029325, "learning_rate": 1.9911494371201023e-05, "loss": 1.3433, "step": 12570 }, { "epoch": 0.052001843626570544, "grad_norm": 2.9182840148796996, "learning_rate": 1.9911320148305235e-05, "loss": 1.4146, "step": 12580 }, { "epoch": 0.05204318054519262, "grad_norm": 3.471309021112678, "learning_rate": 1.991114575486328e-05, "loss": 1.3769, "step": 12590 }, { "epoch": 0.05208451746381469, "grad_norm": 3.132099407416561, "learning_rate": 1.9910971190878157e-05, "loss": 1.4006, "step": 12600 }, { "epoch": 0.05212585438243677, "grad_norm": 3.5078473659046554, "learning_rate": 1.9910796456352863e-05, "loss": 1.3608, "step": 12610 }, { "epoch": 0.052167191301058846, "grad_norm": 3.420611210950219, "learning_rate": 1.991062155129041e-05, "loss": 1.3477, "step": 12620 }, { "epoch": 0.05220852821968092, "grad_norm": 3.3602682043236425, "learning_rate": 1.991044647569381e-05, "loss": 1.3821, "step": 12630 }, { "epoch": 0.052249865138302994, "grad_norm": 3.3934199487204326, "learning_rate": 1.9910271229566067e-05, "loss": 1.3672, "step": 12640 }, { "epoch": 0.05229120205692507, "grad_norm": 3.3930743766477636, "learning_rate": 1.9910095812910205e-05, "loss": 1.3805, "step": 12650 }, { "epoch": 0.05233253897554715, "grad_norm": 3.803471056919821, "learning_rate": 1.9909920225729237e-05, "loss": 1.357, "step": 12660 }, { "epoch": 0.05237387589416922, "grad_norm": 2.9908606514422766, "learning_rate": 1.990974446802619e-05, "loss": 1.4148, "step": 12670 }, { "epoch": 0.052415212812791295, "grad_norm": 3.472506553773665, "learning_rate": 1.990956853980408e-05, "loss": 1.4119, "step": 12680 }, { "epoch": 0.05245654973141337, "grad_norm": 3.652498665098648, "learning_rate": 1.9909392441065944e-05, "loss": 1.3896, "step": 12690 }, { "epoch": 0.05249788665003545, "grad_norm": 3.556198285804122, "learning_rate": 1.9909216171814802e-05, "loss": 1.3556, "step": 12700 }, { "epoch": 0.05253922356865752, "grad_norm": 3.3341578506950187, "learning_rate": 1.9909039732053695e-05, "loss": 1.3875, "step": 12710 }, { "epoch": 0.0525805604872796, "grad_norm": 3.623737574209396, "learning_rate": 1.9908863121785656e-05, "loss": 1.3699, "step": 12720 }, { "epoch": 0.052621897405901674, "grad_norm": 3.068120426816953, "learning_rate": 1.9908686341013723e-05, "loss": 1.3504, "step": 12730 }, { "epoch": 0.052663234324523744, "grad_norm": 3.5988757581859643, "learning_rate": 1.990850938974094e-05, "loss": 1.3506, "step": 12740 }, { "epoch": 0.05270457124314582, "grad_norm": 3.4850198824984724, "learning_rate": 1.990833226797035e-05, "loss": 1.3949, "step": 12750 }, { "epoch": 0.0527459081617679, "grad_norm": 3.3573178296822834, "learning_rate": 1.9908154975705e-05, "loss": 1.3766, "step": 12760 }, { "epoch": 0.052787245080389976, "grad_norm": 3.5288003708700186, "learning_rate": 1.990797751294795e-05, "loss": 1.3915, "step": 12770 }, { "epoch": 0.052828581999012046, "grad_norm": 3.0065181585529794, "learning_rate": 1.990779987970224e-05, "loss": 1.3943, "step": 12780 }, { "epoch": 0.05286991891763412, "grad_norm": 3.8902210517557787, "learning_rate": 1.9907622075970933e-05, "loss": 1.4339, "step": 12790 }, { "epoch": 0.0529112558362562, "grad_norm": 3.0754498963080317, "learning_rate": 1.990744410175709e-05, "loss": 1.3633, "step": 12800 }, { "epoch": 0.05295259275487827, "grad_norm": 3.373819633563616, "learning_rate": 1.990726595706377e-05, "loss": 1.3729, "step": 12810 }, { "epoch": 0.05299392967350035, "grad_norm": 3.2413750593238277, "learning_rate": 1.990708764189404e-05, "loss": 1.3611, "step": 12820 }, { "epoch": 0.053035266592122425, "grad_norm": 3.3175842583387287, "learning_rate": 1.990690915625097e-05, "loss": 1.4386, "step": 12830 }, { "epoch": 0.053076603510744495, "grad_norm": 4.421464949416987, "learning_rate": 1.9906730500137626e-05, "loss": 1.3825, "step": 12840 }, { "epoch": 0.05311794042936657, "grad_norm": 3.7375473757828312, "learning_rate": 1.9906551673557092e-05, "loss": 1.3584, "step": 12850 }, { "epoch": 0.05315927734798865, "grad_norm": 4.23504822699641, "learning_rate": 1.9906372676512435e-05, "loss": 1.3655, "step": 12860 }, { "epoch": 0.05320061426661073, "grad_norm": 3.674354440233681, "learning_rate": 1.9906193509006737e-05, "loss": 1.3652, "step": 12870 }, { "epoch": 0.0532419511852328, "grad_norm": 3.270440974926962, "learning_rate": 1.9906014171043085e-05, "loss": 1.408, "step": 12880 }, { "epoch": 0.053283288103854874, "grad_norm": 3.4328461661592007, "learning_rate": 1.9905834662624562e-05, "loss": 1.3881, "step": 12890 }, { "epoch": 0.05332462502247695, "grad_norm": 3.296815547244285, "learning_rate": 1.9905654983754255e-05, "loss": 1.3099, "step": 12900 }, { "epoch": 0.05336596194109902, "grad_norm": 3.182778307558256, "learning_rate": 1.9905475134435265e-05, "loss": 1.3887, "step": 12910 }, { "epoch": 0.0534072988597211, "grad_norm": 4.066051141098089, "learning_rate": 1.9905295114670674e-05, "loss": 1.3615, "step": 12920 }, { "epoch": 0.053448635778343176, "grad_norm": 3.62599590443558, "learning_rate": 1.9905114924463592e-05, "loss": 1.3461, "step": 12930 }, { "epoch": 0.05348997269696525, "grad_norm": 4.5391383467222735, "learning_rate": 1.9904934563817106e-05, "loss": 1.3543, "step": 12940 }, { "epoch": 0.05353130961558732, "grad_norm": 4.04368698393749, "learning_rate": 1.990475403273433e-05, "loss": 1.3712, "step": 12950 }, { "epoch": 0.0535726465342094, "grad_norm": 3.9811173894333836, "learning_rate": 1.9904573331218365e-05, "loss": 1.4334, "step": 12960 }, { "epoch": 0.05361398345283148, "grad_norm": 3.499953327542098, "learning_rate": 1.9904392459272326e-05, "loss": 1.3871, "step": 12970 }, { "epoch": 0.05365532037145355, "grad_norm": 3.226290735311431, "learning_rate": 1.9904211416899322e-05, "loss": 1.4122, "step": 12980 }, { "epoch": 0.053696657290075625, "grad_norm": 3.566091958099414, "learning_rate": 1.990403020410247e-05, "loss": 1.4075, "step": 12990 }, { "epoch": 0.0537379942086977, "grad_norm": 3.4558175186897513, "learning_rate": 1.990384882088488e-05, "loss": 1.4553, "step": 13000 }, { "epoch": 0.05377933112731978, "grad_norm": 3.238909449520725, "learning_rate": 1.9903667267249683e-05, "loss": 1.3791, "step": 13010 }, { "epoch": 0.05382066804594185, "grad_norm": 3.517722296765338, "learning_rate": 1.9903485543199995e-05, "loss": 1.3283, "step": 13020 }, { "epoch": 0.05386200496456393, "grad_norm": 3.42397575432932, "learning_rate": 1.9903303648738954e-05, "loss": 1.3335, "step": 13030 }, { "epoch": 0.053903341883186004, "grad_norm": 3.350334059229468, "learning_rate": 1.990312158386968e-05, "loss": 1.3806, "step": 13040 }, { "epoch": 0.053944678801808074, "grad_norm": 3.051548296138219, "learning_rate": 1.9902939348595307e-05, "loss": 1.3885, "step": 13050 }, { "epoch": 0.05398601572043015, "grad_norm": 3.276091159978694, "learning_rate": 1.9902756942918976e-05, "loss": 1.359, "step": 13060 }, { "epoch": 0.05402735263905223, "grad_norm": 3.5296387125760185, "learning_rate": 1.9902574366843824e-05, "loss": 1.3625, "step": 13070 }, { "epoch": 0.054068689557674306, "grad_norm": 3.567948297220875, "learning_rate": 1.990239162037299e-05, "loss": 1.351, "step": 13080 }, { "epoch": 0.054110026476296376, "grad_norm": 3.1640186718240266, "learning_rate": 1.9902208703509617e-05, "loss": 1.3458, "step": 13090 }, { "epoch": 0.05415136339491845, "grad_norm": 3.9025546167384495, "learning_rate": 1.9902025616256854e-05, "loss": 1.3588, "step": 13100 }, { "epoch": 0.05419270031354053, "grad_norm": 3.501658240766089, "learning_rate": 1.9901842358617854e-05, "loss": 1.3624, "step": 13110 }, { "epoch": 0.0542340372321626, "grad_norm": 3.923570308465845, "learning_rate": 1.9901658930595774e-05, "loss": 1.3294, "step": 13120 }, { "epoch": 0.05427537415078468, "grad_norm": 3.0232913372852406, "learning_rate": 1.990147533219376e-05, "loss": 1.3855, "step": 13130 }, { "epoch": 0.054316711069406755, "grad_norm": 3.670482250458697, "learning_rate": 1.9901291563414977e-05, "loss": 1.3337, "step": 13140 }, { "epoch": 0.054358047988028825, "grad_norm": 3.790854501668152, "learning_rate": 1.990110762426259e-05, "loss": 1.366, "step": 13150 }, { "epoch": 0.0543993849066509, "grad_norm": 3.1103384131591256, "learning_rate": 1.9900923514739758e-05, "loss": 1.3574, "step": 13160 }, { "epoch": 0.05444072182527298, "grad_norm": 3.2207958459794845, "learning_rate": 1.990073923484965e-05, "loss": 1.3675, "step": 13170 }, { "epoch": 0.05448205874389506, "grad_norm": 3.2588291421463023, "learning_rate": 1.990055478459544e-05, "loss": 1.3313, "step": 13180 }, { "epoch": 0.05452339566251713, "grad_norm": 2.9426904447180506, "learning_rate": 1.99003701639803e-05, "loss": 1.3995, "step": 13190 }, { "epoch": 0.054564732581139204, "grad_norm": 3.892827987664763, "learning_rate": 1.990018537300741e-05, "loss": 1.4035, "step": 13200 }, { "epoch": 0.05460606949976128, "grad_norm": 3.765962575470102, "learning_rate": 1.9900000411679946e-05, "loss": 1.3823, "step": 13210 }, { "epoch": 0.05464740641838335, "grad_norm": 3.031044142550962, "learning_rate": 1.9899815280001093e-05, "loss": 1.3907, "step": 13220 }, { "epoch": 0.05468874333700543, "grad_norm": 3.401074997651561, "learning_rate": 1.9899629977974033e-05, "loss": 1.3724, "step": 13230 }, { "epoch": 0.054730080255627506, "grad_norm": 3.4363592487014367, "learning_rate": 1.9899444505601957e-05, "loss": 1.4044, "step": 13240 }, { "epoch": 0.05477141717424958, "grad_norm": 3.4008170147404924, "learning_rate": 1.9899258862888055e-05, "loss": 1.4329, "step": 13250 }, { "epoch": 0.05481275409287165, "grad_norm": 3.3535448510349086, "learning_rate": 1.9899073049835526e-05, "loss": 1.3803, "step": 13260 }, { "epoch": 0.05485409101149373, "grad_norm": 3.6491506303085677, "learning_rate": 1.9898887066447564e-05, "loss": 1.4061, "step": 13270 }, { "epoch": 0.05489542793011581, "grad_norm": 3.2649595342568754, "learning_rate": 1.9898700912727365e-05, "loss": 1.3548, "step": 13280 }, { "epoch": 0.05493676484873788, "grad_norm": 3.260685808424658, "learning_rate": 1.9898514588678138e-05, "loss": 1.3798, "step": 13290 }, { "epoch": 0.054978101767359955, "grad_norm": 3.3068054059856964, "learning_rate": 1.989832809430309e-05, "loss": 1.3873, "step": 13300 }, { "epoch": 0.05501943868598203, "grad_norm": 3.3289477651913844, "learning_rate": 1.9898141429605428e-05, "loss": 1.42, "step": 13310 }, { "epoch": 0.05506077560460411, "grad_norm": 3.899358403289862, "learning_rate": 1.9897954594588366e-05, "loss": 1.3612, "step": 13320 }, { "epoch": 0.05510211252322618, "grad_norm": 3.4534257185508768, "learning_rate": 1.989776758925511e-05, "loss": 1.4139, "step": 13330 }, { "epoch": 0.05514344944184826, "grad_norm": 3.0896933369894555, "learning_rate": 1.9897580413608888e-05, "loss": 1.3455, "step": 13340 }, { "epoch": 0.055184786360470334, "grad_norm": 3.6880673723268895, "learning_rate": 1.9897393067652916e-05, "loss": 1.3553, "step": 13350 }, { "epoch": 0.055226123279092404, "grad_norm": 4.959844713915171, "learning_rate": 1.989720555139042e-05, "loss": 1.3744, "step": 13360 }, { "epoch": 0.05526746019771448, "grad_norm": 3.7512130358105535, "learning_rate": 1.9897017864824623e-05, "loss": 1.3967, "step": 13370 }, { "epoch": 0.05530879711633656, "grad_norm": 3.438280531829743, "learning_rate": 1.989683000795876e-05, "loss": 1.3199, "step": 13380 }, { "epoch": 0.055350134034958635, "grad_norm": 3.191658504407269, "learning_rate": 1.989664198079606e-05, "loss": 1.3843, "step": 13390 }, { "epoch": 0.055391470953580706, "grad_norm": 3.246301259794481, "learning_rate": 1.989645378333976e-05, "loss": 1.3823, "step": 13400 }, { "epoch": 0.05543280787220278, "grad_norm": 3.209808944542668, "learning_rate": 1.9896265415593096e-05, "loss": 1.4023, "step": 13410 }, { "epoch": 0.05547414479082486, "grad_norm": 3.8876392074458055, "learning_rate": 1.989607687755931e-05, "loss": 1.4021, "step": 13420 }, { "epoch": 0.05551548170944693, "grad_norm": 4.6622959130132635, "learning_rate": 1.9895888169241643e-05, "loss": 1.3941, "step": 13430 }, { "epoch": 0.05555681862806901, "grad_norm": 3.2829449802312243, "learning_rate": 1.989569929064335e-05, "loss": 1.3914, "step": 13440 }, { "epoch": 0.055598155546691085, "grad_norm": 2.975464909103463, "learning_rate": 1.989551024176768e-05, "loss": 1.3393, "step": 13450 }, { "epoch": 0.055639492465313155, "grad_norm": 3.3616145226127374, "learning_rate": 1.9895321022617877e-05, "loss": 1.3691, "step": 13460 }, { "epoch": 0.05568082938393523, "grad_norm": 3.551441103147202, "learning_rate": 1.9895131633197206e-05, "loss": 1.3748, "step": 13470 }, { "epoch": 0.05572216630255731, "grad_norm": 3.1368088044777838, "learning_rate": 1.9894942073508924e-05, "loss": 1.3341, "step": 13480 }, { "epoch": 0.055763503221179386, "grad_norm": 2.8747175172948722, "learning_rate": 1.989475234355629e-05, "loss": 1.3951, "step": 13490 }, { "epoch": 0.055804840139801457, "grad_norm": 3.0419841938845975, "learning_rate": 1.989456244334257e-05, "loss": 1.3325, "step": 13500 }, { "epoch": 0.055846177058423534, "grad_norm": 3.6672894427510947, "learning_rate": 1.9894372372871036e-05, "loss": 1.3847, "step": 13510 }, { "epoch": 0.05588751397704561, "grad_norm": 3.3319466434724325, "learning_rate": 1.989418213214495e-05, "loss": 1.4007, "step": 13520 }, { "epoch": 0.05592885089566768, "grad_norm": 2.756361075189523, "learning_rate": 1.9893991721167593e-05, "loss": 1.3962, "step": 13530 }, { "epoch": 0.05597018781428976, "grad_norm": 3.2588019424168384, "learning_rate": 1.989380113994224e-05, "loss": 1.409, "step": 13540 }, { "epoch": 0.056011524732911835, "grad_norm": 3.992120970935661, "learning_rate": 1.9893610388472162e-05, "loss": 1.3642, "step": 13550 }, { "epoch": 0.05605286165153391, "grad_norm": 3.0454737775385885, "learning_rate": 1.9893419466760653e-05, "loss": 1.3696, "step": 13560 }, { "epoch": 0.05609419857015598, "grad_norm": 3.1960156109377507, "learning_rate": 1.9893228374810993e-05, "loss": 1.3611, "step": 13570 }, { "epoch": 0.05613553548877806, "grad_norm": 3.3282542329613496, "learning_rate": 1.989303711262647e-05, "loss": 1.3541, "step": 13580 }, { "epoch": 0.05617687240740014, "grad_norm": 4.271766406501802, "learning_rate": 1.9892845680210374e-05, "loss": 1.3033, "step": 13590 }, { "epoch": 0.05621820932602221, "grad_norm": 3.4542215892482964, "learning_rate": 1.9892654077566003e-05, "loss": 1.3853, "step": 13600 }, { "epoch": 0.056259546244644285, "grad_norm": 5.10198450683926, "learning_rate": 1.9892462304696653e-05, "loss": 1.3758, "step": 13610 }, { "epoch": 0.05630088316326636, "grad_norm": 4.67424032198832, "learning_rate": 1.989227036160562e-05, "loss": 1.3395, "step": 13620 }, { "epoch": 0.05634222008188844, "grad_norm": 3.574141696384299, "learning_rate": 1.989207824829621e-05, "loss": 1.3953, "step": 13630 }, { "epoch": 0.05638355700051051, "grad_norm": 3.4219942324444244, "learning_rate": 1.989188596477173e-05, "loss": 1.3493, "step": 13640 }, { "epoch": 0.056424893919132586, "grad_norm": 3.135032092895971, "learning_rate": 1.9891693511035484e-05, "loss": 1.4203, "step": 13650 }, { "epoch": 0.05646623083775466, "grad_norm": 3.0856331845063387, "learning_rate": 1.989150088709079e-05, "loss": 1.2854, "step": 13660 }, { "epoch": 0.056507567756376734, "grad_norm": 3.9267068067232, "learning_rate": 1.9891308092940953e-05, "loss": 1.3701, "step": 13670 }, { "epoch": 0.05654890467499881, "grad_norm": 3.8306681224418395, "learning_rate": 1.98911151285893e-05, "loss": 1.4076, "step": 13680 }, { "epoch": 0.05659024159362089, "grad_norm": 3.306837169963007, "learning_rate": 1.9890921994039148e-05, "loss": 1.3873, "step": 13690 }, { "epoch": 0.056631578512242965, "grad_norm": 3.2434462335337297, "learning_rate": 1.989072868929382e-05, "loss": 1.3531, "step": 13700 }, { "epoch": 0.056672915430865035, "grad_norm": 3.3740272090711856, "learning_rate": 1.989053521435664e-05, "loss": 1.3772, "step": 13710 }, { "epoch": 0.05671425234948711, "grad_norm": 3.141984063033404, "learning_rate": 1.989034156923094e-05, "loss": 1.3983, "step": 13720 }, { "epoch": 0.05675558926810919, "grad_norm": 2.9680517660305847, "learning_rate": 1.989014775392005e-05, "loss": 1.3651, "step": 13730 }, { "epoch": 0.05679692618673126, "grad_norm": 3.418890084499366, "learning_rate": 1.9889953768427313e-05, "loss": 1.4157, "step": 13740 }, { "epoch": 0.05683826310535334, "grad_norm": 3.7254805946590706, "learning_rate": 1.9889759612756053e-05, "loss": 1.3979, "step": 13750 }, { "epoch": 0.056879600023975414, "grad_norm": 3.617034367391942, "learning_rate": 1.9889565286909623e-05, "loss": 1.3549, "step": 13760 }, { "epoch": 0.056920936942597485, "grad_norm": 3.8592922160592646, "learning_rate": 1.9889370790891364e-05, "loss": 1.4008, "step": 13770 }, { "epoch": 0.05696227386121956, "grad_norm": 3.510616141867297, "learning_rate": 1.9889176124704616e-05, "loss": 1.4071, "step": 13780 }, { "epoch": 0.05700361077984164, "grad_norm": 3.5434621547794105, "learning_rate": 1.9888981288352736e-05, "loss": 1.3782, "step": 13790 }, { "epoch": 0.057044947698463716, "grad_norm": 3.0056956686627117, "learning_rate": 1.988878628183907e-05, "loss": 1.352, "step": 13800 }, { "epoch": 0.057086284617085786, "grad_norm": 2.876862066794774, "learning_rate": 1.9888591105166984e-05, "loss": 1.3451, "step": 13810 }, { "epoch": 0.05712762153570786, "grad_norm": 3.6994718345443776, "learning_rate": 1.9888395758339823e-05, "loss": 1.3711, "step": 13820 }, { "epoch": 0.05716895845432994, "grad_norm": 3.7019952550478052, "learning_rate": 1.988820024136096e-05, "loss": 1.3542, "step": 13830 }, { "epoch": 0.05721029537295201, "grad_norm": 3.149511937310367, "learning_rate": 1.9888004554233757e-05, "loss": 1.3498, "step": 13840 }, { "epoch": 0.05725163229157409, "grad_norm": 3.3687656057902298, "learning_rate": 1.9887808696961574e-05, "loss": 1.3759, "step": 13850 }, { "epoch": 0.057292969210196165, "grad_norm": 3.286956265957372, "learning_rate": 1.988761266954779e-05, "loss": 1.3416, "step": 13860 }, { "epoch": 0.05733430612881824, "grad_norm": 3.8546233434442025, "learning_rate": 1.988741647199577e-05, "loss": 1.3601, "step": 13870 }, { "epoch": 0.05737564304744031, "grad_norm": 4.043137550642432, "learning_rate": 1.98872201043089e-05, "loss": 1.4019, "step": 13880 }, { "epoch": 0.05741697996606239, "grad_norm": 4.6033780589634805, "learning_rate": 1.988702356649055e-05, "loss": 1.406, "step": 13890 }, { "epoch": 0.05745831688468447, "grad_norm": 3.4533366026357135, "learning_rate": 1.9886826858544103e-05, "loss": 1.3579, "step": 13900 }, { "epoch": 0.05749965380330654, "grad_norm": 4.2918223423561725, "learning_rate": 1.9886629980472945e-05, "loss": 1.3238, "step": 13910 }, { "epoch": 0.057540990721928614, "grad_norm": 3.1786603013459667, "learning_rate": 1.988643293228047e-05, "loss": 1.3815, "step": 13920 }, { "epoch": 0.05758232764055069, "grad_norm": 4.187787674938507, "learning_rate": 1.988623571397006e-05, "loss": 1.3129, "step": 13930 }, { "epoch": 0.05762366455917277, "grad_norm": 3.2066247160025956, "learning_rate": 1.9886038325545112e-05, "loss": 1.3604, "step": 13940 }, { "epoch": 0.05766500147779484, "grad_norm": 4.137189558470061, "learning_rate": 1.9885840767009023e-05, "loss": 1.3683, "step": 13950 }, { "epoch": 0.057706338396416916, "grad_norm": 3.21825868230931, "learning_rate": 1.988564303836519e-05, "loss": 1.3521, "step": 13960 }, { "epoch": 0.05774767531503899, "grad_norm": 3.5331562264396097, "learning_rate": 1.9885445139617018e-05, "loss": 1.4079, "step": 13970 }, { "epoch": 0.05778901223366106, "grad_norm": 3.1970430005062607, "learning_rate": 1.9885247070767915e-05, "loss": 1.3688, "step": 13980 }, { "epoch": 0.05783034915228314, "grad_norm": 4.27476021372676, "learning_rate": 1.988504883182128e-05, "loss": 1.364, "step": 13990 }, { "epoch": 0.05787168607090522, "grad_norm": 3.2156024105612278, "learning_rate": 1.9884850422780534e-05, "loss": 1.3814, "step": 14000 }, { "epoch": 0.057913022989527295, "grad_norm": 3.6998253526421565, "learning_rate": 1.9884651843649083e-05, "loss": 1.3698, "step": 14010 }, { "epoch": 0.057954359908149365, "grad_norm": 4.503662250473274, "learning_rate": 1.988445309443035e-05, "loss": 1.3564, "step": 14020 }, { "epoch": 0.05799569682677144, "grad_norm": 3.762384040491392, "learning_rate": 1.9884254175127754e-05, "loss": 1.4119, "step": 14030 }, { "epoch": 0.05803703374539352, "grad_norm": 3.285301388684364, "learning_rate": 1.9884055085744713e-05, "loss": 1.3501, "step": 14040 }, { "epoch": 0.05807837066401559, "grad_norm": 4.336832986530797, "learning_rate": 1.9883855826284656e-05, "loss": 1.3662, "step": 14050 }, { "epoch": 0.05811970758263767, "grad_norm": 3.4574445488885734, "learning_rate": 1.9883656396751016e-05, "loss": 1.3127, "step": 14060 }, { "epoch": 0.058161044501259744, "grad_norm": 3.4098914920099883, "learning_rate": 1.988345679714722e-05, "loss": 1.391, "step": 14070 }, { "epoch": 0.058202381419881814, "grad_norm": 3.6614081585424603, "learning_rate": 1.98832570274767e-05, "loss": 1.3659, "step": 14080 }, { "epoch": 0.05824371833850389, "grad_norm": 3.276861233677139, "learning_rate": 1.98830570877429e-05, "loss": 1.3559, "step": 14090 }, { "epoch": 0.05828505525712597, "grad_norm": 3.4536947240708997, "learning_rate": 1.9882856977949257e-05, "loss": 1.3779, "step": 14100 }, { "epoch": 0.058326392175748046, "grad_norm": 3.242988394736396, "learning_rate": 1.9882656698099213e-05, "loss": 1.3353, "step": 14110 }, { "epoch": 0.058367729094370116, "grad_norm": 3.033285353432935, "learning_rate": 1.9882456248196216e-05, "loss": 1.3831, "step": 14120 }, { "epoch": 0.05840906601299219, "grad_norm": 2.9840093106321173, "learning_rate": 1.9882255628243715e-05, "loss": 1.399, "step": 14130 }, { "epoch": 0.05845040293161427, "grad_norm": 3.424871961043564, "learning_rate": 1.9882054838245158e-05, "loss": 1.3774, "step": 14140 }, { "epoch": 0.05849173985023634, "grad_norm": 4.206811562034524, "learning_rate": 1.988185387820401e-05, "loss": 1.3768, "step": 14150 }, { "epoch": 0.05853307676885842, "grad_norm": 3.167196829826696, "learning_rate": 1.9881652748123723e-05, "loss": 1.3118, "step": 14160 }, { "epoch": 0.058574413687480495, "grad_norm": 3.2647863004270583, "learning_rate": 1.9881451448007752e-05, "loss": 1.359, "step": 14170 }, { "epoch": 0.05861575060610257, "grad_norm": 3.2249069204445506, "learning_rate": 1.988124997785957e-05, "loss": 1.3594, "step": 14180 }, { "epoch": 0.05865708752472464, "grad_norm": 3.125833926204158, "learning_rate": 1.9881048337682644e-05, "loss": 1.3729, "step": 14190 }, { "epoch": 0.05869842444334672, "grad_norm": 3.124857727130482, "learning_rate": 1.9880846527480434e-05, "loss": 1.3968, "step": 14200 }, { "epoch": 0.0587397613619688, "grad_norm": 3.5324983045866416, "learning_rate": 1.988064454725642e-05, "loss": 1.3744, "step": 14210 }, { "epoch": 0.05878109828059087, "grad_norm": 3.534173243273415, "learning_rate": 1.9880442397014082e-05, "loss": 1.3858, "step": 14220 }, { "epoch": 0.058822435199212944, "grad_norm": 4.522930564227188, "learning_rate": 1.9880240076756885e-05, "loss": 1.3365, "step": 14230 }, { "epoch": 0.05886377211783502, "grad_norm": 2.911847199383502, "learning_rate": 1.9880037586488324e-05, "loss": 1.3629, "step": 14240 }, { "epoch": 0.0589051090364571, "grad_norm": 3.467653251925633, "learning_rate": 1.9879834926211875e-05, "loss": 1.3839, "step": 14250 }, { "epoch": 0.05894644595507917, "grad_norm": 3.7166470032659618, "learning_rate": 1.9879632095931024e-05, "loss": 1.3358, "step": 14260 }, { "epoch": 0.058987782873701246, "grad_norm": 3.7077022813203193, "learning_rate": 1.987942909564927e-05, "loss": 1.3474, "step": 14270 }, { "epoch": 0.05902911979232332, "grad_norm": 3.1733151465540916, "learning_rate": 1.9879225925370094e-05, "loss": 1.3881, "step": 14280 }, { "epoch": 0.05907045671094539, "grad_norm": 3.0349885556128378, "learning_rate": 1.9879022585097005e-05, "loss": 1.3686, "step": 14290 }, { "epoch": 0.05911179362956747, "grad_norm": 3.2620109103700363, "learning_rate": 1.9878819074833493e-05, "loss": 1.3588, "step": 14300 }, { "epoch": 0.05915313054818955, "grad_norm": 3.7133352574339873, "learning_rate": 1.9878615394583062e-05, "loss": 1.34, "step": 14310 }, { "epoch": 0.059194467466811625, "grad_norm": 3.1624289931149114, "learning_rate": 1.987841154434922e-05, "loss": 1.3334, "step": 14320 }, { "epoch": 0.059235804385433695, "grad_norm": 3.7047396553657643, "learning_rate": 1.9878207524135468e-05, "loss": 1.3375, "step": 14330 }, { "epoch": 0.05927714130405577, "grad_norm": 3.3119519535402, "learning_rate": 1.9878003333945325e-05, "loss": 1.3537, "step": 14340 }, { "epoch": 0.05931847822267785, "grad_norm": 3.3812187643072105, "learning_rate": 1.98777989737823e-05, "loss": 1.3374, "step": 14350 }, { "epoch": 0.05935981514129992, "grad_norm": 3.299892572730851, "learning_rate": 1.9877594443649902e-05, "loss": 1.3704, "step": 14360 }, { "epoch": 0.059401152059922, "grad_norm": 4.081245562156265, "learning_rate": 1.9877389743551668e-05, "loss": 1.3498, "step": 14370 }, { "epoch": 0.059442488978544074, "grad_norm": 3.350158600172311, "learning_rate": 1.9877184873491102e-05, "loss": 1.3449, "step": 14380 }, { "epoch": 0.05948382589716615, "grad_norm": 3.714541324538004, "learning_rate": 1.9876979833471742e-05, "loss": 1.3874, "step": 14390 }, { "epoch": 0.05952516281578822, "grad_norm": 3.274100022702722, "learning_rate": 1.9876774623497112e-05, "loss": 1.3582, "step": 14400 }, { "epoch": 0.0595664997344103, "grad_norm": 2.9329187414523425, "learning_rate": 1.9876569243570742e-05, "loss": 1.3901, "step": 14410 }, { "epoch": 0.059607836653032376, "grad_norm": 3.73192299633757, "learning_rate": 1.9876363693696166e-05, "loss": 1.3898, "step": 14420 }, { "epoch": 0.059649173571654446, "grad_norm": 3.2578219560804196, "learning_rate": 1.987615797387692e-05, "loss": 1.371, "step": 14430 }, { "epoch": 0.05969051049027652, "grad_norm": 3.2376808673712807, "learning_rate": 1.9875952084116548e-05, "loss": 1.336, "step": 14440 }, { "epoch": 0.0597318474088986, "grad_norm": 3.5969021652399253, "learning_rate": 1.987574602441859e-05, "loss": 1.3862, "step": 14450 }, { "epoch": 0.05977318432752067, "grad_norm": 3.1882930317103844, "learning_rate": 1.9875539794786593e-05, "loss": 1.3734, "step": 14460 }, { "epoch": 0.05981452124614275, "grad_norm": 3.147605709223492, "learning_rate": 1.9875333395224102e-05, "loss": 1.3739, "step": 14470 }, { "epoch": 0.059855858164764825, "grad_norm": 3.4276761969558645, "learning_rate": 1.9875126825734673e-05, "loss": 1.3301, "step": 14480 }, { "epoch": 0.0598971950833869, "grad_norm": 3.4226682382169997, "learning_rate": 1.987492008632186e-05, "loss": 1.3747, "step": 14490 }, { "epoch": 0.05993853200200897, "grad_norm": 3.471596558825063, "learning_rate": 1.987471317698922e-05, "loss": 1.3349, "step": 14500 }, { "epoch": 0.05997986892063105, "grad_norm": 3.676413482025792, "learning_rate": 1.9874506097740308e-05, "loss": 1.3963, "step": 14510 }, { "epoch": 0.060021205839253126, "grad_norm": 3.2543359739910267, "learning_rate": 1.9874298848578696e-05, "loss": 1.3334, "step": 14520 }, { "epoch": 0.0600625427578752, "grad_norm": 3.2372261281319377, "learning_rate": 1.9874091429507943e-05, "loss": 1.3367, "step": 14530 }, { "epoch": 0.060103879676497274, "grad_norm": 3.3746909228055397, "learning_rate": 1.987388384053162e-05, "loss": 1.376, "step": 14540 }, { "epoch": 0.06014521659511935, "grad_norm": 2.921092227637486, "learning_rate": 1.9873676081653302e-05, "loss": 1.3715, "step": 14550 }, { "epoch": 0.06018655351374143, "grad_norm": 2.9872735994422417, "learning_rate": 1.9873468152876563e-05, "loss": 1.3457, "step": 14560 }, { "epoch": 0.0602278904323635, "grad_norm": 3.713230272104003, "learning_rate": 1.9873260054204978e-05, "loss": 1.328, "step": 14570 }, { "epoch": 0.060269227350985576, "grad_norm": 3.3242217719152496, "learning_rate": 1.9873051785642134e-05, "loss": 1.3433, "step": 14580 }, { "epoch": 0.06031056426960765, "grad_norm": 3.3594706223759143, "learning_rate": 1.9872843347191607e-05, "loss": 1.4027, "step": 14590 }, { "epoch": 0.06035190118822972, "grad_norm": 3.1573056170670846, "learning_rate": 1.9872634738856987e-05, "loss": 1.3798, "step": 14600 }, { "epoch": 0.0603932381068518, "grad_norm": 3.076626392806199, "learning_rate": 1.9872425960641863e-05, "loss": 1.3581, "step": 14610 }, { "epoch": 0.06043457502547388, "grad_norm": 3.468364476739333, "learning_rate": 1.987221701254983e-05, "loss": 1.3675, "step": 14620 }, { "epoch": 0.060475911944095954, "grad_norm": 4.079855852909671, "learning_rate": 1.987200789458448e-05, "loss": 1.3197, "step": 14630 }, { "epoch": 0.060517248862718025, "grad_norm": 4.960368430326063, "learning_rate": 1.9871798606749415e-05, "loss": 1.4018, "step": 14640 }, { "epoch": 0.0605585857813401, "grad_norm": 3.1263141804205272, "learning_rate": 1.9871589149048232e-05, "loss": 1.4034, "step": 14650 }, { "epoch": 0.06059992269996218, "grad_norm": 3.0030045708337876, "learning_rate": 1.9871379521484538e-05, "loss": 1.314, "step": 14660 }, { "epoch": 0.06064125961858425, "grad_norm": 4.171309905380893, "learning_rate": 1.987116972406194e-05, "loss": 1.3454, "step": 14670 }, { "epoch": 0.060682596537206326, "grad_norm": 3.678722232531344, "learning_rate": 1.9870959756784044e-05, "loss": 1.3644, "step": 14680 }, { "epoch": 0.060723933455828404, "grad_norm": 3.5114052948471164, "learning_rate": 1.987074961965447e-05, "loss": 1.3446, "step": 14690 }, { "epoch": 0.06076527037445048, "grad_norm": 3.3802619453865637, "learning_rate": 1.987053931267683e-05, "loss": 1.3603, "step": 14700 }, { "epoch": 0.06080660729307255, "grad_norm": 3.239888006176567, "learning_rate": 1.9870328835854743e-05, "loss": 1.3263, "step": 14710 }, { "epoch": 0.06084794421169463, "grad_norm": 2.7923482855439716, "learning_rate": 1.9870118189191833e-05, "loss": 1.3532, "step": 14720 }, { "epoch": 0.060889281130316705, "grad_norm": 3.672374432583175, "learning_rate": 1.9869907372691715e-05, "loss": 1.3749, "step": 14730 }, { "epoch": 0.060930618048938776, "grad_norm": 3.1371007241226017, "learning_rate": 1.9869696386358032e-05, "loss": 1.3529, "step": 14740 }, { "epoch": 0.06097195496756085, "grad_norm": 3.5064852286924837, "learning_rate": 1.9869485230194403e-05, "loss": 1.3664, "step": 14750 }, { "epoch": 0.06101329188618293, "grad_norm": 3.9587666778347073, "learning_rate": 1.9869273904204465e-05, "loss": 1.3847, "step": 14760 }, { "epoch": 0.061054628804805, "grad_norm": 3.2129303491061973, "learning_rate": 1.9869062408391855e-05, "loss": 1.3625, "step": 14770 }, { "epoch": 0.06109596572342708, "grad_norm": 3.1860777109810465, "learning_rate": 1.9868850742760212e-05, "loss": 1.3062, "step": 14780 }, { "epoch": 0.061137302642049154, "grad_norm": 3.47772048599133, "learning_rate": 1.9868638907313174e-05, "loss": 1.3487, "step": 14790 }, { "epoch": 0.06117863956067123, "grad_norm": 3.323868913803053, "learning_rate": 1.9868426902054394e-05, "loss": 1.3304, "step": 14800 }, { "epoch": 0.0612199764792933, "grad_norm": 3.4796197385612735, "learning_rate": 1.9868214726987513e-05, "loss": 1.3143, "step": 14810 }, { "epoch": 0.06126131339791538, "grad_norm": 3.8413485871243402, "learning_rate": 1.9868002382116186e-05, "loss": 1.3451, "step": 14820 }, { "epoch": 0.061302650316537456, "grad_norm": 3.3376163822500278, "learning_rate": 1.9867789867444066e-05, "loss": 1.3486, "step": 14830 }, { "epoch": 0.061343987235159526, "grad_norm": 3.3554967862457543, "learning_rate": 1.9867577182974807e-05, "loss": 1.3447, "step": 14840 }, { "epoch": 0.061385324153781604, "grad_norm": 3.0553156261560757, "learning_rate": 1.9867364328712074e-05, "loss": 1.3436, "step": 14850 }, { "epoch": 0.06142666107240368, "grad_norm": 3.9386923791793027, "learning_rate": 1.9867151304659527e-05, "loss": 1.3719, "step": 14860 }, { "epoch": 0.06146799799102576, "grad_norm": 3.915463974315291, "learning_rate": 1.986693811082083e-05, "loss": 1.328, "step": 14870 }, { "epoch": 0.06150933490964783, "grad_norm": 2.800179512440139, "learning_rate": 1.986672474719965e-05, "loss": 1.322, "step": 14880 }, { "epoch": 0.061550671828269905, "grad_norm": 3.3411218270323664, "learning_rate": 1.9866511213799665e-05, "loss": 1.3899, "step": 14890 }, { "epoch": 0.06159200874689198, "grad_norm": 3.323705843953213, "learning_rate": 1.9866297510624544e-05, "loss": 1.3615, "step": 14900 }, { "epoch": 0.06163334566551405, "grad_norm": 4.754040048447529, "learning_rate": 1.9866083637677963e-05, "loss": 1.3726, "step": 14910 }, { "epoch": 0.06167468258413613, "grad_norm": 3.45837680678418, "learning_rate": 1.9865869594963607e-05, "loss": 1.3519, "step": 14920 }, { "epoch": 0.06171601950275821, "grad_norm": 3.183067903396876, "learning_rate": 1.986565538248516e-05, "loss": 1.3584, "step": 14930 }, { "epoch": 0.061757356421380284, "grad_norm": 3.8419649238240656, "learning_rate": 1.98654410002463e-05, "loss": 1.3698, "step": 14940 }, { "epoch": 0.061798693340002354, "grad_norm": 3.3977765733638168, "learning_rate": 1.9865226448250725e-05, "loss": 1.3702, "step": 14950 }, { "epoch": 0.06184003025862443, "grad_norm": 3.5010199408218305, "learning_rate": 1.9865011726502118e-05, "loss": 1.3515, "step": 14960 }, { "epoch": 0.06188136717724651, "grad_norm": 3.521969897290798, "learning_rate": 1.9864796835004184e-05, "loss": 1.3562, "step": 14970 }, { "epoch": 0.06192270409586858, "grad_norm": 3.198260891262558, "learning_rate": 1.986458177376061e-05, "loss": 1.3265, "step": 14980 }, { "epoch": 0.061964041014490656, "grad_norm": 4.203288617287408, "learning_rate": 1.9864366542775104e-05, "loss": 1.3445, "step": 14990 }, { "epoch": 0.06200537793311273, "grad_norm": 3.0231024395080204, "learning_rate": 1.9864151142051367e-05, "loss": 1.3437, "step": 15000 }, { "epoch": 0.06204671485173481, "grad_norm": 3.375301699368925, "learning_rate": 1.9863935571593104e-05, "loss": 1.3587, "step": 15010 }, { "epoch": 0.06208805177035688, "grad_norm": 3.3942212627441433, "learning_rate": 1.986371983140403e-05, "loss": 1.3574, "step": 15020 }, { "epoch": 0.06212938868897896, "grad_norm": 2.8229502868870906, "learning_rate": 1.986350392148785e-05, "loss": 1.3252, "step": 15030 }, { "epoch": 0.062170725607601035, "grad_norm": 3.531378883228978, "learning_rate": 1.9863287841848283e-05, "loss": 1.3284, "step": 15040 }, { "epoch": 0.062212062526223105, "grad_norm": 3.4111136482320057, "learning_rate": 1.986307159248905e-05, "loss": 1.3503, "step": 15050 }, { "epoch": 0.06225339944484518, "grad_norm": 3.1924697026460525, "learning_rate": 1.9862855173413864e-05, "loss": 1.3316, "step": 15060 }, { "epoch": 0.06229473636346726, "grad_norm": 3.673294835187914, "learning_rate": 1.9862638584626456e-05, "loss": 1.378, "step": 15070 }, { "epoch": 0.06233607328208933, "grad_norm": 2.944502490931273, "learning_rate": 1.9862421826130548e-05, "loss": 1.3505, "step": 15080 }, { "epoch": 0.06237741020071141, "grad_norm": 3.166457773127537, "learning_rate": 1.9862204897929875e-05, "loss": 1.3274, "step": 15090 }, { "epoch": 0.062418747119333484, "grad_norm": 3.539140440652841, "learning_rate": 1.9861987800028167e-05, "loss": 1.3373, "step": 15100 }, { "epoch": 0.06246008403795556, "grad_norm": 4.111976580182342, "learning_rate": 1.986177053242916e-05, "loss": 1.3795, "step": 15110 }, { "epoch": 0.06250142095657764, "grad_norm": 3.518730561598167, "learning_rate": 1.986155309513659e-05, "loss": 1.3284, "step": 15120 }, { "epoch": 0.06254275787519971, "grad_norm": 3.391425624844218, "learning_rate": 1.9861335488154206e-05, "loss": 1.3587, "step": 15130 }, { "epoch": 0.06258409479382178, "grad_norm": 3.4961442083301097, "learning_rate": 1.9861117711485743e-05, "loss": 1.399, "step": 15140 }, { "epoch": 0.06262543171244386, "grad_norm": 2.7302506739470784, "learning_rate": 1.9860899765134953e-05, "loss": 1.3654, "step": 15150 }, { "epoch": 0.06266676863106593, "grad_norm": 4.411982690984872, "learning_rate": 1.9860681649105585e-05, "loss": 1.3409, "step": 15160 }, { "epoch": 0.062708105549688, "grad_norm": 3.671422586435062, "learning_rate": 1.9860463363401393e-05, "loss": 1.3629, "step": 15170 }, { "epoch": 0.06274944246831009, "grad_norm": 3.599702261798689, "learning_rate": 1.9860244908026133e-05, "loss": 1.3464, "step": 15180 }, { "epoch": 0.06279077938693216, "grad_norm": 3.6962151021925598, "learning_rate": 1.9860026282983568e-05, "loss": 1.362, "step": 15190 }, { "epoch": 0.06283211630555424, "grad_norm": 3.5740776610173284, "learning_rate": 1.9859807488277453e-05, "loss": 1.3657, "step": 15200 }, { "epoch": 0.06287345322417631, "grad_norm": 3.437476924912017, "learning_rate": 1.9859588523911554e-05, "loss": 1.3384, "step": 15210 }, { "epoch": 0.06291479014279838, "grad_norm": 3.2699157716406373, "learning_rate": 1.9859369389889642e-05, "loss": 1.3658, "step": 15220 }, { "epoch": 0.06295612706142047, "grad_norm": 3.123305029392182, "learning_rate": 1.9859150086215487e-05, "loss": 1.352, "step": 15230 }, { "epoch": 0.06299746398004254, "grad_norm": 3.2830928690839354, "learning_rate": 1.985893061289286e-05, "loss": 1.3799, "step": 15240 }, { "epoch": 0.06303880089866461, "grad_norm": 3.664715842390073, "learning_rate": 1.9858710969925547e-05, "loss": 1.3669, "step": 15250 }, { "epoch": 0.06308013781728669, "grad_norm": 3.2013150011378952, "learning_rate": 1.985849115731731e-05, "loss": 1.3403, "step": 15260 }, { "epoch": 0.06312147473590876, "grad_norm": 3.1107330520735643, "learning_rate": 1.9858271175071946e-05, "loss": 1.348, "step": 15270 }, { "epoch": 0.06316281165453083, "grad_norm": 3.188091096811774, "learning_rate": 1.9858051023193234e-05, "loss": 1.3219, "step": 15280 }, { "epoch": 0.06320414857315292, "grad_norm": 3.1313626953852705, "learning_rate": 1.9857830701684967e-05, "loss": 1.3622, "step": 15290 }, { "epoch": 0.06324548549177499, "grad_norm": 3.1702519850910127, "learning_rate": 1.985761021055093e-05, "loss": 1.3546, "step": 15300 }, { "epoch": 0.06328682241039706, "grad_norm": 3.252596544143132, "learning_rate": 1.9857389549794917e-05, "loss": 1.2552, "step": 15310 }, { "epoch": 0.06332815932901914, "grad_norm": 3.0313555049861374, "learning_rate": 1.985716871942073e-05, "loss": 1.4033, "step": 15320 }, { "epoch": 0.06336949624764121, "grad_norm": 3.4443767622891115, "learning_rate": 1.985694771943217e-05, "loss": 1.3893, "step": 15330 }, { "epoch": 0.06341083316626328, "grad_norm": 3.616984395155513, "learning_rate": 1.9856726549833034e-05, "loss": 1.3499, "step": 15340 }, { "epoch": 0.06345217008488536, "grad_norm": 3.50828559499885, "learning_rate": 1.985650521062713e-05, "loss": 1.3298, "step": 15350 }, { "epoch": 0.06349350700350744, "grad_norm": 2.8467914617151244, "learning_rate": 1.9856283701818268e-05, "loss": 1.3307, "step": 15360 }, { "epoch": 0.06353484392212952, "grad_norm": 4.069010272907543, "learning_rate": 1.9856062023410257e-05, "loss": 1.3341, "step": 15370 }, { "epoch": 0.06357618084075159, "grad_norm": 3.554716298604244, "learning_rate": 1.985584017540691e-05, "loss": 1.3366, "step": 15380 }, { "epoch": 0.06361751775937366, "grad_norm": 3.2437195929255123, "learning_rate": 1.985561815781205e-05, "loss": 1.3241, "step": 15390 }, { "epoch": 0.06365885467799574, "grad_norm": 3.0814014462588495, "learning_rate": 1.9855395970629497e-05, "loss": 1.3086, "step": 15400 }, { "epoch": 0.06370019159661781, "grad_norm": 2.845186766341529, "learning_rate": 1.985517361386307e-05, "loss": 1.3302, "step": 15410 }, { "epoch": 0.06374152851523988, "grad_norm": 3.594783956612939, "learning_rate": 1.9854951087516598e-05, "loss": 1.3374, "step": 15420 }, { "epoch": 0.06378286543386197, "grad_norm": 2.8854977362042815, "learning_rate": 1.9854728391593904e-05, "loss": 1.3326, "step": 15430 }, { "epoch": 0.06382420235248404, "grad_norm": 3.5259935395974784, "learning_rate": 1.985450552609883e-05, "loss": 1.3081, "step": 15440 }, { "epoch": 0.06386553927110611, "grad_norm": 3.245306593778846, "learning_rate": 1.9854282491035203e-05, "loss": 1.3746, "step": 15450 }, { "epoch": 0.06390687618972819, "grad_norm": 4.071308549266746, "learning_rate": 1.9854059286406866e-05, "loss": 1.3783, "step": 15460 }, { "epoch": 0.06394821310835026, "grad_norm": 3.247822958885961, "learning_rate": 1.9853835912217657e-05, "loss": 1.3411, "step": 15470 }, { "epoch": 0.06398955002697233, "grad_norm": 3.35088921885468, "learning_rate": 1.9853612368471416e-05, "loss": 1.3769, "step": 15480 }, { "epoch": 0.06403088694559442, "grad_norm": 3.311416405062516, "learning_rate": 1.9853388655171998e-05, "loss": 1.3546, "step": 15490 }, { "epoch": 0.06407222386421649, "grad_norm": 3.423751956404864, "learning_rate": 1.985316477232325e-05, "loss": 1.3082, "step": 15500 }, { "epoch": 0.06411356078283857, "grad_norm": 3.5994672319763072, "learning_rate": 1.9852940719929017e-05, "loss": 1.308, "step": 15510 }, { "epoch": 0.06415489770146064, "grad_norm": 6.092109341719527, "learning_rate": 1.9852716497993164e-05, "loss": 1.3333, "step": 15520 }, { "epoch": 0.06419623462008271, "grad_norm": 3.4870247799249574, "learning_rate": 1.985249210651954e-05, "loss": 1.3755, "step": 15530 }, { "epoch": 0.0642375715387048, "grad_norm": 3.1557901036816687, "learning_rate": 1.9852267545512016e-05, "loss": 1.3237, "step": 15540 }, { "epoch": 0.06427890845732687, "grad_norm": 2.9168599988216655, "learning_rate": 1.9852042814974448e-05, "loss": 1.3333, "step": 15550 }, { "epoch": 0.06432024537594894, "grad_norm": 3.658350430782456, "learning_rate": 1.9851817914910707e-05, "loss": 1.3157, "step": 15560 }, { "epoch": 0.06436158229457102, "grad_norm": 2.98123289160538, "learning_rate": 1.9851592845324664e-05, "loss": 1.3461, "step": 15570 }, { "epoch": 0.06440291921319309, "grad_norm": 3.2851248740018133, "learning_rate": 1.9851367606220187e-05, "loss": 1.3592, "step": 15580 }, { "epoch": 0.06444425613181516, "grad_norm": 2.874060228764119, "learning_rate": 1.9851142197601157e-05, "loss": 1.3179, "step": 15590 }, { "epoch": 0.06448559305043725, "grad_norm": 3.3595179685654286, "learning_rate": 1.985091661947145e-05, "loss": 1.358, "step": 15600 }, { "epoch": 0.06452692996905932, "grad_norm": 4.343402000280312, "learning_rate": 1.9850690871834945e-05, "loss": 1.3387, "step": 15610 }, { "epoch": 0.06456826688768139, "grad_norm": 4.4028919030557345, "learning_rate": 1.985046495469553e-05, "loss": 1.3405, "step": 15620 }, { "epoch": 0.06460960380630347, "grad_norm": 3.3683021821213077, "learning_rate": 1.9850238868057097e-05, "loss": 1.3164, "step": 15630 }, { "epoch": 0.06465094072492554, "grad_norm": 3.223018835339703, "learning_rate": 1.9850012611923527e-05, "loss": 1.2937, "step": 15640 }, { "epoch": 0.06469227764354761, "grad_norm": 3.6479375571543584, "learning_rate": 1.984978618629872e-05, "loss": 1.3703, "step": 15650 }, { "epoch": 0.0647336145621697, "grad_norm": 3.0694724692776107, "learning_rate": 1.9849559591186566e-05, "loss": 1.3239, "step": 15660 }, { "epoch": 0.06477495148079176, "grad_norm": 3.595788633474915, "learning_rate": 1.984933282659097e-05, "loss": 1.2858, "step": 15670 }, { "epoch": 0.06481628839941385, "grad_norm": 2.996464746206387, "learning_rate": 1.984910589251583e-05, "loss": 1.3677, "step": 15680 }, { "epoch": 0.06485762531803592, "grad_norm": 2.978010213666374, "learning_rate": 1.9848878788965053e-05, "loss": 1.3612, "step": 15690 }, { "epoch": 0.06489896223665799, "grad_norm": 4.107355827588679, "learning_rate": 1.9848651515942545e-05, "loss": 1.3473, "step": 15700 }, { "epoch": 0.06494029915528007, "grad_norm": 3.405862899284087, "learning_rate": 1.984842407345222e-05, "loss": 1.329, "step": 15710 }, { "epoch": 0.06498163607390214, "grad_norm": 3.56806926046013, "learning_rate": 1.984819646149799e-05, "loss": 1.3547, "step": 15720 }, { "epoch": 0.06502297299252421, "grad_norm": 2.9244416633025234, "learning_rate": 1.984796868008377e-05, "loss": 1.3451, "step": 15730 }, { "epoch": 0.0650643099111463, "grad_norm": 3.6605360363216133, "learning_rate": 1.984774072921348e-05, "loss": 1.3121, "step": 15740 }, { "epoch": 0.06510564682976837, "grad_norm": 3.857082230167186, "learning_rate": 1.9847512608891046e-05, "loss": 1.3546, "step": 15750 }, { "epoch": 0.06514698374839044, "grad_norm": 3.20861184076794, "learning_rate": 1.9847284319120386e-05, "loss": 1.3384, "step": 15760 }, { "epoch": 0.06518832066701252, "grad_norm": 4.374471400346774, "learning_rate": 1.9847055859905434e-05, "loss": 1.3603, "step": 15770 }, { "epoch": 0.06522965758563459, "grad_norm": 3.558128348565767, "learning_rate": 1.984682723125012e-05, "loss": 1.3307, "step": 15780 }, { "epoch": 0.06527099450425666, "grad_norm": 3.740975960278226, "learning_rate": 1.984659843315838e-05, "loss": 1.3565, "step": 15790 }, { "epoch": 0.06531233142287875, "grad_norm": 3.0884764960813254, "learning_rate": 1.9846369465634146e-05, "loss": 1.3371, "step": 15800 }, { "epoch": 0.06535366834150082, "grad_norm": 3.0640927344766236, "learning_rate": 1.9846140328681363e-05, "loss": 1.3075, "step": 15810 }, { "epoch": 0.0653950052601229, "grad_norm": 3.6774626803339285, "learning_rate": 1.9845911022303973e-05, "loss": 1.3647, "step": 15820 }, { "epoch": 0.06543634217874497, "grad_norm": 2.9365431211187065, "learning_rate": 1.9845681546505915e-05, "loss": 1.3086, "step": 15830 }, { "epoch": 0.06547767909736704, "grad_norm": 4.305431264385432, "learning_rate": 1.9845451901291145e-05, "loss": 1.3348, "step": 15840 }, { "epoch": 0.06551901601598913, "grad_norm": 3.032533703820296, "learning_rate": 1.9845222086663615e-05, "loss": 1.3527, "step": 15850 }, { "epoch": 0.0655603529346112, "grad_norm": 3.3387798006802782, "learning_rate": 1.9844992102627273e-05, "loss": 1.3249, "step": 15860 }, { "epoch": 0.06560168985323327, "grad_norm": 3.3127539852292363, "learning_rate": 1.9844761949186083e-05, "loss": 1.3323, "step": 15870 }, { "epoch": 0.06564302677185535, "grad_norm": 3.4862694527591307, "learning_rate": 1.9844531626344003e-05, "loss": 1.3224, "step": 15880 }, { "epoch": 0.06568436369047742, "grad_norm": 3.215035735991411, "learning_rate": 1.9844301134104996e-05, "loss": 1.349, "step": 15890 }, { "epoch": 0.06572570060909949, "grad_norm": 3.331575129362213, "learning_rate": 1.9844070472473026e-05, "loss": 1.3297, "step": 15900 }, { "epoch": 0.06576703752772158, "grad_norm": 3.008247289759144, "learning_rate": 1.9843839641452062e-05, "loss": 1.368, "step": 15910 }, { "epoch": 0.06580837444634365, "grad_norm": 3.314210218559566, "learning_rate": 1.984360864104608e-05, "loss": 1.3318, "step": 15920 }, { "epoch": 0.06584971136496572, "grad_norm": 4.460695903393647, "learning_rate": 1.9843377471259056e-05, "loss": 1.363, "step": 15930 }, { "epoch": 0.0658910482835878, "grad_norm": 3.591421453277731, "learning_rate": 1.984314613209496e-05, "loss": 1.3428, "step": 15940 }, { "epoch": 0.06593238520220987, "grad_norm": 3.708262991759124, "learning_rate": 1.984291462355778e-05, "loss": 1.3564, "step": 15950 }, { "epoch": 0.06597372212083194, "grad_norm": 3.1432924399561903, "learning_rate": 1.9842682945651495e-05, "loss": 1.3455, "step": 15960 }, { "epoch": 0.06601505903945402, "grad_norm": 3.1161735074970216, "learning_rate": 1.9842451098380096e-05, "loss": 1.3514, "step": 15970 }, { "epoch": 0.0660563959580761, "grad_norm": 2.8632026794139875, "learning_rate": 1.984221908174757e-05, "loss": 1.3446, "step": 15980 }, { "epoch": 0.06609773287669818, "grad_norm": 3.934735081002441, "learning_rate": 1.9841986895757907e-05, "loss": 1.3298, "step": 15990 }, { "epoch": 0.06613906979532025, "grad_norm": 4.324584533198296, "learning_rate": 1.9841754540415102e-05, "loss": 1.3537, "step": 16000 }, { "epoch": 0.06618040671394232, "grad_norm": 3.6119231745645166, "learning_rate": 1.9841522015723164e-05, "loss": 1.3343, "step": 16010 }, { "epoch": 0.0662217436325644, "grad_norm": 3.1193540137010887, "learning_rate": 1.984128932168608e-05, "loss": 1.3752, "step": 16020 }, { "epoch": 0.06626308055118647, "grad_norm": 3.1351192552066762, "learning_rate": 1.984105645830786e-05, "loss": 1.3289, "step": 16030 }, { "epoch": 0.06630441746980854, "grad_norm": 3.5610679476787737, "learning_rate": 1.9840823425592512e-05, "loss": 1.3543, "step": 16040 }, { "epoch": 0.06634575438843063, "grad_norm": 3.5677181048737943, "learning_rate": 1.984059022354404e-05, "loss": 1.3483, "step": 16050 }, { "epoch": 0.0663870913070527, "grad_norm": 3.6239317747912385, "learning_rate": 1.9840356852166465e-05, "loss": 1.3511, "step": 16060 }, { "epoch": 0.06642842822567477, "grad_norm": 3.7952366513012636, "learning_rate": 1.9840123311463803e-05, "loss": 1.33, "step": 16070 }, { "epoch": 0.06646976514429685, "grad_norm": 3.345456868567261, "learning_rate": 1.9839889601440064e-05, "loss": 1.3226, "step": 16080 }, { "epoch": 0.06651110206291892, "grad_norm": 3.0646963347861194, "learning_rate": 1.9839655722099277e-05, "loss": 1.3142, "step": 16090 }, { "epoch": 0.06655243898154099, "grad_norm": 3.1796454557601956, "learning_rate": 1.9839421673445457e-05, "loss": 1.3363, "step": 16100 }, { "epoch": 0.06659377590016308, "grad_norm": 2.7604488860103453, "learning_rate": 1.9839187455482646e-05, "loss": 1.3453, "step": 16110 }, { "epoch": 0.06663511281878515, "grad_norm": 3.1473856534090885, "learning_rate": 1.9838953068214862e-05, "loss": 1.3146, "step": 16120 }, { "epoch": 0.06667644973740723, "grad_norm": 3.323185903362043, "learning_rate": 1.983871851164614e-05, "loss": 1.3013, "step": 16130 }, { "epoch": 0.0667177866560293, "grad_norm": 3.0139283592638315, "learning_rate": 1.9838483785780522e-05, "loss": 1.3761, "step": 16140 }, { "epoch": 0.06675912357465137, "grad_norm": 3.2332451066710535, "learning_rate": 1.9838248890622043e-05, "loss": 1.341, "step": 16150 }, { "epoch": 0.06680046049327346, "grad_norm": 3.0517814224449826, "learning_rate": 1.9838013826174745e-05, "loss": 1.3003, "step": 16160 }, { "epoch": 0.06684179741189553, "grad_norm": 4.164899927708623, "learning_rate": 1.983777859244267e-05, "loss": 1.3247, "step": 16170 }, { "epoch": 0.0668831343305176, "grad_norm": 2.992166731140292, "learning_rate": 1.983754318942987e-05, "loss": 1.2762, "step": 16180 }, { "epoch": 0.06692447124913968, "grad_norm": 3.774796705592006, "learning_rate": 1.98373076171404e-05, "loss": 1.3652, "step": 16190 }, { "epoch": 0.06696580816776175, "grad_norm": 3.6727461962589296, "learning_rate": 1.98370718755783e-05, "loss": 1.3433, "step": 16200 }, { "epoch": 0.06700714508638382, "grad_norm": 2.759830849179559, "learning_rate": 1.983683596474764e-05, "loss": 1.2917, "step": 16210 }, { "epoch": 0.0670484820050059, "grad_norm": 4.142227423223113, "learning_rate": 1.983659988465247e-05, "loss": 1.3287, "step": 16220 }, { "epoch": 0.06708981892362798, "grad_norm": 3.1361138389830305, "learning_rate": 1.9836363635296856e-05, "loss": 1.3526, "step": 16230 }, { "epoch": 0.06713115584225005, "grad_norm": 4.684718067129147, "learning_rate": 1.9836127216684864e-05, "loss": 1.3398, "step": 16240 }, { "epoch": 0.06717249276087213, "grad_norm": 3.9305271840952325, "learning_rate": 1.9835890628820564e-05, "loss": 1.3061, "step": 16250 }, { "epoch": 0.0672138296794942, "grad_norm": 3.6945496877183754, "learning_rate": 1.983565387170802e-05, "loss": 1.3046, "step": 16260 }, { "epoch": 0.06725516659811627, "grad_norm": 2.913410222912495, "learning_rate": 1.983541694535131e-05, "loss": 1.3439, "step": 16270 }, { "epoch": 0.06729650351673835, "grad_norm": 3.4482083464660143, "learning_rate": 1.9835179849754517e-05, "loss": 1.3282, "step": 16280 }, { "epoch": 0.06733784043536042, "grad_norm": 4.311927057182653, "learning_rate": 1.983494258492171e-05, "loss": 1.3156, "step": 16290 }, { "epoch": 0.06737917735398251, "grad_norm": 3.5718375129378015, "learning_rate": 1.9834705150856973e-05, "loss": 1.3088, "step": 16300 }, { "epoch": 0.06742051427260458, "grad_norm": 3.6069557479034704, "learning_rate": 1.98344675475644e-05, "loss": 1.3259, "step": 16310 }, { "epoch": 0.06746185119122665, "grad_norm": 3.2551197753818393, "learning_rate": 1.9834229775048076e-05, "loss": 1.3389, "step": 16320 }, { "epoch": 0.06750318810984873, "grad_norm": 3.4034457092581536, "learning_rate": 1.9833991833312086e-05, "loss": 1.3396, "step": 16330 }, { "epoch": 0.0675445250284708, "grad_norm": 2.9661916406960858, "learning_rate": 1.9833753722360534e-05, "loss": 1.2989, "step": 16340 }, { "epoch": 0.06758586194709287, "grad_norm": 5.279255386120152, "learning_rate": 1.983351544219751e-05, "loss": 1.3283, "step": 16350 }, { "epoch": 0.06762719886571496, "grad_norm": 3.8221920564477028, "learning_rate": 1.9833276992827117e-05, "loss": 1.2918, "step": 16360 }, { "epoch": 0.06766853578433703, "grad_norm": 3.2255728423614163, "learning_rate": 1.9833038374253456e-05, "loss": 1.327, "step": 16370 }, { "epoch": 0.0677098727029591, "grad_norm": 3.2355832791038464, "learning_rate": 1.9832799586480637e-05, "loss": 1.3204, "step": 16380 }, { "epoch": 0.06775120962158118, "grad_norm": 3.2467585154210155, "learning_rate": 1.9832560629512767e-05, "loss": 1.3338, "step": 16390 }, { "epoch": 0.06779254654020325, "grad_norm": 4.988429472619141, "learning_rate": 1.9832321503353954e-05, "loss": 1.3876, "step": 16400 }, { "epoch": 0.06783388345882532, "grad_norm": 3.446832489576954, "learning_rate": 1.9832082208008317e-05, "loss": 1.3233, "step": 16410 }, { "epoch": 0.0678752203774474, "grad_norm": 3.605129460226094, "learning_rate": 1.9831842743479975e-05, "loss": 1.3386, "step": 16420 }, { "epoch": 0.06791655729606948, "grad_norm": 4.275587631874799, "learning_rate": 1.9831603109773044e-05, "loss": 1.3613, "step": 16430 }, { "epoch": 0.06795789421469156, "grad_norm": 4.1009269376864665, "learning_rate": 1.983136330689165e-05, "loss": 1.3662, "step": 16440 }, { "epoch": 0.06799923113331363, "grad_norm": 2.9883770968491565, "learning_rate": 1.983112333483992e-05, "loss": 1.3304, "step": 16450 }, { "epoch": 0.0680405680519357, "grad_norm": 2.9575768154010085, "learning_rate": 1.983088319362198e-05, "loss": 1.3075, "step": 16460 }, { "epoch": 0.06808190497055779, "grad_norm": 3.1295830557654147, "learning_rate": 1.9830642883241967e-05, "loss": 1.3311, "step": 16470 }, { "epoch": 0.06812324188917986, "grad_norm": 3.6574005023751774, "learning_rate": 1.9830402403704008e-05, "loss": 1.2925, "step": 16480 }, { "epoch": 0.06816457880780193, "grad_norm": 3.080968609055009, "learning_rate": 1.9830161755012255e-05, "loss": 1.3156, "step": 16490 }, { "epoch": 0.06820591572642401, "grad_norm": 3.0330618488937553, "learning_rate": 1.9829920937170835e-05, "loss": 1.3314, "step": 16500 }, { "epoch": 0.06824725264504608, "grad_norm": 3.3116009543610665, "learning_rate": 1.9829679950183895e-05, "loss": 1.3034, "step": 16510 }, { "epoch": 0.06828858956366815, "grad_norm": 3.783202270094679, "learning_rate": 1.9829438794055584e-05, "loss": 1.3313, "step": 16520 }, { "epoch": 0.06832992648229023, "grad_norm": 3.969686559597743, "learning_rate": 1.9829197468790054e-05, "loss": 1.2911, "step": 16530 }, { "epoch": 0.0683712634009123, "grad_norm": 3.580580495241376, "learning_rate": 1.9828955974391455e-05, "loss": 1.2912, "step": 16540 }, { "epoch": 0.06841260031953438, "grad_norm": 2.9992747255679033, "learning_rate": 1.982871431086394e-05, "loss": 1.302, "step": 16550 }, { "epoch": 0.06845393723815646, "grad_norm": 3.5773285600866256, "learning_rate": 1.9828472478211673e-05, "loss": 1.334, "step": 16560 }, { "epoch": 0.06849527415677853, "grad_norm": 3.647217700504523, "learning_rate": 1.982823047643881e-05, "loss": 1.3137, "step": 16570 }, { "epoch": 0.0685366110754006, "grad_norm": 3.7929582820880428, "learning_rate": 1.982798830554952e-05, "loss": 1.3598, "step": 16580 }, { "epoch": 0.06857794799402268, "grad_norm": 3.3921261114157817, "learning_rate": 1.982774596554796e-05, "loss": 1.3159, "step": 16590 }, { "epoch": 0.06861928491264475, "grad_norm": 3.948788035775027, "learning_rate": 1.9827503456438314e-05, "loss": 1.3487, "step": 16600 }, { "epoch": 0.06866062183126684, "grad_norm": 3.4434803120058595, "learning_rate": 1.9827260778224744e-05, "loss": 1.3611, "step": 16610 }, { "epoch": 0.06870195874988891, "grad_norm": 3.345318165402726, "learning_rate": 1.9827017930911433e-05, "loss": 1.3214, "step": 16620 }, { "epoch": 0.06874329566851098, "grad_norm": 3.3238354847739675, "learning_rate": 1.9826774914502554e-05, "loss": 1.3415, "step": 16630 }, { "epoch": 0.06878463258713306, "grad_norm": 3.3341319261133773, "learning_rate": 1.9826531729002293e-05, "loss": 1.2814, "step": 16640 }, { "epoch": 0.06882596950575513, "grad_norm": 3.2557833923669937, "learning_rate": 1.982628837441483e-05, "loss": 1.3692, "step": 16650 }, { "epoch": 0.0688673064243772, "grad_norm": 2.7035167555145545, "learning_rate": 1.9826044850744358e-05, "loss": 1.3045, "step": 16660 }, { "epoch": 0.06890864334299929, "grad_norm": 3.2563919103242167, "learning_rate": 1.9825801157995065e-05, "loss": 1.2807, "step": 16670 }, { "epoch": 0.06894998026162136, "grad_norm": 3.369904458766467, "learning_rate": 1.9825557296171143e-05, "loss": 1.2897, "step": 16680 }, { "epoch": 0.06899131718024343, "grad_norm": 3.064586151511127, "learning_rate": 1.982531326527679e-05, "loss": 1.3332, "step": 16690 }, { "epoch": 0.06903265409886551, "grad_norm": 3.0263447019462753, "learning_rate": 1.9825069065316204e-05, "loss": 1.2825, "step": 16700 }, { "epoch": 0.06907399101748758, "grad_norm": 4.065735171694178, "learning_rate": 1.9824824696293584e-05, "loss": 1.2698, "step": 16710 }, { "epoch": 0.06911532793610965, "grad_norm": 3.4683529768298262, "learning_rate": 1.9824580158213142e-05, "loss": 1.3135, "step": 16720 }, { "epoch": 0.06915666485473174, "grad_norm": 2.955425396507506, "learning_rate": 1.9824335451079083e-05, "loss": 1.3571, "step": 16730 }, { "epoch": 0.0691980017733538, "grad_norm": 3.099433278967918, "learning_rate": 1.982409057489561e-05, "loss": 1.3273, "step": 16740 }, { "epoch": 0.06923933869197589, "grad_norm": 2.8197838698402093, "learning_rate": 1.982384552966695e-05, "loss": 1.3158, "step": 16750 }, { "epoch": 0.06928067561059796, "grad_norm": 3.422610473722703, "learning_rate": 1.982360031539731e-05, "loss": 1.3072, "step": 16760 }, { "epoch": 0.06932201252922003, "grad_norm": 3.654024548878873, "learning_rate": 1.9823354932090913e-05, "loss": 1.3174, "step": 16770 }, { "epoch": 0.06936334944784212, "grad_norm": 2.69963783380391, "learning_rate": 1.982310937975198e-05, "loss": 1.301, "step": 16780 }, { "epoch": 0.06940468636646419, "grad_norm": 3.119208859314543, "learning_rate": 1.9822863658384736e-05, "loss": 1.2915, "step": 16790 }, { "epoch": 0.06944602328508626, "grad_norm": 3.983735003830943, "learning_rate": 1.982261776799341e-05, "loss": 1.3029, "step": 16800 }, { "epoch": 0.06948736020370834, "grad_norm": 2.8833424065520292, "learning_rate": 1.9822371708582236e-05, "loss": 1.309, "step": 16810 }, { "epoch": 0.06952869712233041, "grad_norm": 3.5651047449269138, "learning_rate": 1.9822125480155442e-05, "loss": 1.3408, "step": 16820 }, { "epoch": 0.06957003404095248, "grad_norm": 2.9084817767691344, "learning_rate": 1.982187908271727e-05, "loss": 1.3217, "step": 16830 }, { "epoch": 0.06961137095957456, "grad_norm": 3.38644936958733, "learning_rate": 1.982163251627196e-05, "loss": 1.3094, "step": 16840 }, { "epoch": 0.06965270787819663, "grad_norm": 3.7040548837026384, "learning_rate": 1.9821385780823748e-05, "loss": 1.2973, "step": 16850 }, { "epoch": 0.0696940447968187, "grad_norm": 3.1936456120357435, "learning_rate": 1.982113887637689e-05, "loss": 1.3362, "step": 16860 }, { "epoch": 0.06973538171544079, "grad_norm": 3.676850292318552, "learning_rate": 1.9820891802935623e-05, "loss": 1.2947, "step": 16870 }, { "epoch": 0.06977671863406286, "grad_norm": 3.484703646839946, "learning_rate": 1.9820644560504207e-05, "loss": 1.2488, "step": 16880 }, { "epoch": 0.06981805555268493, "grad_norm": 3.4336203726483654, "learning_rate": 1.9820397149086892e-05, "loss": 1.3372, "step": 16890 }, { "epoch": 0.06985939247130701, "grad_norm": 3.4895422854739437, "learning_rate": 1.9820149568687937e-05, "loss": 1.3434, "step": 16900 }, { "epoch": 0.06990072938992908, "grad_norm": 3.5294671703855545, "learning_rate": 1.98199018193116e-05, "loss": 1.3299, "step": 16910 }, { "epoch": 0.06994206630855117, "grad_norm": 4.136918538506862, "learning_rate": 1.9819653900962153e-05, "loss": 1.3082, "step": 16920 }, { "epoch": 0.06998340322717324, "grad_norm": 3.1337022113010975, "learning_rate": 1.981940581364385e-05, "loss": 1.3131, "step": 16930 }, { "epoch": 0.07002474014579531, "grad_norm": 3.3461906954820115, "learning_rate": 1.9819157557360965e-05, "loss": 1.3533, "step": 16940 }, { "epoch": 0.07006607706441739, "grad_norm": 2.945652884357033, "learning_rate": 1.981890913211777e-05, "loss": 1.333, "step": 16950 }, { "epoch": 0.07010741398303946, "grad_norm": 3.8269320260288375, "learning_rate": 1.981866053791854e-05, "loss": 1.3021, "step": 16960 }, { "epoch": 0.07014875090166153, "grad_norm": 3.373539442185096, "learning_rate": 1.9818411774767555e-05, "loss": 1.3621, "step": 16970 }, { "epoch": 0.07019008782028362, "grad_norm": 3.1789158671874724, "learning_rate": 1.9818162842669087e-05, "loss": 1.3357, "step": 16980 }, { "epoch": 0.07023142473890569, "grad_norm": 3.379334004765994, "learning_rate": 1.981791374162743e-05, "loss": 1.3328, "step": 16990 }, { "epoch": 0.07027276165752776, "grad_norm": 3.3263838295875794, "learning_rate": 1.981766447164686e-05, "loss": 1.3177, "step": 17000 }, { "epoch": 0.07031409857614984, "grad_norm": 3.4846748393089744, "learning_rate": 1.9817415032731676e-05, "loss": 1.3088, "step": 17010 }, { "epoch": 0.07035543549477191, "grad_norm": 3.1166768241545526, "learning_rate": 1.9817165424886165e-05, "loss": 1.3168, "step": 17020 }, { "epoch": 0.07039677241339398, "grad_norm": 3.880062668262267, "learning_rate": 1.9816915648114623e-05, "loss": 1.3071, "step": 17030 }, { "epoch": 0.07043810933201607, "grad_norm": 3.511807529042466, "learning_rate": 1.9816665702421344e-05, "loss": 1.3409, "step": 17040 }, { "epoch": 0.07047944625063814, "grad_norm": 3.1993990782018256, "learning_rate": 1.9816415587810636e-05, "loss": 1.2918, "step": 17050 }, { "epoch": 0.07052078316926022, "grad_norm": 2.9930648253290943, "learning_rate": 1.98161653042868e-05, "loss": 1.2926, "step": 17060 }, { "epoch": 0.07056212008788229, "grad_norm": 3.0399828063698116, "learning_rate": 1.981591485185414e-05, "loss": 1.3183, "step": 17070 }, { "epoch": 0.07060345700650436, "grad_norm": 3.233401985951665, "learning_rate": 1.981566423051697e-05, "loss": 1.3262, "step": 17080 }, { "epoch": 0.07064479392512645, "grad_norm": 2.891852783093535, "learning_rate": 1.9815413440279597e-05, "loss": 1.2882, "step": 17090 }, { "epoch": 0.07068613084374852, "grad_norm": 2.861582978848217, "learning_rate": 1.9815162481146345e-05, "loss": 1.3417, "step": 17100 }, { "epoch": 0.07072746776237059, "grad_norm": 3.657647746254238, "learning_rate": 1.981491135312152e-05, "loss": 1.3239, "step": 17110 }, { "epoch": 0.07076880468099267, "grad_norm": 3.422931095645904, "learning_rate": 1.9814660056209454e-05, "loss": 1.3471, "step": 17120 }, { "epoch": 0.07081014159961474, "grad_norm": 3.233056855771981, "learning_rate": 1.9814408590414466e-05, "loss": 1.342, "step": 17130 }, { "epoch": 0.07085147851823681, "grad_norm": 2.8978002673834706, "learning_rate": 1.9814156955740885e-05, "loss": 1.3526, "step": 17140 }, { "epoch": 0.0708928154368589, "grad_norm": 3.1044166179845227, "learning_rate": 1.981390515219304e-05, "loss": 1.3338, "step": 17150 }, { "epoch": 0.07093415235548096, "grad_norm": 3.4443852171387963, "learning_rate": 1.9813653179775263e-05, "loss": 1.2798, "step": 17160 }, { "epoch": 0.07097548927410303, "grad_norm": 3.4197392236572925, "learning_rate": 1.9813401038491893e-05, "loss": 1.3278, "step": 17170 }, { "epoch": 0.07101682619272512, "grad_norm": 2.8656903185698868, "learning_rate": 1.9813148728347263e-05, "loss": 1.3044, "step": 17180 }, { "epoch": 0.07105816311134719, "grad_norm": 3.4951853787951457, "learning_rate": 1.981289624934572e-05, "loss": 1.3731, "step": 17190 }, { "epoch": 0.07109950002996926, "grad_norm": 3.0021246003594646, "learning_rate": 1.981264360149161e-05, "loss": 1.2953, "step": 17200 }, { "epoch": 0.07114083694859134, "grad_norm": 3.078524102988392, "learning_rate": 1.981239078478927e-05, "loss": 1.3536, "step": 17210 }, { "epoch": 0.07118217386721341, "grad_norm": 3.6935463858453805, "learning_rate": 1.981213779924306e-05, "loss": 1.3386, "step": 17220 }, { "epoch": 0.0712235107858355, "grad_norm": 3.2508773899525036, "learning_rate": 1.9811884644857332e-05, "loss": 1.2929, "step": 17230 }, { "epoch": 0.07126484770445757, "grad_norm": 4.11272659457257, "learning_rate": 1.9811631321636438e-05, "loss": 1.3376, "step": 17240 }, { "epoch": 0.07130618462307964, "grad_norm": 3.3031386708100734, "learning_rate": 1.9811377829584738e-05, "loss": 1.3078, "step": 17250 }, { "epoch": 0.07134752154170172, "grad_norm": 3.721567213304264, "learning_rate": 1.9811124168706598e-05, "loss": 1.3135, "step": 17260 }, { "epoch": 0.07138885846032379, "grad_norm": 3.3077256345455712, "learning_rate": 1.981087033900638e-05, "loss": 1.2727, "step": 17270 }, { "epoch": 0.07143019537894586, "grad_norm": 3.1365945025331716, "learning_rate": 1.9810616340488448e-05, "loss": 1.3082, "step": 17280 }, { "epoch": 0.07147153229756795, "grad_norm": 3.0095750827782095, "learning_rate": 1.981036217315718e-05, "loss": 1.3422, "step": 17290 }, { "epoch": 0.07151286921619002, "grad_norm": 3.879458456856346, "learning_rate": 1.9810107837016943e-05, "loss": 1.3444, "step": 17300 }, { "epoch": 0.07155420613481209, "grad_norm": 3.5419514306232145, "learning_rate": 1.9809853332072118e-05, "loss": 1.3495, "step": 17310 }, { "epoch": 0.07159554305343417, "grad_norm": 3.309317975252579, "learning_rate": 1.9809598658327084e-05, "loss": 1.3249, "step": 17320 }, { "epoch": 0.07163687997205624, "grad_norm": 3.0972043541501484, "learning_rate": 1.9809343815786218e-05, "loss": 1.3289, "step": 17330 }, { "epoch": 0.07167821689067831, "grad_norm": 3.278786808325931, "learning_rate": 1.9809088804453913e-05, "loss": 1.2998, "step": 17340 }, { "epoch": 0.0717195538093004, "grad_norm": 4.238835899003122, "learning_rate": 1.9808833624334547e-05, "loss": 1.3311, "step": 17350 }, { "epoch": 0.07176089072792247, "grad_norm": 2.92277331992324, "learning_rate": 1.980857827543252e-05, "loss": 1.2915, "step": 17360 }, { "epoch": 0.07180222764654455, "grad_norm": 3.376501582542177, "learning_rate": 1.9808322757752227e-05, "loss": 1.3324, "step": 17370 }, { "epoch": 0.07184356456516662, "grad_norm": 2.9710666955071754, "learning_rate": 1.9808067071298057e-05, "loss": 1.28, "step": 17380 }, { "epoch": 0.07188490148378869, "grad_norm": 3.651318543567687, "learning_rate": 1.9807811216074412e-05, "loss": 1.2827, "step": 17390 }, { "epoch": 0.07192623840241077, "grad_norm": 4.002270792783564, "learning_rate": 1.9807555192085697e-05, "loss": 1.2869, "step": 17400 }, { "epoch": 0.07196757532103285, "grad_norm": 3.7494249341770027, "learning_rate": 1.9807298999336316e-05, "loss": 1.368, "step": 17410 }, { "epoch": 0.07200891223965492, "grad_norm": 3.533387599209301, "learning_rate": 1.9807042637830677e-05, "loss": 1.3093, "step": 17420 }, { "epoch": 0.072050249158277, "grad_norm": 3.3204434550557096, "learning_rate": 1.980678610757319e-05, "loss": 1.3557, "step": 17430 }, { "epoch": 0.07209158607689907, "grad_norm": 3.1416247569905935, "learning_rate": 1.9806529408568274e-05, "loss": 1.2784, "step": 17440 }, { "epoch": 0.07213292299552114, "grad_norm": 3.117796981445201, "learning_rate": 1.980627254082034e-05, "loss": 1.2882, "step": 17450 }, { "epoch": 0.07217425991414322, "grad_norm": 3.13339952835746, "learning_rate": 1.9806015504333812e-05, "loss": 1.3144, "step": 17460 }, { "epoch": 0.0722155968327653, "grad_norm": 3.7609938207052074, "learning_rate": 1.9805758299113115e-05, "loss": 1.329, "step": 17470 }, { "epoch": 0.07225693375138736, "grad_norm": 3.1345386262954533, "learning_rate": 1.980550092516267e-05, "loss": 1.3049, "step": 17480 }, { "epoch": 0.07229827067000945, "grad_norm": 3.7469777473015573, "learning_rate": 1.98052433824869e-05, "loss": 1.3096, "step": 17490 }, { "epoch": 0.07233960758863152, "grad_norm": 2.9933582431757073, "learning_rate": 1.9804985671090252e-05, "loss": 1.3274, "step": 17500 }, { "epoch": 0.07238094450725359, "grad_norm": 3.182564552885168, "learning_rate": 1.980472779097715e-05, "loss": 1.2891, "step": 17510 }, { "epoch": 0.07242228142587567, "grad_norm": 3.8813847623233744, "learning_rate": 1.9804469742152035e-05, "loss": 1.2941, "step": 17520 }, { "epoch": 0.07246361834449774, "grad_norm": 3.388309470760254, "learning_rate": 1.9804211524619345e-05, "loss": 1.3129, "step": 17530 }, { "epoch": 0.07250495526311983, "grad_norm": 3.0370861904609305, "learning_rate": 1.9803953138383523e-05, "loss": 1.2816, "step": 17540 }, { "epoch": 0.0725462921817419, "grad_norm": 3.255739551897295, "learning_rate": 1.980369458344902e-05, "loss": 1.3048, "step": 17550 }, { "epoch": 0.07258762910036397, "grad_norm": 3.2197652995956325, "learning_rate": 1.9803435859820278e-05, "loss": 1.3401, "step": 17560 }, { "epoch": 0.07262896601898605, "grad_norm": 3.144102345108867, "learning_rate": 1.9803176967501752e-05, "loss": 1.3093, "step": 17570 }, { "epoch": 0.07267030293760812, "grad_norm": 3.255959441623639, "learning_rate": 1.98029179064979e-05, "loss": 1.3107, "step": 17580 }, { "epoch": 0.07271163985623019, "grad_norm": 3.300836279803978, "learning_rate": 1.9802658676813177e-05, "loss": 1.2793, "step": 17590 }, { "epoch": 0.07275297677485228, "grad_norm": 3.1982961633057307, "learning_rate": 1.980239927845204e-05, "loss": 1.293, "step": 17600 }, { "epoch": 0.07279431369347435, "grad_norm": 2.7735162621143066, "learning_rate": 1.980213971141896e-05, "loss": 1.2829, "step": 17610 }, { "epoch": 0.07283565061209642, "grad_norm": 3.1127853241409524, "learning_rate": 1.9801879975718397e-05, "loss": 1.3038, "step": 17620 }, { "epoch": 0.0728769875307185, "grad_norm": 4.4882400102517, "learning_rate": 1.9801620071354823e-05, "loss": 1.3252, "step": 17630 }, { "epoch": 0.07291832444934057, "grad_norm": 3.0527302986554474, "learning_rate": 1.980135999833271e-05, "loss": 1.3324, "step": 17640 }, { "epoch": 0.07295966136796264, "grad_norm": 2.6769380820391637, "learning_rate": 1.9801099756656534e-05, "loss": 1.3472, "step": 17650 }, { "epoch": 0.07300099828658473, "grad_norm": 2.8240010897597796, "learning_rate": 1.980083934633077e-05, "loss": 1.3072, "step": 17660 }, { "epoch": 0.0730423352052068, "grad_norm": 3.0678813341238387, "learning_rate": 1.9800578767359905e-05, "loss": 1.3385, "step": 17670 }, { "epoch": 0.07308367212382888, "grad_norm": 3.9303518801684056, "learning_rate": 1.9800318019748414e-05, "loss": 1.3024, "step": 17680 }, { "epoch": 0.07312500904245095, "grad_norm": 3.4138877073872367, "learning_rate": 1.980005710350079e-05, "loss": 1.3219, "step": 17690 }, { "epoch": 0.07316634596107302, "grad_norm": 2.9612831417650574, "learning_rate": 1.9799796018621523e-05, "loss": 1.2972, "step": 17700 }, { "epoch": 0.0732076828796951, "grad_norm": 3.5999340523181167, "learning_rate": 1.9799534765115106e-05, "loss": 1.2879, "step": 17710 }, { "epoch": 0.07324901979831717, "grad_norm": 3.243493407552293, "learning_rate": 1.9799273342986027e-05, "loss": 1.3312, "step": 17720 }, { "epoch": 0.07329035671693925, "grad_norm": 3.1251098781902678, "learning_rate": 1.979901175223879e-05, "loss": 1.283, "step": 17730 }, { "epoch": 0.07333169363556133, "grad_norm": 3.016302925303861, "learning_rate": 1.97987499928779e-05, "loss": 1.2653, "step": 17740 }, { "epoch": 0.0733730305541834, "grad_norm": 3.0676270506907373, "learning_rate": 1.9798488064907854e-05, "loss": 1.3463, "step": 17750 }, { "epoch": 0.07341436747280547, "grad_norm": 3.33093266064901, "learning_rate": 1.9798225968333162e-05, "loss": 1.2925, "step": 17760 }, { "epoch": 0.07345570439142755, "grad_norm": 2.9945442082263707, "learning_rate": 1.9797963703158338e-05, "loss": 1.3073, "step": 17770 }, { "epoch": 0.07349704131004962, "grad_norm": 2.7596863662002145, "learning_rate": 1.9797701269387886e-05, "loss": 1.3014, "step": 17780 }, { "epoch": 0.0735383782286717, "grad_norm": 3.533092084537244, "learning_rate": 1.979743866702633e-05, "loss": 1.2791, "step": 17790 }, { "epoch": 0.07357971514729378, "grad_norm": 4.416928923354257, "learning_rate": 1.9797175896078183e-05, "loss": 1.3187, "step": 17800 }, { "epoch": 0.07362105206591585, "grad_norm": 3.4889626149350983, "learning_rate": 1.9796912956547968e-05, "loss": 1.3279, "step": 17810 }, { "epoch": 0.07366238898453792, "grad_norm": 2.8612849638284636, "learning_rate": 1.979664984844021e-05, "loss": 1.286, "step": 17820 }, { "epoch": 0.07370372590316, "grad_norm": 3.48443371509397, "learning_rate": 1.9796386571759437e-05, "loss": 1.3143, "step": 17830 }, { "epoch": 0.07374506282178207, "grad_norm": 3.744181699697567, "learning_rate": 1.979612312651018e-05, "loss": 1.3133, "step": 17840 }, { "epoch": 0.07378639974040416, "grad_norm": 3.0686519126186056, "learning_rate": 1.9795859512696974e-05, "loss": 1.3136, "step": 17850 }, { "epoch": 0.07382773665902623, "grad_norm": 3.067495520118401, "learning_rate": 1.9795595730324347e-05, "loss": 1.3026, "step": 17860 }, { "epoch": 0.0738690735776483, "grad_norm": 3.39693600568867, "learning_rate": 1.9795331779396846e-05, "loss": 1.3045, "step": 17870 }, { "epoch": 0.07391041049627038, "grad_norm": 3.593857826641827, "learning_rate": 1.9795067659919008e-05, "loss": 1.3278, "step": 17880 }, { "epoch": 0.07395174741489245, "grad_norm": 3.5924096708123985, "learning_rate": 1.9794803371895383e-05, "loss": 1.2578, "step": 17890 }, { "epoch": 0.07399308433351452, "grad_norm": 3.1548306467737546, "learning_rate": 1.9794538915330514e-05, "loss": 1.3145, "step": 17900 }, { "epoch": 0.0740344212521366, "grad_norm": 3.3874308529909736, "learning_rate": 1.979427429022895e-05, "loss": 1.2635, "step": 17910 }, { "epoch": 0.07407575817075868, "grad_norm": 3.574121943476516, "learning_rate": 1.979400949659525e-05, "loss": 1.3333, "step": 17920 }, { "epoch": 0.07411709508938075, "grad_norm": 3.277207731814223, "learning_rate": 1.9793744534433968e-05, "loss": 1.3438, "step": 17930 }, { "epoch": 0.07415843200800283, "grad_norm": 3.8968392073501152, "learning_rate": 1.979347940374966e-05, "loss": 1.313, "step": 17940 }, { "epoch": 0.0741997689266249, "grad_norm": 3.083444751622717, "learning_rate": 1.9793214104546895e-05, "loss": 1.3146, "step": 17950 }, { "epoch": 0.07424110584524697, "grad_norm": 3.2007909625237208, "learning_rate": 1.9792948636830235e-05, "loss": 1.2767, "step": 17960 }, { "epoch": 0.07428244276386906, "grad_norm": 3.475285388439305, "learning_rate": 1.979268300060424e-05, "loss": 1.3113, "step": 17970 }, { "epoch": 0.07432377968249113, "grad_norm": 3.0728058276568064, "learning_rate": 1.9792417195873496e-05, "loss": 1.2625, "step": 17980 }, { "epoch": 0.07436511660111321, "grad_norm": 3.1462512846060595, "learning_rate": 1.9792151222642565e-05, "loss": 1.2669, "step": 17990 }, { "epoch": 0.07440645351973528, "grad_norm": 3.451097468244895, "learning_rate": 1.9791885080916026e-05, "loss": 1.3536, "step": 18000 }, { "epoch": 0.07444779043835735, "grad_norm": 2.9123675277858805, "learning_rate": 1.979161877069846e-05, "loss": 1.2859, "step": 18010 }, { "epoch": 0.07448912735697943, "grad_norm": 3.6048473916322905, "learning_rate": 1.9791352291994453e-05, "loss": 1.2868, "step": 18020 }, { "epoch": 0.0745304642756015, "grad_norm": 2.877236521609129, "learning_rate": 1.9791085644808588e-05, "loss": 1.3201, "step": 18030 }, { "epoch": 0.07457180119422357, "grad_norm": 3.346518920497629, "learning_rate": 1.9790818829145447e-05, "loss": 1.2914, "step": 18040 }, { "epoch": 0.07461313811284566, "grad_norm": 3.5346578512252416, "learning_rate": 1.979055184500963e-05, "loss": 1.2668, "step": 18050 }, { "epoch": 0.07465447503146773, "grad_norm": 3.044418176627761, "learning_rate": 1.9790284692405723e-05, "loss": 1.2722, "step": 18060 }, { "epoch": 0.0746958119500898, "grad_norm": 3.4743539555195206, "learning_rate": 1.979001737133833e-05, "loss": 1.3093, "step": 18070 }, { "epoch": 0.07473714886871188, "grad_norm": 3.152139034733342, "learning_rate": 1.978974988181205e-05, "loss": 1.3282, "step": 18080 }, { "epoch": 0.07477848578733395, "grad_norm": 3.844127465592703, "learning_rate": 1.978948222383148e-05, "loss": 1.3, "step": 18090 }, { "epoch": 0.07481982270595602, "grad_norm": 3.312821234732469, "learning_rate": 1.9789214397401233e-05, "loss": 1.3007, "step": 18100 }, { "epoch": 0.07486115962457811, "grad_norm": 2.587430778686902, "learning_rate": 1.978894640252591e-05, "loss": 1.2712, "step": 18110 }, { "epoch": 0.07490249654320018, "grad_norm": 3.6471232765625867, "learning_rate": 1.978867823921013e-05, "loss": 1.3255, "step": 18120 }, { "epoch": 0.07494383346182225, "grad_norm": 3.4191203190660966, "learning_rate": 1.9788409907458502e-05, "loss": 1.2588, "step": 18130 }, { "epoch": 0.07498517038044433, "grad_norm": 3.3115429776293466, "learning_rate": 1.9788141407275643e-05, "loss": 1.2923, "step": 18140 }, { "epoch": 0.0750265072990664, "grad_norm": 3.360103375582472, "learning_rate": 1.9787872738666182e-05, "loss": 1.3274, "step": 18150 }, { "epoch": 0.07506784421768849, "grad_norm": 4.4310427169860205, "learning_rate": 1.978760390163473e-05, "loss": 1.3237, "step": 18160 }, { "epoch": 0.07510918113631056, "grad_norm": 2.794577806503562, "learning_rate": 1.9787334896185916e-05, "loss": 1.3095, "step": 18170 }, { "epoch": 0.07515051805493263, "grad_norm": 3.3585888834525384, "learning_rate": 1.9787065722324374e-05, "loss": 1.3199, "step": 18180 }, { "epoch": 0.07519185497355471, "grad_norm": 3.060067450654041, "learning_rate": 1.9786796380054733e-05, "loss": 1.2532, "step": 18190 }, { "epoch": 0.07523319189217678, "grad_norm": 3.124345955834219, "learning_rate": 1.978652686938163e-05, "loss": 1.2875, "step": 18200 }, { "epoch": 0.07527452881079885, "grad_norm": 3.6658880497970254, "learning_rate": 1.9786257190309695e-05, "loss": 1.301, "step": 18210 }, { "epoch": 0.07531586572942094, "grad_norm": 3.7099075204316088, "learning_rate": 1.9785987342843573e-05, "loss": 1.343, "step": 18220 }, { "epoch": 0.075357202648043, "grad_norm": 3.26000011118403, "learning_rate": 1.9785717326987914e-05, "loss": 1.2782, "step": 18230 }, { "epoch": 0.07539853956666508, "grad_norm": 3.133092069419479, "learning_rate": 1.978544714274735e-05, "loss": 1.3026, "step": 18240 }, { "epoch": 0.07543987648528716, "grad_norm": 3.1564032343316453, "learning_rate": 1.9785176790126542e-05, "loss": 1.3207, "step": 18250 }, { "epoch": 0.07548121340390923, "grad_norm": 3.284115672721678, "learning_rate": 1.9784906269130137e-05, "loss": 1.3117, "step": 18260 }, { "epoch": 0.0755225503225313, "grad_norm": 3.1339318140995167, "learning_rate": 1.9784635579762793e-05, "loss": 1.305, "step": 18270 }, { "epoch": 0.07556388724115339, "grad_norm": 4.379220492841367, "learning_rate": 1.9784364722029165e-05, "loss": 1.3408, "step": 18280 }, { "epoch": 0.07560522415977546, "grad_norm": 3.9318661880011945, "learning_rate": 1.978409369593391e-05, "loss": 1.3053, "step": 18290 }, { "epoch": 0.07564656107839754, "grad_norm": 3.3417710306838253, "learning_rate": 1.97838225014817e-05, "loss": 1.2781, "step": 18300 }, { "epoch": 0.07568789799701961, "grad_norm": 3.54702678079395, "learning_rate": 1.9783551138677197e-05, "loss": 1.2998, "step": 18310 }, { "epoch": 0.07572923491564168, "grad_norm": 3.4741727106428946, "learning_rate": 1.978327960752507e-05, "loss": 1.3003, "step": 18320 }, { "epoch": 0.07577057183426376, "grad_norm": 3.2924386529485563, "learning_rate": 1.9783007908029995e-05, "loss": 1.3095, "step": 18330 }, { "epoch": 0.07581190875288583, "grad_norm": 3.516437725795232, "learning_rate": 1.978273604019664e-05, "loss": 1.3088, "step": 18340 }, { "epoch": 0.0758532456715079, "grad_norm": 3.5342954273687193, "learning_rate": 1.9782464004029692e-05, "loss": 1.3035, "step": 18350 }, { "epoch": 0.07589458259012999, "grad_norm": 3.3786994777959523, "learning_rate": 1.9782191799533824e-05, "loss": 1.315, "step": 18360 }, { "epoch": 0.07593591950875206, "grad_norm": 3.16392973516022, "learning_rate": 1.9781919426713725e-05, "loss": 1.3363, "step": 18370 }, { "epoch": 0.07597725642737413, "grad_norm": 3.5807733324507693, "learning_rate": 1.9781646885574078e-05, "loss": 1.313, "step": 18380 }, { "epoch": 0.07601859334599621, "grad_norm": 2.911047627936505, "learning_rate": 1.978137417611958e-05, "loss": 1.2943, "step": 18390 }, { "epoch": 0.07605993026461828, "grad_norm": 3.519244637721149, "learning_rate": 1.9781101298354913e-05, "loss": 1.3013, "step": 18400 }, { "epoch": 0.07610126718324035, "grad_norm": 2.80355546975897, "learning_rate": 1.9780828252284778e-05, "loss": 1.319, "step": 18410 }, { "epoch": 0.07614260410186244, "grad_norm": 3.272390505391555, "learning_rate": 1.9780555037913874e-05, "loss": 1.3139, "step": 18420 }, { "epoch": 0.07618394102048451, "grad_norm": 4.373312173753661, "learning_rate": 1.9780281655246903e-05, "loss": 1.332, "step": 18430 }, { "epoch": 0.07622527793910659, "grad_norm": 3.4593509090352828, "learning_rate": 1.9780008104288566e-05, "loss": 1.3275, "step": 18440 }, { "epoch": 0.07626661485772866, "grad_norm": 3.988419562725868, "learning_rate": 1.9779734385043572e-05, "loss": 1.2876, "step": 18450 }, { "epoch": 0.07630795177635073, "grad_norm": 3.1901407057893127, "learning_rate": 1.9779460497516633e-05, "loss": 1.3077, "step": 18460 }, { "epoch": 0.07634928869497282, "grad_norm": 3.392493661451085, "learning_rate": 1.9779186441712456e-05, "loss": 1.2853, "step": 18470 }, { "epoch": 0.07639062561359489, "grad_norm": 3.2188126520970126, "learning_rate": 1.9778912217635762e-05, "loss": 1.3166, "step": 18480 }, { "epoch": 0.07643196253221696, "grad_norm": 3.207853073897323, "learning_rate": 1.9778637825291267e-05, "loss": 1.2522, "step": 18490 }, { "epoch": 0.07647329945083904, "grad_norm": 3.614534499504545, "learning_rate": 1.9778363264683694e-05, "loss": 1.2857, "step": 18500 }, { "epoch": 0.07651463636946111, "grad_norm": 3.1137575271670634, "learning_rate": 1.9778088535817765e-05, "loss": 1.322, "step": 18510 }, { "epoch": 0.07655597328808318, "grad_norm": 3.155733653317413, "learning_rate": 1.977781363869821e-05, "loss": 1.2775, "step": 18520 }, { "epoch": 0.07659731020670527, "grad_norm": 4.553902503352338, "learning_rate": 1.9777538573329757e-05, "loss": 1.3102, "step": 18530 }, { "epoch": 0.07663864712532734, "grad_norm": 3.421736022180327, "learning_rate": 1.9777263339717143e-05, "loss": 1.3101, "step": 18540 }, { "epoch": 0.0766799840439494, "grad_norm": 3.0688727373371245, "learning_rate": 1.97769879378651e-05, "loss": 1.3116, "step": 18550 }, { "epoch": 0.07672132096257149, "grad_norm": 3.7016289513150173, "learning_rate": 1.977671236777837e-05, "loss": 1.3266, "step": 18560 }, { "epoch": 0.07676265788119356, "grad_norm": 2.918955840628811, "learning_rate": 1.977643662946169e-05, "loss": 1.335, "step": 18570 }, { "epoch": 0.07680399479981563, "grad_norm": 2.710819215083447, "learning_rate": 1.9776160722919808e-05, "loss": 1.3241, "step": 18580 }, { "epoch": 0.07684533171843771, "grad_norm": 4.690835156163024, "learning_rate": 1.9775884648157473e-05, "loss": 1.3112, "step": 18590 }, { "epoch": 0.07688666863705979, "grad_norm": 2.8651231551210685, "learning_rate": 1.9775608405179433e-05, "loss": 1.2753, "step": 18600 }, { "epoch": 0.07692800555568187, "grad_norm": 3.440907539318232, "learning_rate": 1.9775331993990445e-05, "loss": 1.3065, "step": 18610 }, { "epoch": 0.07696934247430394, "grad_norm": 3.284791911253238, "learning_rate": 1.977505541459526e-05, "loss": 1.2491, "step": 18620 }, { "epoch": 0.07701067939292601, "grad_norm": 3.9127411714280815, "learning_rate": 1.977477866699864e-05, "loss": 1.3486, "step": 18630 }, { "epoch": 0.0770520163115481, "grad_norm": 3.6832173528899292, "learning_rate": 1.9774501751205343e-05, "loss": 1.26, "step": 18640 }, { "epoch": 0.07709335323017016, "grad_norm": 3.037263148140778, "learning_rate": 1.9774224667220145e-05, "loss": 1.3066, "step": 18650 }, { "epoch": 0.07713469014879223, "grad_norm": 3.9474254309978978, "learning_rate": 1.97739474150478e-05, "loss": 1.3526, "step": 18660 }, { "epoch": 0.07717602706741432, "grad_norm": 3.2632360352472087, "learning_rate": 1.977366999469309e-05, "loss": 1.242, "step": 18670 }, { "epoch": 0.07721736398603639, "grad_norm": 3.398609819663782, "learning_rate": 1.977339240616078e-05, "loss": 1.3083, "step": 18680 }, { "epoch": 0.07725870090465846, "grad_norm": 2.8558120813505274, "learning_rate": 1.977311464945565e-05, "loss": 1.3002, "step": 18690 }, { "epoch": 0.07730003782328054, "grad_norm": 3.3599778107706553, "learning_rate": 1.9772836724582483e-05, "loss": 1.3299, "step": 18700 }, { "epoch": 0.07734137474190261, "grad_norm": 3.9628018743002658, "learning_rate": 1.9772558631546054e-05, "loss": 1.3115, "step": 18710 }, { "epoch": 0.07738271166052468, "grad_norm": 3.239048590526076, "learning_rate": 1.9772280370351155e-05, "loss": 1.2683, "step": 18720 }, { "epoch": 0.07742404857914677, "grad_norm": 3.0722778687848558, "learning_rate": 1.977200194100257e-05, "loss": 1.3484, "step": 18730 }, { "epoch": 0.07746538549776884, "grad_norm": 3.1378953738889637, "learning_rate": 1.9771723343505093e-05, "loss": 1.353, "step": 18740 }, { "epoch": 0.07750672241639092, "grad_norm": 2.9849102736288113, "learning_rate": 1.9771444577863517e-05, "loss": 1.3318, "step": 18750 }, { "epoch": 0.07754805933501299, "grad_norm": 3.006210181091503, "learning_rate": 1.9771165644082636e-05, "loss": 1.3095, "step": 18760 }, { "epoch": 0.07758939625363506, "grad_norm": 2.8083464465567074, "learning_rate": 1.9770886542167252e-05, "loss": 1.2896, "step": 18770 }, { "epoch": 0.07763073317225715, "grad_norm": 3.262789038521565, "learning_rate": 1.9770607272122168e-05, "loss": 1.3333, "step": 18780 }, { "epoch": 0.07767207009087922, "grad_norm": 3.421189746770822, "learning_rate": 1.9770327833952187e-05, "loss": 1.28, "step": 18790 }, { "epoch": 0.07771340700950129, "grad_norm": 3.006884991049078, "learning_rate": 1.977004822766212e-05, "loss": 1.2982, "step": 18800 }, { "epoch": 0.07775474392812337, "grad_norm": 2.8631010619173427, "learning_rate": 1.976976845325678e-05, "loss": 1.2924, "step": 18810 }, { "epoch": 0.07779608084674544, "grad_norm": 3.4772737519626533, "learning_rate": 1.9769488510740974e-05, "loss": 1.2927, "step": 18820 }, { "epoch": 0.07783741776536751, "grad_norm": 2.7187723773853967, "learning_rate": 1.976920840011953e-05, "loss": 1.2868, "step": 18830 }, { "epoch": 0.0778787546839896, "grad_norm": 3.145525828551427, "learning_rate": 1.9768928121397253e-05, "loss": 1.2662, "step": 18840 }, { "epoch": 0.07792009160261167, "grad_norm": 3.6834084868042205, "learning_rate": 1.9768647674578978e-05, "loss": 1.2916, "step": 18850 }, { "epoch": 0.07796142852123374, "grad_norm": 3.2384861614856697, "learning_rate": 1.976836705966953e-05, "loss": 1.2719, "step": 18860 }, { "epoch": 0.07800276543985582, "grad_norm": 3.06441372291841, "learning_rate": 1.976808627667373e-05, "loss": 1.3137, "step": 18870 }, { "epoch": 0.07804410235847789, "grad_norm": 3.1960269763685565, "learning_rate": 1.9767805325596417e-05, "loss": 1.2943, "step": 18880 }, { "epoch": 0.07808543927709996, "grad_norm": 3.6639857210827778, "learning_rate": 1.976752420644242e-05, "loss": 1.2634, "step": 18890 }, { "epoch": 0.07812677619572204, "grad_norm": 2.8992259499910564, "learning_rate": 1.976724291921658e-05, "loss": 1.3205, "step": 18900 }, { "epoch": 0.07816811311434411, "grad_norm": 3.8410116582710963, "learning_rate": 1.9766961463923735e-05, "loss": 1.2778, "step": 18910 }, { "epoch": 0.0782094500329662, "grad_norm": 4.723075181703024, "learning_rate": 1.976667984056873e-05, "loss": 1.2998, "step": 18920 }, { "epoch": 0.07825078695158827, "grad_norm": 3.371508438100944, "learning_rate": 1.976639804915641e-05, "loss": 1.2512, "step": 18930 }, { "epoch": 0.07829212387021034, "grad_norm": 3.6639818461414677, "learning_rate": 1.976611608969162e-05, "loss": 1.2961, "step": 18940 }, { "epoch": 0.07833346078883242, "grad_norm": 2.8877225304005525, "learning_rate": 1.976583396217922e-05, "loss": 1.3345, "step": 18950 }, { "epoch": 0.0783747977074545, "grad_norm": 3.282148781915577, "learning_rate": 1.9765551666624062e-05, "loss": 1.3293, "step": 18960 }, { "epoch": 0.07841613462607656, "grad_norm": 3.1290209387742007, "learning_rate": 1.9765269203030996e-05, "loss": 1.3202, "step": 18970 }, { "epoch": 0.07845747154469865, "grad_norm": 2.9401948590630798, "learning_rate": 1.9764986571404892e-05, "loss": 1.2739, "step": 18980 }, { "epoch": 0.07849880846332072, "grad_norm": 2.726413528264204, "learning_rate": 1.9764703771750606e-05, "loss": 1.3417, "step": 18990 }, { "epoch": 0.07854014538194279, "grad_norm": 3.8708836373349693, "learning_rate": 1.976442080407301e-05, "loss": 1.2817, "step": 19000 }, { "epoch": 0.07858148230056487, "grad_norm": 2.9568442793661225, "learning_rate": 1.976413766837697e-05, "loss": 1.3157, "step": 19010 }, { "epoch": 0.07862281921918694, "grad_norm": 3.2014326193422895, "learning_rate": 1.9763854364667355e-05, "loss": 1.2734, "step": 19020 }, { "epoch": 0.07866415613780901, "grad_norm": 3.8142983049855204, "learning_rate": 1.9763570892949048e-05, "loss": 1.3268, "step": 19030 }, { "epoch": 0.0787054930564311, "grad_norm": 3.2687191971325555, "learning_rate": 1.976328725322692e-05, "loss": 1.2922, "step": 19040 }, { "epoch": 0.07874682997505317, "grad_norm": 3.8385310533023276, "learning_rate": 1.9763003445505854e-05, "loss": 1.274, "step": 19050 }, { "epoch": 0.07878816689367525, "grad_norm": 3.11002301324885, "learning_rate": 1.9762719469790736e-05, "loss": 1.3201, "step": 19060 }, { "epoch": 0.07882950381229732, "grad_norm": 3.1064094109546385, "learning_rate": 1.9762435326086446e-05, "loss": 1.2524, "step": 19070 }, { "epoch": 0.07887084073091939, "grad_norm": 2.8255870735206687, "learning_rate": 1.976215101439788e-05, "loss": 1.2982, "step": 19080 }, { "epoch": 0.07891217764954148, "grad_norm": 3.041236956663884, "learning_rate": 1.9761866534729926e-05, "loss": 1.2784, "step": 19090 }, { "epoch": 0.07895351456816355, "grad_norm": 3.0762050459561032, "learning_rate": 1.976158188708748e-05, "loss": 1.2615, "step": 19100 }, { "epoch": 0.07899485148678562, "grad_norm": 4.057630290231405, "learning_rate": 1.976129707147544e-05, "loss": 1.2956, "step": 19110 }, { "epoch": 0.0790361884054077, "grad_norm": 3.530641336009131, "learning_rate": 1.976101208789871e-05, "loss": 1.3015, "step": 19120 }, { "epoch": 0.07907752532402977, "grad_norm": 3.457139153653439, "learning_rate": 1.976072693636219e-05, "loss": 1.3007, "step": 19130 }, { "epoch": 0.07911886224265184, "grad_norm": 4.247767055366308, "learning_rate": 1.9760441616870785e-05, "loss": 1.3284, "step": 19140 }, { "epoch": 0.07916019916127393, "grad_norm": 3.4313340038672995, "learning_rate": 1.976015612942941e-05, "loss": 1.3064, "step": 19150 }, { "epoch": 0.079201536079896, "grad_norm": 3.908560338711229, "learning_rate": 1.9759870474042973e-05, "loss": 1.3116, "step": 19160 }, { "epoch": 0.07924287299851807, "grad_norm": 3.745765592110947, "learning_rate": 1.9759584650716395e-05, "loss": 1.3737, "step": 19170 }, { "epoch": 0.07928420991714015, "grad_norm": 3.078986622432472, "learning_rate": 1.9759298659454588e-05, "loss": 1.2788, "step": 19180 }, { "epoch": 0.07932554683576222, "grad_norm": 2.987786079650764, "learning_rate": 1.9759012500262474e-05, "loss": 1.2834, "step": 19190 }, { "epoch": 0.07936688375438429, "grad_norm": 3.1036456287250673, "learning_rate": 1.975872617314498e-05, "loss": 1.2831, "step": 19200 }, { "epoch": 0.07940822067300637, "grad_norm": 3.865690690229234, "learning_rate": 1.9758439678107033e-05, "loss": 1.2922, "step": 19210 }, { "epoch": 0.07944955759162844, "grad_norm": 3.3185016462394588, "learning_rate": 1.9758153015153553e-05, "loss": 1.3349, "step": 19220 }, { "epoch": 0.07949089451025053, "grad_norm": 2.971387000152931, "learning_rate": 1.975786618428949e-05, "loss": 1.2411, "step": 19230 }, { "epoch": 0.0795322314288726, "grad_norm": 3.1868412483457065, "learning_rate": 1.9757579185519766e-05, "loss": 1.3152, "step": 19240 }, { "epoch": 0.07957356834749467, "grad_norm": 3.573887753524877, "learning_rate": 1.9757292018849322e-05, "loss": 1.32, "step": 19250 }, { "epoch": 0.07961490526611675, "grad_norm": 2.6269840263774347, "learning_rate": 1.9757004684283107e-05, "loss": 1.3123, "step": 19260 }, { "epoch": 0.07965624218473882, "grad_norm": 3.1839995568616546, "learning_rate": 1.9756717181826054e-05, "loss": 1.3305, "step": 19270 }, { "epoch": 0.0796975791033609, "grad_norm": 3.5732944308856474, "learning_rate": 1.9756429511483117e-05, "loss": 1.298, "step": 19280 }, { "epoch": 0.07973891602198298, "grad_norm": 3.612052010659264, "learning_rate": 1.9756141673259247e-05, "loss": 1.2797, "step": 19290 }, { "epoch": 0.07978025294060505, "grad_norm": 3.129559274239735, "learning_rate": 1.9755853667159392e-05, "loss": 1.3242, "step": 19300 }, { "epoch": 0.07982158985922712, "grad_norm": 3.2804938014945915, "learning_rate": 1.9755565493188507e-05, "loss": 1.2882, "step": 19310 }, { "epoch": 0.0798629267778492, "grad_norm": 3.5207612414520444, "learning_rate": 1.9755277151351558e-05, "loss": 1.3292, "step": 19320 }, { "epoch": 0.07990426369647127, "grad_norm": 3.2901806629006356, "learning_rate": 1.9754988641653502e-05, "loss": 1.2829, "step": 19330 }, { "epoch": 0.07994560061509334, "grad_norm": 3.1031625175692876, "learning_rate": 1.97546999640993e-05, "loss": 1.2952, "step": 19340 }, { "epoch": 0.07998693753371543, "grad_norm": 3.1014549071869606, "learning_rate": 1.975441111869393e-05, "loss": 1.2609, "step": 19350 }, { "epoch": 0.0800282744523375, "grad_norm": 2.9394216851114217, "learning_rate": 1.975412210544235e-05, "loss": 1.2948, "step": 19360 }, { "epoch": 0.08006961137095958, "grad_norm": 3.0263675235321545, "learning_rate": 1.975383292434954e-05, "loss": 1.3069, "step": 19370 }, { "epoch": 0.08011094828958165, "grad_norm": 3.0848357434937124, "learning_rate": 1.9753543575420477e-05, "loss": 1.2747, "step": 19380 }, { "epoch": 0.08015228520820372, "grad_norm": 2.9964068973521774, "learning_rate": 1.9753254058660132e-05, "loss": 1.3225, "step": 19390 }, { "epoch": 0.0801936221268258, "grad_norm": 3.2536839847485224, "learning_rate": 1.9752964374073494e-05, "loss": 1.3448, "step": 19400 }, { "epoch": 0.08023495904544788, "grad_norm": 3.37952149059762, "learning_rate": 1.9752674521665546e-05, "loss": 1.2845, "step": 19410 }, { "epoch": 0.08027629596406995, "grad_norm": 3.0635418759023687, "learning_rate": 1.9752384501441276e-05, "loss": 1.3123, "step": 19420 }, { "epoch": 0.08031763288269203, "grad_norm": 3.298468467101726, "learning_rate": 1.9752094313405674e-05, "loss": 1.2986, "step": 19430 }, { "epoch": 0.0803589698013141, "grad_norm": 3.2473420562854507, "learning_rate": 1.9751803957563735e-05, "loss": 1.327, "step": 19440 }, { "epoch": 0.08040030671993617, "grad_norm": 3.0843171834880483, "learning_rate": 1.975151343392045e-05, "loss": 1.304, "step": 19450 }, { "epoch": 0.08044164363855826, "grad_norm": 3.0717834263814856, "learning_rate": 1.9751222742480823e-05, "loss": 1.3133, "step": 19460 }, { "epoch": 0.08048298055718033, "grad_norm": 2.9934271457981603, "learning_rate": 1.9750931883249852e-05, "loss": 1.2674, "step": 19470 }, { "epoch": 0.0805243174758024, "grad_norm": 3.1064293377071843, "learning_rate": 1.9750640856232548e-05, "loss": 1.2917, "step": 19480 }, { "epoch": 0.08056565439442448, "grad_norm": 2.980738543973593, "learning_rate": 1.975034966143391e-05, "loss": 1.2978, "step": 19490 }, { "epoch": 0.08060699131304655, "grad_norm": 3.383580434705728, "learning_rate": 1.975005829885896e-05, "loss": 1.2603, "step": 19500 }, { "epoch": 0.08064832823166862, "grad_norm": 3.2830909352425564, "learning_rate": 1.97497667685127e-05, "loss": 1.3043, "step": 19510 }, { "epoch": 0.0806896651502907, "grad_norm": 3.3969068546862036, "learning_rate": 1.9749475070400157e-05, "loss": 1.3338, "step": 19520 }, { "epoch": 0.08073100206891277, "grad_norm": 3.0880094364724773, "learning_rate": 1.974918320452634e-05, "loss": 1.2351, "step": 19530 }, { "epoch": 0.08077233898753486, "grad_norm": 3.4417279889999564, "learning_rate": 1.974889117089628e-05, "loss": 1.264, "step": 19540 }, { "epoch": 0.08081367590615693, "grad_norm": 2.904756528303193, "learning_rate": 1.9748598969514993e-05, "loss": 1.2647, "step": 19550 }, { "epoch": 0.080855012824779, "grad_norm": 2.7148091448555722, "learning_rate": 1.9748306600387516e-05, "loss": 1.2989, "step": 19560 }, { "epoch": 0.08089634974340108, "grad_norm": 3.651161572746769, "learning_rate": 1.9748014063518875e-05, "loss": 1.3161, "step": 19570 }, { "epoch": 0.08093768666202315, "grad_norm": 3.1157328327095866, "learning_rate": 1.9747721358914106e-05, "loss": 1.2713, "step": 19580 }, { "epoch": 0.08097902358064522, "grad_norm": 3.131352709584711, "learning_rate": 1.9747428486578243e-05, "loss": 1.2904, "step": 19590 }, { "epoch": 0.08102036049926731, "grad_norm": 2.942079614415163, "learning_rate": 1.9747135446516327e-05, "loss": 1.2857, "step": 19600 }, { "epoch": 0.08106169741788938, "grad_norm": 2.8838500418296955, "learning_rate": 1.9746842238733404e-05, "loss": 1.3162, "step": 19610 }, { "epoch": 0.08110303433651145, "grad_norm": 3.62285581087724, "learning_rate": 1.9746548863234512e-05, "loss": 1.3105, "step": 19620 }, { "epoch": 0.08114437125513353, "grad_norm": 2.8759608727278927, "learning_rate": 1.9746255320024702e-05, "loss": 1.2757, "step": 19630 }, { "epoch": 0.0811857081737556, "grad_norm": 3.2868321525430377, "learning_rate": 1.974596160910903e-05, "loss": 1.2808, "step": 19640 }, { "epoch": 0.08122704509237767, "grad_norm": 2.9255021662463, "learning_rate": 1.9745667730492543e-05, "loss": 1.2982, "step": 19650 }, { "epoch": 0.08126838201099976, "grad_norm": 3.6734905665007336, "learning_rate": 1.97453736841803e-05, "loss": 1.3469, "step": 19660 }, { "epoch": 0.08130971892962183, "grad_norm": 2.9210368580920294, "learning_rate": 1.974507947017736e-05, "loss": 1.2624, "step": 19670 }, { "epoch": 0.08135105584824391, "grad_norm": 3.5155014430588514, "learning_rate": 1.974478508848879e-05, "loss": 1.3032, "step": 19680 }, { "epoch": 0.08139239276686598, "grad_norm": 3.1972640975511966, "learning_rate": 1.9744490539119652e-05, "loss": 1.2663, "step": 19690 }, { "epoch": 0.08143372968548805, "grad_norm": 3.137196038716376, "learning_rate": 1.9744195822075016e-05, "loss": 1.2599, "step": 19700 }, { "epoch": 0.08147506660411014, "grad_norm": 3.429683582295254, "learning_rate": 1.974390093735995e-05, "loss": 1.3125, "step": 19710 }, { "epoch": 0.0815164035227322, "grad_norm": 3.4935153839614213, "learning_rate": 1.974360588497953e-05, "loss": 1.3021, "step": 19720 }, { "epoch": 0.08155774044135428, "grad_norm": 3.693677622986634, "learning_rate": 1.9743310664938836e-05, "loss": 1.3154, "step": 19730 }, { "epoch": 0.08159907735997636, "grad_norm": 3.1960119762782995, "learning_rate": 1.9743015277242942e-05, "loss": 1.2931, "step": 19740 }, { "epoch": 0.08164041427859843, "grad_norm": 3.362501999299266, "learning_rate": 1.9742719721896936e-05, "loss": 1.2977, "step": 19750 }, { "epoch": 0.0816817511972205, "grad_norm": 3.152254479818853, "learning_rate": 1.97424239989059e-05, "loss": 1.2571, "step": 19760 }, { "epoch": 0.08172308811584258, "grad_norm": 3.78958825416846, "learning_rate": 1.9742128108274926e-05, "loss": 1.2903, "step": 19770 }, { "epoch": 0.08176442503446466, "grad_norm": 3.2123145951877077, "learning_rate": 1.9741832050009102e-05, "loss": 1.282, "step": 19780 }, { "epoch": 0.08180576195308673, "grad_norm": 3.1167331240538076, "learning_rate": 1.9741535824113526e-05, "loss": 1.2552, "step": 19790 }, { "epoch": 0.08184709887170881, "grad_norm": 4.206288270940374, "learning_rate": 1.974123943059329e-05, "loss": 1.2597, "step": 19800 }, { "epoch": 0.08188843579033088, "grad_norm": 3.553992295271698, "learning_rate": 1.9740942869453504e-05, "loss": 1.2908, "step": 19810 }, { "epoch": 0.08192977270895295, "grad_norm": 3.521907001370512, "learning_rate": 1.974064614069926e-05, "loss": 1.297, "step": 19820 }, { "epoch": 0.08197110962757503, "grad_norm": 3.360470421890721, "learning_rate": 1.9740349244335665e-05, "loss": 1.2882, "step": 19830 }, { "epoch": 0.0820124465461971, "grad_norm": 2.9432002594333464, "learning_rate": 1.9740052180367836e-05, "loss": 1.252, "step": 19840 }, { "epoch": 0.08205378346481919, "grad_norm": 3.252199526734451, "learning_rate": 1.9739754948800874e-05, "loss": 1.2805, "step": 19850 }, { "epoch": 0.08209512038344126, "grad_norm": 3.1871490405890586, "learning_rate": 1.9739457549639905e-05, "loss": 1.2697, "step": 19860 }, { "epoch": 0.08213645730206333, "grad_norm": 3.1800803475250166, "learning_rate": 1.973915998289004e-05, "loss": 1.3342, "step": 19870 }, { "epoch": 0.08217779422068541, "grad_norm": 2.8783537685939478, "learning_rate": 1.9738862248556395e-05, "loss": 1.2471, "step": 19880 }, { "epoch": 0.08221913113930748, "grad_norm": 2.7100348444687246, "learning_rate": 1.9738564346644103e-05, "loss": 1.2827, "step": 19890 }, { "epoch": 0.08226046805792955, "grad_norm": 3.509524916127054, "learning_rate": 1.973826627715828e-05, "loss": 1.3025, "step": 19900 }, { "epoch": 0.08230180497655164, "grad_norm": 3.3725272601095795, "learning_rate": 1.9737968040104065e-05, "loss": 1.3234, "step": 19910 }, { "epoch": 0.08234314189517371, "grad_norm": 3.341475977428283, "learning_rate": 1.9737669635486585e-05, "loss": 1.3203, "step": 19920 }, { "epoch": 0.08238447881379578, "grad_norm": 3.185315976662971, "learning_rate": 1.9737371063310972e-05, "loss": 1.2828, "step": 19930 }, { "epoch": 0.08242581573241786, "grad_norm": 3.6059578502827945, "learning_rate": 1.9737072323582366e-05, "loss": 1.3272, "step": 19940 }, { "epoch": 0.08246715265103993, "grad_norm": 3.486164979637442, "learning_rate": 1.973677341630591e-05, "loss": 1.2619, "step": 19950 }, { "epoch": 0.082508489569662, "grad_norm": 3.3788628785271935, "learning_rate": 1.9736474341486742e-05, "loss": 1.2866, "step": 19960 }, { "epoch": 0.08254982648828409, "grad_norm": 3.116863275035675, "learning_rate": 1.973617509913001e-05, "loss": 1.291, "step": 19970 }, { "epoch": 0.08259116340690616, "grad_norm": 2.7964040776322796, "learning_rate": 1.973587568924087e-05, "loss": 1.2725, "step": 19980 }, { "epoch": 0.08263250032552824, "grad_norm": 2.797740269032973, "learning_rate": 1.9735576111824465e-05, "loss": 1.2742, "step": 19990 }, { "epoch": 0.08267383724415031, "grad_norm": 3.2432141472722162, "learning_rate": 1.9735276366885956e-05, "loss": 1.2947, "step": 20000 }, { "epoch": 0.08267383724415031, "eval_loss": 1.5713279247283936, "eval_runtime": 392.3898, "eval_samples_per_second": 10.439, "eval_steps_per_second": 2.61, "step": 20000 }, { "epoch": 0.08267797093601252, "step": 20001, "total_flos": 0.0, "train_loss": 6.551032936291113e-05, "train_runtime": 86.8457, "train_samples_per_second": 14738.777, "train_steps_per_second": 230.293 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }