diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,4910 +3,5141 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 3488, + "global_step": 3653, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0014334862385321102, - "grad_norm": 10.930955258982435, - "learning_rate": 2.865329512893983e-07, - "loss": 1.1122, + "epoch": 0.001368738023542294, + "grad_norm": 1.1110930744661907, + "learning_rate": 2.73224043715847e-07, + "loss": 1.71, "step": 5 }, { - "epoch": 0.0028669724770642203, - "grad_norm": 7.957126318923027, - "learning_rate": 5.730659025787966e-07, - "loss": 1.1033, + "epoch": 0.002737476047084588, + "grad_norm": 1.0450218593816867, + "learning_rate": 5.46448087431694e-07, + "loss": 1.7206, "step": 10 }, { - "epoch": 0.0043004587155963305, - "grad_norm": 6.252488126522345, - "learning_rate": 8.595988538681949e-07, - "loss": 1.1448, + "epoch": 0.004106214070626882, + "grad_norm": 1.2104539142499893, + "learning_rate": 8.196721311475409e-07, + "loss": 1.8131, "step": 15 }, { - "epoch": 0.005733944954128441, - "grad_norm": 6.589440813762702, - "learning_rate": 1.1461318051575932e-06, - "loss": 1.0284, + "epoch": 0.005474952094169176, + "grad_norm": 1.1014228621000113, + "learning_rate": 1.092896174863388e-06, + "loss": 1.7571, "step": 20 }, { - "epoch": 0.007167431192660551, - "grad_norm": 4.917441825534674, - "learning_rate": 1.4326647564469915e-06, - "loss": 1.0131, + "epoch": 0.00684369011771147, + "grad_norm": 1.0213348170440788, + "learning_rate": 1.3661202185792352e-06, + "loss": 1.7504, "step": 25 }, { - "epoch": 0.008600917431192661, - "grad_norm": 4.611821748972413, - "learning_rate": 1.7191977077363897e-06, - "loss": 1.0492, + "epoch": 0.008212428141253765, + "grad_norm": 0.907763844914787, + "learning_rate": 1.6393442622950819e-06, + "loss": 1.726, "step": 30 }, { - "epoch": 0.010034403669724771, - "grad_norm": 4.602326061969326, - "learning_rate": 2.005730659025788e-06, - "loss": 1.0249, + "epoch": 0.009581166164796057, + "grad_norm": 0.9051570945453739, + "learning_rate": 1.912568306010929e-06, + "loss": 1.7158, "step": 35 }, { - "epoch": 0.011467889908256881, - "grad_norm": 4.795975437768488, - "learning_rate": 2.2922636103151864e-06, - "loss": 0.9889, + "epoch": 0.010949904188338352, + "grad_norm": 0.7396508509648158, + "learning_rate": 2.185792349726776e-06, + "loss": 1.7692, "step": 40 }, { - "epoch": 0.012901376146788992, - "grad_norm": 4.881452695659322, - "learning_rate": 2.5787965616045845e-06, - "loss": 0.9654, + "epoch": 0.012318642211880646, + "grad_norm": 0.6316750205627214, + "learning_rate": 2.459016393442623e-06, + "loss": 1.7118, "step": 45 }, { - "epoch": 0.014334862385321102, - "grad_norm": 4.816747748013977, - "learning_rate": 2.865329512893983e-06, - "loss": 1.0479, + "epoch": 0.01368738023542294, + "grad_norm": 0.6071126407750962, + "learning_rate": 2.7322404371584705e-06, + "loss": 1.6533, "step": 50 }, { - "epoch": 0.01576834862385321, - "grad_norm": 4.672562758642394, - "learning_rate": 3.151862464183381e-06, - "loss": 0.9692, + "epoch": 0.015056118258965233, + "grad_norm": 0.6129917793535419, + "learning_rate": 3.0054644808743173e-06, + "loss": 1.6471, "step": 55 }, { - "epoch": 0.017201834862385322, - "grad_norm": 4.665632444621185, - "learning_rate": 3.4383954154727795e-06, - "loss": 1.0033, + "epoch": 0.01642485628250753, + "grad_norm": 0.6342687394025227, + "learning_rate": 3.2786885245901638e-06, + "loss": 1.6505, "step": 60 }, { - "epoch": 0.01863532110091743, - "grad_norm": 4.5487339829077795, - "learning_rate": 3.724928366762178e-06, - "loss": 1.0206, + "epoch": 0.017793594306049824, + "grad_norm": 0.5732154266113143, + "learning_rate": 3.551912568306011e-06, + "loss": 1.6261, "step": 65 }, { - "epoch": 0.020068807339449542, - "grad_norm": 4.72120821418095, - "learning_rate": 4.011461318051576e-06, - "loss": 1.0065, + "epoch": 0.019162332329592115, + "grad_norm": 0.5200300967595174, + "learning_rate": 3.825136612021858e-06, + "loss": 1.5861, "step": 70 }, { - "epoch": 0.02150229357798165, - "grad_norm": 4.488698654642803, - "learning_rate": 4.2979942693409744e-06, - "loss": 0.9634, + "epoch": 0.02053107035313441, + "grad_norm": 0.5356030837166187, + "learning_rate": 4.098360655737705e-06, + "loss": 1.6002, "step": 75 }, { - "epoch": 0.022935779816513763, - "grad_norm": 4.502540106177661, - "learning_rate": 4.584527220630373e-06, - "loss": 0.9633, + "epoch": 0.021899808376676703, + "grad_norm": 0.4893081141758957, + "learning_rate": 4.371584699453552e-06, + "loss": 1.5388, "step": 80 }, { - "epoch": 0.02436926605504587, - "grad_norm": 4.538891592130377, - "learning_rate": 4.871060171919771e-06, - "loss": 0.9148, + "epoch": 0.023268546400218998, + "grad_norm": 0.5042585034665032, + "learning_rate": 4.6448087431694e-06, + "loss": 1.5788, "step": 85 }, { - "epoch": 0.025802752293577983, - "grad_norm": 4.89489963864345, - "learning_rate": 5.157593123209169e-06, - "loss": 1.0303, + "epoch": 0.024637284423761292, + "grad_norm": 0.4756627065397568, + "learning_rate": 4.918032786885246e-06, + "loss": 1.5832, "step": 90 }, { - "epoch": 0.02723623853211009, - "grad_norm": 4.546009685259081, - "learning_rate": 5.444126074498568e-06, - "loss": 1.0174, + "epoch": 0.026006022447303587, + "grad_norm": 0.4712228645477175, + "learning_rate": 5.191256830601094e-06, + "loss": 1.5221, "step": 95 }, { - "epoch": 0.028669724770642203, - "grad_norm": 4.542941293141969, - "learning_rate": 5.730659025787966e-06, - "loss": 1.0251, + "epoch": 0.02737476047084588, + "grad_norm": 0.47712827241082845, + "learning_rate": 5.464480874316941e-06, + "loss": 1.5554, "step": 100 }, { - "epoch": 0.030103211009174312, - "grad_norm": 4.571406282018777, - "learning_rate": 6.017191977077364e-06, - "loss": 1.0376, + "epoch": 0.028743498494388176, + "grad_norm": 0.4747245775681728, + "learning_rate": 5.737704918032787e-06, + "loss": 1.5051, "step": 105 }, { - "epoch": 0.03153669724770642, - "grad_norm": 4.75986045778734, - "learning_rate": 6.303724928366762e-06, - "loss": 0.9827, + "epoch": 0.030112236517930466, + "grad_norm": 0.4619152125132191, + "learning_rate": 6.010928961748635e-06, + "loss": 1.5731, "step": 110 }, { - "epoch": 0.03297018348623853, - "grad_norm": 4.393838501543807, - "learning_rate": 6.590257879656161e-06, - "loss": 0.9533, + "epoch": 0.031480974541472764, + "grad_norm": 0.46656513563983254, + "learning_rate": 6.284153005464482e-06, + "loss": 1.5719, "step": 115 }, { - "epoch": 0.034403669724770644, - "grad_norm": 4.780066323607276, - "learning_rate": 6.876790830945559e-06, - "loss": 1.0018, + "epoch": 0.03284971256501506, + "grad_norm": 0.45441304688271805, + "learning_rate": 6.5573770491803276e-06, + "loss": 1.5359, "step": 120 }, { - "epoch": 0.03583715596330275, - "grad_norm": 4.414441537089264, - "learning_rate": 7.163323782234957e-06, - "loss": 1.0135, + "epoch": 0.03421845058855735, + "grad_norm": 0.4492251581811985, + "learning_rate": 6.830601092896175e-06, + "loss": 1.5254, "step": 125 }, { - "epoch": 0.03727064220183486, - "grad_norm": 4.301618750664585, - "learning_rate": 7.449856733524356e-06, - "loss": 1.0201, + "epoch": 0.03558718861209965, + "grad_norm": 0.4901256909260675, + "learning_rate": 7.103825136612022e-06, + "loss": 1.5531, "step": 130 }, { - "epoch": 0.03870412844036697, - "grad_norm": 4.3543481672910245, - "learning_rate": 7.736389684813753e-06, - "loss": 1.0236, + "epoch": 0.036955926635641935, + "grad_norm": 0.4521871797566739, + "learning_rate": 7.3770491803278695e-06, + "loss": 1.5203, "step": 135 }, { - "epoch": 0.040137614678899085, - "grad_norm": 4.31887452196869, - "learning_rate": 8.022922636103152e-06, - "loss": 1.0413, + "epoch": 0.03832466465918423, + "grad_norm": 0.46301007561132373, + "learning_rate": 7.650273224043716e-06, + "loss": 1.5597, "step": 140 }, { - "epoch": 0.04157110091743119, - "grad_norm": 4.19580797580846, - "learning_rate": 8.30945558739255e-06, - "loss": 0.9999, + "epoch": 0.039693402682726524, + "grad_norm": 0.4443455697452246, + "learning_rate": 7.923497267759564e-06, + "loss": 1.4751, "step": 145 }, { - "epoch": 0.0430045871559633, - "grad_norm": 4.2941554889344635, - "learning_rate": 8.595988538681949e-06, - "loss": 1.0824, + "epoch": 0.04106214070626882, + "grad_norm": 0.4529255518299648, + "learning_rate": 8.19672131147541e-06, + "loss": 1.5645, "step": 150 }, { - "epoch": 0.04443807339449541, - "grad_norm": 4.430699539319586, - "learning_rate": 8.882521489971347e-06, - "loss": 1.0545, + "epoch": 0.04243087872981111, + "grad_norm": 0.45575204993786755, + "learning_rate": 8.469945355191259e-06, + "loss": 1.5297, "step": 155 }, { - "epoch": 0.045871559633027525, - "grad_norm": 4.357465541547218, - "learning_rate": 9.169054441260746e-06, - "loss": 1.0662, + "epoch": 0.04379961675335341, + "grad_norm": 0.4619195990067295, + "learning_rate": 8.743169398907103e-06, + "loss": 1.4945, "step": 160 }, { - "epoch": 0.047305045871559634, - "grad_norm": 5.722224893724134, - "learning_rate": 9.455587392550144e-06, - "loss": 1.0197, + "epoch": 0.0451683547768957, + "grad_norm": 0.43620516601045234, + "learning_rate": 9.016393442622952e-06, + "loss": 1.5064, "step": 165 }, { - "epoch": 0.04873853211009174, - "grad_norm": 4.374513298911362, - "learning_rate": 9.742120343839543e-06, - "loss": 1.046, + "epoch": 0.046537092800437996, + "grad_norm": 0.48507057683395305, + "learning_rate": 9.2896174863388e-06, + "loss": 1.4873, "step": 170 }, { - "epoch": 0.05017201834862385, - "grad_norm": 4.593743802840811, - "learning_rate": 1.0028653295128941e-05, - "loss": 1.0739, + "epoch": 0.04790583082398029, + "grad_norm": 0.4591779126577447, + "learning_rate": 9.562841530054644e-06, + "loss": 1.4989, "step": 175 }, { - "epoch": 0.051605504587155966, - "grad_norm": 4.333415176606838, - "learning_rate": 1.0315186246418338e-05, - "loss": 1.0833, + "epoch": 0.049274568847522585, + "grad_norm": 0.4477837953889451, + "learning_rate": 9.836065573770493e-06, + "loss": 1.4755, "step": 180 }, { - "epoch": 0.053038990825688075, - "grad_norm": 4.426601251103325, - "learning_rate": 1.0601719197707738e-05, - "loss": 1.0799, + "epoch": 0.05064330687106488, + "grad_norm": 0.4580627295319435, + "learning_rate": 1.0109289617486339e-05, + "loss": 1.4545, "step": 185 }, { - "epoch": 0.05447247706422018, - "grad_norm": 4.556659147888879, - "learning_rate": 1.0888252148997137e-05, - "loss": 1.0405, + "epoch": 0.05201204489460717, + "grad_norm": 0.45313033315292356, + "learning_rate": 1.0382513661202187e-05, + "loss": 1.4985, "step": 190 }, { - "epoch": 0.05590596330275229, - "grad_norm": 4.3383547238785605, - "learning_rate": 1.1174785100286533e-05, - "loss": 1.0189, + "epoch": 0.05338078291814947, + "grad_norm": 0.42708720087605334, + "learning_rate": 1.0655737704918034e-05, + "loss": 1.4229, "step": 195 }, { - "epoch": 0.05733944954128441, - "grad_norm": 4.125983340413087, - "learning_rate": 1.1461318051575932e-05, - "loss": 1.0592, + "epoch": 0.05474952094169176, + "grad_norm": 0.4211311153329928, + "learning_rate": 1.0928961748633882e-05, + "loss": 1.4669, "step": 200 }, { - "epoch": 0.058772935779816515, - "grad_norm": 4.479422603616749, - "learning_rate": 1.1747851002865332e-05, - "loss": 1.0626, + "epoch": 0.05611825896523406, + "grad_norm": 0.4553582543296074, + "learning_rate": 1.1202185792349727e-05, + "loss": 1.4444, "step": 205 }, { - "epoch": 0.060206422018348624, - "grad_norm": 4.506035260293105, - "learning_rate": 1.2034383954154729e-05, - "loss": 1.0506, + "epoch": 0.05748699698877635, + "grad_norm": 0.43888054055275466, + "learning_rate": 1.1475409836065575e-05, + "loss": 1.4622, "step": 210 }, { - "epoch": 0.06163990825688073, - "grad_norm": 5.22525942718489, - "learning_rate": 1.2320916905444127e-05, - "loss": 1.0855, + "epoch": 0.058855735012318645, + "grad_norm": 0.43839040680573754, + "learning_rate": 1.1748633879781421e-05, + "loss": 1.3921, "step": 215 }, { - "epoch": 0.06307339449541284, - "grad_norm": 4.779500493076456, - "learning_rate": 1.2607449856733524e-05, - "loss": 1.0872, + "epoch": 0.06022447303586093, + "grad_norm": 0.442929256824184, + "learning_rate": 1.202185792349727e-05, + "loss": 1.4695, "step": 220 }, { - "epoch": 0.06450688073394495, - "grad_norm": 4.1419371246845555, - "learning_rate": 1.2893982808022924e-05, - "loss": 1.0158, + "epoch": 0.06159321105940323, + "grad_norm": 0.42979267539703714, + "learning_rate": 1.2295081967213116e-05, + "loss": 1.4444, "step": 225 }, { - "epoch": 0.06594036697247706, - "grad_norm": 4.55193247523381, - "learning_rate": 1.3180515759312323e-05, - "loss": 1.1341, + "epoch": 0.06296194908294553, + "grad_norm": 0.43942113048062365, + "learning_rate": 1.2568306010928964e-05, + "loss": 1.4581, "step": 230 }, { - "epoch": 0.06737385321100918, - "grad_norm": 4.821922591302919, - "learning_rate": 1.346704871060172e-05, - "loss": 1.0317, + "epoch": 0.06433068710648782, + "grad_norm": 0.43105155511527676, + "learning_rate": 1.284153005464481e-05, + "loss": 1.4335, "step": 235 }, { - "epoch": 0.06880733944954129, - "grad_norm": 5.7218678278363075, - "learning_rate": 1.3753581661891118e-05, - "loss": 1.0929, + "epoch": 0.06569942513003012, + "grad_norm": 0.4347334195115935, + "learning_rate": 1.3114754098360655e-05, + "loss": 1.5437, "step": 240 }, { - "epoch": 0.0702408256880734, - "grad_norm": 4.5897462960681885, - "learning_rate": 1.4040114613180518e-05, - "loss": 1.0707, + "epoch": 0.0670681631535724, + "grad_norm": 0.41558733251657665, + "learning_rate": 1.3387978142076503e-05, + "loss": 1.4246, "step": 245 }, { - "epoch": 0.0716743119266055, - "grad_norm": 4.9044704729529265, - "learning_rate": 1.4326647564469915e-05, - "loss": 1.0589, + "epoch": 0.0684369011771147, + "grad_norm": 0.4166574660442, + "learning_rate": 1.366120218579235e-05, + "loss": 1.4672, "step": 250 }, { - "epoch": 0.07310779816513761, - "grad_norm": 4.317942267526155, - "learning_rate": 1.4613180515759313e-05, - "loss": 1.0944, + "epoch": 0.069805639200657, + "grad_norm": 0.40343261677226544, + "learning_rate": 1.3934426229508198e-05, + "loss": 1.4695, "step": 255 }, { - "epoch": 0.07454128440366972, - "grad_norm": 4.019443011772793, - "learning_rate": 1.4899713467048712e-05, - "loss": 1.1014, + "epoch": 0.0711743772241993, + "grad_norm": 0.41945935612278784, + "learning_rate": 1.4207650273224044e-05, + "loss": 1.4173, "step": 260 }, { - "epoch": 0.07597477064220183, - "grad_norm": 5.05645456273739, - "learning_rate": 1.518624641833811e-05, - "loss": 1.0855, + "epoch": 0.07254311524774158, + "grad_norm": 0.4091410585881606, + "learning_rate": 1.4480874316939892e-05, + "loss": 1.3669, "step": 265 }, { - "epoch": 0.07740825688073394, - "grad_norm": 4.631991611940915, - "learning_rate": 1.5472779369627507e-05, - "loss": 1.1797, + "epoch": 0.07391185327128387, + "grad_norm": 0.40428264801464686, + "learning_rate": 1.4754098360655739e-05, + "loss": 1.4712, "step": 270 }, { - "epoch": 0.07884174311926606, - "grad_norm": 4.863391448983006, - "learning_rate": 1.5759312320916907e-05, - "loss": 1.1189, + "epoch": 0.07528059129482617, + "grad_norm": 0.4074017153449847, + "learning_rate": 1.5027322404371585e-05, + "loss": 1.4477, "step": 275 }, { - "epoch": 0.08027522935779817, - "grad_norm": 5.67171228826149, - "learning_rate": 1.6045845272206304e-05, - "loss": 1.0687, + "epoch": 0.07664932931836846, + "grad_norm": 0.41299926058270664, + "learning_rate": 1.5300546448087432e-05, + "loss": 1.4015, "step": 280 }, { - "epoch": 0.08170871559633028, - "grad_norm": 4.863402157719064, - "learning_rate": 1.6332378223495704e-05, - "loss": 1.1033, + "epoch": 0.07801806734191076, + "grad_norm": 0.39622800283246956, + "learning_rate": 1.5573770491803278e-05, + "loss": 1.443, "step": 285 }, { - "epoch": 0.08314220183486239, - "grad_norm": 4.660992837053409, - "learning_rate": 1.66189111747851e-05, - "loss": 1.1543, + "epoch": 0.07938680536545305, + "grad_norm": 0.4050370810855124, + "learning_rate": 1.5846994535519128e-05, + "loss": 1.4454, "step": 290 }, { - "epoch": 0.0845756880733945, - "grad_norm": 4.552069083735871, - "learning_rate": 1.69054441260745e-05, - "loss": 1.1464, + "epoch": 0.08075554338899535, + "grad_norm": 0.405591352468174, + "learning_rate": 1.6120218579234975e-05, + "loss": 1.4077, "step": 295 }, { - "epoch": 0.0860091743119266, - "grad_norm": 4.61726981167071, - "learning_rate": 1.7191977077363898e-05, - "loss": 1.1332, + "epoch": 0.08212428141253764, + "grad_norm": 0.4188674346008248, + "learning_rate": 1.639344262295082e-05, + "loss": 1.4503, "step": 300 }, { - "epoch": 0.08744266055045871, - "grad_norm": 4.9734394128477755, - "learning_rate": 1.7478510028653298e-05, - "loss": 1.0608, + "epoch": 0.08349301943607994, + "grad_norm": 0.4015518431467613, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.4027, "step": 305 }, { - "epoch": 0.08887614678899082, - "grad_norm": 4.080774915213187, - "learning_rate": 1.7765042979942695e-05, - "loss": 1.1737, + "epoch": 0.08486175745962223, + "grad_norm": 0.40825550305371383, + "learning_rate": 1.6939890710382517e-05, + "loss": 1.4021, "step": 310 }, { - "epoch": 0.09030963302752294, - "grad_norm": 4.9010757003007805, - "learning_rate": 1.805157593123209e-05, - "loss": 1.152, + "epoch": 0.08623049548316453, + "grad_norm": 0.4103687720629476, + "learning_rate": 1.721311475409836e-05, + "loss": 1.4304, "step": 315 }, { - "epoch": 0.09174311926605505, - "grad_norm": 4.348268075612745, - "learning_rate": 1.833810888252149e-05, - "loss": 1.1493, + "epoch": 0.08759923350670681, + "grad_norm": 0.4048205442390437, + "learning_rate": 1.7486338797814207e-05, + "loss": 1.4, "step": 320 }, { - "epoch": 0.09317660550458716, - "grad_norm": 5.525129625035676, - "learning_rate": 1.8624641833810892e-05, - "loss": 1.109, + "epoch": 0.08896797153024912, + "grad_norm": 0.41661892344943224, + "learning_rate": 1.7759562841530057e-05, + "loss": 1.4035, "step": 325 }, { - "epoch": 0.09461009174311927, - "grad_norm": 6.128793939259254, - "learning_rate": 1.891117478510029e-05, - "loss": 1.1883, + "epoch": 0.0903367095537914, + "grad_norm": 0.4308332237873975, + "learning_rate": 1.8032786885245903e-05, + "loss": 1.4302, "step": 330 }, { - "epoch": 0.09604357798165138, - "grad_norm": 4.553408862504849, - "learning_rate": 1.9197707736389685e-05, - "loss": 1.1294, + "epoch": 0.0917054475773337, + "grad_norm": 0.41012677210314796, + "learning_rate": 1.830601092896175e-05, + "loss": 1.3969, "step": 335 }, { - "epoch": 0.09747706422018348, - "grad_norm": 4.512129266800345, - "learning_rate": 1.9484240687679085e-05, - "loss": 1.2076, + "epoch": 0.09307418560087599, + "grad_norm": 0.40743633096989823, + "learning_rate": 1.85792349726776e-05, + "loss": 1.4425, "step": 340 }, { - "epoch": 0.0989105504587156, - "grad_norm": 4.294698339999475, - "learning_rate": 1.9770773638968482e-05, - "loss": 1.1455, + "epoch": 0.09444292362441829, + "grad_norm": 0.39899819661715447, + "learning_rate": 1.8852459016393446e-05, + "loss": 1.4303, "step": 345 }, { - "epoch": 0.1003440366972477, - "grad_norm": 4.649384755228939, - "learning_rate": 1.999999499173752e-05, - "loss": 1.142, + "epoch": 0.09581166164796058, + "grad_norm": 0.41870768132807307, + "learning_rate": 1.912568306010929e-05, + "loss": 1.4002, "step": 350 }, { - "epoch": 0.10177752293577981, - "grad_norm": 4.639825467109486, - "learning_rate": 1.999981970307739e-05, - "loss": 1.19, + "epoch": 0.09718039967150287, + "grad_norm": 0.40412869941796903, + "learning_rate": 1.939890710382514e-05, + "loss": 1.403, "step": 355 }, { - "epoch": 0.10321100917431193, - "grad_norm": 4.447236882707046, - "learning_rate": 1.999939400630968e-05, - "loss": 1.1753, + "epoch": 0.09854913769504517, + "grad_norm": 0.40478167238457397, + "learning_rate": 1.9672131147540985e-05, + "loss": 1.4227, "step": 360 }, { - "epoch": 0.10464449541284404, - "grad_norm": 4.092068716884874, - "learning_rate": 1.999871791209438e-05, - "loss": 1.1893, + "epoch": 0.09991787571858746, + "grad_norm": 0.41341003482817107, + "learning_rate": 1.994535519125683e-05, + "loss": 1.4302, "step": 365 }, { - "epoch": 0.10607798165137615, - "grad_norm": 4.398948638445328, - "learning_rate": 1.9997791437361734e-05, - "loss": 1.1555, + "epoch": 0.10128661374212976, + "grad_norm": 0.41658269655258945, + "learning_rate": 1.999992692147127e-05, + "loss": 1.445, "step": 370 }, { - "epoch": 0.10751146788990826, - "grad_norm": 4.701129620714216, - "learning_rate": 1.9996614605311848e-05, - "loss": 1.2011, + "epoch": 0.10265535176567205, + "grad_norm": 0.40746899077621546, + "learning_rate": 1.999963004177886e-05, + "loss": 1.458, "step": 375 }, { - "epoch": 0.10894495412844037, - "grad_norm": 4.821236633722439, - "learning_rate": 1.999518744541407e-05, - "loss": 1.1776, + "epoch": 0.10402408978921435, + "grad_norm": 0.3960893049170255, + "learning_rate": 1.9999104800289367e-05, + "loss": 1.4382, "step": 380 }, { - "epoch": 0.11037844036697247, - "grad_norm": 4.335100849608752, - "learning_rate": 1.9993509993406297e-05, - "loss": 1.1902, + "epoch": 0.10539282781275663, + "grad_norm": 0.3950983233516754, + "learning_rate": 1.9998351208997734e-05, + "loss": 1.4365, "step": 385 }, { - "epoch": 0.11181192660550458, - "grad_norm": 4.068825093466267, - "learning_rate": 1.9991582291294042e-05, - "loss": 1.1836, + "epoch": 0.10676156583629894, + "grad_norm": 0.38845890924227394, + "learning_rate": 1.9997369285113754e-05, + "loss": 1.3731, "step": 390 }, { - "epoch": 0.11324541284403669, - "grad_norm": 3.9520872546047143, - "learning_rate": 1.9989404387349393e-05, - "loss": 1.1439, + "epoch": 0.10813030385984122, + "grad_norm": 0.39526066204949717, + "learning_rate": 1.9996159051061638e-05, + "loss": 1.4111, "step": 395 }, { - "epoch": 0.11467889908256881, - "grad_norm": 4.582077126321336, - "learning_rate": 1.998697633610982e-05, - "loss": 1.2094, + "epoch": 0.10949904188338352, + "grad_norm": 0.4133763360642962, + "learning_rate": 1.9994720534479543e-05, + "loss": 1.4294, "step": 400 }, { - "epoch": 0.11611238532110092, - "grad_norm": 4.38687782745242, - "learning_rate": 1.998429819837679e-05, - "loss": 1.1866, + "epoch": 0.11086777990692581, + "grad_norm": 0.39671784657884285, + "learning_rate": 1.999305376821889e-05, + "loss": 1.3708, "step": 405 }, { - "epoch": 0.11754587155963303, - "grad_norm": 4.6771014427247275, - "learning_rate": 1.998137004121425e-05, - "loss": 1.1587, + "epoch": 0.11223651793046811, + "grad_norm": 0.3847825062400942, + "learning_rate": 1.999115879034368e-05, + "loss": 1.4638, "step": 410 }, { - "epoch": 0.11897935779816514, - "grad_norm": 4.218200613872475, - "learning_rate": 1.9978191937946955e-05, - "loss": 1.1411, + "epoch": 0.1136052559540104, + "grad_norm": 0.41459555705408424, + "learning_rate": 1.9989035644129553e-05, + "loss": 1.4105, "step": 415 }, { - "epoch": 0.12041284403669725, - "grad_norm": 4.509609936000089, - "learning_rate": 1.9974763968158614e-05, - "loss": 1.1772, + "epoch": 0.1149739939775527, + "grad_norm": 0.4270268157501032, + "learning_rate": 1.998668437806286e-05, + "loss": 1.4243, "step": 420 }, { - "epoch": 0.12184633027522936, - "grad_norm": 5.925174301278395, - "learning_rate": 1.9971086217689928e-05, - "loss": 1.2411, + "epoch": 0.11634273200109499, + "grad_norm": 0.42083549100480927, + "learning_rate": 1.998410504583952e-05, + "loss": 1.4285, "step": 425 }, { - "epoch": 0.12327981651376146, - "grad_norm": 4.022377669658578, - "learning_rate": 1.9967158778636405e-05, - "loss": 1.1887, + "epoch": 0.11771147002463729, + "grad_norm": 0.3929543512101535, + "learning_rate": 1.998129770636381e-05, + "loss": 1.3692, "step": 430 }, { - "epoch": 0.12471330275229357, - "grad_norm": 4.539584147379229, - "learning_rate": 1.996298174934608e-05, - "loss": 1.1717, + "epoch": 0.11908020804817958, + "grad_norm": 0.40163841769109054, + "learning_rate": 1.9978262423747003e-05, + "loss": 1.3842, "step": 435 }, { - "epoch": 0.12614678899082568, - "grad_norm": 9.545718854729863, - "learning_rate": 1.9958555234417035e-05, - "loss": 1.1879, + "epoch": 0.12044894607172187, + "grad_norm": 0.40750648380479343, + "learning_rate": 1.997499926730593e-05, + "loss": 1.4412, "step": 440 }, { - "epoch": 0.1275802752293578, - "grad_norm": 22.053598752620175, - "learning_rate": 1.995387934469479e-05, - "loss": 1.172, + "epoch": 0.12181768409526417, + "grad_norm": 0.4029670313800167, + "learning_rate": 1.9971508311561373e-05, + "loss": 1.4481, "step": 445 }, { - "epoch": 0.1290137614678899, - "grad_norm": 12.911881630750296, - "learning_rate": 1.994895419726953e-05, - "loss": 1.2034, + "epoch": 0.12318642211880645, + "grad_norm": 0.3934576991168861, + "learning_rate": 1.996778963623637e-05, + "loss": 1.3706, "step": 450 }, { - "epoch": 0.13044724770642202, - "grad_norm": 4.856805114224933, - "learning_rate": 1.9943779915473165e-05, - "loss": 1.1795, + "epoch": 0.12455516014234876, + "grad_norm": 0.4126991514447529, + "learning_rate": 1.9963843326254406e-05, + "loss": 1.4096, "step": 455 }, { - "epoch": 0.13188073394495411, - "grad_norm": 4.286105662410557, - "learning_rate": 1.9938356628876235e-05, - "loss": 1.1866, + "epoch": 0.12592389816589106, + "grad_norm": 0.400896785065692, + "learning_rate": 1.9959669471737456e-05, + "loss": 1.3284, "step": 460 }, { - "epoch": 0.13331422018348624, - "grad_norm": 4.2624287742900995, - "learning_rate": 1.9932684473284687e-05, - "loss": 1.1894, + "epoch": 0.12729263618943334, + "grad_norm": 0.4084884002554967, + "learning_rate": 1.9955268168003938e-05, + "loss": 1.4346, "step": 465 }, { - "epoch": 0.13474770642201836, - "grad_norm": 4.716264114725749, - "learning_rate": 1.9926763590736457e-05, - "loss": 1.2835, + "epoch": 0.12866137421297563, + "grad_norm": 0.4031311283565323, + "learning_rate": 1.9950639515566537e-05, + "loss": 1.3632, "step": 470 }, { - "epoch": 0.13618119266055045, - "grad_norm": 4.2462330499210115, - "learning_rate": 1.992059412949791e-05, - "loss": 1.2412, + "epoch": 0.13003011223651792, + "grad_norm": 0.4017916786924823, + "learning_rate": 1.99457836201299e-05, + "loss": 1.4083, "step": 475 }, { - "epoch": 0.13761467889908258, - "grad_norm": 4.391420347453553, - "learning_rate": 1.9914176244060158e-05, - "loss": 1.1516, + "epoch": 0.13139885026006023, + "grad_norm": 0.38801834482390074, + "learning_rate": 1.9940700592588228e-05, + "loss": 1.3823, "step": 480 }, { - "epoch": 0.13904816513761467, - "grad_norm": 75.66535297260735, - "learning_rate": 1.9907510095135142e-05, - "loss": 1.2722, + "epoch": 0.13276758828360252, + "grad_norm": 0.40725954819087723, + "learning_rate": 1.993539054902275e-05, + "loss": 1.369, "step": 485 }, { - "epoch": 0.1404816513761468, - "grad_norm": 5.310902667535963, - "learning_rate": 1.9900595849651645e-05, - "loss": 1.2421, + "epoch": 0.1341363263071448, + "grad_norm": 0.42191731863948084, + "learning_rate": 1.992985361069906e-05, + "loss": 1.3907, "step": 490 }, { - "epoch": 0.1419151376146789, - "grad_norm": 5.025394485398511, - "learning_rate": 1.9893433680751105e-05, - "loss": 1.2633, + "epoch": 0.1355050643306871, + "grad_norm": 0.3981980013765845, + "learning_rate": 1.9924089904064354e-05, + "loss": 1.4253, "step": 495 }, { - "epoch": 0.143348623853211, - "grad_norm": 5.520994924491881, - "learning_rate": 1.988602376778327e-05, - "loss": 1.189, + "epoch": 0.1368738023542294, + "grad_norm": 0.3968358847514907, + "learning_rate": 1.9918099560744545e-05, + "loss": 1.4478, "step": 500 }, { - "epoch": 0.14478211009174313, - "grad_norm": 4.515316969142533, - "learning_rate": 1.9878366296301713e-05, - "loss": 1.1607, + "epoch": 0.1382425403777717, + "grad_norm": 0.4020407861005804, + "learning_rate": 1.991188271754125e-05, + "loss": 1.4063, "step": 505 }, { - "epoch": 0.14621559633027523, - "grad_norm": 4.532857835025907, - "learning_rate": 1.9870461458059188e-05, - "loss": 1.2315, + "epoch": 0.139611278401314, + "grad_norm": 0.3983008409093381, + "learning_rate": 1.990543951642866e-05, + "loss": 1.3838, "step": 510 }, { - "epoch": 0.14764908256880735, - "grad_norm": 4.001206919662932, - "learning_rate": 1.9862309451002827e-05, - "loss": 1.1659, + "epoch": 0.14098001642485627, + "grad_norm": 0.38772247745474425, + "learning_rate": 1.9898770104550335e-05, + "loss": 1.4193, "step": 515 }, { - "epoch": 0.14908256880733944, - "grad_norm": 4.137717431705149, - "learning_rate": 1.9853910479269165e-05, - "loss": 1.1732, + "epoch": 0.1423487544483986, + "grad_norm": 0.4172192092889838, + "learning_rate": 1.9891874634215784e-05, + "loss": 1.3831, "step": 520 }, { - "epoch": 0.15051605504587157, - "grad_norm": 4.367905503345653, - "learning_rate": 1.9845264753179064e-05, - "loss": 1.2219, + "epoch": 0.14371749247194088, + "grad_norm": 0.39316534732966774, + "learning_rate": 1.9884753262897042e-05, + "loss": 1.3813, "step": 525 }, { - "epoch": 0.15194954128440366, - "grad_norm": 3.942691834220351, - "learning_rate": 1.9836372489232416e-05, - "loss": 1.1208, + "epoch": 0.14508623049548316, + "grad_norm": 0.40825756476362335, + "learning_rate": 1.9877406153225028e-05, + "loss": 1.4233, "step": 530 }, { - "epoch": 0.15338302752293578, - "grad_norm": 4.028647286087348, - "learning_rate": 1.982723391010273e-05, - "loss": 1.1499, + "epoch": 0.14645496851902545, + "grad_norm": 0.4303738354233706, + "learning_rate": 1.9869833472985882e-05, + "loss": 1.4267, "step": 535 }, { - "epoch": 0.15481651376146788, - "grad_norm": 4.262774154684462, - "learning_rate": 1.9817849244631575e-05, - "loss": 1.2052, + "epoch": 0.14782370654256774, + "grad_norm": 0.39536263390575654, + "learning_rate": 1.9862035395117075e-05, + "loss": 1.3688, "step": 540 }, { - "epoch": 0.15625, - "grad_norm": 4.0350996506196175, - "learning_rate": 1.9808218727822808e-05, - "loss": 1.1899, + "epoch": 0.14919244456611006, + "grad_norm": 0.39303998157704906, + "learning_rate": 1.9854012097703515e-05, + "loss": 1.4259, "step": 545 }, { - "epoch": 0.15768348623853212, - "grad_norm": 3.806300412200292, - "learning_rate": 1.979834260083673e-05, - "loss": 1.2099, + "epoch": 0.15056118258965234, + "grad_norm": 0.3939411519174778, + "learning_rate": 1.9845763763973433e-05, + "loss": 1.354, "step": 550 }, { - "epoch": 0.15911697247706422, - "grad_norm": 4.447162495747635, - "learning_rate": 1.9788221110984026e-05, - "loss": 1.2123, + "epoch": 0.15192992061319463, + "grad_norm": 0.39277763892363776, + "learning_rate": 1.9837290582294233e-05, + "loss": 1.3468, "step": 555 }, { - "epoch": 0.16055045871559634, - "grad_norm": 4.094827599966712, - "learning_rate": 1.977785451171958e-05, - "loss": 1.1411, + "epoch": 0.15329865863673692, + "grad_norm": 0.39121990838351217, + "learning_rate": 1.9828592746168172e-05, + "loss": 1.3508, "step": 560 }, { - "epoch": 0.16198394495412843, - "grad_norm": 4.6429132125696535, - "learning_rate": 1.9767243062636122e-05, - "loss": 1.1914, + "epoch": 0.15466739666027923, + "grad_norm": 0.4035101254769364, + "learning_rate": 1.981967045422795e-05, + "loss": 1.369, "step": 565 }, { - "epoch": 0.16341743119266056, - "grad_norm": 3.9726685572488227, - "learning_rate": 1.975638702945773e-05, - "loss": 1.1758, + "epoch": 0.15603613468382152, + "grad_norm": 0.38407623227911974, + "learning_rate": 1.9810523910232165e-05, + "loss": 1.4039, "step": 570 }, { - "epoch": 0.16485091743119265, - "grad_norm": 3.9582277412235367, - "learning_rate": 1.974528668403318e-05, - "loss": 1.1839, + "epoch": 0.1574048727073638, + "grad_norm": 0.3915340445821982, + "learning_rate": 1.9801153323060667e-05, + "loss": 1.3634, "step": 575 }, { - "epoch": 0.16628440366972477, - "grad_norm": 4.611199474059946, - "learning_rate": 1.973394230432913e-05, - "loss": 1.2155, + "epoch": 0.1587736107309061, + "grad_norm": 0.39241210575593644, + "learning_rate": 1.9791558906709787e-05, + "loss": 1.4257, "step": 580 }, { - "epoch": 0.16771788990825687, - "grad_norm": 3.964842824608755, - "learning_rate": 1.972235417442317e-05, - "loss": 1.168, + "epoch": 0.1601423487544484, + "grad_norm": 0.3946963973614525, + "learning_rate": 1.9781740880287444e-05, + "loss": 1.3136, "step": 585 }, { - "epoch": 0.169151376146789, - "grad_norm": 4.083964710145448, - "learning_rate": 1.9710522584496695e-05, - "loss": 1.2135, + "epoch": 0.1615110867779907, + "grad_norm": 0.39148042578465725, + "learning_rate": 1.9771699468008156e-05, + "loss": 1.3654, "step": 590 }, { - "epoch": 0.1705848623853211, - "grad_norm": 3.8905730852414155, - "learning_rate": 1.9698447830827655e-05, - "loss": 1.1893, + "epoch": 0.16287982480153299, + "grad_norm": 0.3895106088217061, + "learning_rate": 1.9761434899187893e-05, + "loss": 1.3762, "step": 595 }, { - "epoch": 0.1720183486238532, - "grad_norm": 4.399950123881084, - "learning_rate": 1.9686130215783124e-05, - "loss": 1.2244, + "epoch": 0.16424856282507527, + "grad_norm": 0.3997937267692177, + "learning_rate": 1.9750947408238872e-05, + "loss": 1.4064, "step": 600 }, { - "epoch": 0.17345183486238533, - "grad_norm": 4.3066835429404495, - "learning_rate": 1.967357004781173e-05, - "loss": 1.1532, + "epoch": 0.1656173008486176, + "grad_norm": 0.4002294154735745, + "learning_rate": 1.974023723466418e-05, + "loss": 1.3771, "step": 605 }, { - "epoch": 0.17488532110091742, - "grad_norm": 5.071390390632784, - "learning_rate": 1.9660767641435926e-05, - "loss": 1.2137, + "epoch": 0.16698603887215988, + "grad_norm": 0.40550620529076886, + "learning_rate": 1.9729304623052315e-05, + "loss": 1.4551, "step": 610 }, { - "epoch": 0.17631880733944955, - "grad_norm": 3.738266843064544, - "learning_rate": 1.964772331724414e-05, - "loss": 1.165, + "epoch": 0.16835477689570216, + "grad_norm": 0.40410760524836825, + "learning_rate": 1.9718149823071592e-05, + "loss": 1.3394, "step": 615 }, { - "epoch": 0.17775229357798164, - "grad_norm": 3.885352429025116, - "learning_rate": 1.9634437401882707e-05, - "loss": 1.1899, + "epoch": 0.16972351491924445, + "grad_norm": 0.40142195594131513, + "learning_rate": 1.970677308946446e-05, + "loss": 1.3872, "step": 620 }, { - "epoch": 0.17918577981651376, - "grad_norm": 3.653935761439161, - "learning_rate": 1.962091022804772e-05, - "loss": 1.1506, + "epoch": 0.17109225294278674, + "grad_norm": 0.4041136169142662, + "learning_rate": 1.9695174682041652e-05, + "loss": 1.3644, "step": 625 }, { - "epoch": 0.18061926605504589, - "grad_norm": 4.559227033021131, - "learning_rate": 1.960714213447668e-05, - "loss": 1.1645, + "epoch": 0.17246099096632905, + "grad_norm": 0.39436310577761385, + "learning_rate": 1.9683354865676298e-05, + "loss": 1.434, "step": 630 }, { - "epoch": 0.18205275229357798, - "grad_norm": 4.618925411728112, - "learning_rate": 1.959313346594004e-05, - "loss": 1.1802, + "epoch": 0.17382972898987134, + "grad_norm": 0.400517293758074, + "learning_rate": 1.9671313910297826e-05, + "loss": 1.3941, "step": 635 }, { - "epoch": 0.1834862385321101, - "grad_norm": 4.308837236054774, - "learning_rate": 1.9578884573232538e-05, - "loss": 1.2023, + "epoch": 0.17519846701341363, + "grad_norm": 0.4059408812570113, + "learning_rate": 1.9659052090885834e-05, + "loss": 1.3907, "step": 640 }, { - "epoch": 0.1849197247706422, - "grad_norm": 4.304427359268019, - "learning_rate": 1.9564395813164428e-05, - "loss": 1.2244, + "epoch": 0.17656720503695592, + "grad_norm": 0.3929001302562383, + "learning_rate": 1.9646569687463796e-05, + "loss": 1.351, "step": 645 }, { - "epoch": 0.18635321100917432, - "grad_norm": 3.486413458583046, - "learning_rate": 1.9549667548552557e-05, - "loss": 1.2111, + "epoch": 0.17793594306049823, + "grad_norm": 0.4090718300758474, + "learning_rate": 1.9633866985092655e-05, + "loss": 1.4102, "step": 650 }, { - "epoch": 0.1877866972477064, - "grad_norm": 3.7640427311235403, - "learning_rate": 1.9534700148211255e-05, - "loss": 1.1956, + "epoch": 0.17930468108404052, + "grad_norm": 0.40995200812054255, + "learning_rate": 1.9620944273864343e-05, + "loss": 1.3575, "step": 655 }, { - "epoch": 0.18922018348623854, - "grad_norm": 4.222766727318349, - "learning_rate": 1.9519493986943125e-05, - "loss": 1.1996, + "epoch": 0.1806734191075828, + "grad_norm": 0.40276642674265933, + "learning_rate": 1.960780184889514e-05, + "loss": 1.3941, "step": 660 }, { - "epoch": 0.19065366972477063, - "grad_norm": 3.7868143111703256, - "learning_rate": 1.9504049445529632e-05, - "loss": 1.2093, + "epoch": 0.1820421571311251, + "grad_norm": 0.400136397383414, + "learning_rate": 1.9594440010318924e-05, + "loss": 1.4171, "step": 665 }, { - "epoch": 0.19208715596330275, - "grad_norm": 3.579431474524795, - "learning_rate": 1.94883669107216e-05, - "loss": 1.1785, + "epoch": 0.1834108951546674, + "grad_norm": 0.40872717576906, + "learning_rate": 1.9580859063280326e-05, + "loss": 1.391, "step": 670 }, { - "epoch": 0.19352064220183487, - "grad_norm": 4.14899759337832, - "learning_rate": 1.9472446775229486e-05, - "loss": 1.2402, + "epoch": 0.1847796331782097, + "grad_norm": 0.39405384932557985, + "learning_rate": 1.956705931792777e-05, + "loss": 1.4042, "step": 675 }, { - "epoch": 0.19495412844036697, - "grad_norm": 3.7259985492864804, - "learning_rate": 1.9456289437713578e-05, - "loss": 1.1713, + "epoch": 0.18614837120175198, + "grad_norm": 0.396632571946212, + "learning_rate": 1.9553041089406387e-05, + "loss": 1.3598, "step": 680 }, { - "epoch": 0.1963876146788991, - "grad_norm": 4.525451494816849, - "learning_rate": 1.9439895302774007e-05, - "loss": 1.2179, + "epoch": 0.18751710922529427, + "grad_norm": 0.39535225715229844, + "learning_rate": 1.95388046978508e-05, + "loss": 1.3652, "step": 685 }, { - "epoch": 0.1978211009174312, - "grad_norm": 3.9639891914703185, - "learning_rate": 1.9423264780940602e-05, - "loss": 1.2163, + "epoch": 0.18888584724883659, + "grad_norm": 0.39747240968211434, + "learning_rate": 1.9524350468377828e-05, + "loss": 1.3857, "step": 690 }, { - "epoch": 0.1992545871559633, - "grad_norm": 3.8210369748297612, - "learning_rate": 1.940639828866262e-05, - "loss": 1.2462, + "epoch": 0.19025458527237887, + "grad_norm": 0.4045442697980633, + "learning_rate": 1.9509678731079074e-05, + "loss": 1.3724, "step": 695 }, { - "epoch": 0.2006880733944954, - "grad_norm": 4.499363334347953, - "learning_rate": 1.938929624829832e-05, - "loss": 1.2572, + "epoch": 0.19162332329592116, + "grad_norm": 0.39648112930634666, + "learning_rate": 1.949478982101336e-05, + "loss": 1.3642, "step": 700 }, { - "epoch": 0.20212155963302753, - "grad_norm": 32.95893389851377, - "learning_rate": 1.937195908810438e-05, - "loss": 1.2332, + "epoch": 0.19299206131946345, + "grad_norm": 0.3966559822195587, + "learning_rate": 1.947968407819909e-05, + "loss": 1.3704, "step": 705 }, { - "epoch": 0.20355504587155962, - "grad_norm": 4.056487188350767, - "learning_rate": 1.935438724222517e-05, - "loss": 1.1797, + "epoch": 0.19436079934300574, + "grad_norm": 0.3803799928500003, + "learning_rate": 1.9464361847606486e-05, + "loss": 1.3718, "step": 710 }, { - "epoch": 0.20498853211009174, - "grad_norm": 3.940310991182425, - "learning_rate": 1.93365811506819e-05, - "loss": 1.2315, + "epoch": 0.19572953736654805, + "grad_norm": 0.3920580621341347, + "learning_rate": 1.9448823479149705e-05, + "loss": 1.3994, "step": 715 }, { - "epoch": 0.20642201834862386, - "grad_norm": 3.92517292280482, - "learning_rate": 1.9318541259361573e-05, - "loss": 1.1824, + "epoch": 0.19709827539009034, + "grad_norm": 0.38159551727148144, + "learning_rate": 1.9433069327678847e-05, + "loss": 1.3539, "step": 720 }, { - "epoch": 0.20785550458715596, - "grad_norm": 4.109451546798357, - "learning_rate": 1.9300268020005832e-05, - "loss": 1.2375, + "epoch": 0.19846701341363263, + "grad_norm": 0.3979848267470623, + "learning_rate": 1.9417099752971858e-05, + "loss": 1.3824, "step": 725 }, { - "epoch": 0.20928899082568808, - "grad_norm": 4.168329616635684, - "learning_rate": 1.9281761890199666e-05, - "loss": 1.1812, + "epoch": 0.1998357514371749, + "grad_norm": 0.36228811525683013, + "learning_rate": 1.9400915119726305e-05, + "loss": 1.2942, "step": 730 }, { - "epoch": 0.21072247706422018, - "grad_norm": 3.8618338414153435, - "learning_rate": 1.9263023333359918e-05, - "loss": 1.1903, + "epoch": 0.20120448946071723, + "grad_norm": 0.40336839547536973, + "learning_rate": 1.938451579755106e-05, + "loss": 1.3305, "step": 735 }, { - "epoch": 0.2121559633027523, - "grad_norm": 4.070643005805358, - "learning_rate": 1.9244052818723706e-05, - "loss": 1.2539, + "epoch": 0.20257322748425952, + "grad_norm": 0.4093706132368206, + "learning_rate": 1.9367902160957843e-05, + "loss": 1.4047, "step": 740 }, { - "epoch": 0.2135894495412844, - "grad_norm": 3.9207051720216004, - "learning_rate": 1.9224850821336664e-05, - "loss": 1.2375, + "epoch": 0.2039419655078018, + "grad_norm": 0.4012678664539993, + "learning_rate": 1.9351074589352684e-05, + "loss": 1.34, "step": 745 }, { - "epoch": 0.21502293577981652, - "grad_norm": 4.14495928553583, - "learning_rate": 1.920541782204104e-05, - "loss": 1.253, + "epoch": 0.2053107035313441, + "grad_norm": 0.3982341789573293, + "learning_rate": 1.933403346702725e-05, + "loss": 1.3518, "step": 750 }, { - "epoch": 0.21645642201834864, - "grad_norm": 4.210904171053472, - "learning_rate": 1.918575430746367e-05, - "loss": 1.1987, + "epoch": 0.2066794415548864, + "grad_norm": 0.3895851509088621, + "learning_rate": 1.931677918315007e-05, + "loss": 1.3698, "step": 755 }, { - "epoch": 0.21788990825688073, - "grad_norm": 3.6674431622452883, - "learning_rate": 1.9165860770003774e-05, - "loss": 1.2427, + "epoch": 0.2080481795784287, + "grad_norm": 0.40873783451123424, + "learning_rate": 1.9299312131757645e-05, + "loss": 1.3768, "step": 760 }, { - "epoch": 0.21932339449541285, - "grad_norm": 3.7503035449224127, - "learning_rate": 1.914573770782065e-05, - "loss": 1.1564, + "epoch": 0.20941691760197098, + "grad_norm": 0.38863227885235524, + "learning_rate": 1.928163271174546e-05, + "loss": 1.368, "step": 765 }, { - "epoch": 0.22075688073394495, - "grad_norm": 4.206820549333469, - "learning_rate": 1.9125385624821162e-05, - "loss": 1.2026, + "epoch": 0.21078565562551327, + "grad_norm": 0.40218624042286666, + "learning_rate": 1.9263741326858866e-05, + "loss": 1.4002, "step": 770 }, { - "epoch": 0.22219036697247707, - "grad_norm": 4.079050524248544, - "learning_rate": 1.9104805030647164e-05, - "loss": 1.2521, + "epoch": 0.21215439364905558, + "grad_norm": 0.38750285278794777, + "learning_rate": 1.9245638385683857e-05, + "loss": 1.3808, "step": 775 }, { - "epoch": 0.22362385321100917, - "grad_norm": 3.8439532493920585, - "learning_rate": 1.908399644066272e-05, - "loss": 1.2247, + "epoch": 0.21352313167259787, + "grad_norm": 0.39332958062090156, + "learning_rate": 1.9227324301637747e-05, + "loss": 1.3991, "step": 780 }, { - "epoch": 0.2250573394495413, - "grad_norm": 4.419153034660572, - "learning_rate": 1.906296037594117e-05, - "loss": 1.2827, + "epoch": 0.21489186969614016, + "grad_norm": 0.39759945858946194, + "learning_rate": 1.9208799492959723e-05, + "loss": 1.3765, "step": 785 }, { - "epoch": 0.22649082568807338, - "grad_norm": 4.802038277224569, - "learning_rate": 1.904169736325215e-05, - "loss": 1.262, + "epoch": 0.21626060771968245, + "grad_norm": 0.3779639266645249, + "learning_rate": 1.9190064382701296e-05, + "loss": 1.378, "step": 790 }, { - "epoch": 0.2279243119266055, - "grad_norm": 4.284188903637777, - "learning_rate": 1.9020207935048317e-05, - "loss": 1.1532, + "epoch": 0.21762934574322473, + "grad_norm": 0.3980685187213535, + "learning_rate": 1.917111939871664e-05, + "loss": 1.3559, "step": 795 }, { - "epoch": 0.22935779816513763, - "grad_norm": 3.639715866595117, - "learning_rate": 1.8998492629452087e-05, - "loss": 1.2381, + "epoch": 0.21899808376676705, + "grad_norm": 0.3955990129038042, + "learning_rate": 1.915196497365282e-05, + "loss": 1.3688, "step": 800 }, { - "epoch": 0.23079128440366972, - "grad_norm": 3.741005239883841, - "learning_rate": 1.8976551990242122e-05, - "loss": 1.216, + "epoch": 0.22036682179030934, + "grad_norm": 0.3814772041353902, + "learning_rate": 1.9132601544939914e-05, + "loss": 1.3106, "step": 805 }, { - "epoch": 0.23222477064220184, - "grad_norm": 3.6139511602383276, - "learning_rate": 1.895438656683972e-05, - "loss": 1.2473, + "epoch": 0.22173555981385162, + "grad_norm": 0.4033095650227253, + "learning_rate": 1.9113029554781014e-05, + "loss": 1.3389, "step": 810 }, { - "epoch": 0.23365825688073394, - "grad_norm": 3.8300619382359846, - "learning_rate": 1.8931996914295065e-05, - "loss": 1.1882, + "epoch": 0.2231042978373939, + "grad_norm": 0.3934860150762494, + "learning_rate": 1.9093249450142144e-05, + "loss": 1.3343, "step": 815 }, { - "epoch": 0.23509174311926606, - "grad_norm": 3.943889343109655, - "learning_rate": 1.8909383593273317e-05, - "loss": 1.2444, + "epoch": 0.22447303586093623, + "grad_norm": 0.4008166941226203, + "learning_rate": 1.907326168274204e-05, + "loss": 1.3627, "step": 820 }, { - "epoch": 0.23652522935779816, - "grad_norm": 4.115538048586424, - "learning_rate": 1.8886547170040575e-05, - "loss": 1.233, + "epoch": 0.2258417738844785, + "grad_norm": 0.38863913805386574, + "learning_rate": 1.905306670904184e-05, + "loss": 1.3978, "step": 825 }, { - "epoch": 0.23795871559633028, - "grad_norm": 5.787154468595948, - "learning_rate": 1.8863488216449702e-05, - "loss": 1.2236, + "epoch": 0.2272105119080208, + "grad_norm": 0.39887884072448215, + "learning_rate": 1.9032664990234648e-05, + "loss": 1.3548, "step": 830 }, { - "epoch": 0.23939220183486237, - "grad_norm": 3.778052893153268, - "learning_rate": 1.8840207309926003e-05, - "loss": 1.2286, + "epoch": 0.2285792499315631, + "grad_norm": 0.3942528539065832, + "learning_rate": 1.9012056992235025e-05, + "loss": 1.3589, "step": 835 }, { - "epoch": 0.2408256880733945, - "grad_norm": 3.435004162216508, - "learning_rate": 1.881670503345277e-05, - "loss": 1.1868, + "epoch": 0.2299479879551054, + "grad_norm": 0.4003093485695292, + "learning_rate": 1.899124318566832e-05, + "loss": 1.4008, "step": 840 }, { - "epoch": 0.24225917431192662, - "grad_norm": 5.261326157501783, - "learning_rate": 1.879298197555666e-05, - "loss": 1.2486, + "epoch": 0.2313167259786477, + "grad_norm": 0.38851656838431264, + "learning_rate": 1.897022404585996e-05, + "loss": 1.3396, "step": 845 }, { - "epoch": 0.2436926605504587, - "grad_norm": 3.651324240819765, - "learning_rate": 1.8769038730292993e-05, - "loss": 1.2383, + "epoch": 0.23268546400218998, + "grad_norm": 0.39282411404685663, + "learning_rate": 1.894900005282454e-05, + "loss": 1.3118, "step": 850 }, { - "epoch": 0.24512614678899083, - "grad_norm": 4.834151575695704, - "learning_rate": 1.8744875897230853e-05, - "loss": 1.2592, + "epoch": 0.23405420202573227, + "grad_norm": 0.3995600743628729, + "learning_rate": 1.892757169125492e-05, + "loss": 1.3261, "step": 855 }, { - "epoch": 0.24655963302752293, - "grad_norm": 3.9673796319727606, - "learning_rate": 1.872049408143808e-05, - "loss": 1.2135, + "epoch": 0.23542294004927458, + "grad_norm": 0.40585943733175783, + "learning_rate": 1.8905939450511117e-05, + "loss": 1.2896, "step": 860 }, { - "epoch": 0.24799311926605505, - "grad_norm": 3.9495356227550293, - "learning_rate": 1.869589389346611e-05, - "loss": 1.2443, + "epoch": 0.23679167807281687, + "grad_norm": 0.38676778157652897, + "learning_rate": 1.888410382460915e-05, + "loss": 1.3769, "step": 865 }, { - "epoch": 0.24942660550458715, - "grad_norm": 3.8171283870857917, - "learning_rate": 1.8671075949334713e-05, - "loss": 1.1502, + "epoch": 0.23816041609635916, + "grad_norm": 0.4006135132967053, + "learning_rate": 1.8862065312209735e-05, + "loss": 1.3744, "step": 870 }, { - "epoch": 0.25086009174311924, - "grad_norm": 3.594654406802359, - "learning_rate": 1.8646040870516526e-05, - "loss": 1.1831, + "epoch": 0.23952915411990144, + "grad_norm": 0.38183538010624907, + "learning_rate": 1.8839824416606932e-05, + "loss": 1.3994, "step": 875 }, { - "epoch": 0.25229357798165136, - "grad_norm": 3.5772784131514337, - "learning_rate": 1.862078928392153e-05, - "loss": 1.2116, + "epoch": 0.24089789214344373, + "grad_norm": 0.40012172515979694, + "learning_rate": 1.8817381645716613e-05, + "loss": 1.3736, "step": 880 }, { - "epoch": 0.2537270642201835, - "grad_norm": 4.016497035159711, - "learning_rate": 1.8595321821881322e-05, - "loss": 1.2156, + "epoch": 0.24226663016698605, + "grad_norm": 0.4036320450863606, + "learning_rate": 1.879473751206489e-05, + "loss": 1.3679, "step": 885 }, { - "epoch": 0.2551605504587156, - "grad_norm": 3.6584048014823622, - "learning_rate": 1.8569639122133304e-05, - "loss": 1.1687, + "epoch": 0.24363536819052833, + "grad_norm": 0.39250548017399456, + "learning_rate": 1.8771892532776406e-05, + "loss": 1.352, "step": 890 }, { - "epoch": 0.25659403669724773, - "grad_norm": 4.092581816708313, - "learning_rate": 1.8543741827804685e-05, - "loss": 1.2433, + "epoch": 0.24500410621407062, + "grad_norm": 0.384618237208868, + "learning_rate": 1.8748847229562504e-05, + "loss": 1.3415, "step": 895 }, { - "epoch": 0.2580275229357798, - "grad_norm": 10.965320915487338, - "learning_rate": 1.8517630587396413e-05, - "loss": 1.2365, + "epoch": 0.2463728442376129, + "grad_norm": 0.3847231065307316, + "learning_rate": 1.8725602128709348e-05, + "loss": 1.436, "step": 900 }, { - "epoch": 0.2594610091743119, - "grad_norm": 3.744425931996878, - "learning_rate": 1.8491306054766907e-05, - "loss": 1.2315, + "epoch": 0.24774158226115522, + "grad_norm": 0.39329176880661687, + "learning_rate": 1.8702157761065877e-05, + "loss": 1.344, "step": 905 }, { - "epoch": 0.26089449541284404, - "grad_norm": 4.446159019549649, - "learning_rate": 1.8464768889115684e-05, - "loss": 1.165, + "epoch": 0.2491103202846975, + "grad_norm": 0.4202514383981607, + "learning_rate": 1.8678514662031688e-05, + "loss": 1.3384, "step": 910 }, { - "epoch": 0.26232798165137616, - "grad_norm": 3.4122914752905227, - "learning_rate": 1.8438019754966877e-05, - "loss": 1.1715, + "epoch": 0.2504790583082398, + "grad_norm": 0.3866724615350219, + "learning_rate": 1.8654673371544815e-05, + "loss": 1.3374, "step": 915 }, { - "epoch": 0.26376146788990823, - "grad_norm": 3.681090000606899, - "learning_rate": 1.841105932215256e-05, - "loss": 1.2197, + "epoch": 0.2518477963317821, + "grad_norm": 0.4109333525553728, + "learning_rate": 1.8630634434069397e-05, + "loss": 1.3949, "step": 920 }, { - "epoch": 0.26519495412844035, - "grad_norm": 3.723633399780894, - "learning_rate": 1.838388826579601e-05, - "loss": 1.2308, + "epoch": 0.2532165343553244, + "grad_norm": 0.3782421450027903, + "learning_rate": 1.860639839858324e-05, + "loss": 1.3162, "step": 925 }, { - "epoch": 0.2666284403669725, - "grad_norm": 3.79765489569063, - "learning_rate": 1.835650726629477e-05, - "loss": 1.2388, + "epoch": 0.2545852723788667, + "grad_norm": 0.3909532902784873, + "learning_rate": 1.8581965818565278e-05, + "loss": 1.3829, "step": 930 }, { - "epoch": 0.2680619266055046, - "grad_norm": 3.636387647388923, - "learning_rate": 1.8328917009303634e-05, - "loss": 1.2296, + "epoch": 0.255954010402409, + "grad_norm": 0.4051170609748319, + "learning_rate": 1.855733725198295e-05, + "loss": 1.3462, "step": 935 }, { - "epoch": 0.2694954128440367, - "grad_norm": 3.696962989830932, - "learning_rate": 1.830111818571745e-05, - "loss": 1.2048, + "epoch": 0.25732274842595126, + "grad_norm": 0.39349552836368845, + "learning_rate": 1.8532513261279433e-05, + "loss": 1.4015, "step": 940 }, { - "epoch": 0.2709288990825688, - "grad_norm": 3.896582872078181, - "learning_rate": 1.8273111491653867e-05, - "loss": 1.2522, + "epoch": 0.2586914864494936, + "grad_norm": 0.39347934180028793, + "learning_rate": 1.8507494413360808e-05, + "loss": 1.3367, "step": 945 }, { - "epoch": 0.2723623853211009, - "grad_norm": 3.7244583842983454, - "learning_rate": 1.824489762843584e-05, - "loss": 1.1896, + "epoch": 0.26006022447303584, + "grad_norm": 0.4158855876544813, + "learning_rate": 1.848228127958312e-05, + "loss": 1.353, "step": 950 }, { - "epoch": 0.27379587155963303, - "grad_norm": 3.66702244239725, - "learning_rate": 1.821647730257413e-05, - "loss": 1.1777, + "epoch": 0.26142896249657815, + "grad_norm": 0.3793682443730692, + "learning_rate": 1.8456874435739337e-05, + "loss": 1.3398, "step": 955 }, { - "epoch": 0.27522935779816515, - "grad_norm": 6.778664388382065, - "learning_rate": 1.818785122574956e-05, - "loss": 1.2239, + "epoch": 0.26279770052012047, + "grad_norm": 0.38047885709566037, + "learning_rate": 1.843127446204616e-05, + "loss": 1.329, "step": 960 }, { - "epoch": 0.2766628440366973, - "grad_norm": 3.5798872160414095, - "learning_rate": 1.8159020114795226e-05, - "loss": 1.1532, + "epoch": 0.26416643854366273, + "grad_norm": 0.3991275596683987, + "learning_rate": 1.8405481943130827e-05, + "loss": 1.3115, "step": 965 }, { - "epoch": 0.27809633027522934, - "grad_norm": 4.080407231530825, - "learning_rate": 1.8129984691678547e-05, - "loss": 1.2046, + "epoch": 0.26553517656720504, + "grad_norm": 0.389649193690539, + "learning_rate": 1.8379497468017726e-05, + "loss": 1.3746, "step": 970 }, { - "epoch": 0.27952981651376146, - "grad_norm": 3.9179230686635234, - "learning_rate": 1.8100745683483168e-05, - "loss": 1.254, + "epoch": 0.2669039145907473, + "grad_norm": 0.40470694231468296, + "learning_rate": 1.8353321630114952e-05, + "loss": 1.3354, "step": 975 }, { - "epoch": 0.2809633027522936, - "grad_norm": 4.97899730667325, - "learning_rate": 1.807130382239075e-05, - "loss": 1.2084, + "epoch": 0.2682726526142896, + "grad_norm": 0.40734933283960667, + "learning_rate": 1.832695502720076e-05, + "loss": 1.3295, "step": 980 }, { - "epoch": 0.2823967889908257, - "grad_norm": 3.8642747207026242, - "learning_rate": 1.8041659845662663e-05, - "loss": 1.2014, + "epoch": 0.26964139063783193, + "grad_norm": 0.39323434740751423, + "learning_rate": 1.8300398261409912e-05, + "loss": 1.3069, "step": 985 }, { - "epoch": 0.2838302752293578, - "grad_norm": 3.4483181116099964, - "learning_rate": 1.8011814495621506e-05, - "loss": 1.1435, + "epoch": 0.2710101286613742, + "grad_norm": 0.3837841492160018, + "learning_rate": 1.8273651939219914e-05, + "loss": 1.3543, "step": 990 }, { - "epoch": 0.2852637614678899, - "grad_norm": 3.8042720182477434, - "learning_rate": 1.798176851963251e-05, - "loss": 1.2047, + "epoch": 0.2723788666849165, + "grad_norm": 0.38795485386053835, + "learning_rate": 1.8246716671437186e-05, + "loss": 1.3798, "step": 995 }, { - "epoch": 0.286697247706422, - "grad_norm": 3.483685408627929, - "learning_rate": 1.7951522670084847e-05, - "loss": 1.171, + "epoch": 0.2737476047084588, + "grad_norm": 0.38509661706339016, + "learning_rate": 1.8219593073183106e-05, + "loss": 1.3604, "step": 1000 }, { - "epoch": 0.28813073394495414, - "grad_norm": 3.487052716444198, - "learning_rate": 1.792107770437276e-05, - "loss": 1.2396, + "epoch": 0.2751163427320011, + "grad_norm": 0.3882331224756399, + "learning_rate": 1.8192281763879946e-05, + "loss": 1.3417, "step": 1005 }, { - "epoch": 0.28956422018348627, - "grad_norm": 3.553005650055509, - "learning_rate": 1.789043438487662e-05, - "loss": 1.1149, + "epoch": 0.2764850807555434, + "grad_norm": 0.3842715356979672, + "learning_rate": 1.816478336723675e-05, + "loss": 1.4061, "step": 1010 }, { - "epoch": 0.29099770642201833, - "grad_norm": 3.5693421017081306, - "learning_rate": 1.785959347894383e-05, - "loss": 1.2387, + "epoch": 0.27785381877908566, + "grad_norm": 0.4013303335538485, + "learning_rate": 1.8137098511235084e-05, + "loss": 1.3509, "step": 1015 }, { - "epoch": 0.29243119266055045, - "grad_norm": 3.535948224388696, - "learning_rate": 1.7828555758869602e-05, - "loss": 1.232, + "epoch": 0.279222556802628, + "grad_norm": 0.389250409772092, + "learning_rate": 1.810922782811468e-05, + "loss": 1.3371, "step": 1020 }, { - "epoch": 0.2938646788990826, - "grad_norm": 3.522215138188648, - "learning_rate": 1.7797322001877625e-05, - "loss": 1.2004, + "epoch": 0.2805912948261703, + "grad_norm": 0.3891214912082097, + "learning_rate": 1.808117195435901e-05, + "loss": 1.3756, "step": 1025 }, { - "epoch": 0.2952981651376147, - "grad_norm": 3.9401114869933362, - "learning_rate": 1.7765892990100593e-05, - "loss": 1.1954, + "epoch": 0.28196003284971255, + "grad_norm": 0.39295322317665116, + "learning_rate": 1.805293153068076e-05, + "loss": 1.3525, "step": 1030 }, { - "epoch": 0.29673165137614677, - "grad_norm": 3.9452874322873424, - "learning_rate": 1.773426951056064e-05, - "loss": 1.1752, + "epoch": 0.28332877087325486, + "grad_norm": 0.38744159952014173, + "learning_rate": 1.802450720200718e-05, + "loss": 1.3508, "step": 1035 }, { - "epoch": 0.2981651376146789, - "grad_norm": 3.77900474922427, - "learning_rate": 1.7702452355149606e-05, - "loss": 1.2023, + "epoch": 0.2846975088967972, + "grad_norm": 0.3971694811346686, + "learning_rate": 1.7995899617465357e-05, + "loss": 1.3242, "step": 1040 }, { - "epoch": 0.299598623853211, - "grad_norm": 4.194810233115317, - "learning_rate": 1.7670442320609226e-05, - "loss": 1.1762, + "epoch": 0.28606624692033944, + "grad_norm": 0.3892979964335795, + "learning_rate": 1.7967109430367406e-05, + "loss": 1.2919, "step": 1045 }, { - "epoch": 0.30103211009174313, - "grad_norm": 3.6317516711250524, - "learning_rate": 1.7638240208511162e-05, - "loss": 1.2036, + "epoch": 0.28743498494388176, + "grad_norm": 0.40651116778504365, + "learning_rate": 1.793813729819553e-05, + "loss": 1.4047, "step": 1050 }, { - "epoch": 0.30246559633027525, - "grad_norm": 3.613998915069475, - "learning_rate": 1.760584682523696e-05, - "loss": 1.1794, + "epoch": 0.288803722967424, + "grad_norm": 0.39365601735181455, + "learning_rate": 1.7908983882587038e-05, + "loss": 1.3622, "step": 1055 }, { - "epoch": 0.3038990825688073, - "grad_norm": 3.168795755449482, - "learning_rate": 1.7573262981957814e-05, - "loss": 1.1812, + "epoch": 0.29017246099096633, + "grad_norm": 0.38324138023204, + "learning_rate": 1.787964984931919e-05, + "loss": 1.3773, "step": 1060 }, { - "epoch": 0.30533256880733944, - "grad_norm": 3.883990610964808, - "learning_rate": 1.7540489494614294e-05, - "loss": 1.2273, + "epoch": 0.29154119901450865, + "grad_norm": 0.3887104304366927, + "learning_rate": 1.7850135868294023e-05, + "loss": 1.3973, "step": 1065 }, { - "epoch": 0.30676605504587157, - "grad_norm": 4.068560981758666, - "learning_rate": 1.7507527183895893e-05, - "loss": 1.2297, + "epoch": 0.2929099370380509, + "grad_norm": 0.4087532681641359, + "learning_rate": 1.782044261352305e-05, + "loss": 1.3246, "step": 1070 }, { - "epoch": 0.3081995412844037, - "grad_norm": 3.812795569574441, - "learning_rate": 1.747437687522047e-05, - "loss": 1.2037, + "epoch": 0.2942786750615932, + "grad_norm": 0.39381567774906023, + "learning_rate": 1.7790570763111864e-05, + "loss": 1.3683, "step": 1075 }, { - "epoch": 0.30963302752293576, - "grad_norm": 3.7886870743963077, - "learning_rate": 1.744103939871361e-05, - "loss": 1.2235, + "epoch": 0.2956474130851355, + "grad_norm": 0.38720936830053604, + "learning_rate": 1.7760520999244638e-05, + "loss": 1.375, "step": 1080 }, { - "epoch": 0.3110665137614679, - "grad_norm": 3.3064001495116555, - "learning_rate": 1.7407515589187793e-05, - "loss": 1.213, + "epoch": 0.2970161511086778, + "grad_norm": 0.38699812271583683, + "learning_rate": 1.7730294008168578e-05, + "loss": 1.403, "step": 1085 }, { - "epoch": 0.3125, - "grad_norm": 3.999062074927365, - "learning_rate": 1.7373806286121532e-05, - "loss": 1.2586, + "epoch": 0.2983848891322201, + "grad_norm": 0.3938155567988771, + "learning_rate": 1.7699890480178216e-05, + "loss": 1.3567, "step": 1090 }, { - "epoch": 0.3139334862385321, - "grad_norm": 4.371331122546795, - "learning_rate": 1.7339912333638322e-05, - "loss": 1.1731, + "epoch": 0.29975362715576237, + "grad_norm": 0.4051470823072413, + "learning_rate": 1.766931110959967e-05, + "loss": 1.4228, "step": 1095 }, { - "epoch": 0.31536697247706424, - "grad_norm": 3.8167405663507, - "learning_rate": 1.730583458048552e-05, - "loss": 1.2098, + "epoch": 0.3011223651793047, + "grad_norm": 0.4013404711196811, + "learning_rate": 1.763855659477478e-05, + "loss": 1.3689, "step": 1100 }, { - "epoch": 0.3168004587155963, - "grad_norm": 3.398825843337014, - "learning_rate": 1.727157388001307e-05, - "loss": 1.187, + "epoch": 0.302491103202847, + "grad_norm": 0.4239789885244797, + "learning_rate": 1.7607627638045156e-05, + "loss": 1.3988, "step": 1105 }, { - "epoch": 0.31823394495412843, - "grad_norm": 3.323984794217734, - "learning_rate": 1.723713109015217e-05, - "loss": 1.229, + "epoch": 0.30385984122638926, + "grad_norm": 0.4005302271519552, + "learning_rate": 1.7576524945736137e-05, + "loss": 1.3368, "step": 1110 }, { - "epoch": 0.31966743119266056, - "grad_norm": 3.388696780319304, - "learning_rate": 1.720250707339374e-05, - "loss": 1.1616, + "epoch": 0.3052285792499316, + "grad_norm": 0.3900827670562639, + "learning_rate": 1.754524922814068e-05, + "loss": 1.3633, "step": 1115 }, { - "epoch": 0.3211009174311927, - "grad_norm": 3.7284978608483934, - "learning_rate": 1.7167702696766877e-05, - "loss": 1.173, + "epoch": 0.30659731727347384, + "grad_norm": 0.39004513777273, + "learning_rate": 1.751380119950311e-05, + "loss": 1.4024, "step": 1120 }, { - "epoch": 0.32253440366972475, - "grad_norm": 4.037999124478957, - "learning_rate": 1.7132718831817093e-05, - "loss": 1.1695, + "epoch": 0.30796605529701615, + "grad_norm": 0.3955440868581025, + "learning_rate": 1.7482181578002837e-05, + "loss": 1.3667, "step": 1125 }, { - "epoch": 0.32396788990825687, - "grad_norm": 3.2060752585491086, - "learning_rate": 1.7097556354584526e-05, - "loss": 1.1464, + "epoch": 0.30933479332055847, + "grad_norm": 0.41261816602720663, + "learning_rate": 1.745039108573793e-05, + "loss": 1.357, "step": 1130 }, { - "epoch": 0.325401376146789, - "grad_norm": 3.72775795176933, - "learning_rate": 1.7062216145581997e-05, - "loss": 1.1237, + "epoch": 0.3107035313441007, + "grad_norm": 0.37732770567847945, + "learning_rate": 1.7418430448708644e-05, + "loss": 1.3337, "step": 1135 }, { - "epoch": 0.3268348623853211, - "grad_norm": 3.783961044208216, - "learning_rate": 1.7026699089772937e-05, - "loss": 1.1899, + "epoch": 0.31207226936764304, + "grad_norm": 0.3764972884033891, + "learning_rate": 1.738630039680083e-05, + "loss": 1.3556, "step": 1140 }, { - "epoch": 0.32826834862385323, - "grad_norm": 3.5936905948051323, - "learning_rate": 1.699100607654926e-05, - "loss": 1.176, + "epoch": 0.3134410073911853, + "grad_norm": 0.40764805847336166, + "learning_rate": 1.7354001663769278e-05, + "loss": 1.3679, "step": 1145 }, { - "epoch": 0.3297018348623853, - "grad_norm": 3.5296084880895156, - "learning_rate": 1.6955137999709075e-05, - "loss": 1.1445, + "epoch": 0.3148097454147276, + "grad_norm": 0.3794101029629702, + "learning_rate": 1.7321534987220942e-05, + "loss": 1.3176, "step": 1150 }, { - "epoch": 0.3311353211009174, - "grad_norm": 3.278850491610551, - "learning_rate": 1.6919095757434288e-05, - "loss": 1.2269, + "epoch": 0.31617848343826993, + "grad_norm": 0.40248164169870704, + "learning_rate": 1.728890110859812e-05, + "loss": 1.3458, "step": 1155 }, { - "epoch": 0.33256880733944955, - "grad_norm": 3.6404322855144247, - "learning_rate": 1.6882880252268156e-05, - "loss": 1.1836, + "epoch": 0.3175472214618122, + "grad_norm": 0.39012890957455026, + "learning_rate": 1.7256100773161492e-05, + "loss": 1.296, "step": 1160 }, { - "epoch": 0.33400229357798167, - "grad_norm": 3.437933045156408, - "learning_rate": 1.6846492391092625e-05, - "loss": 1.2295, + "epoch": 0.3189159594853545, + "grad_norm": 0.39856381680118136, + "learning_rate": 1.7223134729973134e-05, + "loss": 1.3614, "step": 1165 }, { - "epoch": 0.33543577981651373, - "grad_norm": 3.5611595737623167, - "learning_rate": 1.680993308510568e-05, - "loss": 1.1862, + "epoch": 0.3202846975088968, + "grad_norm": 0.40493526767721577, + "learning_rate": 1.7190003731879375e-05, + "loss": 1.3533, "step": 1170 }, { - "epoch": 0.33686926605504586, - "grad_norm": 4.312054532006292, - "learning_rate": 1.6773203249798482e-05, - "loss": 1.1602, + "epoch": 0.3216534355324391, + "grad_norm": 0.38868285184435786, + "learning_rate": 1.715670853549364e-05, + "loss": 1.3377, "step": 1175 }, { - "epoch": 0.338302752293578, - "grad_norm": 3.598611274354031, - "learning_rate": 1.6736303804932475e-05, - "loss": 1.1825, + "epoch": 0.3230221735559814, + "grad_norm": 0.3840977803535061, + "learning_rate": 1.7123249901179142e-05, + "loss": 1.3753, "step": 1180 }, { - "epoch": 0.3397362385321101, - "grad_norm": 3.779933724951351, - "learning_rate": 1.6699235674516334e-05, - "loss": 1.1847, + "epoch": 0.32439091157952366, + "grad_norm": 0.37510550805682613, + "learning_rate": 1.708962859303154e-05, + "loss": 1.3557, "step": 1185 }, { - "epoch": 0.3411697247706422, - "grad_norm": 3.590038112136874, - "learning_rate": 1.666199978678283e-05, - "loss": 1.1984, + "epoch": 0.32575964960306597, + "grad_norm": 0.39070855119805253, + "learning_rate": 1.7055845378861476e-05, + "loss": 1.3584, "step": 1190 }, { - "epoch": 0.3426032110091743, - "grad_norm": 4.110223116861788, - "learning_rate": 1.6624597074165597e-05, - "loss": 1.2468, + "epoch": 0.3271283876266083, + "grad_norm": 0.39021160718672115, + "learning_rate": 1.7021901030177036e-05, + "loss": 1.3399, "step": 1195 }, { - "epoch": 0.3440366972477064, - "grad_norm": 3.603557931052651, - "learning_rate": 1.6587028473275772e-05, - "loss": 1.1962, + "epoch": 0.32849712565015055, + "grad_norm": 0.3983342506003189, + "learning_rate": 1.698779632216615e-05, + "loss": 1.3965, "step": 1200 }, { - "epoch": 0.34547018348623854, - "grad_norm": 3.209591671498054, - "learning_rate": 1.6549294924878532e-05, - "loss": 1.1714, + "epoch": 0.32986586367369286, + "grad_norm": 0.38690562360273995, + "learning_rate": 1.6953532033678874e-05, + "loss": 1.4221, "step": 1205 }, { - "epoch": 0.34690366972477066, - "grad_norm": 3.6243010245754337, - "learning_rate": 1.651139737386957e-05, - "loss": 1.1733, + "epoch": 0.3312346016972352, + "grad_norm": 0.39029540911089444, + "learning_rate": 1.69191089472096e-05, + "loss": 1.3201, "step": 1210 }, { - "epoch": 0.3483371559633027, - "grad_norm": 3.9078886802263697, - "learning_rate": 1.6473336769251388e-05, - "loss": 1.1715, + "epoch": 0.33260333972077744, + "grad_norm": 0.40512131600219015, + "learning_rate": 1.688452784887921e-05, + "loss": 1.3495, "step": 1215 }, { - "epoch": 0.34977064220183485, - "grad_norm": 3.6344742284976306, - "learning_rate": 1.6435114064109575e-05, - "loss": 1.1244, + "epoch": 0.33397207774431975, + "grad_norm": 0.3804586676740124, + "learning_rate": 1.684978952841709e-05, + "loss": 1.3471, "step": 1220 }, { - "epoch": 0.35120412844036697, - "grad_norm": 3.6952536271050964, - "learning_rate": 1.6396730215588913e-05, - "loss": 1.2089, + "epoch": 0.335340815767862, + "grad_norm": 0.38428741373037245, + "learning_rate": 1.681489477914312e-05, + "loss": 1.3196, "step": 1225 }, { - "epoch": 0.3526376146788991, - "grad_norm": 3.429886933846167, - "learning_rate": 1.6358186184869417e-05, - "loss": 1.1892, + "epoch": 0.3367095537914043, + "grad_norm": 0.384047691017285, + "learning_rate": 1.677984439794954e-05, + "loss": 1.3303, "step": 1230 }, { - "epoch": 0.3540711009174312, - "grad_norm": 3.093455767598654, - "learning_rate": 1.631948293714227e-05, - "loss": 1.2106, + "epoch": 0.33807829181494664, + "grad_norm": 0.3839900646299871, + "learning_rate": 1.6744639185282784e-05, + "loss": 1.2792, "step": 1235 }, { - "epoch": 0.3555045871559633, - "grad_norm": 4.038152481713509, - "learning_rate": 1.6280621441585647e-05, - "loss": 1.1697, + "epoch": 0.3394470298384889, + "grad_norm": 0.3884042208952324, + "learning_rate": 1.670927994512514e-05, + "loss": 1.3223, "step": 1240 }, { - "epoch": 0.3569380733944954, - "grad_norm": 3.6999557746923095, - "learning_rate": 1.6241602671340448e-05, - "loss": 1.2376, + "epoch": 0.3408157678620312, + "grad_norm": 0.3911385295369844, + "learning_rate": 1.667376748497646e-05, + "loss": 1.3546, "step": 1245 }, { - "epoch": 0.3583715596330275, - "grad_norm": 3.6041781324392868, - "learning_rate": 1.6202427603485933e-05, - "loss": 1.1327, + "epoch": 0.3421845058855735, + "grad_norm": 0.3807410219808285, + "learning_rate": 1.6638102615835658e-05, + "loss": 1.3148, "step": 1250 }, { - "epoch": 0.35980504587155965, - "grad_norm": 3.4145472213007473, - "learning_rate": 1.6163097219015245e-05, - "loss": 1.1372, + "epoch": 0.3435532439091158, + "grad_norm": 0.39423887853753, + "learning_rate": 1.6602286152182236e-05, + "loss": 1.361, "step": 1255 }, { - "epoch": 0.36123853211009177, - "grad_norm": 3.429021637188865, - "learning_rate": 1.6123612502810865e-05, - "loss": 1.1646, + "epoch": 0.3449219819326581, + "grad_norm": 0.39187255483162625, + "learning_rate": 1.6566318911957647e-05, + "loss": 1.339, "step": 1260 }, { - "epoch": 0.36267201834862384, - "grad_norm": 3.6122171856293708, - "learning_rate": 1.6083974443619922e-05, - "loss": 1.2031, + "epoch": 0.34629071995620037, + "grad_norm": 0.38859511955741677, + "learning_rate": 1.6530201716546647e-05, + "loss": 1.3556, "step": 1265 }, { - "epoch": 0.36410550458715596, - "grad_norm": 3.2252592769247617, - "learning_rate": 1.6044184034029445e-05, - "loss": 1.1669, + "epoch": 0.3476594579797427, + "grad_norm": 0.3993643770992966, + "learning_rate": 1.649393539075851e-05, + "loss": 1.3476, "step": 1270 }, { - "epoch": 0.3655389908256881, - "grad_norm": 3.513524941132868, - "learning_rate": 1.6004242270441523e-05, - "loss": 1.1871, + "epoch": 0.349028196003285, + "grad_norm": 0.39035176617866785, + "learning_rate": 1.6457520762808217e-05, + "loss": 1.3228, "step": 1275 }, { - "epoch": 0.3669724770642202, - "grad_norm": 3.5159805187675115, - "learning_rate": 1.596415015304833e-05, - "loss": 1.1851, + "epoch": 0.35039693402682726, + "grad_norm": 0.3830707286890397, + "learning_rate": 1.6420958664297514e-05, + "loss": 1.3094, "step": 1280 }, { - "epoch": 0.36840596330275227, - "grad_norm": 3.3054440445349837, - "learning_rate": 1.5923908685807087e-05, - "loss": 1.1074, + "epoch": 0.35176567205036957, + "grad_norm": 0.36321444690348953, + "learning_rate": 1.638424993019595e-05, + "loss": 1.3853, "step": 1285 }, { - "epoch": 0.3698394495412844, - "grad_norm": 3.6946632308744136, - "learning_rate": 1.588351887641494e-05, - "loss": 1.1596, + "epoch": 0.35313441007391183, + "grad_norm": 0.371835147945527, + "learning_rate": 1.634739539882178e-05, + "loss": 1.3477, "step": 1290 }, { - "epoch": 0.3712729357798165, - "grad_norm": 3.3924561467755785, - "learning_rate": 1.5842981736283686e-05, - "loss": 1.2262, + "epoch": 0.35450314809745415, + "grad_norm": 0.3906744323143432, + "learning_rate": 1.6310395911822848e-05, + "loss": 1.3149, "step": 1295 }, { - "epoch": 0.37270642201834864, - "grad_norm": 3.623083608828368, - "learning_rate": 1.5802298280514487e-05, - "loss": 1.1823, + "epoch": 0.35587188612099646, + "grad_norm": 0.37408867437468385, + "learning_rate": 1.6273252314157352e-05, + "loss": 1.3135, "step": 1300 }, { - "epoch": 0.37413990825688076, - "grad_norm": 3.0148932024272437, - "learning_rate": 1.5761469527872427e-05, - "loss": 1.1513, + "epoch": 0.3572406241445387, + "grad_norm": 0.37403440281560435, + "learning_rate": 1.6235965454074535e-05, + "loss": 1.2301, "step": 1305 }, { - "epoch": 0.3755733944954128, - "grad_norm": 10.198288104380973, - "learning_rate": 1.572049650076101e-05, - "loss": 1.1498, + "epoch": 0.35860936216808104, + "grad_norm": 0.3867708656040568, + "learning_rate": 1.619853618309535e-05, + "loss": 1.3716, "step": 1310 }, { - "epoch": 0.37700688073394495, - "grad_norm": 3.623884808969302, - "learning_rate": 1.5679380225196546e-05, - "loss": 1.1857, + "epoch": 0.3599781001916233, + "grad_norm": 0.3885328796144267, + "learning_rate": 1.6160965355992966e-05, + "loss": 1.366, "step": 1315 }, { - "epoch": 0.37844036697247707, - "grad_norm": 21.814389727799938, - "learning_rate": 1.5638121730782486e-05, - "loss": 1.1748, + "epoch": 0.3613468382151656, + "grad_norm": 0.403719948973195, + "learning_rate": 1.6123253830773293e-05, + "loss": 1.3661, "step": 1320 }, { - "epoch": 0.3798738532110092, - "grad_norm": 3.7586731538464684, - "learning_rate": 1.5596722050683598e-05, - "loss": 1.2024, + "epoch": 0.3627155762387079, + "grad_norm": 0.3827196192323514, + "learning_rate": 1.6085402468655356e-05, + "loss": 1.3567, "step": 1325 }, { - "epoch": 0.38130733944954126, - "grad_norm": 3.914930197959616, - "learning_rate": 1.555518222160013e-05, - "loss": 1.1873, + "epoch": 0.3640843142622502, + "grad_norm": 0.3968500270407259, + "learning_rate": 1.6047412134051645e-05, + "loss": 1.4044, "step": 1330 }, { - "epoch": 0.3827408256880734, - "grad_norm": 3.645421580899915, - "learning_rate": 1.551350328374184e-05, - "loss": 1.1252, + "epoch": 0.3654530522857925, + "grad_norm": 0.38847711656237954, + "learning_rate": 1.6009283694548365e-05, + "loss": 1.3591, "step": 1335 }, { - "epoch": 0.3841743119266055, - "grad_norm": 3.1973889011870376, - "learning_rate": 1.5471686280801933e-05, - "loss": 1.1839, + "epoch": 0.3668217903093348, + "grad_norm": 0.4036707276972636, + "learning_rate": 1.5971018020885623e-05, + "loss": 1.3916, "step": 1340 }, { - "epoch": 0.3856077981651376, - "grad_norm": 3.5501087765790915, - "learning_rate": 1.5429732259930955e-05, - "loss": 1.1232, + "epoch": 0.3681905283328771, + "grad_norm": 0.39526777473833846, + "learning_rate": 1.593261598693755e-05, + "loss": 1.3478, "step": 1345 }, { - "epoch": 0.38704128440366975, - "grad_norm": 3.3946063622500016, - "learning_rate": 1.538764227171054e-05, - "loss": 1.1768, + "epoch": 0.3695592663564194, + "grad_norm": 0.3837685918142903, + "learning_rate": 1.5894078469692343e-05, + "loss": 1.3604, "step": 1350 }, { - "epoch": 0.3884747706422018, - "grad_norm": 3.3089407247339975, - "learning_rate": 1.5345417370127123e-05, - "loss": 1.1697, + "epoch": 0.37092800437996165, + "grad_norm": 0.4184389090754904, + "learning_rate": 1.585540634923223e-05, + "loss": 1.4042, "step": 1355 }, { - "epoch": 0.38990825688073394, - "grad_norm": 4.435494305269373, - "learning_rate": 1.5303058612545534e-05, - "loss": 1.1301, + "epoch": 0.37229674240350397, + "grad_norm": 0.4132734637825316, + "learning_rate": 1.5816600508713372e-05, + "loss": 1.3901, "step": 1360 }, { - "epoch": 0.39134174311926606, - "grad_norm": 3.1427532195841343, - "learning_rate": 1.5260567059682535e-05, - "loss": 1.1857, + "epoch": 0.3736654804270463, + "grad_norm": 0.3876929955451876, + "learning_rate": 1.5777661834345708e-05, + "loss": 1.3296, "step": 1365 }, { - "epoch": 0.3927752293577982, - "grad_norm": 3.5070367063445684, - "learning_rate": 1.521794377558024e-05, - "loss": 1.1294, + "epoch": 0.37503421845058854, + "grad_norm": 0.39294627819783745, + "learning_rate": 1.57385912153727e-05, + "loss": 1.3581, "step": 1370 }, { - "epoch": 0.39420871559633025, - "grad_norm": 3.6077040833104896, - "learning_rate": 1.5175189827579489e-05, - "loss": 1.1469, + "epoch": 0.37640295647413086, + "grad_norm": 0.3875865415567979, + "learning_rate": 1.5699389544051028e-05, + "loss": 1.3167, "step": 1375 }, { - "epoch": 0.3956422018348624, - "grad_norm": 3.401651617393664, - "learning_rate": 1.5132306286293096e-05, - "loss": 1.1383, + "epoch": 0.37777169449767317, + "grad_norm": 0.3863898154888909, + "learning_rate": 1.566005771563023e-05, + "loss": 1.3694, "step": 1380 }, { - "epoch": 0.3970756880733945, - "grad_norm": 3.3826428424139925, - "learning_rate": 1.5089294225579077e-05, - "loss": 1.1887, + "epoch": 0.37914043252121543, + "grad_norm": 0.38577432820095847, + "learning_rate": 1.5620596628332242e-05, + "loss": 1.323, "step": 1385 }, { - "epoch": 0.3985091743119266, - "grad_norm": 3.2830968866458345, - "learning_rate": 1.5046154722513718e-05, - "loss": 1.15, + "epoch": 0.38050917054475775, + "grad_norm": 0.3971392209603939, + "learning_rate": 1.5581007183330877e-05, + "loss": 1.3432, "step": 1390 }, { - "epoch": 0.39994266055045874, - "grad_norm": 3.2832815737637113, - "learning_rate": 1.5002888857364624e-05, - "loss": 1.1811, + "epoch": 0.3818779085683, + "grad_norm": 0.40311344444707337, + "learning_rate": 1.554129028473127e-05, + "loss": 1.3802, "step": 1395 }, { - "epoch": 0.4013761467889908, - "grad_norm": 3.222829419424945, - "learning_rate": 1.4959497713563677e-05, - "loss": 1.1696, + "epoch": 0.3832466465918423, + "grad_norm": 0.40527858159368374, + "learning_rate": 1.5501446839549207e-05, + "loss": 1.3445, "step": 1400 }, { - "epoch": 0.40280963302752293, - "grad_norm": 3.739181624579295, - "learning_rate": 1.4915982377679885e-05, - "loss": 1.1978, + "epoch": 0.38461538461538464, + "grad_norm": 0.4025758901452214, + "learning_rate": 1.5461477757690424e-05, + "loss": 1.3321, "step": 1405 }, { - "epoch": 0.40424311926605505, - "grad_norm": 3.383948957829039, - "learning_rate": 1.4872343939392189e-05, - "loss": 1.1724, + "epoch": 0.3859841226389269, + "grad_norm": 0.3855393398790419, + "learning_rate": 1.542138395192983e-05, + "loss": 1.3405, "step": 1410 }, { - "epoch": 0.4056766055045872, - "grad_norm": 3.7382791698251427, - "learning_rate": 1.482858349146216e-05, - "loss": 1.1471, + "epoch": 0.3873528606624692, + "grad_norm": 0.38944762738859545, + "learning_rate": 1.538116633789065e-05, + "loss": 1.3289, "step": 1415 }, { - "epoch": 0.40711009174311924, - "grad_norm": 3.510878161789182, - "learning_rate": 1.4784702129706655e-05, - "loss": 1.1337, + "epoch": 0.38872159868601147, + "grad_norm": 0.4028684971128393, + "learning_rate": 1.5340825834023526e-05, + "loss": 1.3798, "step": 1420 }, { - "epoch": 0.40854357798165136, - "grad_norm": 3.505598802015759, - "learning_rate": 1.474070095297036e-05, - "loss": 1.187, + "epoch": 0.3900903367095538, + "grad_norm": 0.37654892707807947, + "learning_rate": 1.530036336158553e-05, + "loss": 1.3432, "step": 1425 }, { - "epoch": 0.4099770642201835, - "grad_norm": 3.2069715268799204, - "learning_rate": 1.469658106309828e-05, - "loss": 1.1136, + "epoch": 0.3914590747330961, + "grad_norm": 0.3986727790830818, + "learning_rate": 1.5259779844619152e-05, + "loss": 1.3422, "step": 1430 }, { - "epoch": 0.4114105504587156, - "grad_norm": 4.1812266615604345, - "learning_rate": 1.465234356490815e-05, - "loss": 1.1718, + "epoch": 0.39282781275663836, + "grad_norm": 0.4029735444716236, + "learning_rate": 1.5219076209931159e-05, + "loss": 1.3136, "step": 1435 }, { - "epoch": 0.41284403669724773, - "grad_norm": 3.3985858466934338, - "learning_rate": 1.4607989566162761e-05, - "loss": 1.1414, + "epoch": 0.3941965507801807, + "grad_norm": 0.41012078556001347, + "learning_rate": 1.5178253387071458e-05, + "loss": 1.4002, "step": 1440 }, { - "epoch": 0.4142775229357798, - "grad_norm": 3.6381472550431093, - "learning_rate": 1.4563520177542226e-05, - "loss": 1.2166, + "epoch": 0.395565288803723, + "grad_norm": 0.40771281882367433, + "learning_rate": 1.5137312308311857e-05, + "loss": 1.3684, "step": 1445 }, { - "epoch": 0.4157110091743119, - "grad_norm": 3.268767608874384, - "learning_rate": 1.451893651261617e-05, - "loss": 1.1695, + "epoch": 0.39693402682726525, + "grad_norm": 0.38445172076552614, + "learning_rate": 1.5096253908624778e-05, + "loss": 1.3137, "step": 1450 }, { - "epoch": 0.41714449541284404, - "grad_norm": 4.519674348990799, - "learning_rate": 1.4474239687815838e-05, - "loss": 1.189, + "epoch": 0.39830276485080757, + "grad_norm": 0.3869608798791883, + "learning_rate": 1.5055079125661908e-05, + "loss": 1.2812, "step": 1455 }, { - "epoch": 0.41857798165137616, - "grad_norm": 3.594106437257434, - "learning_rate": 1.4429430822406138e-05, - "loss": 1.1956, + "epoch": 0.3996715028743498, + "grad_norm": 0.3987871366347409, + "learning_rate": 1.5013788899732775e-05, + "loss": 1.3394, "step": 1460 }, { - "epoch": 0.42001146788990823, - "grad_norm": 3.838114634468203, - "learning_rate": 1.4384511038457624e-05, - "loss": 1.1348, + "epoch": 0.40104024089789214, + "grad_norm": 0.3970490183809672, + "learning_rate": 1.4972384173783284e-05, + "loss": 1.3544, "step": 1465 }, { - "epoch": 0.42144495412844035, - "grad_norm": 3.3721934098468083, - "learning_rate": 1.4339481460818385e-05, - "loss": 1.135, + "epoch": 0.40240897892143446, + "grad_norm": 0.3925074770480503, + "learning_rate": 1.493086589337418e-05, + "loss": 1.3294, "step": 1470 }, { - "epoch": 0.4228784403669725, - "grad_norm": 3.3513743473211512, - "learning_rate": 1.429434321708588e-05, - "loss": 1.1443, + "epoch": 0.4037777169449767, + "grad_norm": 0.40973551864547075, + "learning_rate": 1.4889235006659448e-05, + "loss": 1.3675, "step": 1475 }, { - "epoch": 0.4243119266055046, - "grad_norm": 3.3279635877417304, - "learning_rate": 1.4249097437578712e-05, - "loss": 1.1574, + "epoch": 0.40514645496851903, + "grad_norm": 0.3812235133361238, + "learning_rate": 1.484749246436468e-05, + "loss": 1.3288, "step": 1480 }, { - "epoch": 0.4257454128440367, - "grad_norm": 3.5340814143703896, - "learning_rate": 1.4203745255308306e-05, - "loss": 1.1607, + "epoch": 0.40651519299206135, + "grad_norm": 0.3827389411340829, + "learning_rate": 1.4805639219765337e-05, + "loss": 1.4128, "step": 1485 }, { - "epoch": 0.4271788990825688, - "grad_norm": 3.3296679892148995, - "learning_rate": 1.4158287805950557e-05, - "loss": 1.199, + "epoch": 0.4078839310156036, + "grad_norm": 0.37343264715636293, + "learning_rate": 1.476367622866499e-05, + "loss": 1.3419, "step": 1490 }, { - "epoch": 0.4286123853211009, - "grad_norm": 3.401879333323161, - "learning_rate": 1.411272622781737e-05, - "loss": 1.1364, + "epoch": 0.4092526690391459, + "grad_norm": 0.4116326320287375, + "learning_rate": 1.4721604449373505e-05, + "loss": 1.297, "step": 1495 }, { - "epoch": 0.43004587155963303, - "grad_norm": 3.131916232702695, - "learning_rate": 1.4067061661828176e-05, - "loss": 1.1472, + "epoch": 0.4106214070626882, + "grad_norm": 0.38955441577160277, + "learning_rate": 1.4679424842685137e-05, + "loss": 1.3138, "step": 1500 }, { - "epoch": 0.43147935779816515, - "grad_norm": 3.543349954734912, - "learning_rate": 1.4021295251481347e-05, - "loss": 1.1813, + "epoch": 0.4119901450862305, + "grad_norm": 0.38444568811838686, + "learning_rate": 1.4637138371856601e-05, + "loss": 1.3284, "step": 1505 }, { - "epoch": 0.4329128440366973, - "grad_norm": 3.309506087624542, - "learning_rate": 1.3975428142825562e-05, - "loss": 1.1581, + "epoch": 0.4133588831097728, + "grad_norm": 0.3938144538178601, + "learning_rate": 1.4594746002585072e-05, + "loss": 1.3498, "step": 1510 }, { - "epoch": 0.43434633027522934, - "grad_norm": 3.9281940551258097, - "learning_rate": 1.392946148443112e-05, - "loss": 1.1808, + "epoch": 0.41472762113331507, + "grad_norm": 0.39696555787686244, + "learning_rate": 1.4552248702986127e-05, + "loss": 1.3524, "step": 1515 }, { - "epoch": 0.43577981651376146, - "grad_norm": 3.8214812468244346, - "learning_rate": 1.3883396427361169e-05, - "loss": 1.1616, + "epoch": 0.4160963591568574, + "grad_norm": 0.3802714110333375, + "learning_rate": 1.4509647443571643e-05, + "loss": 1.288, "step": 1520 }, { - "epoch": 0.4372133027522936, - "grad_norm": 3.997270424750147, - "learning_rate": 1.383723412514288e-05, - "loss": 1.1113, + "epoch": 0.41746509718039965, + "grad_norm": 0.38532238774246486, + "learning_rate": 1.446694319722763e-05, + "loss": 1.3656, "step": 1525 }, { - "epoch": 0.4386467889908257, - "grad_norm": 3.437008178425516, - "learning_rate": 1.3790975733738576e-05, - "loss": 1.1713, + "epoch": 0.41883383520394196, + "grad_norm": 0.3866193343216763, + "learning_rate": 1.4424136939192009e-05, + "loss": 1.3696, "step": 1530 }, { - "epoch": 0.4400802752293578, - "grad_norm": 3.3811402901395984, - "learning_rate": 1.3744622411516758e-05, - "loss": 1.1347, + "epoch": 0.4202025732274843, + "grad_norm": 0.37501111401637777, + "learning_rate": 1.4381229647032346e-05, + "loss": 1.3512, "step": 1535 }, { - "epoch": 0.4415137614678899, - "grad_norm": 3.3962664452333993, - "learning_rate": 1.3698175319223133e-05, - "loss": 1.1347, + "epoch": 0.42157131125102654, + "grad_norm": 0.3700012749063157, + "learning_rate": 1.4338222300623533e-05, + "loss": 1.3092, "step": 1540 }, { - "epoch": 0.442947247706422, - "grad_norm": 3.6162635705687127, - "learning_rate": 1.3651635619951509e-05, - "loss": 1.1574, + "epoch": 0.42294004927456885, + "grad_norm": 0.38806617391675347, + "learning_rate": 1.4295115882125393e-05, + "loss": 1.3471, "step": 1545 }, { - "epoch": 0.44438073394495414, - "grad_norm": 3.502705575568886, - "learning_rate": 1.360500447911471e-05, - "loss": 1.0691, + "epoch": 0.42430878729811117, + "grad_norm": 0.39116631927262135, + "learning_rate": 1.4251911375960261e-05, + "loss": 1.4043, "step": 1550 }, { - "epoch": 0.44581422018348627, - "grad_norm": 3.473102068128584, - "learning_rate": 1.3558283064415357e-05, - "loss": 1.1018, + "epoch": 0.4256775253216534, + "grad_norm": 0.3907489868883888, + "learning_rate": 1.4208609768790513e-05, + "loss": 1.3476, "step": 1555 }, { - "epoch": 0.44724770642201833, - "grad_norm": 3.5587976176438656, - "learning_rate": 1.3511472545816648e-05, - "loss": 1.1293, + "epoch": 0.42704626334519574, + "grad_norm": 0.40188011896979126, + "learning_rate": 1.4165212049496013e-05, + "loss": 1.3398, "step": 1560 }, { - "epoch": 0.44868119266055045, - "grad_norm": 3.613085860404909, - "learning_rate": 1.3464574095513058e-05, - "loss": 1.1685, + "epoch": 0.428415001368738, + "grad_norm": 0.39583117671628015, + "learning_rate": 1.4121719209151545e-05, + "loss": 1.3827, "step": 1565 }, { - "epoch": 0.4501146788990826, - "grad_norm": 4.06622018524846, - "learning_rate": 1.3417588887900978e-05, - "loss": 1.1067, + "epoch": 0.4297837393922803, + "grad_norm": 0.3948036088686505, + "learning_rate": 1.4078132241004174e-05, + "loss": 1.3509, "step": 1570 }, { - "epoch": 0.4515481651376147, - "grad_norm": 3.4649254770088698, - "learning_rate": 1.3370518099549315e-05, - "loss": 1.1026, + "epoch": 0.43115247741582263, + "grad_norm": 0.3844018510427473, + "learning_rate": 1.4034452140450561e-05, + "loss": 1.3619, "step": 1575 }, { - "epoch": 0.45298165137614677, - "grad_norm": 2.952840725725241, - "learning_rate": 1.3323362909170018e-05, - "loss": 1.1283, + "epoch": 0.4325212154393649, + "grad_norm": 0.40654356409937986, + "learning_rate": 1.3990679905014235e-05, + "loss": 1.2983, "step": 1580 }, { - "epoch": 0.4544151376146789, - "grad_norm": 3.253136251920175, - "learning_rate": 1.3276124497588585e-05, - "loss": 1.2414, + "epoch": 0.4338899534629072, + "grad_norm": 0.401731234750035, + "learning_rate": 1.3946816534322815e-05, + "loss": 1.3573, "step": 1585 }, { - "epoch": 0.455848623853211, - "grad_norm": 3.162846986211554, - "learning_rate": 1.3228804047714462e-05, - "loss": 1.1169, + "epoch": 0.43525869148644947, + "grad_norm": 0.40604771069832035, + "learning_rate": 1.3902863030085176e-05, + "loss": 1.333, "step": 1590 }, { - "epoch": 0.45728211009174313, - "grad_norm": 3.3272179350821505, - "learning_rate": 1.3181402744511446e-05, - "loss": 1.1756, + "epoch": 0.4366274295099918, + "grad_norm": 0.3821888564061559, + "learning_rate": 1.3858820396068572e-05, + "loss": 1.3062, "step": 1595 }, { - "epoch": 0.45871559633027525, - "grad_norm": 3.1731644889698183, - "learning_rate": 1.3133921774968001e-05, - "loss": 1.1305, + "epoch": 0.4379961675335341, + "grad_norm": 0.3992586017031061, + "learning_rate": 1.3814689638075725e-05, + "loss": 1.3671, "step": 1600 }, { - "epoch": 0.4601490825688073, - "grad_norm": 3.1249681327250904, - "learning_rate": 1.3086362328067536e-05, - "loss": 1.1603, + "epoch": 0.43936490555707636, + "grad_norm": 0.4018110425948897, + "learning_rate": 1.3770471763921833e-05, + "loss": 1.3709, "step": 1605 }, { - "epoch": 0.46158256880733944, - "grad_norm": 3.376924777276381, - "learning_rate": 1.3038725594758632e-05, - "loss": 1.118, + "epoch": 0.4407336435806187, + "grad_norm": 0.387994622633552, + "learning_rate": 1.372616778341158e-05, + "loss": 1.3138, "step": 1610 }, { - "epoch": 0.46301605504587157, - "grad_norm": 3.193633263846461, - "learning_rate": 1.2991012767925224e-05, - "loss": 1.1337, + "epoch": 0.442102381604161, + "grad_norm": 0.3974604574879118, + "learning_rate": 1.3681778708316054e-05, + "loss": 1.4087, "step": 1615 }, { - "epoch": 0.4644495412844037, - "grad_norm": 3.0745547065753915, - "learning_rate": 1.2943225042356714e-05, - "loss": 1.1252, + "epoch": 0.44347111962770325, + "grad_norm": 0.39844593969419934, + "learning_rate": 1.3637305552349656e-05, + "loss": 1.3599, "step": 1620 }, { - "epoch": 0.46588302752293576, - "grad_norm": 7.556185912909887, - "learning_rate": 1.2895363614718082e-05, - "loss": 1.1517, + "epoch": 0.44483985765124556, + "grad_norm": 0.3857194594528982, + "learning_rate": 1.3592749331146941e-05, + "loss": 1.2897, "step": 1625 }, { - "epoch": 0.4673165137614679, - "grad_norm": 3.3618721338615787, - "learning_rate": 1.2847429683519879e-05, - "loss": 1.1321, + "epoch": 0.4462085956747878, + "grad_norm": 0.3750305160936592, + "learning_rate": 1.3548111062239432e-05, + "loss": 1.2819, "step": 1630 }, { - "epoch": 0.46875, - "grad_norm": 3.1654091942580904, - "learning_rate": 1.2799424449088246e-05, - "loss": 1.1475, + "epoch": 0.44757733369833014, + "grad_norm": 0.4007647993356723, + "learning_rate": 1.350339176503237e-05, + "loss": 1.3194, "step": 1635 }, { - "epoch": 0.4701834862385321, - "grad_norm": 3.3736619152907386, - "learning_rate": 1.2751349113534856e-05, - "loss": 1.1297, + "epoch": 0.44894607172187245, + "grad_norm": 0.4013887994126227, + "learning_rate": 1.3458592460781446e-05, + "loss": 1.3199, "step": 1640 }, { - "epoch": 0.47161697247706424, - "grad_norm": 3.711148861718645, - "learning_rate": 1.2703204880726788e-05, - "loss": 1.1654, + "epoch": 0.4503148097454147, + "grad_norm": 0.3981164766269796, + "learning_rate": 1.341371417256947e-05, + "loss": 1.3464, "step": 1645 }, { - "epoch": 0.4730504587155963, - "grad_norm": 3.0399296608693946, - "learning_rate": 1.2654992956256397e-05, - "loss": 1.1072, + "epoch": 0.451683547768957, + "grad_norm": 0.39477092552678616, + "learning_rate": 1.3368757925283015e-05, + "loss": 1.3529, "step": 1650 }, { - "epoch": 0.47448394495412843, - "grad_norm": 3.299482801946242, - "learning_rate": 1.2606714547411138e-05, - "loss": 1.1338, + "epoch": 0.45305228579249934, + "grad_norm": 0.3896314211042654, + "learning_rate": 1.3323724745589007e-05, + "loss": 1.3881, "step": 1655 }, { - "epoch": 0.47591743119266056, - "grad_norm": 4.109998693762478, - "learning_rate": 1.2558370863143298e-05, - "loss": 1.1517, + "epoch": 0.4544210238160416, + "grad_norm": 0.3954835469300208, + "learning_rate": 1.3278615661911274e-05, + "loss": 1.3124, "step": 1660 }, { - "epoch": 0.4773509174311927, - "grad_norm": 3.6028647178766167, - "learning_rate": 1.250996311403976e-05, - "loss": 1.2193, + "epoch": 0.4557897618395839, + "grad_norm": 0.3708379387210574, + "learning_rate": 1.3233431704407072e-05, + "loss": 1.2866, "step": 1665 }, { - "epoch": 0.47878440366972475, - "grad_norm": 3.088463522622337, - "learning_rate": 1.246149251229166e-05, - "loss": 1.1481, + "epoch": 0.4571584998631262, + "grad_norm": 0.38510327105983166, + "learning_rate": 1.318817390494355e-05, + "loss": 1.2968, "step": 1670 }, { - "epoch": 0.48021788990825687, - "grad_norm": 3.1966754732870033, - "learning_rate": 1.2412960271664046e-05, - "loss": 1.0923, + "epoch": 0.4585272378866685, + "grad_norm": 0.38362958034859684, + "learning_rate": 1.3142843297074182e-05, + "loss": 1.3155, "step": 1675 }, { - "epoch": 0.481651376146789, - "grad_norm": 3.5520419027791044, - "learning_rate": 1.2364367607465483e-05, - "loss": 1.131, + "epoch": 0.4598959759102108, + "grad_norm": 0.4082500339960413, + "learning_rate": 1.3097440916015179e-05, + "loss": 1.3646, "step": 1680 }, { - "epoch": 0.4830848623853211, - "grad_norm": 3.1911366000417103, - "learning_rate": 1.2315715736517624e-05, - "loss": 1.1524, + "epoch": 0.46126471393375307, + "grad_norm": 0.3927808793669975, + "learning_rate": 1.3051967798621834e-05, + "loss": 1.3165, "step": 1685 }, { - "epoch": 0.48451834862385323, - "grad_norm": 3.1232221206590847, - "learning_rate": 1.2267005877124721e-05, - "loss": 1.1336, + "epoch": 0.4626334519572954, + "grad_norm": 0.4002328011893471, + "learning_rate": 1.300642498336484e-05, + "loss": 1.3287, "step": 1690 }, { - "epoch": 0.4859518348623853, - "grad_norm": 3.1919229672837606, - "learning_rate": 1.2218239249043143e-05, - "loss": 1.1163, + "epoch": 0.46400218998083764, + "grad_norm": 0.38096569725907414, + "learning_rate": 1.2960813510306599e-05, + "loss": 1.319, "step": 1695 }, { - "epoch": 0.4873853211009174, - "grad_norm": 3.6466393865332223, - "learning_rate": 1.2169417073450805e-05, - "loss": 1.1113, + "epoch": 0.46537092800437996, + "grad_norm": 0.38869356480860534, + "learning_rate": 1.2915134421077433e-05, + "loss": 1.3763, "step": 1700 }, { - "epoch": 0.48881880733944955, - "grad_norm": 3.275240485720321, - "learning_rate": 1.2120540572916617e-05, - "loss": 1.1516, + "epoch": 0.4667396660279223, + "grad_norm": 0.38290589898614796, + "learning_rate": 1.2869388758851828e-05, + "loss": 1.3204, "step": 1705 }, { - "epoch": 0.49025229357798167, - "grad_norm": 3.350383352712665, - "learning_rate": 1.2071610971369842e-05, - "loss": 1.1564, + "epoch": 0.46810840405146453, + "grad_norm": 0.41880465304512116, + "learning_rate": 1.2823577568324604e-05, + "loss": 1.3858, "step": 1710 }, { - "epoch": 0.49168577981651373, - "grad_norm": 3.5074589174762236, - "learning_rate": 1.2022629494069466e-05, - "loss": 1.1004, + "epoch": 0.46947714207500685, + "grad_norm": 0.4031243358335159, + "learning_rate": 1.2777701895687034e-05, + "loss": 1.3346, "step": 1715 }, { - "epoch": 0.49311926605504586, - "grad_norm": 3.4164962172597035, - "learning_rate": 1.1973597367573509e-05, - "loss": 1.0966, + "epoch": 0.47084588009854916, + "grad_norm": 0.3919138760842289, + "learning_rate": 1.2731762788602988e-05, + "loss": 1.4046, "step": 1720 }, { - "epoch": 0.494552752293578, - "grad_norm": 3.3927103725177825, - "learning_rate": 1.19245158197083e-05, - "loss": 1.1635, + "epoch": 0.4722146181220914, + "grad_norm": 0.40045145219664335, + "learning_rate": 1.2685761296184987e-05, + "loss": 1.3808, "step": 1725 }, { - "epoch": 0.4959862385321101, - "grad_norm": 2.9590270815185193, - "learning_rate": 1.1875386079537762e-05, - "loss": 1.1415, + "epoch": 0.47358335614563374, + "grad_norm": 0.38631426280799075, + "learning_rate": 1.2639698468970237e-05, + "loss": 1.3563, "step": 1730 }, { - "epoch": 0.4974197247706422, - "grad_norm": 3.257588158306314, - "learning_rate": 1.1826209377332593e-05, - "loss": 1.1326, + "epoch": 0.474952094169176, + "grad_norm": 0.3891224496732836, + "learning_rate": 1.259357535889666e-05, + "loss": 1.368, "step": 1735 }, { - "epoch": 0.4988532110091743, - "grad_norm": 3.3051541372419737, - "learning_rate": 1.1776986944539498e-05, - "loss": 1.1278, + "epoch": 0.4763208321927183, + "grad_norm": 0.40044455029632475, + "learning_rate": 1.2547393019278853e-05, + "loss": 1.3548, "step": 1740 }, { - "epoch": 0.5002866972477065, - "grad_norm": 3.523346784327648, - "learning_rate": 1.1727720013750319e-05, - "loss": 1.1099, + "epoch": 0.47768957021626063, + "grad_norm": 0.3899915581292829, + "learning_rate": 1.2501152504784044e-05, + "loss": 1.3291, "step": 1745 }, { - "epoch": 0.5017201834862385, - "grad_norm": 3.3893012078265015, - "learning_rate": 1.1678409818671192e-05, - "loss": 1.1546, + "epoch": 0.4790583082398029, + "grad_norm": 0.3869143385442828, + "learning_rate": 1.2454854871407993e-05, + "loss": 1.3473, "step": 1750 }, { - "epoch": 0.5031536697247706, - "grad_norm": 3.3226857594304713, - "learning_rate": 1.1629057594091639e-05, - "loss": 1.1061, + "epoch": 0.4804270462633452, + "grad_norm": 0.3824047072867315, + "learning_rate": 1.2408501176450898e-05, + "loss": 1.3196, "step": 1755 }, { - "epoch": 0.5045871559633027, - "grad_norm": 3.507323841567208, - "learning_rate": 1.1579664575853667e-05, - "loss": 1.1551, + "epoch": 0.48179578428688746, + "grad_norm": 0.4057108906200233, + "learning_rate": 1.2362092478493226e-05, + "loss": 1.3554, "step": 1760 }, { - "epoch": 0.5060206422018348, - "grad_norm": 3.1700645014865856, - "learning_rate": 1.1530232000820791e-05, - "loss": 1.1151, + "epoch": 0.4831645223104298, + "grad_norm": 0.3776742288321122, + "learning_rate": 1.2315629837371556e-05, + "loss": 1.3014, "step": 1765 }, { - "epoch": 0.507454128440367, - "grad_norm": 3.397843452612661, - "learning_rate": 1.1480761106847088e-05, - "loss": 1.0967, + "epoch": 0.4845332603339721, + "grad_norm": 0.3977510874849243, + "learning_rate": 1.2269114314154365e-05, + "loss": 1.3967, "step": 1770 }, { - "epoch": 0.5088876146788991, - "grad_norm": 3.296593763189619, - "learning_rate": 1.1431253132746187e-05, - "loss": 1.1229, + "epoch": 0.48590199835751435, + "grad_norm": 0.3697505648941807, + "learning_rate": 1.2222546971117797e-05, + "loss": 1.3171, "step": 1775 }, { - "epoch": 0.5103211009174312, - "grad_norm": 3.1860797359537574, - "learning_rate": 1.138170931826025e-05, - "loss": 1.1305, + "epoch": 0.48727073638105667, + "grad_norm": 0.40788370625821474, + "learning_rate": 1.2175928871721411e-05, + "loss": 1.3222, "step": 1780 }, { - "epoch": 0.5117545871559633, - "grad_norm": 3.2917554463603333, - "learning_rate": 1.133213090402893e-05, - "loss": 1.0911, + "epoch": 0.488639474404599, + "grad_norm": 0.39090111193092664, + "learning_rate": 1.2129261080583897e-05, + "loss": 1.3676, "step": 1785 }, { - "epoch": 0.5131880733944955, - "grad_norm": 3.158139008213761, - "learning_rate": 1.1282519131558302e-05, - "loss": 1.1095, + "epoch": 0.49000821242814124, + "grad_norm": 0.3983751803246069, + "learning_rate": 1.2082544663458736e-05, + "loss": 1.3364, "step": 1790 }, { - "epoch": 0.5146215596330275, - "grad_norm": 3.22490301605298, - "learning_rate": 1.1232875243189765e-05, - "loss": 1.1695, + "epoch": 0.49137695045168356, + "grad_norm": 0.3848833412781912, + "learning_rate": 1.20357806872099e-05, + "loss": 1.2978, "step": 1795 }, { - "epoch": 0.5160550458715596, - "grad_norm": 3.586519697086409, - "learning_rate": 1.1183200482068949e-05, - "loss": 1.0989, + "epoch": 0.4927456884752258, + "grad_norm": 0.37897812252926655, + "learning_rate": 1.1988970219787467e-05, + "loss": 1.3333, "step": 1800 }, { - "epoch": 0.5174885321100917, - "grad_norm": 3.265891742392178, - "learning_rate": 1.1133496092114576e-05, - "loss": 1.0851, + "epoch": 0.49411442649876813, + "grad_norm": 0.3948100327011319, + "learning_rate": 1.1942114330203227e-05, + "loss": 1.3665, "step": 1805 }, { - "epoch": 0.5189220183486238, - "grad_norm": 3.1099004425125334, - "learning_rate": 1.1083763317987304e-05, - "loss": 1.1119, + "epoch": 0.49548316452231045, + "grad_norm": 0.3844309877664426, + "learning_rate": 1.1895214088506284e-05, + "loss": 1.3383, "step": 1810 }, { - "epoch": 0.520355504587156, - "grad_norm": 4.5404304007913305, - "learning_rate": 1.103400340505858e-05, - "loss": 1.1143, + "epoch": 0.4968519025458527, + "grad_norm": 0.3785571762413702, + "learning_rate": 1.1848270565758616e-05, + "loss": 1.2997, "step": 1815 }, { - "epoch": 0.5217889908256881, - "grad_norm": 3.4230865219558764, - "learning_rate": 1.0984217599379425e-05, - "loss": 1.1182, + "epoch": 0.498220640569395, + "grad_norm": 0.3873700740980167, + "learning_rate": 1.1801284834010596e-05, + "loss": 1.3332, "step": 1820 }, { - "epoch": 0.5232224770642202, - "grad_norm": 3.1427663664508456, - "learning_rate": 1.093440714764926e-05, - "loss": 1.1317, + "epoch": 0.49958937859293734, + "grad_norm": 0.3971195600650439, + "learning_rate": 1.1754257966276544e-05, + "loss": 1.3468, "step": 1825 }, { - "epoch": 0.5246559633027523, - "grad_norm": 2.9784943507834476, - "learning_rate": 1.088457329718467e-05, - "loss": 1.0364, + "epoch": 0.5009581166164796, + "grad_norm": 0.37924897770225724, + "learning_rate": 1.1707191036510189e-05, + "loss": 1.3152, "step": 1830 }, { - "epoch": 0.5260894495412844, - "grad_norm": 3.3656116327984806, - "learning_rate": 1.0834717295888168e-05, - "loss": 1.1432, + "epoch": 0.5023268546400219, + "grad_norm": 0.37889309262468557, + "learning_rate": 1.1660085119580165e-05, + "loss": 1.3074, "step": 1835 }, { - "epoch": 0.5275229357798165, - "grad_norm": 2.8639878500754774, - "learning_rate": 1.0784840392216961e-05, - "loss": 1.1421, + "epoch": 0.5036955926635642, + "grad_norm": 0.38526267448234497, + "learning_rate": 1.1612941291245456e-05, + "loss": 1.2863, "step": 1840 }, { - "epoch": 0.5289564220183486, - "grad_norm": 3.340151227567289, - "learning_rate": 1.0734943835151674e-05, - "loss": 1.0931, + "epoch": 0.5050643306871064, + "grad_norm": 0.4007327343298376, + "learning_rate": 1.1565760628130824e-05, + "loss": 1.348, "step": 1845 }, { - "epoch": 0.5303899082568807, - "grad_norm": 3.8035180968079993, - "learning_rate": 1.0685028874165075e-05, - "loss": 1.1179, + "epoch": 0.5064330687106487, + "grad_norm": 0.3917232580410847, + "learning_rate": 1.1518544207702238e-05, + "loss": 1.3193, "step": 1850 }, { - "epoch": 0.5318233944954128, - "grad_norm": 3.070748628370571, - "learning_rate": 1.0635096759190792e-05, - "loss": 1.123, + "epoch": 0.5078018067341911, + "grad_norm": 0.41154960813532804, + "learning_rate": 1.1471293108242251e-05, + "loss": 1.3921, "step": 1855 }, { - "epoch": 0.533256880733945, - "grad_norm": 3.226775333715127, - "learning_rate": 1.0585148740592013e-05, - "loss": 1.1159, + "epoch": 0.5091705447577334, + "grad_norm": 0.3808027664016428, + "learning_rate": 1.1424008408825383e-05, + "loss": 1.337, "step": 1860 }, { - "epoch": 0.5346903669724771, - "grad_norm": 3.236504164334084, - "learning_rate": 1.053518606913017e-05, - "loss": 1.1494, + "epoch": 0.5105392827812757, + "grad_norm": 0.38767901381709907, + "learning_rate": 1.1376691189293474e-05, + "loss": 1.3142, "step": 1865 }, { - "epoch": 0.5361238532110092, - "grad_norm": 3.2302791306950702, - "learning_rate": 1.048520999593362e-05, - "loss": 1.1444, + "epoch": 0.511908020804818, + "grad_norm": 0.39402951617560983, + "learning_rate": 1.1329342530231036e-05, + "loss": 1.358, "step": 1870 }, { - "epoch": 0.5375573394495413, - "grad_norm": 3.2832052841745916, - "learning_rate": 1.0435221772466318e-05, - "loss": 1.1469, + "epoch": 0.5132767588283602, + "grad_norm": 0.38538496638703307, + "learning_rate": 1.128196351294055e-05, + "loss": 1.2994, "step": 1875 }, { - "epoch": 0.5389908256880734, - "grad_norm": 3.2524360195076496, - "learning_rate": 1.0385222650496479e-05, - "loss": 1.1313, + "epoch": 0.5146454968519025, + "grad_norm": 0.38082093170114384, + "learning_rate": 1.1234555219417804e-05, + "loss": 1.3208, "step": 1880 }, { - "epoch": 0.5404243119266054, - "grad_norm": 2.919300243143409, - "learning_rate": 1.0335213882065225e-05, - "loss": 1.1112, + "epoch": 0.5160142348754448, + "grad_norm": 0.3748307863634792, + "learning_rate": 1.1187118732327167e-05, + "loss": 1.3506, "step": 1885 }, { - "epoch": 0.5418577981651376, - "grad_norm": 3.7356105416504546, - "learning_rate": 1.0285196719455242e-05, - "loss": 1.1113, + "epoch": 0.5173829728989872, + "grad_norm": 0.38705593276141625, + "learning_rate": 1.1139655134976855e-05, + "loss": 1.2906, "step": 1890 }, { - "epoch": 0.5432912844036697, - "grad_norm": 3.1163300953894306, - "learning_rate": 1.0235172415159418e-05, - "loss": 1.0671, + "epoch": 0.5187517109225295, + "grad_norm": 0.39888481423655353, + "learning_rate": 1.1092165511294206e-05, + "loss": 1.367, "step": 1895 }, { - "epoch": 0.5447247706422018, - "grad_norm": 3.6338243642002976, - "learning_rate": 1.0185142221849469e-05, - "loss": 1.1405, + "epoch": 0.5201204489460717, + "grad_norm": 0.3971141048112466, + "learning_rate": 1.104465094580093e-05, + "loss": 1.3405, "step": 1900 }, { - "epoch": 0.5461582568807339, - "grad_norm": 3.4265846748133613, - "learning_rate": 1.0135107392344594e-05, - "loss": 1.1052, + "epoch": 0.521489186969614, + "grad_norm": 0.3837313790185174, + "learning_rate": 1.0997112523588322e-05, + "loss": 1.3351, "step": 1905 }, { - "epoch": 0.5475917431192661, - "grad_norm": 3.6433669703963774, - "learning_rate": 1.0085069179580076e-05, - "loss": 1.0958, + "epoch": 0.5228579249931563, + "grad_norm": 0.39019433157562, + "learning_rate": 1.0949551330292502e-05, + "loss": 1.2482, "step": 1910 }, { - "epoch": 0.5490252293577982, - "grad_norm": 3.070116617818365, - "learning_rate": 1.0035028836575922e-05, - "loss": 1.0856, + "epoch": 0.5242266630166986, + "grad_norm": 0.39062012523465395, + "learning_rate": 1.090196845206961e-05, + "loss": 1.343, "step": 1915 }, { - "epoch": 0.5504587155963303, - "grad_norm": 3.3094892848191373, - "learning_rate": 9.984987616405486e-06, - "loss": 1.1775, + "epoch": 0.5255954010402409, + "grad_norm": 0.3948259084399907, + "learning_rate": 1.0854364975571004e-05, + "loss": 1.3001, "step": 1920 }, { - "epoch": 0.5518922018348624, - "grad_norm": 3.1253255381505465, - "learning_rate": 9.934946772164082e-06, - "loss": 1.0962, + "epoch": 0.5269641390637831, + "grad_norm": 0.3845620013681871, + "learning_rate": 1.0806741987918448e-05, + "loss": 1.3449, "step": 1925 }, { - "epoch": 0.5533256880733946, - "grad_norm": 3.2006466765888377, - "learning_rate": 9.884907556937619e-06, - "loss": 1.0732, + "epoch": 0.5283328770873255, + "grad_norm": 0.3976520791295208, + "learning_rate": 1.075910057667928e-05, + "loss": 1.3007, "step": 1930 }, { - "epoch": 0.5547591743119266, - "grad_norm": 3.1292256633487647, - "learning_rate": 9.834871223771204e-06, - "loss": 1.0887, + "epoch": 0.5297016151108678, + "grad_norm": 0.3817709747936168, + "learning_rate": 1.071144182984158e-05, + "loss": 1.3453, "step": 1935 }, { - "epoch": 0.5561926605504587, - "grad_norm": 3.1916225285491886, - "learning_rate": 9.78483902563778e-06, - "loss": 1.1239, + "epoch": 0.5310703531344101, + "grad_norm": 0.38542609340341144, + "learning_rate": 1.0663766835789327e-05, + "loss": 1.3187, "step": 1940 }, { - "epoch": 0.5576261467889908, - "grad_norm": 3.0332428019262623, - "learning_rate": 9.73481221540674e-06, - "loss": 1.1231, + "epoch": 0.5324390911579524, + "grad_norm": 0.3775171570445251, + "learning_rate": 1.0616076683277524e-05, + "loss": 1.3315, "step": 1945 }, { - "epoch": 0.5590596330275229, - "grad_norm": 3.045815826748736, - "learning_rate": 9.684792045812555e-06, - "loss": 1.1098, + "epoch": 0.5338078291814946, + "grad_norm": 0.40024298975169065, + "learning_rate": 1.056837246140736e-05, + "loss": 1.3772, "step": 1950 }, { - "epoch": 0.560493119266055, - "grad_norm": 3.0391892370593334, - "learning_rate": 9.634779769423412e-06, - "loss": 1.1044, + "epoch": 0.5351765672050369, + "grad_norm": 0.3882397104456436, + "learning_rate": 1.0520655259601325e-05, + "loss": 1.3284, "step": 1955 }, { - "epoch": 0.5619266055045872, - "grad_norm": 3.055569668736528, - "learning_rate": 9.584776638609841e-06, - "loss": 1.0894, + "epoch": 0.5365453052285792, + "grad_norm": 0.3892143818918897, + "learning_rate": 1.0472926167578323e-05, + "loss": 1.302, "step": 1960 }, { - "epoch": 0.5633600917431193, - "grad_norm": 3.5406439314630167, - "learning_rate": 9.534783905513355e-06, - "loss": 1.1109, + "epoch": 0.5379140432521216, + "grad_norm": 0.41282669532669847, + "learning_rate": 1.042518627532881e-05, + "loss": 1.3261, "step": 1965 }, { - "epoch": 0.5647935779816514, - "grad_norm": 3.345905555913882, - "learning_rate": 9.484802822015087e-06, - "loss": 1.1138, + "epoch": 0.5392827812756639, + "grad_norm": 0.37154889529358315, + "learning_rate": 1.0377436673089873e-05, + "loss": 1.3142, "step": 1970 }, { - "epoch": 0.5662270642201835, - "grad_norm": 3.296141105361695, - "learning_rate": 9.434834639704464e-06, - "loss": 1.1059, + "epoch": 0.5406515192992062, + "grad_norm": 0.4001478843028637, + "learning_rate": 1.0329678451320352e-05, + "loss": 1.3894, "step": 1975 }, { - "epoch": 0.5676605504587156, - "grad_norm": 2.8031036854561697, - "learning_rate": 9.384880609847838e-06, - "loss": 1.0806, + "epoch": 0.5420202573227484, + "grad_norm": 0.4082737912884333, + "learning_rate": 1.0281912700675937e-05, + "loss": 1.38, "step": 1980 }, { - "epoch": 0.5690940366972477, - "grad_norm": 3.0977315843393165, - "learning_rate": 9.33494198335717e-06, - "loss": 1.1364, + "epoch": 0.5433889953462907, + "grad_norm": 0.3976694377900842, + "learning_rate": 1.0234140511984246e-05, + "loss": 1.337, "step": 1985 }, { - "epoch": 0.5705275229357798, - "grad_norm": 3.3929218329827604, - "learning_rate": 9.285020010758706e-06, - "loss": 1.1644, + "epoch": 0.544757733369833, + "grad_norm": 0.4090490835246688, + "learning_rate": 1.0186362976219926e-05, + "loss": 1.3389, "step": 1990 }, { - "epoch": 0.5719610091743119, - "grad_norm": 3.2746802917151396, - "learning_rate": 9.235115942161656e-06, - "loss": 1.0867, + "epoch": 0.5461264713933753, + "grad_norm": 0.39123091240448754, + "learning_rate": 1.0138581184479737e-05, + "loss": 1.2726, "step": 1995 }, { - "epoch": 0.573394495412844, - "grad_norm": 3.4111204084118594, - "learning_rate": 9.18523102722688e-06, - "loss": 1.1372, + "epoch": 0.5474952094169176, + "grad_norm": 0.383808685203924, + "learning_rate": 1.0090796227957633e-05, + "loss": 1.3344, "step": 2000 }, { - "epoch": 0.5748279816513762, - "grad_norm": 3.0339025073036883, - "learning_rate": 9.135366515135617e-06, - "loss": 1.1156, + "epoch": 0.5488639474404599, + "grad_norm": 0.39282920042398406, + "learning_rate": 1.0043009197919836e-05, + "loss": 1.3188, "step": 2005 }, { - "epoch": 0.5762614678899083, - "grad_norm": 3.152200571534001, - "learning_rate": 9.0855236545582e-06, - "loss": 1.2007, + "epoch": 0.5502326854640022, + "grad_norm": 0.3877735838132022, + "learning_rate": 9.99522118567993e-06, + "loss": 1.3412, "step": 2010 }, { - "epoch": 0.5776949541284404, - "grad_norm": 5.672098040763039, - "learning_rate": 9.035703693622762e-06, - "loss": 1.1011, + "epoch": 0.5516014234875445, + "grad_norm": 0.3855977732261345, + "learning_rate": 9.947433282573926e-06, + "loss": 1.31, "step": 2015 }, { - "epoch": 0.5791284403669725, - "grad_norm": 3.0337007868840207, - "learning_rate": 8.985907879884011e-06, - "loss": 1.0502, + "epoch": 0.5529701615110868, + "grad_norm": 0.39182359203627265, + "learning_rate": 9.899646579935336e-06, + "loss": 1.3288, "step": 2020 }, { - "epoch": 0.5805619266055045, - "grad_norm": 3.397456497600358, - "learning_rate": 8.936137460291985e-06, - "loss": 1.0686, + "epoch": 0.5543388995346291, + "grad_norm": 0.37896193597955086, + "learning_rate": 9.85186216907027e-06, + "loss": 1.3222, "step": 2025 }, { - "epoch": 0.5819954128440367, - "grad_norm": 3.1557825825649983, - "learning_rate": 8.886393681160804e-06, - "loss": 1.1242, + "epoch": 0.5557076375581713, + "grad_norm": 0.39627324276578396, + "learning_rate": 9.80408114123249e-06, + "loss": 1.3659, "step": 2030 }, { - "epoch": 0.5834288990825688, - "grad_norm": 3.014868609795805, - "learning_rate": 8.836677788137488e-06, - "loss": 1.0823, + "epoch": 0.5570763755817136, + "grad_norm": 0.39331176053433153, + "learning_rate": 9.756304587598503e-06, + "loss": 1.4234, "step": 2035 }, { - "epoch": 0.5848623853211009, - "grad_norm": 3.1697976935253256, - "learning_rate": 8.78699102617076e-06, - "loss": 1.1477, + "epoch": 0.558445113605256, + "grad_norm": 0.38413569943245757, + "learning_rate": 9.708533599242643e-06, + "loss": 1.2809, "step": 2040 }, { - "epoch": 0.586295871559633, - "grad_norm": 3.060910090014108, - "learning_rate": 8.737334639479843e-06, - "loss": 1.1047, + "epoch": 0.5598138516287983, + "grad_norm": 0.378050582923272, + "learning_rate": 9.660769267112137e-06, + "loss": 1.3, "step": 2045 }, { - "epoch": 0.5877293577981652, - "grad_norm": 3.3975873769480525, - "learning_rate": 8.687709871523346e-06, - "loss": 1.1472, + "epoch": 0.5611825896523406, + "grad_norm": 0.3810287903511996, + "learning_rate": 9.61301268200222e-06, + "loss": 1.328, "step": 2050 }, { - "epoch": 0.5891628440366973, - "grad_norm": 3.103294993064734, - "learning_rate": 8.638117964968098e-06, - "loss": 1.1147, + "epoch": 0.5625513276758828, + "grad_norm": 0.3968883040797777, + "learning_rate": 9.5652649345312e-06, + "loss": 1.3861, "step": 2055 }, { - "epoch": 0.5905963302752294, - "grad_norm": 3.1038713521837176, - "learning_rate": 8.588560161658039e-06, - "loss": 1.1201, + "epoch": 0.5639200656994251, + "grad_norm": 0.381754630880566, + "learning_rate": 9.517527115115554e-06, + "loss": 1.2931, "step": 2060 }, { - "epoch": 0.5920298165137615, - "grad_norm": 3.579612587244007, - "learning_rate": 8.539037702583108e-06, - "loss": 1.2046, + "epoch": 0.5652888037229674, + "grad_norm": 0.3934475062551264, + "learning_rate": 9.46980031394504e-06, + "loss": 1.357, "step": 2065 }, { - "epoch": 0.5934633027522935, - "grad_norm": 3.1302516904022766, - "learning_rate": 8.489551827848197e-06, - "loss": 1.0237, + "epoch": 0.5666575417465097, + "grad_norm": 0.3764960791780743, + "learning_rate": 9.422085620957795e-06, + "loss": 1.3112, "step": 2070 }, { - "epoch": 0.5948967889908257, - "grad_norm": 3.24638481758472, - "learning_rate": 8.440103776642074e-06, - "loss": 1.0678, + "epoch": 0.568026279770052, + "grad_norm": 0.3819475902039454, + "learning_rate": 9.374384125815427e-06, + "loss": 1.317, "step": 2075 }, { - "epoch": 0.5963302752293578, - "grad_norm": 3.4237594408353083, - "learning_rate": 8.390694787206349e-06, - "loss": 1.1734, + "epoch": 0.5693950177935944, + "grad_norm": 0.39075935188345634, + "learning_rate": 9.326696917878159e-06, + "loss": 1.329, "step": 2080 }, { - "epoch": 0.5977637614678899, - "grad_norm": 3.0719379679166625, - "learning_rate": 8.341326096804489e-06, - "loss": 1.0969, + "epoch": 0.5707637558171366, + "grad_norm": 0.4115410325839856, + "learning_rate": 9.27902508617993e-06, + "loss": 1.3405, "step": 2085 }, { - "epoch": 0.599197247706422, - "grad_norm": 3.114117068959359, - "learning_rate": 8.291998941690821e-06, - "loss": 1.1036, + "epoch": 0.5721324938406789, + "grad_norm": 0.3929327529237431, + "learning_rate": 9.23136971940353e-06, + "loss": 1.2811, "step": 2090 }, { - "epoch": 0.6006307339449541, - "grad_norm": 3.058349370894333, - "learning_rate": 8.242714557079563e-06, - "loss": 1.0749, + "epoch": 0.5735012318642212, + "grad_norm": 0.38257057379328274, + "learning_rate": 9.183731905855746e-06, + "loss": 1.3234, "step": 2095 }, { - "epoch": 0.6020642201834863, - "grad_norm": 3.036934322182637, - "learning_rate": 8.193474177113918e-06, - "loss": 1.0583, + "epoch": 0.5748699698877635, + "grad_norm": 0.38674820150451145, + "learning_rate": 9.136112733442493e-06, + "loss": 1.3466, "step": 2100 }, { - "epoch": 0.6034977064220184, - "grad_norm": 3.13799906852602, - "learning_rate": 8.144279034835157e-06, - "loss": 1.1374, + "epoch": 0.5762387079113058, + "grad_norm": 0.39368922489348585, + "learning_rate": 9.088513289643982e-06, + "loss": 1.3149, "step": 2105 }, { - "epoch": 0.6049311926605505, - "grad_norm": 3.303135805293391, - "learning_rate": 8.095130362151737e-06, - "loss": 1.0465, + "epoch": 0.577607445934848, + "grad_norm": 0.38199335563674713, + "learning_rate": 9.040934661489885e-06, + "loss": 1.3576, "step": 2110 }, { - "epoch": 0.6063646788990825, - "grad_norm": 3.3978912865766686, - "learning_rate": 8.046029389808457e-06, - "loss": 1.055, + "epoch": 0.5789761839583903, + "grad_norm": 0.39323361965983067, + "learning_rate": 8.9933779355345e-06, + "loss": 1.3159, "step": 2115 }, { - "epoch": 0.6077981651376146, - "grad_norm": 3.47121454527395, - "learning_rate": 7.996977347355647e-06, - "loss": 1.1155, + "epoch": 0.5803449219819327, + "grad_norm": 0.39256654497554794, + "learning_rate": 8.945844197831945e-06, + "loss": 1.3623, "step": 2120 }, { - "epoch": 0.6092316513761468, - "grad_norm": 3.156947339229311, - "learning_rate": 7.947975463118361e-06, - "loss": 1.0363, + "epoch": 0.581713660005475, + "grad_norm": 0.3905410660434604, + "learning_rate": 8.898334533911362e-06, + "loss": 1.3074, "step": 2125 }, { - "epoch": 0.6106651376146789, - "grad_norm": 3.2797688368791955, - "learning_rate": 7.899024964165634e-06, - "loss": 1.1381, + "epoch": 0.5830823980290173, + "grad_norm": 0.3700729231212675, + "learning_rate": 8.850850028752108e-06, + "loss": 1.3436, "step": 2130 }, { - "epoch": 0.612098623853211, - "grad_norm": 2.9662302688805275, - "learning_rate": 7.850127076279747e-06, - "loss": 1.1032, + "epoch": 0.5844511360525595, + "grad_norm": 0.3701852614302805, + "learning_rate": 8.803391766758998e-06, + "loss": 1.2672, "step": 2135 }, { - "epoch": 0.6135321100917431, - "grad_norm": 3.203748291163447, - "learning_rate": 7.801283023925536e-06, - "loss": 1.0517, + "epoch": 0.5858198740761018, + "grad_norm": 0.3809658344783075, + "learning_rate": 8.755960831737529e-06, + "loss": 1.3638, "step": 2140 }, { - "epoch": 0.6149655963302753, - "grad_norm": 3.319287430110112, - "learning_rate": 7.752494030219724e-06, - "loss": 1.115, + "epoch": 0.5871886120996441, + "grad_norm": 0.3852501406589481, + "learning_rate": 8.708558306869125e-06, + "loss": 1.3389, "step": 2145 }, { - "epoch": 0.6163990825688074, - "grad_norm": 3.7703557404425823, - "learning_rate": 7.703761316900293e-06, - "loss": 1.0813, + "epoch": 0.5885573501231864, + "grad_norm": 0.4003501899309122, + "learning_rate": 8.661185274686418e-06, + "loss": 1.3228, "step": 2150 }, { - "epoch": 0.6178325688073395, - "grad_norm": 3.2303522091544727, - "learning_rate": 7.655086104295904e-06, - "loss": 1.0668, + "epoch": 0.5899260881467288, + "grad_norm": 0.38933438340786924, + "learning_rate": 8.613842817048503e-06, + "loss": 1.3627, "step": 2155 }, { - "epoch": 0.6192660550458715, - "grad_norm": 3.1818989111713676, - "learning_rate": 7.606469611295315e-06, - "loss": 1.1368, + "epoch": 0.591294826170271, + "grad_norm": 0.39608781513600605, + "learning_rate": 8.566532015116251e-06, + "loss": 1.3626, "step": 2160 }, { - "epoch": 0.6206995412844036, - "grad_norm": 3.0516455298261347, - "learning_rate": 7.5579130553168815e-06, - "loss": 1.1132, + "epoch": 0.5926635641938133, + "grad_norm": 0.37582004076749886, + "learning_rate": 8.51925394932761e-06, + "loss": 1.2766, "step": 2165 }, { - "epoch": 0.6221330275229358, - "grad_norm": 3.3307988467764087, - "learning_rate": 7.50941765227805e-06, - "loss": 1.0518, + "epoch": 0.5940323022173556, + "grad_norm": 0.39046439420533485, + "learning_rate": 8.47200969937293e-06, + "loss": 1.3342, "step": 2170 }, { - "epoch": 0.6235665137614679, - "grad_norm": 3.3783423662774363, - "learning_rate": 7.460984616564929e-06, - "loss": 1.1132, + "epoch": 0.5954010402408979, + "grad_norm": 0.3846853640345239, + "learning_rate": 8.424800344170308e-06, + "loss": 1.2973, "step": 2175 }, { - "epoch": 0.625, - "grad_norm": 3.634134930894941, - "learning_rate": 7.412615161001866e-06, - "loss": 1.0798, + "epoch": 0.5967697782644402, + "grad_norm": 0.3794031254673053, + "learning_rate": 8.37762696184095e-06, + "loss": 1.3288, "step": 2180 }, { - "epoch": 0.6264334862385321, - "grad_norm": 3.2417955659870796, - "learning_rate": 7.364310496821086e-06, - "loss": 1.0864, + "epoch": 0.5981385162879824, + "grad_norm": 0.38487683969290326, + "learning_rate": 8.330490629684547e-06, + "loss": 1.3374, "step": 2185 }, { - "epoch": 0.6278669724770642, - "grad_norm": 3.1150031782926395, - "learning_rate": 7.316071833632346e-06, - "loss": 1.1044, + "epoch": 0.5995072543115247, + "grad_norm": 0.38666671699682115, + "learning_rate": 8.28339242415468e-06, + "loss": 1.3202, "step": 2190 }, { - "epoch": 0.6293004587155964, - "grad_norm": 3.058907033640181, - "learning_rate": 7.2679003793926626e-06, - "loss": 1.1097, + "epoch": 0.600875992335067, + "grad_norm": 0.3788541652969095, + "learning_rate": 8.236333420834216e-06, + "loss": 1.2778, "step": 2195 }, { - "epoch": 0.6307339449541285, - "grad_norm": 3.2942538258713348, - "learning_rate": 7.2197973403760614e-06, - "loss": 1.1005, + "epoch": 0.6022447303586094, + "grad_norm": 0.4003559394681289, + "learning_rate": 8.189314694410781e-06, + "loss": 1.3349, "step": 2200 }, { - "epoch": 0.6321674311926605, - "grad_norm": 3.225308816069398, - "learning_rate": 7.171763921143346e-06, - "loss": 1.0627, + "epoch": 0.6036134683821517, + "grad_norm": 0.40672129713209243, + "learning_rate": 8.14233731865218e-06, + "loss": 1.3343, "step": 2205 }, { - "epoch": 0.6336009174311926, - "grad_norm": 3.0409935206984566, - "learning_rate": 7.123801324511972e-06, - "loss": 1.0937, + "epoch": 0.604982206405694, + "grad_norm": 0.392344465653996, + "learning_rate": 8.0954023663819e-06, + "loss": 1.2804, "step": 2210 }, { - "epoch": 0.6350344036697247, - "grad_norm": 9.664703329802007, - "learning_rate": 7.075910751525895e-06, - "loss": 1.1024, + "epoch": 0.6063509444292362, + "grad_norm": 0.396931681160349, + "learning_rate": 8.048510909454601e-06, + "loss": 1.3712, "step": 2215 }, { - "epoch": 0.6364678899082569, - "grad_norm": 3.057262418668561, - "learning_rate": 7.0280934014255195e-06, - "loss": 1.0857, + "epoch": 0.6077196824527785, + "grad_norm": 0.3882127247657791, + "learning_rate": 8.001664018731637e-06, + "loss": 1.3313, "step": 2220 }, { - "epoch": 0.637901376146789, - "grad_norm": 3.0509477926802395, - "learning_rate": 6.980350471617638e-06, - "loss": 1.0727, + "epoch": 0.6090884204763208, + "grad_norm": 0.3727086035483801, + "learning_rate": 7.954862764056602e-06, + "loss": 1.2715, "step": 2225 }, { - "epoch": 0.6393348623853211, - "grad_norm": 3.4647112333044743, - "learning_rate": 6.9326831576454835e-06, - "loss": 1.1128, + "epoch": 0.6104571584998632, + "grad_norm": 0.39519191088539024, + "learning_rate": 7.908108214230902e-06, + "loss": 1.3018, "step": 2230 }, { - "epoch": 0.6407683486238532, - "grad_norm": 8.615283468155432, - "learning_rate": 6.885092653158768e-06, - "loss": 1.1205, + "epoch": 0.6118258965234055, + "grad_norm": 0.39954515244878724, + "learning_rate": 7.861401436989337e-06, + "loss": 1.3697, "step": 2235 }, { - "epoch": 0.6422018348623854, - "grad_norm": 3.6518697308480927, - "learning_rate": 6.837580149883787e-06, - "loss": 1.0998, + "epoch": 0.6131946345469477, + "grad_norm": 0.3961862899698687, + "learning_rate": 7.814743498975731e-06, + "loss": 1.3651, "step": 2240 }, { - "epoch": 0.6436353211009175, - "grad_norm": 3.2292043015948493, - "learning_rate": 6.790146837593599e-06, - "loss": 1.0467, + "epoch": 0.61456337257049, + "grad_norm": 0.38565731038310297, + "learning_rate": 7.768135465718559e-06, + "loss": 1.3462, "step": 2245 }, { - "epoch": 0.6450688073394495, - "grad_norm": 3.060662379650319, - "learning_rate": 6.7427939040782175e-06, - "loss": 1.0426, + "epoch": 0.6159321105940323, + "grad_norm": 0.38796255525719986, + "learning_rate": 7.72157840160662e-06, + "loss": 1.3087, "step": 2250 }, { - "epoch": 0.6465022935779816, - "grad_norm": 3.3641577098688087, - "learning_rate": 6.695522535114866e-06, - "loss": 1.0968, + "epoch": 0.6173008486175746, + "grad_norm": 0.3947254708668997, + "learning_rate": 7.67507336986474e-06, + "loss": 1.262, "step": 2255 }, { - "epoch": 0.6479357798165137, - "grad_norm": 3.025310634435396, - "learning_rate": 6.64833391443829e-06, - "loss": 1.1066, + "epoch": 0.6186695866411169, + "grad_norm": 0.39167259958488637, + "learning_rate": 7.628621432529461e-06, + "loss": 1.3117, "step": 2260 }, { - "epoch": 0.6493692660550459, - "grad_norm": 3.3638409022750477, - "learning_rate": 6.601229223711123e-06, - "loss": 1.061, + "epoch": 0.6200383246646591, + "grad_norm": 0.39458361491505556, + "learning_rate": 7.582223650424825e-06, + "loss": 1.3067, "step": 2265 }, { - "epoch": 0.650802752293578, - "grad_norm": 3.0591413772292415, - "learning_rate": 6.554209642494267e-06, - "loss": 1.1305, + "epoch": 0.6214070626882015, + "grad_norm": 0.3948240515553033, + "learning_rate": 7.5358810831381225e-06, + "loss": 1.3177, "step": 2270 }, { - "epoch": 0.6522362385321101, - "grad_norm": 3.048895188924846, - "learning_rate": 6.507276348217393e-06, - "loss": 1.0676, + "epoch": 0.6227758007117438, + "grad_norm": 0.3814626352546649, + "learning_rate": 7.489594788995698e-06, + "loss": 1.3259, "step": 2275 }, { - "epoch": 0.6536697247706422, - "grad_norm": 3.1625182877065, - "learning_rate": 6.460430516149433e-06, - "loss": 0.9783, + "epoch": 0.6241445387352861, + "grad_norm": 0.39070667312634827, + "learning_rate": 7.443365825038793e-06, + "loss": 1.3212, "step": 2280 }, { - "epoch": 0.6551032110091743, - "grad_norm": 3.0971219975300093, - "learning_rate": 6.413673319369145e-06, - "loss": 1.0947, + "epoch": 0.6255132767588284, + "grad_norm": 0.38913877038236866, + "learning_rate": 7.397195246999391e-06, + "loss": 1.346, "step": 2285 }, { - "epoch": 0.6565366972477065, - "grad_norm": 3.0087113841047706, - "learning_rate": 6.36700592873576e-06, - "loss": 1.1115, + "epoch": 0.6268820147823706, + "grad_norm": 0.4070153012541291, + "learning_rate": 7.351084109276119e-06, + "loss": 1.3826, "step": 2290 }, { - "epoch": 0.6579701834862385, - "grad_norm": 3.240656660427129, - "learning_rate": 6.320429512859645e-06, - "loss": 1.0604, + "epoch": 0.6282507528059129, + "grad_norm": 0.36428744429147014, + "learning_rate": 7.305033464910164e-06, + "loss": 1.2659, "step": 2295 }, { - "epoch": 0.6594036697247706, - "grad_norm": 2.8631951161021423, - "learning_rate": 6.273945238073047e-06, - "loss": 1.0521, + "epoch": 0.6296194908294552, + "grad_norm": 0.3877290507395246, + "learning_rate": 7.25904436556122e-06, + "loss": 1.304, "step": 2300 }, { - "epoch": 0.6608371559633027, - "grad_norm": 3.372695675980184, - "learning_rate": 6.227554268400875e-06, - "loss": 1.1259, + "epoch": 0.6309882288529975, + "grad_norm": 0.3803920075270922, + "learning_rate": 7.21311786148348e-06, + "loss": 1.3333, "step": 2305 }, { - "epoch": 0.6622706422018348, - "grad_norm": 3.0645684579682353, - "learning_rate": 6.1812577655315695e-06, - "loss": 1.0146, + "epoch": 0.6323569668765399, + "grad_norm": 0.3822541348646928, + "learning_rate": 7.167255001501651e-06, + "loss": 1.3307, "step": 2310 }, { - "epoch": 0.663704128440367, - "grad_norm": 3.2992667272685456, - "learning_rate": 6.135056888788004e-06, - "loss": 1.0908, + "epoch": 0.6337257049000822, + "grad_norm": 0.3977624877197784, + "learning_rate": 7.121456832986988e-06, + "loss": 1.3329, "step": 2315 }, { - "epoch": 0.6651376146788991, - "grad_norm": 3.134342552266428, - "learning_rate": 6.088952795098442e-06, - "loss": 1.0408, + "epoch": 0.6350944429236244, + "grad_norm": 0.38990040940646387, + "learning_rate": 7.075724401833395e-06, + "loss": 1.3647, "step": 2320 }, { - "epoch": 0.6665711009174312, - "grad_norm": 3.195978033002455, - "learning_rate": 6.042946638967586e-06, - "loss": 1.0734, + "epoch": 0.6364631809471667, + "grad_norm": 0.39550975070062716, + "learning_rate": 7.030058752433526e-06, + "loss": 1.2715, "step": 2325 }, { - "epoch": 0.6680045871559633, - "grad_norm": 3.244082712886425, - "learning_rate": 5.997039572447658e-06, - "loss": 1.0466, + "epoch": 0.637831918970709, + "grad_norm": 0.3857148917491234, + "learning_rate": 6.984460927654937e-06, + "loss": 1.3191, "step": 2330 }, { - "epoch": 0.6694380733944955, - "grad_norm": 2.9373127993862105, - "learning_rate": 5.951232745109552e-06, - "loss": 1.0658, + "epoch": 0.6392006569942513, + "grad_norm": 0.38855941306596253, + "learning_rate": 6.938931968816275e-06, + "loss": 1.2979, "step": 2335 }, { - "epoch": 0.6708715596330275, - "grad_norm": 3.1038594855264967, - "learning_rate": 5.9055273040140374e-06, - "loss": 1.0916, + "epoch": 0.6405693950177936, + "grad_norm": 0.3892088605550479, + "learning_rate": 6.893472915663493e-06, + "loss": 1.3826, "step": 2340 }, { - "epoch": 0.6723050458715596, - "grad_norm": 3.1281256278361256, - "learning_rate": 5.859924393683056e-06, - "loss": 1.1042, + "epoch": 0.6419381330413358, + "grad_norm": 0.4024108620394257, + "learning_rate": 6.8480848063461035e-06, + "loss": 1.2943, "step": 2345 }, { - "epoch": 0.6737385321100917, - "grad_norm": 2.9165714439580825, - "learning_rate": 5.8144251560710415e-06, - "loss": 1.0735, + "epoch": 0.6433068710648782, + "grad_norm": 0.3976600247063458, + "learning_rate": 6.8027686773934765e-06, + "loss": 1.3241, "step": 2350 }, { - "epoch": 0.6751720183486238, - "grad_norm": 3.3690225312568955, - "learning_rate": 5.769030730536336e-06, - "loss": 1.0574, + "epoch": 0.6446756090884205, + "grad_norm": 0.38538373822690386, + "learning_rate": 6.7575255636911626e-06, + "loss": 1.3316, "step": 2355 }, { - "epoch": 0.676605504587156, - "grad_norm": 3.298580191163876, - "learning_rate": 5.723742253812658e-06, - "loss": 1.1132, + "epoch": 0.6460443471119628, + "grad_norm": 0.3961847974725629, + "learning_rate": 6.7123564984572596e-06, + "loss": 1.3481, "step": 2360 }, { - "epoch": 0.6780389908256881, - "grad_norm": 3.004930197106605, - "learning_rate": 5.678560859980621e-06, - "loss": 1.0691, + "epoch": 0.6474130851355051, + "grad_norm": 0.3739018408786073, + "learning_rate": 6.667262513218824e-06, + "loss": 1.2857, "step": 2365 }, { - "epoch": 0.6794724770642202, - "grad_norm": 3.1318112002832197, - "learning_rate": 5.633487680439362e-06, - "loss": 1.0305, + "epoch": 0.6487818231590473, + "grad_norm": 0.4058027362822848, + "learning_rate": 6.622244637788302e-06, + "loss": 1.3623, "step": 2370 }, { - "epoch": 0.6809059633027523, - "grad_norm": 3.139769183129407, - "learning_rate": 5.588523843878189e-06, - "loss": 1.0547, + "epoch": 0.6501505611825896, + "grad_norm": 0.3887614780397367, + "learning_rate": 6.577303900240023e-06, + "loss": 1.3466, "step": 2375 }, { - "epoch": 0.6823394495412844, - "grad_norm": 3.060531894281262, - "learning_rate": 5.543670476248327e-06, - "loss": 1.0354, + "epoch": 0.6515192992061319, + "grad_norm": 0.39583219449776136, + "learning_rate": 6.532441326886716e-06, + "loss": 1.3826, "step": 2380 }, { - "epoch": 0.6837729357798165, - "grad_norm": 3.1023960334424556, - "learning_rate": 5.498928700734713e-06, - "loss": 1.0565, + "epoch": 0.6528880372296743, + "grad_norm": 0.38600694323078566, + "learning_rate": 6.487657942256069e-06, + "loss": 1.3124, "step": 2385 }, { - "epoch": 0.6852064220183486, - "grad_norm": 3.1211535834048396, - "learning_rate": 5.454299637727885e-06, - "loss": 1.0662, + "epoch": 0.6542567752532166, + "grad_norm": 0.35581732986644143, + "learning_rate": 6.442954769067341e-06, + "loss": 1.3352, "step": 2390 }, { - "epoch": 0.6866399082568807, - "grad_norm": 2.99851195359665, - "learning_rate": 5.409784404795913e-06, - "loss": 1.0467, + "epoch": 0.6556255132767588, + "grad_norm": 0.3977530272775962, + "learning_rate": 6.398332828207996e-06, + "loss": 1.3226, "step": 2395 }, { - "epoch": 0.6880733944954128, - "grad_norm": 3.0166642308548606, - "learning_rate": 5.365384116656415e-06, - "loss": 1.1033, + "epoch": 0.6569942513003011, + "grad_norm": 0.3928175408666658, + "learning_rate": 6.3537931387103925e-06, + "loss": 1.3262, "step": 2400 }, { - "epoch": 0.689506880733945, - "grad_norm": 3.1825150095020724, - "learning_rate": 5.321099885148652e-06, - "loss": 1.0541, + "epoch": 0.6583629893238434, + "grad_norm": 0.3795840583774513, + "learning_rate": 6.309336717728516e-06, + "loss": 1.3203, "step": 2405 }, { - "epoch": 0.6909403669724771, - "grad_norm": 3.3490852800179174, - "learning_rate": 5.2769328192056824e-06, - "loss": 1.0624, + "epoch": 0.6597317273473857, + "grad_norm": 0.396679060826067, + "learning_rate": 6.264964580514745e-06, + "loss": 1.2842, "step": 2410 }, { - "epoch": 0.6923738532110092, - "grad_norm": 3.1423133524388698, - "learning_rate": 5.23288402482658e-06, - "loss": 1.0582, + "epoch": 0.661100465370928, + "grad_norm": 0.3699325537006795, + "learning_rate": 6.220677740396668e-06, + "loss": 1.2936, "step": 2415 }, { - "epoch": 0.6938073394495413, - "grad_norm": 3.1573558030480235, - "learning_rate": 5.18895460504876e-06, - "loss": 1.0325, + "epoch": 0.6624692033944704, + "grad_norm": 0.38098356916961845, + "learning_rate": 6.176477208753944e-06, + "loss": 1.276, "step": 2420 }, { - "epoch": 0.6952408256880734, - "grad_norm": 3.107763639677989, - "learning_rate": 5.145145659920348e-06, - "loss": 1.0017, + "epoch": 0.6638379414180126, + "grad_norm": 0.38873579028440075, + "learning_rate": 6.132363994995194e-06, + "loss": 1.2348, "step": 2425 }, { - "epoch": 0.6966743119266054, - "grad_norm": 3.3802618734734793, - "learning_rate": 5.101458286472618e-06, - "loss": 1.0718, + "epoch": 0.6652066794415549, + "grad_norm": 0.39405073870487267, + "learning_rate": 6.088339106534971e-06, + "loss": 1.2659, "step": 2430 }, { - "epoch": 0.6981077981651376, - "grad_norm": 2.918248752013308, - "learning_rate": 5.05789357869255e-06, - "loss": 1.0316, + "epoch": 0.6665754174650972, + "grad_norm": 0.3833620202872981, + "learning_rate": 6.044403548770735e-06, + "loss": 1.3103, "step": 2435 }, { - "epoch": 0.6995412844036697, - "grad_norm": 3.189799059727885, - "learning_rate": 5.01445262749542e-06, - "loss": 1.0851, + "epoch": 0.6679441554886395, + "grad_norm": 0.39527725366864974, + "learning_rate": 6.000558325059894e-06, + "loss": 1.3423, "step": 2440 }, { - "epoch": 0.7009747706422018, - "grad_norm": 3.2475620018827676, - "learning_rate": 4.9711365206974716e-06, - "loss": 1.0537, + "epoch": 0.6693128935121818, + "grad_norm": 0.38996615233504806, + "learning_rate": 5.956804436696904e-06, + "loss": 1.303, "step": 2445 }, { - "epoch": 0.7024082568807339, - "grad_norm": 3.0707000997716047, - "learning_rate": 4.927946342988699e-06, - "loss": 1.0271, + "epoch": 0.670681631535724, + "grad_norm": 0.3943489977366666, + "learning_rate": 5.9131428828903905e-06, + "loss": 1.3547, "step": 2450 }, { - "epoch": 0.7038417431192661, - "grad_norm": 2.9176475930747543, - "learning_rate": 4.884883175905671e-06, - "loss": 1.0535, + "epoch": 0.6720503695592663, + "grad_norm": 0.3899208460332646, + "learning_rate": 5.8695746607403285e-06, + "loss": 1.2969, "step": 2455 }, { - "epoch": 0.7052752293577982, - "grad_norm": 3.0577673743440923, - "learning_rate": 4.8419480978044395e-06, - "loss": 1.0848, + "epoch": 0.6734191075828087, + "grad_norm": 0.3940611248671062, + "learning_rate": 5.826100765215273e-06, + "loss": 1.318, "step": 2460 }, { - "epoch": 0.7067087155963303, - "grad_norm": 3.3735135945939136, - "learning_rate": 4.799142183833561e-06, - "loss": 1.0651, + "epoch": 0.674787845606351, + "grad_norm": 0.3930083338553073, + "learning_rate": 5.782722189129655e-06, + "loss": 1.2906, "step": 2465 }, { - "epoch": 0.7081422018348624, - "grad_norm": 3.170994136975781, - "learning_rate": 4.75646650590715e-06, - "loss": 1.0736, + "epoch": 0.6761565836298933, + "grad_norm": 0.39311675749157776, + "learning_rate": 5.739439923121077e-06, + "loss": 1.3569, "step": 2470 }, { - "epoch": 0.7095756880733946, - "grad_norm": 2.993299399999142, - "learning_rate": 4.713922132678055e-06, - "loss": 1.0415, + "epoch": 0.6775253216534355, + "grad_norm": 0.4114954929599597, + "learning_rate": 5.6962549556277134e-06, + "loss": 1.3773, "step": 2475 }, { - "epoch": 0.7110091743119266, - "grad_norm": 3.4225173298945117, - "learning_rate": 4.671510129511074e-06, - "loss": 1.0363, + "epoch": 0.6788940596769778, + "grad_norm": 0.38652001787654966, + "learning_rate": 5.653168272865724e-06, + "loss": 1.3008, "step": 2480 }, { - "epoch": 0.7124426605504587, - "grad_norm": 3.0344762068899365, - "learning_rate": 4.629231558456306e-06, - "loss": 1.0414, + "epoch": 0.6802627977005201, + "grad_norm": 0.3878760723325068, + "learning_rate": 5.6101808588067505e-06, + "loss": 1.2992, "step": 2485 }, { - "epoch": 0.7138761467889908, - "grad_norm": 3.1844264445491297, - "learning_rate": 4.587087478222539e-06, - "loss": 1.0714, + "epoch": 0.6816315357240624, + "grad_norm": 0.39884745029373364, + "learning_rate": 5.56729369515542e-06, + "loss": 1.3088, "step": 2490 }, { - "epoch": 0.7153096330275229, - "grad_norm": 3.09219275586818, - "learning_rate": 4.545078944150728e-06, - "loss": 1.017, + "epoch": 0.6830002737476047, + "grad_norm": 0.38961302860625757, + "learning_rate": 5.52450776132694e-06, + "loss": 1.2904, "step": 2495 }, { - "epoch": 0.716743119266055, - "grad_norm": 3.4728979799373976, - "learning_rate": 4.5032070081876e-06, - "loss": 1.0612, + "epoch": 0.684369011771147, + "grad_norm": 0.3970778873479824, + "learning_rate": 5.481824034424741e-06, + "loss": 1.3464, "step": 2500 }, { - "epoch": 0.7181766055045872, - "grad_norm": 3.063801560649391, - "learning_rate": 4.4614727188592835e-06, - "loss": 1.0798, + "epoch": 0.6857377497946893, + "grad_norm": 0.4014161218820373, + "learning_rate": 5.439243489218138e-06, + "loss": 1.3376, "step": 2505 }, { - "epoch": 0.7196100917431193, - "grad_norm": 3.823900363914515, - "learning_rate": 4.419877121245058e-06, - "loss": 1.0435, + "epoch": 0.6871064878182316, + "grad_norm": 0.3874962119825426, + "learning_rate": 5.396767098120087e-06, + "loss": 1.3078, "step": 2510 }, { - "epoch": 0.7210435779816514, - "grad_norm": 2.8618913314337107, - "learning_rate": 4.378421256951192e-06, - "loss": 1.0102, + "epoch": 0.6884752258417739, + "grad_norm": 0.37396226223807555, + "learning_rate": 5.354395831164982e-06, + "loss": 1.3097, "step": 2515 }, { - "epoch": 0.7224770642201835, - "grad_norm": 2.9378204976736466, - "learning_rate": 4.337106164084861e-06, - "loss": 1.0824, + "epoch": 0.6898439638653162, + "grad_norm": 0.3792883560955228, + "learning_rate": 5.312130655986485e-06, + "loss": 1.3158, "step": 2520 }, { - "epoch": 0.7239105504587156, - "grad_norm": 2.945681604419468, - "learning_rate": 4.295932877228132e-06, - "loss": 0.9985, + "epoch": 0.6912127018888585, + "grad_norm": 0.3798203691318429, + "learning_rate": 5.269972537795434e-06, + "loss": 1.3087, "step": 2525 }, { - "epoch": 0.7253440366972477, - "grad_norm": 3.106814303596456, - "learning_rate": 4.254902427412082e-06, - "loss": 1.0317, + "epoch": 0.6925814399124007, + "grad_norm": 0.38253235315284295, + "learning_rate": 5.227922439357823e-06, + "loss": 1.3198, "step": 2530 }, { - "epoch": 0.7267775229357798, - "grad_norm": 3.1281573814995363, - "learning_rate": 4.214015842090969e-06, - "loss": 1.0491, + "epoch": 0.693950177935943, + "grad_norm": 0.3817014501814054, + "learning_rate": 5.1859813209727775e-06, + "loss": 1.321, "step": 2535 }, { - "epoch": 0.7282110091743119, - "grad_norm": 2.795954987429199, - "learning_rate": 4.173274145116491e-06, - "loss": 0.9885, + "epoch": 0.6953189159594854, + "grad_norm": 0.3950376035714224, + "learning_rate": 5.144150140450643e-06, + "loss": 1.3024, "step": 2540 }, { - "epoch": 0.729644495412844, - "grad_norm": 3.3191476249085565, - "learning_rate": 4.1326783567121685e-06, - "loss": 1.0517, + "epoch": 0.6966876539830277, + "grad_norm": 0.4084629435066235, + "learning_rate": 5.102429853091128e-06, + "loss": 1.3437, "step": 2545 }, { - "epoch": 0.7310779816513762, - "grad_norm": 3.07441160825076, - "learning_rate": 4.092229493447788e-06, - "loss": 1.0512, + "epoch": 0.69805639200657, + "grad_norm": 0.3808185831587325, + "learning_rate": 5.060821411661459e-06, + "loss": 1.3264, "step": 2550 }, { - "epoch": 0.7325114678899083, - "grad_norm": 3.3162975946352824, - "learning_rate": 4.051928568213942e-06, - "loss": 1.0476, + "epoch": 0.6994251300301122, + "grad_norm": 0.3937638284493571, + "learning_rate": 5.019325766374625e-06, + "loss": 1.314, "step": 2555 }, { - "epoch": 0.7339449541284404, - "grad_norm": 3.222715903509193, - "learning_rate": 4.0117765901966635e-06, - "loss": 1.0401, + "epoch": 0.7007938680536545, + "grad_norm": 0.3920386340433558, + "learning_rate": 4.977943864867712e-06, + "loss": 1.3271, "step": 2560 }, { - "epoch": 0.7353784403669725, - "grad_norm": 3.1385505256509014, - "learning_rate": 3.9717745648521646e-06, - "loss": 1.0243, + "epoch": 0.7021626060771968, + "grad_norm": 0.3898729624487372, + "learning_rate": 4.936676652180215e-06, + "loss": 1.3467, "step": 2565 }, { - "epoch": 0.7368119266055045, - "grad_norm": 3.5703129773472506, - "learning_rate": 3.931923493881659e-06, - "loss": 1.0319, + "epoch": 0.7035313441007391, + "grad_norm": 0.39902316286587364, + "learning_rate": 4.89552507073248e-06, + "loss": 1.3102, "step": 2570 }, { - "epoch": 0.7382454128440367, - "grad_norm": 3.0801290385363393, - "learning_rate": 3.892224375206256e-06, - "loss": 1.0382, + "epoch": 0.7049000821242815, + "grad_norm": 0.3673934948431135, + "learning_rate": 4.854490060304192e-06, + "loss": 1.2612, "step": 2575 }, { - "epoch": 0.7396788990825688, - "grad_norm": 3.2917177730741676, - "learning_rate": 3.8526782029420005e-06, - "loss": 1.1191, + "epoch": 0.7062688201478237, + "grad_norm": 0.3922497888421968, + "learning_rate": 4.813572558012892e-06, + "loss": 1.329, "step": 2580 }, { - "epoch": 0.7411123853211009, - "grad_norm": 3.1425801332889827, - "learning_rate": 3.8132859673749688e-06, - "loss": 1.0481, + "epoch": 0.707637558171366, + "grad_norm": 0.3860080742782407, + "learning_rate": 4.772773498292579e-06, + "loss": 1.3332, "step": 2585 }, { - "epoch": 0.742545871559633, - "grad_norm": 3.1741701966057447, - "learning_rate": 3.774048654936454e-06, - "loss": 1.0072, + "epoch": 0.7090062961949083, + "grad_norm": 0.3695087849805836, + "learning_rate": 4.732093812872391e-06, + "loss": 1.2697, "step": 2590 }, { - "epoch": 0.7439793577981652, - "grad_norm": 3.4923814958157466, - "learning_rate": 3.7349672481782894e-06, - "loss": 1.01, + "epoch": 0.7103750342184506, + "grad_norm": 0.4012769440751328, + "learning_rate": 4.691534430755302e-06, + "loss": 1.3213, "step": 2595 }, { - "epoch": 0.7454128440366973, - "grad_norm": 3.121310877940095, - "learning_rate": 3.6960427257482343e-06, - "loss": 1.0804, + "epoch": 0.7117437722419929, + "grad_norm": 0.38657730746893026, + "learning_rate": 4.651096278196916e-06, + "loss": 1.3665, "step": 2600 }, { - "epoch": 0.7468463302752294, - "grad_norm": 3.1172405011336908, - "learning_rate": 3.657276062365457e-06, - "loss": 1.0313, + "epoch": 0.7131125102655351, + "grad_norm": 0.38560469814966725, + "learning_rate": 4.610780278684315e-06, + "loss": 1.3369, "step": 2605 }, { - "epoch": 0.7482798165137615, - "grad_norm": 3.0906434983571223, - "learning_rate": 3.618668228796143e-06, - "loss": 1.0378, + "epoch": 0.7144812482890774, + "grad_norm": 0.39542221048440374, + "learning_rate": 4.570587352914977e-06, + "loss": 1.3048, "step": 2610 }, { - "epoch": 0.7497133027522935, - "grad_norm": 3.1863927645425725, - "learning_rate": 3.580220191829178e-06, - "loss": 0.9929, + "epoch": 0.7158499863126198, + "grad_norm": 0.3910002548641959, + "learning_rate": 4.530518418775734e-06, + "loss": 1.3247, "step": 2615 }, { - "epoch": 0.7511467889908257, - "grad_norm": 3.0251252557702437, - "learning_rate": 3.5419329142519433e-06, - "loss": 1.0814, + "epoch": 0.7172187243361621, + "grad_norm": 0.3825608407076146, + "learning_rate": 4.490574391321814e-06, + "loss": 1.2967, "step": 2620 }, { - "epoch": 0.7525802752293578, - "grad_norm": 4.943706664249104, - "learning_rate": 3.5038073548261888e-06, - "loss": 1.0629, + "epoch": 0.7185874623597044, + "grad_norm": 0.4028460389272689, + "learning_rate": 4.450756182755963e-06, + "loss": 1.3671, "step": 2625 }, { - "epoch": 0.7540137614678899, - "grad_norm": 3.16322658334476, - "learning_rate": 3.46584446826405e-06, - "loss": 0.9881, + "epoch": 0.7199562003832466, + "grad_norm": 0.40485919090431155, + "learning_rate": 4.411064702407585e-06, + "loss": 1.3556, "step": 2630 }, { - "epoch": 0.755447247706422, - "grad_norm": 3.125095249964922, - "learning_rate": 3.428045205204125e-06, - "loss": 0.9923, + "epoch": 0.7213249384067889, + "grad_norm": 0.39021669719654245, + "learning_rate": 4.371500856711988e-06, + "loss": 1.3774, "step": 2635 }, { - "epoch": 0.7568807339449541, - "grad_norm": 2.973224447074909, - "learning_rate": 3.3904105121876764e-06, - "loss": 1.1052, + "epoch": 0.7226936764303312, + "grad_norm": 0.3958279714364836, + "learning_rate": 4.332065549189697e-06, + "loss": 1.2982, "step": 2640 }, { - "epoch": 0.7583142201834863, - "grad_norm": 2.996637163056179, - "learning_rate": 3.3529413316349145e-06, - "loss": 1.1049, + "epoch": 0.7240624144538735, + "grad_norm": 0.4060690524084766, + "learning_rate": 4.292759680425794e-06, + "loss": 1.3767, "step": 2645 }, { - "epoch": 0.7597477064220184, - "grad_norm": 2.935944096948497, - "learning_rate": 3.3156386018214193e-06, - "loss": 1.0712, + "epoch": 0.7254311524774159, + "grad_norm": 0.40926421108177974, + "learning_rate": 4.253584148049369e-06, + "loss": 1.339, "step": 2650 }, { - "epoch": 0.7611811926605505, - "grad_norm": 3.038494065615691, - "learning_rate": 3.2785032568546304e-06, - "loss": 1.0472, + "epoch": 0.7267998905009582, + "grad_norm": 0.37375698755072967, + "learning_rate": 4.214539846713024e-06, + "loss": 1.2961, "step": 2655 }, { - "epoch": 0.7626146788990825, - "grad_norm": 3.2871929054536397, - "learning_rate": 3.2415362266504626e-06, - "loss": 1.0342, + "epoch": 0.7281686285245004, + "grad_norm": 0.3968852233360893, + "learning_rate": 4.175627668072425e-06, + "loss": 1.2962, "step": 2660 }, { - "epoch": 0.7640481651376146, - "grad_norm": 3.098314519979228, - "learning_rate": 3.2047384369100065e-06, - "loss": 1.0903, + "epoch": 0.7295373665480427, + "grad_norm": 0.4073445114415598, + "learning_rate": 4.136848500765948e-06, + "loss": 1.4056, "step": 2665 }, { - "epoch": 0.7654816513761468, - "grad_norm": 2.9569161427598787, - "learning_rate": 3.16811080909637e-06, - "loss": 1.0007, + "epoch": 0.730906104571585, + "grad_norm": 0.3950383171619063, + "learning_rate": 4.098203230394399e-06, + "loss": 1.3192, "step": 2670 }, { - "epoch": 0.7669151376146789, - "grad_norm": 3.355325001454353, - "learning_rate": 3.1316542604115853e-06, - "loss": 1.0525, + "epoch": 0.7322748425951273, + "grad_norm": 0.3986322017273241, + "learning_rate": 4.059692739500761e-06, + "loss": 1.3485, "step": 2675 }, { - "epoch": 0.768348623853211, - "grad_norm": 3.7866942452127113, - "learning_rate": 3.095369703773652e-06, - "loss": 1.0607, + "epoch": 0.7336435806186696, + "grad_norm": 0.3848930065792978, + "learning_rate": 4.02131790755006e-06, + "loss": 1.378, "step": 2680 }, { - "epoch": 0.7697821100917431, - "grad_norm": 2.8549205403217024, - "learning_rate": 3.0592580477936606e-06, - "loss": 1.0612, + "epoch": 0.7350123186422118, + "grad_norm": 0.39139666434515563, + "learning_rate": 3.983079610909283e-06, + "loss": 1.3334, "step": 2685 }, { - "epoch": 0.7712155963302753, - "grad_norm": 3.1911721995721547, - "learning_rate": 3.0233201967530647e-06, - "loss": 1.0756, + "epoch": 0.7363810566657542, + "grad_norm": 0.38858454704969564, + "learning_rate": 3.944978722827347e-06, + "loss": 1.3235, "step": 2690 }, { - "epoch": 0.7726490825688074, - "grad_norm": 3.254594926845702, - "learning_rate": 2.987557050581017e-06, - "loss": 1.0928, + "epoch": 0.7377497946892965, + "grad_norm": 0.390122324459421, + "learning_rate": 3.907016113415166e-06, + "loss": 1.3583, "step": 2695 }, { - "epoch": 0.7740825688073395, - "grad_norm": 3.029915420885045, - "learning_rate": 2.9519695048318353e-06, - "loss": 0.9834, + "epoch": 0.7391185327128388, + "grad_norm": 0.3919502066927978, + "learning_rate": 3.869192649625792e-06, + "loss": 1.3092, "step": 2700 }, { - "epoch": 0.7755160550458715, - "grad_norm": 3.7010803603107614, - "learning_rate": 2.9165584506625864e-06, - "loss": 1.0616, + "epoch": 0.7404872707363811, + "grad_norm": 0.39777490757009926, + "learning_rate": 3.831509195234598e-06, + "loss": 1.3358, "step": 2705 }, { - "epoch": 0.7769495412844036, - "grad_norm": 3.0973140862278155, - "learning_rate": 2.8813247748107665e-06, - "loss": 1.0493, + "epoch": 0.7418560087599233, + "grad_norm": 0.39353868348346743, + "learning_rate": 3.793966610819545e-06, + "loss": 1.3515, "step": 2710 }, { - "epoch": 0.7783830275229358, - "grad_norm": 3.3576787529696217, - "learning_rate": 2.8462693595720938e-06, - "loss": 1.0743, + "epoch": 0.7432247467834656, + "grad_norm": 0.39342447998988617, + "learning_rate": 3.756565753741569e-06, + "loss": 1.2885, "step": 2715 }, { - "epoch": 0.7798165137614679, - "grad_norm": 3.074976481415647, - "learning_rate": 2.8113930827784076e-06, - "loss": 1.0157, + "epoch": 0.7445934848070079, + "grad_norm": 0.37781062124467246, + "learning_rate": 3.7193074781249585e-06, + "loss": 1.276, "step": 2720 }, { - "epoch": 0.78125, - "grad_norm": 3.0753397191274945, - "learning_rate": 2.776696817775707e-06, - "loss": 1.0266, + "epoch": 0.7459622228305502, + "grad_norm": 0.38397833073923665, + "learning_rate": 3.6821926348378666e-06, + "loss": 1.3031, "step": 2725 }, { - "epoch": 0.7826834862385321, - "grad_norm": 3.09366006809429, - "learning_rate": 2.7421814334022624e-06, - "loss": 1.0243, + "epoch": 0.7473309608540926, + "grad_norm": 0.3993776914034358, + "learning_rate": 3.6452220714728883e-06, + "loss": 1.3719, "step": 2730 }, { - "epoch": 0.7841169724770642, - "grad_norm": 3.6168913804023326, - "learning_rate": 2.7078477939668625e-06, - "loss": 1.0652, + "epoch": 0.7486996988776348, + "grad_norm": 0.387591612397714, + "learning_rate": 3.608396632327684e-06, + "loss": 1.3006, "step": 2735 }, { - "epoch": 0.7855504587155964, - "grad_norm": 4.050361789957049, - "learning_rate": 2.673696759227177e-06, - "loss": 1.0636, + "epoch": 0.7500684369011771, + "grad_norm": 0.4059102325008591, + "learning_rate": 3.5717171583857115e-06, + "loss": 1.3689, "step": 2740 }, { - "epoch": 0.7869839449541285, - "grad_norm": 3.249151536222312, - "learning_rate": 2.639729184368226e-06, - "loss": 1.0643, + "epoch": 0.7514371749247194, + "grad_norm": 0.3772375625162727, + "learning_rate": 3.5351844872970233e-06, + "loss": 1.3091, "step": 2745 }, { - "epoch": 0.7884174311926605, - "grad_norm": 2.9287260819204692, - "learning_rate": 2.6059459199809545e-06, - "loss": 1.0028, + "epoch": 0.7528059129482617, + "grad_norm": 0.3853940006094772, + "learning_rate": 3.498799453359124e-06, + "loss": 1.3454, "step": 2750 }, { - "epoch": 0.7898509174311926, - "grad_norm": 3.0465964857735663, - "learning_rate": 2.5723478120409474e-06, - "loss": 1.0421, + "epoch": 0.754174650971804, + "grad_norm": 0.3771144405512011, + "learning_rate": 3.462562887497927e-06, + "loss": 1.2998, "step": 2755 }, { - "epoch": 0.7912844036697247, - "grad_norm": 3.153016800252537, - "learning_rate": 2.5389357018872405e-06, - "loss": 1.0201, + "epoch": 0.7555433889953463, + "grad_norm": 0.39604150906533137, + "learning_rate": 3.4264756172487813e-06, + "loss": 1.3129, "step": 2760 }, { - "epoch": 0.7927178899082569, - "grad_norm": 3.115248902197113, - "learning_rate": 2.505710426201239e-06, - "loss": 1.1122, + "epoch": 0.7569121270188885, + "grad_norm": 0.3843657920576998, + "learning_rate": 3.390538466737564e-06, + "loss": 1.2981, "step": 2765 }, { - "epoch": 0.794151376146789, - "grad_norm": 3.456526732392788, - "learning_rate": 2.4726728169857885e-06, - "loss": 0.9736, + "epoch": 0.7582808650424309, + "grad_norm": 0.39238834291334906, + "learning_rate": 3.3547522566618593e-06, + "loss": 1.2697, "step": 2770 }, { - "epoch": 0.7955848623853211, - "grad_norm": 3.078706890825068, - "learning_rate": 2.439823701544328e-06, - "loss": 1.0208, + "epoch": 0.7596496030659732, + "grad_norm": 0.4030855832137369, + "learning_rate": 3.319117804272236e-06, + "loss": 1.3152, "step": 2775 }, { - "epoch": 0.7970183486238532, - "grad_norm": 3.0852747181382485, - "learning_rate": 2.407163902460167e-06, - "loss": 1.0112, + "epoch": 0.7610183410895155, + "grad_norm": 0.38853914068753576, + "learning_rate": 3.283635923353553e-06, + "loss": 1.3125, "step": 2780 }, { - "epoch": 0.7984518348623854, - "grad_norm": 2.9795351626224202, - "learning_rate": 2.3746942375758986e-06, - "loss": 1.0495, + "epoch": 0.7623870791130578, + "grad_norm": 0.39575307616938343, + "learning_rate": 3.248307424206395e-06, + "loss": 1.4215, "step": 2785 }, { - "epoch": 0.7998853211009175, - "grad_norm": 3.662965002906388, - "learning_rate": 2.3424155199729206e-06, - "loss": 1.0088, + "epoch": 0.7637558171366, + "grad_norm": 0.3903008557955737, + "learning_rate": 3.2131331136285717e-06, + "loss": 1.3034, "step": 2790 }, { - "epoch": 0.8013188073394495, - "grad_norm": 2.9979503759811146, - "learning_rate": 2.310328557951065e-06, - "loss": 0.9978, + "epoch": 0.7651245551601423, + "grad_norm": 0.38760587663102863, + "learning_rate": 3.1781137948966754e-06, + "loss": 1.291, "step": 2795 }, { - "epoch": 0.8027522935779816, - "grad_norm": 3.1993479502657856, - "learning_rate": 2.2784341550083577e-06, - "loss": 1.08, + "epoch": 0.7664932931836846, + "grad_norm": 0.3917779069091076, + "learning_rate": 3.1432502677477494e-06, + "loss": 1.3318, "step": 2800 }, { - "epoch": 0.8041857798165137, - "grad_norm": 2.9645752085086716, - "learning_rate": 2.2467331098209098e-06, - "loss": 1.0019, + "epoch": 0.767862031207227, + "grad_norm": 0.3978532288818969, + "learning_rate": 3.108543328361017e-06, + "loss": 1.3074, "step": 2805 }, { - "epoch": 0.8056192660550459, - "grad_norm": 3.0604845129074856, - "learning_rate": 2.215226216222911e-06, - "loss": 0.9847, + "epoch": 0.7692307692307693, + "grad_norm": 0.3931552217963087, + "learning_rate": 3.0739937693397113e-06, + "loss": 1.3181, "step": 2810 }, { - "epoch": 0.807052752293578, - "grad_norm": 3.1249395082814235, - "learning_rate": 2.1839142631867396e-06, - "loss": 1.003, + "epoch": 0.7705995072543115, + "grad_norm": 0.38132685696899987, + "learning_rate": 3.0396023796929597e-06, + "loss": 1.3202, "step": 2815 }, { - "epoch": 0.8084862385321101, - "grad_norm": 3.5887794377789746, - "learning_rate": 2.1527980348032263e-06, - "loss": 1.0915, + "epoch": 0.7719682452778538, + "grad_norm": 0.3895674318092009, + "learning_rate": 3.0053699448177687e-06, + "loss": 1.2912, "step": 2820 }, { - "epoch": 0.8099197247706422, - "grad_norm": 2.9913673167068056, - "learning_rate": 2.121878310262008e-06, - "loss": 1.0346, + "epoch": 0.7733369833013961, + "grad_norm": 0.3923145835781509, + "learning_rate": 2.971297246481101e-06, + "loss": 1.2874, "step": 2825 }, { - "epoch": 0.8113532110091743, - "grad_norm": 3.076522728603463, - "learning_rate": 2.0911558638320117e-06, - "loss": 1.0254, + "epoch": 0.7747057213249384, + "grad_norm": 0.3964424700594031, + "learning_rate": 2.937385062802004e-06, + "loss": 1.3723, "step": 2830 }, { - "epoch": 0.8127866972477065, - "grad_norm": 3.4952190302417625, - "learning_rate": 2.0606314648420757e-06, - "loss": 1.0164, + "epoch": 0.7760744593484807, + "grad_norm": 0.39752354179970156, + "learning_rate": 2.9036341682338466e-06, + "loss": 1.3329, "step": 2835 }, { - "epoch": 0.8142201834862385, - "grad_norm": 2.987547482751348, - "learning_rate": 2.0303058776616847e-06, - "loss": 1.0241, + "epoch": 0.7774431973720229, + "grad_norm": 0.39323110376722525, + "learning_rate": 2.870045333546644e-06, + "loss": 1.2905, "step": 2840 }, { - "epoch": 0.8156536697247706, - "grad_norm": 3.0295895740792265, - "learning_rate": 2.0001798616818137e-06, - "loss": 0.9397, + "epoch": 0.7788119353955653, + "grad_norm": 0.3924911670687984, + "learning_rate": 2.8366193258094355e-06, + "loss": 1.292, "step": 2845 }, { - "epoch": 0.8170871559633027, - "grad_norm": 3.2124562930166376, - "learning_rate": 1.970254171295931e-06, - "loss": 1.0349, + "epoch": 0.7801806734191076, + "grad_norm": 0.372337059578975, + "learning_rate": 2.8033569083727797e-06, + "loss": 1.3368, "step": 2850 }, { - "epoch": 0.8185206422018348, - "grad_norm": 2.7811469673449403, - "learning_rate": 1.940529555881101e-06, - "loss": 1.0372, + "epoch": 0.7815494114426499, + "grad_norm": 0.386256983183401, + "learning_rate": 2.7702588408513276e-06, + "loss": 1.3236, "step": 2855 }, { - "epoch": 0.819954128440367, - "grad_norm": 3.0716049369726854, - "learning_rate": 1.9110067597792094e-06, - "loss": 1.0258, + "epoch": 0.7829181494661922, + "grad_norm": 0.4151085195929508, + "learning_rate": 2.7373258791064572e-06, + "loss": 1.3342, "step": 2860 }, { - "epoch": 0.8213876146788991, - "grad_norm": 3.31583577307639, - "learning_rate": 1.8816865222783354e-06, - "loss": 1.0659, + "epoch": 0.7842868874897345, + "grad_norm": 0.39410387314037304, + "learning_rate": 2.7045587752290224e-06, + "loss": 1.28, "step": 2865 }, { - "epoch": 0.8228211009174312, - "grad_norm": 3.3545193733702616, - "learning_rate": 1.8525695775942376e-06, - "loss": 0.997, + "epoch": 0.7856556255132767, + "grad_norm": 0.40070665770653285, + "learning_rate": 2.6719582775221862e-06, + "loss": 1.3336, "step": 2870 }, { - "epoch": 0.8242545871559633, - "grad_norm": 3.082738506583357, - "learning_rate": 1.8236566548519664e-06, - "loss": 1.0109, + "epoch": 0.787024363536819, + "grad_norm": 0.3842777541641571, + "learning_rate": 2.6395251304843137e-06, + "loss": 1.2757, "step": 2875 }, { - "epoch": 0.8256880733944955, - "grad_norm": 2.9272236463346326, - "learning_rate": 1.7949484780675941e-06, - "loss": 1.0096, + "epoch": 0.7883931015603614, + "grad_norm": 0.40580643578053277, + "learning_rate": 2.6072600747919773e-06, + "loss": 1.3196, "step": 2880 }, { - "epoch": 0.8271215596330275, - "grad_norm": 2.9288594245035937, - "learning_rate": 1.7664457661301103e-06, - "loss": 1.0126, + "epoch": 0.7897618395839037, + "grad_norm": 0.388179506322075, + "learning_rate": 2.575163847283053e-06, + "loss": 1.293, "step": 2885 }, { - "epoch": 0.8285550458715596, - "grad_norm": 2.9288774104923077, - "learning_rate": 1.7381492327834004e-06, - "loss": 1.0499, + "epoch": 0.791130577607446, + "grad_norm": 0.3732450173203748, + "learning_rate": 2.543237180939875e-06, + "loss": 1.2751, "step": 2890 }, { - "epoch": 0.8299885321100917, - "grad_norm": 2.7877742435190065, - "learning_rate": 1.7100595866083713e-06, - "loss": 1.0041, + "epoch": 0.7924993156309882, + "grad_norm": 0.38506878802774175, + "learning_rate": 2.5114808048725035e-06, + "loss": 1.3083, "step": 2895 }, { - "epoch": 0.8314220183486238, - "grad_norm": 3.0286412164914656, - "learning_rate": 1.6821775310052212e-06, - "loss": 0.9811, + "epoch": 0.7938680536545305, + "grad_norm": 0.4075860384902042, + "learning_rate": 2.479895444302086e-06, + "loss": 1.307, "step": 2900 }, { - "epoch": 0.832855504587156, - "grad_norm": 2.7532642945829537, - "learning_rate": 1.65450376417582e-06, - "loss": 1.0096, + "epoch": 0.7952367916780728, + "grad_norm": 0.39287683770161946, + "learning_rate": 2.4484818205442763e-06, + "loss": 1.3645, "step": 2905 }, { - "epoch": 0.8342889908256881, - "grad_norm": 2.9265157315568375, - "learning_rate": 1.6270389791062146e-06, - "loss": 1.0491, + "epoch": 0.7966055297016151, + "grad_norm": 0.3882005380231924, + "learning_rate": 2.417240650992767e-06, + "loss": 1.3512, "step": 2910 }, { - "epoch": 0.8357224770642202, - "grad_norm": 3.216652891176282, - "learning_rate": 1.5997838635492936e-06, - "loss": 1.0956, + "epoch": 0.7979742677251574, + "grad_norm": 0.3866431713970654, + "learning_rate": 2.3861726491029237e-06, + "loss": 1.2793, "step": 2915 }, { - "epoch": 0.8371559633027523, - "grad_norm": 3.242345149185627, - "learning_rate": 1.5727391000075542e-06, - "loss": 0.9864, + "epoch": 0.7993430057486997, + "grad_norm": 0.3798968085022952, + "learning_rate": 2.355278524375465e-06, + "loss": 1.2865, "step": 2920 }, { - "epoch": 0.8385894495412844, - "grad_norm": 3.2259601086687204, - "learning_rate": 1.5459053657160084e-06, - "loss": 1.1009, + "epoch": 0.800711743772242, + "grad_norm": 0.39453328832547074, + "learning_rate": 2.324558982340275e-06, + "loss": 1.2576, "step": 2925 }, { - "epoch": 0.8400229357798165, - "grad_norm": 2.744523780896053, - "learning_rate": 1.5192833326252377e-06, - "loss": 1.0163, + "epoch": 0.8020804817957843, + "grad_norm": 0.3764906658011197, + "learning_rate": 2.2940147245402944e-06, + "loss": 1.3167, "step": 2930 }, { - "epoch": 0.8414564220183486, - "grad_norm": 2.828910888875833, - "learning_rate": 1.4928736673845534e-06, - "loss": 1.0037, + "epoch": 0.8034492198193266, + "grad_norm": 0.3823354264503374, + "learning_rate": 2.2636464485154875e-06, + "loss": 1.334, "step": 2935 }, { - "epoch": 0.8428899082568807, - "grad_norm": 3.062048982707707, - "learning_rate": 1.4666770313253054e-06, - "loss": 1.0248, + "epoch": 0.8048179578428689, + "grad_norm": 0.3911098766218853, + "learning_rate": 2.23345484778692e-06, + "loss": 1.3856, "step": 2940 }, { - "epoch": 0.8443233944954128, - "grad_norm": 3.1169358681462174, - "learning_rate": 1.4406940804443303e-06, - "loss": 1.0094, + "epoch": 0.8061866958664111, + "grad_norm": 0.4024307217930752, + "learning_rate": 2.2034406118409178e-06, + "loss": 1.3704, "step": 2945 }, { - "epoch": 0.845756880733945, - "grad_norm": 3.045993870767794, - "learning_rate": 1.4149254653875167e-06, - "loss": 1.0523, + "epoch": 0.8075554338899534, + "grad_norm": 0.38312668558103635, + "learning_rate": 2.1736044261133305e-06, + "loss": 1.32, "step": 2950 }, { - "epoch": 0.8471903669724771, - "grad_norm": 3.0564501410586637, - "learning_rate": 1.389371831433507e-06, - "loss": 1.0095, + "epoch": 0.8089241719134957, + "grad_norm": 0.39944235808095113, + "learning_rate": 2.1439469719738615e-06, + "loss": 1.3348, "step": 2955 }, { - "epoch": 0.8486238532110092, - "grad_norm": 3.4108278621499917, - "learning_rate": 1.3640338184775526e-06, - "loss": 1.0366, + "epoch": 0.8102929099370381, + "grad_norm": 0.37362368240209753, + "learning_rate": 2.1144689267105213e-06, + "loss": 1.3058, "step": 2960 }, { - "epoch": 0.8500573394495413, - "grad_norm": 3.309723042860519, - "learning_rate": 1.3389120610154804e-06, - "loss": 1.015, + "epoch": 0.8116616479605804, + "grad_norm": 0.3977426993128578, + "learning_rate": 2.0851709635141526e-06, + "loss": 1.329, "step": 2965 }, { - "epoch": 0.8514908256880734, - "grad_norm": 3.40140275029476, - "learning_rate": 1.3140071881278106e-06, - "loss": 1.1159, + "epoch": 0.8130303859841227, + "grad_norm": 0.390250388707541, + "learning_rate": 2.0560537514630595e-06, + "loss": 1.3435, "step": 2970 }, { - "epoch": 0.8529243119266054, - "grad_norm": 3.0642718892804344, - "learning_rate": 1.2893198234639904e-06, - "loss": 0.983, + "epoch": 0.8143991240076649, + "grad_norm": 0.41070469591969644, + "learning_rate": 2.0271179555077357e-06, + "loss": 1.3172, "step": 2975 }, { - "epoch": 0.8543577981651376, - "grad_norm": 3.5196505954999315, - "learning_rate": 1.2648505852267956e-06, - "loss": 1.0422, + "epoch": 0.8157678620312072, + "grad_norm": 0.378348117809942, + "learning_rate": 1.998364236455661e-06, + "loss": 1.3027, "step": 2980 }, { - "epoch": 0.8557912844036697, - "grad_norm": 3.012388196153649, - "learning_rate": 1.240600086156839e-06, - "loss": 1.0161, + "epoch": 0.8171366000547495, + "grad_norm": 0.380190609171277, + "learning_rate": 1.969793250956221e-06, + "loss": 1.2577, "step": 2985 }, { - "epoch": 0.8572247706422018, - "grad_norm": 4.224630882517724, - "learning_rate": 1.2165689335172248e-06, - "loss": 1.0435, + "epoch": 0.8185053380782918, + "grad_norm": 0.38655656053220244, + "learning_rate": 1.9414056514857205e-06, + "loss": 1.3137, "step": 2990 }, { - "epoch": 0.8586582568807339, - "grad_norm": 3.3927019439454558, - "learning_rate": 1.1927577290783488e-06, - "loss": 1.0549, + "epoch": 0.8198740761018342, + "grad_norm": 0.38867016212920535, + "learning_rate": 1.913202086332463e-06, + "loss": 1.3597, "step": 2995 }, { - "epoch": 0.8600917431192661, - "grad_norm": 2.8680598337924965, - "learning_rate": 1.169167069102828e-06, - "loss": 1.062, + "epoch": 0.8212428141253764, + "grad_norm": 0.38641157633799067, + "learning_rate": 1.8851831995819569e-06, + "loss": 1.3184, "step": 3000 }, { - "epoch": 0.8615252293577982, - "grad_norm": 3.2446852778299453, - "learning_rate": 1.1457975443305625e-06, - "loss": 1.0439, + "epoch": 0.8226115521489187, + "grad_norm": 0.39081114681314566, + "learning_rate": 1.8573496311022133e-06, + "loss": 1.3219, "step": 3005 }, { - "epoch": 0.8629587155963303, - "grad_norm": 3.074025983059374, - "learning_rate": 1.1226497399639501e-06, - "loss": 1.0202, + "epoch": 0.823980290172461, + "grad_norm": 0.37733544335013924, + "learning_rate": 1.8297020165291158e-06, + "loss": 1.2927, "step": 3010 }, { - "epoch": 0.8643922018348624, - "grad_norm": 2.9805357814293423, - "learning_rate": 1.0997242356532335e-06, - "loss": 0.9824, + "epoch": 0.8253490281960033, + "grad_norm": 0.38748075262103254, + "learning_rate": 1.8022409872519197e-06, + "loss": 1.3184, "step": 3015 }, { - "epoch": 0.8658256880733946, - "grad_norm": 3.377450943710277, - "learning_rate": 1.0770216054819782e-06, - "loss": 1.0292, + "epoch": 0.8267177662195456, + "grad_norm": 0.40138106139790847, + "learning_rate": 1.7749671703988226e-06, + "loss": 1.3312, "step": 3020 }, { - "epoch": 0.8672591743119266, - "grad_norm": 3.203588510488362, - "learning_rate": 1.0545424179526963e-06, - "loss": 1.0008, + "epoch": 0.8280865042430878, + "grad_norm": 0.3876362883781664, + "learning_rate": 1.7478811888226555e-06, + "loss": 1.3101, "step": 3025 }, { - "epoch": 0.8686926605504587, - "grad_norm": 3.3838873878755438, - "learning_rate": 1.03228723597262e-06, - "loss": 1.0276, + "epoch": 0.8294552422666301, + "grad_norm": 0.3936459314036851, + "learning_rate": 1.7209836610866426e-06, + "loss": 1.3431, "step": 3030 }, { - "epoch": 0.8701261467889908, - "grad_norm": 3.0207979512808247, - "learning_rate": 1.0102566168395977e-06, - "loss": 1.0448, + "epoch": 0.8308239802901725, + "grad_norm": 0.38764243587502206, + "learning_rate": 1.694275201450284e-06, + "loss": 1.3386, "step": 3035 }, { - "epoch": 0.8715596330275229, - "grad_norm": 3.294936816848852, - "learning_rate": 9.884511122281427e-07, - "loss": 1.0123, + "epoch": 0.8321927183137148, + "grad_norm": 0.3908568598595201, + "learning_rate": 1.6677564198553332e-06, + "loss": 1.3342, "step": 3040 }, { - "epoch": 0.872993119266055, - "grad_norm": 3.0263492251087802, - "learning_rate": 9.668712681756087e-07, - "loss": 0.9943, + "epoch": 0.8335614563372571, + "grad_norm": 0.38588972696708185, + "learning_rate": 1.6414279219118568e-06, + "loss": 1.3527, "step": 3045 }, { - "epoch": 0.8744266055045872, - "grad_norm": 2.987557644263372, - "learning_rate": 9.455176250685338e-07, - "loss": 1.0392, + "epoch": 0.8349301943607993, + "grad_norm": 0.3907806499233958, + "learning_rate": 1.6152903088844051e-06, + "loss": 1.3104, "step": 3050 }, { - "epoch": 0.8758600917431193, - "grad_norm": 2.827866834943389, - "learning_rate": 9.243907176290945e-07, - "loss": 1.0009, + "epoch": 0.8362989323843416, + "grad_norm": 0.3825824287507951, + "learning_rate": 1.5893441776782947e-06, + "loss": 1.3062, "step": 3055 }, { - "epoch": 0.8772935779816514, - "grad_norm": 3.0479507652012106, - "learning_rate": 9.034910749017211e-07, - "loss": 1.0362, + "epoch": 0.8376676704078839, + "grad_norm": 0.3842728055099776, + "learning_rate": 1.5635901208259608e-06, + "loss": 1.3581, "step": 3060 }, { - "epoch": 0.8787270642201835, - "grad_norm": 3.166615479232945, - "learning_rate": 8.828192202398455e-07, - "loss": 1.0649, + "epoch": 0.8390364084314262, + "grad_norm": 0.3905623616859072, + "learning_rate": 1.5380287264734285e-06, + "loss": 1.3148, "step": 3065 }, { - "epoch": 0.8801605504587156, - "grad_norm": 3.086530733524286, - "learning_rate": 8.623756712928022e-07, - "loss": 1.0161, + "epoch": 0.8404051464549686, + "grad_norm": 0.39844460827488454, + "learning_rate": 1.5126605783668945e-06, + "loss": 1.3074, "step": 3070 }, { - "epoch": 0.8815940366972477, - "grad_norm": 3.2339942958362995, - "learning_rate": 8.421609399928621e-07, - "loss": 1.0274, + "epoch": 0.8417738844785108, + "grad_norm": 0.39217288885390583, + "learning_rate": 1.4874862558393787e-06, + "loss": 1.3171, "step": 3075 }, { - "epoch": 0.8830275229357798, - "grad_norm": 3.1624079222447845, - "learning_rate": 8.221755325424152e-07, - "loss": 0.963, + "epoch": 0.8431426225020531, + "grad_norm": 0.38303308206260517, + "learning_rate": 1.462506333797501e-06, + "loss": 1.2985, "step": 3080 }, { - "epoch": 0.8844610091743119, - "grad_norm": 6.67242001014344, - "learning_rate": 8.024199494012863e-07, - "loss": 0.994, + "epoch": 0.8445113605255954, + "grad_norm": 0.37178133903556354, + "learning_rate": 1.4377213827083602e-06, + "loss": 1.278, "step": 3085 }, { - "epoch": 0.885894495412844, - "grad_norm": 3.0370868865276424, - "learning_rate": 7.828946852742148e-07, - "loss": 1.0551, + "epoch": 0.8458800985491377, + "grad_norm": 0.383873936282195, + "learning_rate": 1.413131968586491e-06, + "loss": 1.2989, "step": 3090 }, { - "epoch": 0.8873279816513762, - "grad_norm": 3.1653796046419185, - "learning_rate": 7.636002290984634e-07, - "loss": 1.0085, + "epoch": 0.84724883657268, + "grad_norm": 0.3877611566994199, + "learning_rate": 1.3887386529809454e-06, + "loss": 1.3543, "step": 3095 }, { - "epoch": 0.8887614678899083, - "grad_norm": 3.018418033888261, - "learning_rate": 7.445370640315642e-07, - "loss": 1.0017, + "epoch": 0.8486175745962223, + "grad_norm": 0.3844140889489012, + "learning_rate": 1.364541992962476e-06, + "loss": 1.345, "step": 3100 }, { - "epoch": 0.8901949541284404, - "grad_norm": 3.543389563066827, - "learning_rate": 7.257056674392359e-07, - "loss": 1.002, + "epoch": 0.8499863126197645, + "grad_norm": 0.3791779816963577, + "learning_rate": 1.3405425411108008e-06, + "loss": 1.3202, "step": 3105 }, { - "epoch": 0.8916284403669725, - "grad_norm": 3.1380320671176194, - "learning_rate": 7.071065108834197e-07, - "loss": 1.0311, + "epoch": 0.8513550506433069, + "grad_norm": 0.37802806800903216, + "learning_rate": 1.3167408455019903e-06, + "loss": 1.3, "step": 3110 }, { - "epoch": 0.8930619266055045, - "grad_norm": 3.107368067479966, - "learning_rate": 6.887400601104688e-07, - "loss": 0.9969, + "epoch": 0.8527237886668492, + "grad_norm": 0.38421255363489804, + "learning_rate": 1.2931374496959548e-06, + "loss": 1.3033, "step": 3115 }, { - "epoch": 0.8944954128440367, - "grad_norm": 2.86265063408354, - "learning_rate": 6.706067750394951e-07, - "loss": 1.0004, + "epoch": 0.8540925266903915, + "grad_norm": 0.3890245703911996, + "learning_rate": 1.2697328927240238e-06, + "loss": 1.3155, "step": 3120 }, { - "epoch": 0.8959288990825688, - "grad_norm": 2.921057543794976, - "learning_rate": 6.527071097508475e-07, - "loss": 1.0568, + "epoch": 0.8554612647139338, + "grad_norm": 0.38965532355017113, + "learning_rate": 1.2465277090766381e-06, + "loss": 1.3408, "step": 3125 }, { - "epoch": 0.8973623853211009, - "grad_norm": 3.413457128089623, - "learning_rate": 6.350415124747378e-07, - "loss": 1.0966, + "epoch": 0.856830002737476, + "grad_norm": 0.38847581936098935, + "learning_rate": 1.2235224286911495e-06, + "loss": 1.3619, "step": 3130 }, { - "epoch": 0.898795871559633, - "grad_norm": 3.2754484907337913, - "learning_rate": 6.176104255800175e-07, - "loss": 1.0012, + "epoch": 0.8581987407610183, + "grad_norm": 0.3901578768967818, + "learning_rate": 1.2007175769397117e-06, + "loss": 1.3714, "step": 3135 }, { - "epoch": 0.9002293577981652, - "grad_norm": 3.096817656277008, - "learning_rate": 6.004142855631068e-07, - "loss": 1.0409, + "epoch": 0.8595674787845606, + "grad_norm": 0.38170228292380465, + "learning_rate": 1.178113674617285e-06, + "loss": 1.3144, "step": 3140 }, { - "epoch": 0.9016628440366973, - "grad_norm": 3.2107227855247737, - "learning_rate": 5.834535230370586e-07, - "loss": 1.022, + "epoch": 0.860936216808103, + "grad_norm": 0.3852269233134472, + "learning_rate": 1.1557112379297385e-06, + "loss": 1.3542, "step": 3145 }, { - "epoch": 0.9030963302752294, - "grad_norm": 3.087915467643749, - "learning_rate": 5.66728562720772e-07, - "loss": 1.1023, + "epoch": 0.8623049548316453, + "grad_norm": 0.385512160462019, + "learning_rate": 1.1335107784820741e-06, + "loss": 1.3556, "step": 3150 }, { - "epoch": 0.9045298165137615, - "grad_norm": 3.1987341877696034, - "learning_rate": 5.502398234283657e-07, - "loss": 1.004, + "epoch": 0.8636736928551875, + "grad_norm": 0.39758307605071247, + "learning_rate": 1.1115128032667288e-06, + "loss": 1.2992, "step": 3155 }, { - "epoch": 0.9059633027522935, - "grad_norm": 2.8182259878196527, - "learning_rate": 5.339877180586872e-07, - "loss": 1.0388, + "epoch": 0.8650424308787298, + "grad_norm": 0.3702170793773961, + "learning_rate": 1.0897178146520014e-06, + "loss": 1.3861, "step": 3160 }, { - "epoch": 0.9073967889908257, - "grad_norm": 3.060289512500466, - "learning_rate": 5.179726535849649e-07, - "loss": 1.0461, + "epoch": 0.8664111689022721, + "grad_norm": 0.3862887179246134, + "learning_rate": 1.0681263103705853e-06, + "loss": 1.3317, "step": 3165 }, { - "epoch": 0.9088302752293578, - "grad_norm": 3.1698304516587794, - "learning_rate": 5.0219503104463e-07, - "loss": 1.0247, + "epoch": 0.8677799069258144, + "grad_norm": 0.3979382501364706, + "learning_rate": 1.0467387835081944e-06, + "loss": 1.351, "step": 3170 }, { - "epoch": 0.9102637614678899, - "grad_norm": 3.130302466725309, - "learning_rate": 4.866552455292673e-07, - "loss": 1.0121, + "epoch": 0.8691486449493567, + "grad_norm": 0.3853783562962658, + "learning_rate": 1.0255557224923018e-06, + "loss": 1.3474, "step": 3175 }, { - "epoch": 0.911697247706422, - "grad_norm": 3.42566263140553, - "learning_rate": 4.713536861747181e-07, - "loss": 1.0241, + "epoch": 0.8705173829728989, + "grad_norm": 0.3921947417156507, + "learning_rate": 1.004577611080998e-06, + "loss": 1.3162, "step": 3180 }, { - "epoch": 0.9131307339449541, - "grad_norm": 2.9858737991061974, - "learning_rate": 4.5629073615134466e-07, - "loss": 1.0616, + "epoch": 0.8718861209964412, + "grad_norm": 0.38229199053031476, + "learning_rate": 9.838049283519258e-07, + "loss": 1.3265, "step": 3185 }, { - "epoch": 0.9145642201834863, - "grad_norm": 3.285076013359202, - "learning_rate": 4.414667726544308e-07, - "loss": 0.9933, + "epoch": 0.8732548590199836, + "grad_norm": 0.38057187484097027, + "learning_rate": 9.63238148691351e-07, + "loss": 1.3087, "step": 3190 }, { - "epoch": 0.9159977064220184, - "grad_norm": 3.2338711561101006, - "learning_rate": 4.2688216689472984e-07, - "loss": 1.0764, + "epoch": 0.8746235970435259, + "grad_norm": 0.3942222628186198, + "learning_rate": 9.42877741783328e-07, + "loss": 1.366, "step": 3195 }, { - "epoch": 0.9174311926605505, - "grad_norm": 3.1747891589130766, - "learning_rate": 4.125372840891817e-07, - "loss": 1.0288, + "epoch": 0.8759923350670682, + "grad_norm": 0.38514993886291565, + "learning_rate": 9.227241725989699e-07, + "loss": 1.3212, "step": 3200 }, { - "epoch": 0.9188646788990825, - "grad_norm": 3.4369303388998604, - "learning_rate": 3.984324834517583e-07, - "loss": 1.0151, + "epoch": 0.8773610730906105, + "grad_norm": 0.3836381492014198, + "learning_rate": 9.027779013858284e-07, + "loss": 1.2787, "step": 3205 }, { - "epoch": 0.9202981651376146, - "grad_norm": 2.9486157404109927, - "learning_rate": 3.845681181844718e-07, - "loss": 1.0339, + "epoch": 0.8787298111141527, + "grad_norm": 0.38734970197899504, + "learning_rate": 8.830393836573947e-07, + "loss": 1.3387, "step": 3210 }, { - "epoch": 0.9217316513761468, - "grad_norm": 3.026266781773246, - "learning_rate": 3.7094453546852706e-07, - "loss": 1.0372, + "epoch": 0.880098549137695, + "grad_norm": 0.3817865137800505, + "learning_rate": 8.635090701826799e-07, + "loss": 1.3753, "step": 3215 }, { - "epoch": 0.9231651376146789, - "grad_norm": 3.6005808302403715, - "learning_rate": 3.575620764556331e-07, - "loss": 1.0675, + "epoch": 0.8814672871612373, + "grad_norm": 0.38200788300985267, + "learning_rate": 8.441874069759337e-07, + "loss": 1.2776, "step": 3220 }, { - "epoch": 0.924598623853211, - "grad_norm": 3.065531017189199, - "learning_rate": 3.4442107625945577e-07, - "loss": 0.9969, + "epoch": 0.8828360251847797, + "grad_norm": 0.37300363369723033, + "learning_rate": 8.250748352864546e-07, + "loss": 1.317, "step": 3225 }, { - "epoch": 0.9260321100917431, - "grad_norm": 3.101619885450492, - "learning_rate": 3.3152186394722506e-07, - "loss": 1.004, + "epoch": 0.884204763208322, + "grad_norm": 0.39737199946658025, + "learning_rate": 8.061717915885103e-07, + "loss": 1.3048, "step": 3230 }, { - "epoch": 0.9274655963302753, - "grad_norm": 3.069142098455305, - "learning_rate": 3.188647625315011e-07, - "loss": 1.0277, + "epoch": 0.8855735012318642, + "grad_norm": 0.3771135467998678, + "learning_rate": 7.874787075713742e-07, + "loss": 1.2507, "step": 3235 }, { - "epoch": 0.9288990825688074, - "grad_norm": 2.981358314660114, - "learning_rate": 3.064500889620792e-07, - "loss": 1.012, + "epoch": 0.8869422392554065, + "grad_norm": 0.37723109683472905, + "learning_rate": 7.689960101294691e-07, + "loss": 1.3081, "step": 3240 }, { - "epoch": 0.9303325688073395, - "grad_norm": 3.146209185290999, - "learning_rate": 2.9427815411805616e-07, - "loss": 1.0409, + "epoch": 0.8883109772789488, + "grad_norm": 0.3871833961992763, + "learning_rate": 7.507241213526073e-07, + "loss": 1.3122, "step": 3245 }, { - "epoch": 0.9317660550458715, - "grad_norm": 2.9139523212945, - "learning_rate": 2.823492628000435e-07, - "loss": 1.0108, + "epoch": 0.8896797153024911, + "grad_norm": 0.3774090621861952, + "learning_rate": 7.326634585163617e-07, + "loss": 1.3243, "step": 3250 }, { - "epoch": 0.9331995412844036, - "grad_norm": 3.2427768967167068, - "learning_rate": 2.7066371372253873e-07, - "loss": 1.0033, + "epoch": 0.8910484533260334, + "grad_norm": 0.39061075194420053, + "learning_rate": 7.148144340725371e-07, + "loss": 1.3123, "step": 3255 }, { - "epoch": 0.9346330275229358, - "grad_norm": 3.2929773628001073, - "learning_rate": 2.5922179950643833e-07, - "loss": 1.053, + "epoch": 0.8924171913495756, + "grad_norm": 0.3844913895203542, + "learning_rate": 6.971774556397415e-07, + "loss": 1.3238, "step": 3260 }, { - "epoch": 0.9360665137614679, - "grad_norm": 3.0857731418567473, - "learning_rate": 2.480238066717178e-07, - "loss": 1.0159, + "epoch": 0.893785929373118, + "grad_norm": 0.3954658749141559, + "learning_rate": 6.797529259940827e-07, + "loss": 1.3421, "step": 3265 }, { - "epoch": 0.9375, - "grad_norm": 2.913155618655732, - "learning_rate": 2.370700156302541e-07, - "loss": 0.9997, + "epoch": 0.8951546673966603, + "grad_norm": 0.38848733821104914, + "learning_rate": 6.625412430599765e-07, + "loss": 1.3485, "step": 3270 }, { - "epoch": 0.9389334862385321, - "grad_norm": 2.9236470377703134, - "learning_rate": 2.2636070067879933e-07, - "loss": 1.0315, + "epoch": 0.8965234054202026, + "grad_norm": 0.38275187348582623, + "learning_rate": 6.455427999010466e-07, + "loss": 1.3343, "step": 3275 }, { - "epoch": 0.9403669724770642, - "grad_norm": 3.1616081864710113, - "learning_rate": 2.1589612999211697e-07, - "loss": 1.0573, + "epoch": 0.8978921434437449, + "grad_norm": 0.3946581641011151, + "learning_rate": 6.287579847111569e-07, + "loss": 1.3362, "step": 3280 }, { - "epoch": 0.9418004587155964, - "grad_norm": 3.2061935710058305, - "learning_rate": 2.056765656162685e-07, - "loss": 1.0344, + "epoch": 0.8992608814672871, + "grad_norm": 0.3931302904815353, + "learning_rate": 6.121871808055479e-07, + "loss": 1.3095, "step": 3285 }, { - "epoch": 0.9432339449541285, - "grad_norm": 3.1445007223648087, - "learning_rate": 1.9570226346204312e-07, - "loss": 1.0502, + "epoch": 0.9006296194908294, + "grad_norm": 0.39874206377284055, + "learning_rate": 5.958307666120733e-07, + "loss": 1.2925, "step": 3290 }, { - "epoch": 0.9446674311926605, - "grad_norm": 3.0523955190740226, - "learning_rate": 1.8597347329855742e-07, - "loss": 1.0277, + "epoch": 0.9019983575143717, + "grad_norm": 0.38856468686205425, + "learning_rate": 5.796891156625639e-07, + "loss": 1.2878, "step": 3295 }, { - "epoch": 0.9461009174311926, - "grad_norm": 3.10853151235872, - "learning_rate": 1.764904387469979e-07, - "loss": 0.9951, + "epoch": 0.903367095537914, + "grad_norm": 0.3830311422743617, + "learning_rate": 5.637625965843041e-07, + "loss": 1.3247, "step": 3300 }, { - "epoch": 0.9475344036697247, - "grad_norm": 2.8610073341168327, - "learning_rate": 1.672533972745194e-07, - "loss": 1.0133, + "epoch": 0.9047358335614564, + "grad_norm": 0.39286283244773124, + "learning_rate": 5.480515730915992e-07, + "loss": 1.2902, "step": 3305 }, { - "epoch": 0.9489678899082569, - "grad_norm": 2.9659296802362154, - "learning_rate": 1.5826258018829866e-07, - "loss": 1.0146, + "epoch": 0.9061045715849987, + "grad_norm": 0.37864973007608094, + "learning_rate": 5.325564039774777e-07, + "loss": 1.308, "step": 3310 }, { - "epoch": 0.950401376146789, - "grad_norm": 3.0101090249896187, - "learning_rate": 1.4951821262974563e-07, - "loss": 1.0064, + "epoch": 0.9074733096085409, + "grad_norm": 0.3932055048918823, + "learning_rate": 5.172774431054995e-07, + "loss": 1.3245, "step": 3315 }, { - "epoch": 0.9518348623853211, - "grad_norm": 2.873431144530414, - "learning_rate": 1.4102051356886027e-07, - "loss": 0.9988, + "epoch": 0.9088420476320832, + "grad_norm": 0.3861464572512248, + "learning_rate": 5.022150394016701e-07, + "loss": 1.3345, "step": 3320 }, { - "epoch": 0.9532683486238532, - "grad_norm": 2.861730520518137, - "learning_rate": 1.3276969579875453e-07, - "loss": 1.007, + "epoch": 0.9102107856556255, + "grad_norm": 0.3849211658260205, + "learning_rate": 4.873695368464693e-07, + "loss": 1.3522, "step": 3325 }, { - "epoch": 0.9547018348623854, - "grad_norm": 3.1539277667342964, - "learning_rate": 1.24765965930318e-07, - "loss": 1.017, + "epoch": 0.9115795236791678, + "grad_norm": 0.3850036124623499, + "learning_rate": 4.72741274467009e-07, + "loss": 1.3205, "step": 3330 }, { - "epoch": 0.9561353211009175, - "grad_norm": 3.182662519707628, - "learning_rate": 1.1700952438705171e-07, - "loss": 1.0008, + "epoch": 0.9129482617027102, + "grad_norm": 0.3858389082786252, + "learning_rate": 4.5833058632927417e-07, + "loss": 1.2984, "step": 3335 }, { - "epoch": 0.9575688073394495, - "grad_norm": 3.078518777565793, - "learning_rate": 1.0950056540004029e-07, - "loss": 1.0649, + "epoch": 0.9143169997262524, + "grad_norm": 0.3974200110164499, + "learning_rate": 4.441378015305031e-07, + "loss": 1.2907, "step": 3340 }, { - "epoch": 0.9590022935779816, - "grad_norm": 2.8736168807619507, - "learning_rate": 1.0223927700309667e-07, - "loss": 1.0177, + "epoch": 0.9156857377497947, + "grad_norm": 0.3885026941845055, + "learning_rate": 4.3016324419167365e-07, + "loss": 1.3571, "step": 3345 }, { - "epoch": 0.9604357798165137, - "grad_norm": 3.380358687506272, - "learning_rate": 9.522584102804599e-08, - "loss": 0.9955, + "epoch": 0.917054475773337, + "grad_norm": 0.391701334822345, + "learning_rate": 4.164072334500935e-07, + "loss": 1.2946, "step": 3350 }, { - "epoch": 0.9618692660550459, - "grad_norm": 3.209744698526145, - "learning_rate": 8.846043310017927e-08, - "loss": 1.0051, + "epoch": 0.9184232137968793, + "grad_norm": 0.371298260058105, + "learning_rate": 4.028700834521193e-07, + "loss": 1.2734, "step": 3355 }, { - "epoch": 0.963302752293578, - "grad_norm": 3.135993664577884, - "learning_rate": 8.194322263385013e-08, - "loss": 1.0096, + "epoch": 0.9197919518204216, + "grad_norm": 0.39257123679148415, + "learning_rate": 3.8955210334597595e-07, + "loss": 1.3792, "step": 3360 }, { - "epoch": 0.9647362385321101, - "grad_norm": 2.9702419226658128, - "learning_rate": 7.567437282823386e-08, - "loss": 0.9581, + "epoch": 0.9211606898439638, + "grad_norm": 0.37739382436504915, + "learning_rate": 3.764535972747052e-07, + "loss": 1.3182, "step": 3365 }, { - "epoch": 0.9661697247706422, - "grad_norm": 2.9902064692936365, - "learning_rate": 6.96540406632451e-08, - "loss": 1.0254, + "epoch": 0.9225294278675061, + "grad_norm": 0.4040354445639437, + "learning_rate": 3.6357486436921164e-07, + "loss": 1.3149, "step": 3370 }, { - "epoch": 0.9676032110091743, - "grad_norm": 3.136047137814621, - "learning_rate": 6.388237689559762e-08, - "loss": 1.0386, + "epoch": 0.9238981658910485, + "grad_norm": 0.3885892463097669, + "learning_rate": 3.5091619874143446e-07, + "loss": 1.3612, "step": 3375 }, { - "epoch": 0.9690366972477065, - "grad_norm": 3.209855131586944, - "learning_rate": 5.8359526055038476e-08, - "loss": 1.0493, + "epoch": 0.9252669039145908, + "grad_norm": 0.39478973344601664, + "learning_rate": 3.3847788947763194e-07, + "loss": 1.3338, "step": 3380 }, { - "epoch": 0.9704701834862385, - "grad_norm": 3.2935877965995966, - "learning_rate": 5.3085626440724235e-08, - "loss": 1.0391, + "epoch": 0.9266356419381331, + "grad_norm": 0.4002415191432085, + "learning_rate": 3.2626022063177997e-07, + "loss": 1.3854, "step": 3385 }, { - "epoch": 0.9719036697247706, - "grad_norm": 3.406107022140581, - "learning_rate": 4.8060810117757096e-08, - "loss": 1.0244, + "epoch": 0.9280043799616753, + "grad_norm": 0.3871640152525417, + "learning_rate": 3.142634712190795e-07, + "loss": 1.2663, "step": 3390 }, { - "epoch": 0.9733371559633027, - "grad_norm": 3.3748108058363377, - "learning_rate": 4.3285202913881944e-08, - "loss": 0.9787, + "epoch": 0.9293731179852176, + "grad_norm": 0.3844217189319342, + "learning_rate": 3.0248791520959387e-07, + "loss": 1.3304, "step": 3395 }, { - "epoch": 0.9747706422018348, - "grad_norm": 3.0832449198208978, - "learning_rate": 3.875892441633e-08, - "loss": 0.9951, + "epoch": 0.9307418560087599, + "grad_norm": 0.40010784379733844, + "learning_rate": 2.909338215219859e-07, + "loss": 1.3458, "step": 3400 }, { - "epoch": 0.976204128440367, - "grad_norm": 3.186455532049428, - "learning_rate": 3.4482087968829014e-08, - "loss": 1.0315, + "epoch": 0.9321105940323022, + "grad_norm": 0.38274676057072776, + "learning_rate": 2.7960145401737415e-07, + "loss": 1.2606, "step": 3405 }, { - "epoch": 0.9776376146788991, - "grad_norm": 2.9104798470989883, - "learning_rate": 3.045480066876105e-08, - "loss": 1.0092, + "epoch": 0.9334793320558445, + "grad_norm": 0.39275920250834023, + "learning_rate": 2.6849107149331756e-07, + "loss": 1.2825, "step": 3410 }, { - "epoch": 0.9790711009174312, - "grad_norm": 3.037035090176394, - "learning_rate": 2.667716336448356e-08, - "loss": 0.9587, + "epoch": 0.9348480700793868, + "grad_norm": 0.3947633255379109, + "learning_rate": 2.576029276778924e-07, + "loss": 1.3441, "step": 3415 }, { - "epoch": 0.9805045871559633, - "grad_norm": 2.9918210850993785, - "learning_rate": 2.3149270652803592e-08, - "loss": 0.9973, + "epoch": 0.9362168081029291, + "grad_norm": 0.41407416506322803, + "learning_rate": 2.4693727122390597e-07, + "loss": 1.371, "step": 3420 }, { - "epoch": 0.9819380733944955, - "grad_norm": 3.183820306323636, - "learning_rate": 1.9871210876607484e-08, - "loss": 1.011, + "epoch": 0.9375855461264714, + "grad_norm": 0.37625661449174036, + "learning_rate": 2.3649434570321984e-07, + "loss": 1.2862, "step": 3425 }, { - "epoch": 0.9833715596330275, - "grad_norm": 3.214411554963273, - "learning_rate": 1.6843066122649297e-08, - "loss": 0.9913, + "epoch": 0.9389542841500137, + "grad_norm": 0.37104631834710733, + "learning_rate": 2.2627438960117876e-07, + "loss": 1.2833, "step": 3430 }, { - "epoch": 0.9848050458715596, - "grad_norm": 3.5216487010248088, - "learning_rate": 1.4064912219496907e-08, - "loss": 1.0442, + "epoch": 0.940323022173556, + "grad_norm": 0.3868409112001441, + "learning_rate": 2.1627763631117182e-07, + "loss": 1.3551, "step": 3435 }, { - "epoch": 0.9862385321100917, - "grad_norm": 3.1877265670501598, - "learning_rate": 1.1536818735630172e-08, - "loss": 1.0318, + "epoch": 0.9416917601970983, + "grad_norm": 0.4061569745095073, + "learning_rate": 2.0650431412930104e-07, + "loss": 1.3273, "step": 3440 }, { - "epoch": 0.9876720183486238, - "grad_norm": 3.087482942801695, - "learning_rate": 9.25884897770013e-09, - "loss": 1.0607, + "epoch": 0.9430604982206405, + "grad_norm": 0.3923199404039606, + "learning_rate": 1.969546462491634e-07, + "loss": 1.3093, "step": 3445 }, { - "epoch": 0.989105504587156, - "grad_norm": 3.059255678756277, - "learning_rate": 7.231059988945799e-09, - "loss": 1.0148, + "epoch": 0.9444292362441828, + "grad_norm": 0.40035298460473323, + "learning_rate": 1.876288507567592e-07, + "loss": 1.2859, "step": 3450 }, { - "epoch": 0.9905389908256881, - "grad_norm": 3.239412598410617, - "learning_rate": 5.4535025477642224e-09, - "loss": 1.0422, + "epoch": 0.9457979742677252, + "grad_norm": 0.39270377743419116, + "learning_rate": 1.785271406255107e-07, + "loss": 1.3086, "step": 3455 }, { - "epoch": 0.9919724770642202, - "grad_norm": 3.144154368562322, - "learning_rate": 3.926221166434818e-09, - "loss": 1.0226, + "epoch": 0.9471667122912675, + "grad_norm": 0.391272234134139, + "learning_rate": 1.6964972371139588e-07, + "loss": 1.3324, "step": 3460 }, { - "epoch": 0.9934059633027523, - "grad_norm": 3.4686686147707877, - "learning_rate": 2.6492540900135976e-09, - "loss": 1.0359, + "epoch": 0.9485354503148098, + "grad_norm": 0.3971408419550116, + "learning_rate": 1.609968027482012e-07, + "loss": 1.3241, "step": 3465 }, { - "epoch": 0.9948394495412844, - "grad_norm": 2.9818281052270432, - "learning_rate": 1.6226332953661605e-09, - "loss": 0.988, + "epoch": 0.949904188338352, + "grad_norm": 0.397310674294107, + "learning_rate": 1.5256857534289626e-07, + "loss": 1.344, "step": 3470 }, { - "epoch": 0.9962729357798165, - "grad_norm": 2.940056641516156, - "learning_rate": 8.46384490373886e-10, - "loss": 1.0299, + "epoch": 0.9512729263618943, + "grad_norm": 0.37737586107823284, + "learning_rate": 1.443652339711199e-07, + "loss": 1.3227, "step": 3475 }, { - "epoch": 0.9977064220183486, - "grad_norm": 3.0373605695085883, - "learning_rate": 3.2052711328778297e-10, - "loss": 0.9876, + "epoch": 0.9526416643854366, + "grad_norm": 0.3953334287452581, + "learning_rate": 1.3638696597277678e-07, + "loss": 1.3323, "step": 3480 }, { - "epoch": 0.9991399082568807, - "grad_norm": 3.0988136193131335, - "learning_rate": 4.5074332237771134e-11, - "loss": 1.0197, + "epoch": 0.9540104024089789, + "grad_norm": 0.39727863290285664, + "learning_rate": 1.2863395354777097e-07, + "loss": 1.2965, "step": 3485 }, + { + "epoch": 0.9553791404325213, + "grad_norm": 0.3915026545665889, + "learning_rate": 1.211063737518392e-07, + "loss": 1.2945, + "step": 3490 + }, + { + "epoch": 0.9567478784560635, + "grad_norm": 0.3963237143375255, + "learning_rate": 1.1380439849250414e-07, + "loss": 1.3079, + "step": 3495 + }, + { + "epoch": 0.9581166164796058, + "grad_norm": 0.38628531323230814, + "learning_rate": 1.0672819452515526e-07, + "loss": 1.347, + "step": 3500 + }, + { + "epoch": 0.9594853545031481, + "grad_norm": 0.39250735459020125, + "learning_rate": 9.987792344923753e-08, + "loss": 1.3292, + "step": 3505 + }, + { + "epoch": 0.9608540925266904, + "grad_norm": 0.39214485753203543, + "learning_rate": 9.32537417045576e-08, + "loss": 1.2703, + "step": 3510 + }, + { + "epoch": 0.9622228305502327, + "grad_norm": 0.38156832143297204, + "learning_rate": 8.685580056771781e-08, + "loss": 1.3404, + "step": 3515 + }, + { + "epoch": 0.9635915685737749, + "grad_norm": 0.3907257773875925, + "learning_rate": 8.0684246148659e-08, + "loss": 1.2681, + "step": 3520 + }, + { + "epoch": 0.9649603065973172, + "grad_norm": 0.4194193748219591, + "learning_rate": 7.473921938731865e-08, + "loss": 1.382, + "step": 3525 + }, + { + "epoch": 0.9663290446208596, + "grad_norm": 0.3858101635490962, + "learning_rate": 6.902085605042019e-08, + "loss": 1.3671, + "step": 3530 + }, + { + "epoch": 0.9676977826444019, + "grad_norm": 0.3813102160301827, + "learning_rate": 6.352928672836767e-08, + "loss": 1.3013, + "step": 3535 + }, + { + "epoch": 0.9690665206679442, + "grad_norm": 0.38117598807083153, + "learning_rate": 5.82646368322648e-08, + "loss": 1.3406, + "step": 3540 + }, + { + "epoch": 0.9704352586914865, + "grad_norm": 0.3869856138402919, + "learning_rate": 5.3227026591049505e-08, + "loss": 1.3311, + "step": 3545 + }, + { + "epoch": 0.9718039967150287, + "grad_norm": 0.3866858707608126, + "learning_rate": 4.841657104875275e-08, + "loss": 1.3593, + "step": 3550 + }, + { + "epoch": 0.973172734738571, + "grad_norm": 0.3866568363141188, + "learning_rate": 4.3833380061865104e-08, + "loss": 1.3318, + "step": 3555 + }, + { + "epoch": 0.9745414727621133, + "grad_norm": 0.39553603732869785, + "learning_rate": 3.947755829683097e-08, + "loss": 1.3403, + "step": 3560 + }, + { + "epoch": 0.9759102107856557, + "grad_norm": 0.3990993940103786, + "learning_rate": 3.5349205227660496e-08, + "loss": 1.3812, + "step": 3565 + }, + { + "epoch": 0.977278948809198, + "grad_norm": 0.3925614431869845, + "learning_rate": 3.144841513365249e-08, + "loss": 1.3025, + "step": 3570 + }, + { + "epoch": 0.9786476868327402, + "grad_norm": 0.3883373657835178, + "learning_rate": 2.7775277097247255e-08, + "loss": 1.313, + "step": 3575 + }, + { + "epoch": 0.9800164248562825, + "grad_norm": 0.38572501548529475, + "learning_rate": 2.4329875001989356e-08, + "loss": 1.3058, + "step": 3580 + }, + { + "epoch": 0.9813851628798248, + "grad_norm": 0.3838975342711645, + "learning_rate": 2.1112287530609122e-08, + "loss": 1.3165, + "step": 3585 + }, + { + "epoch": 0.9827539009033671, + "grad_norm": 0.3918735272830008, + "learning_rate": 1.812258816323187e-08, + "loss": 1.3388, + "step": 3590 + }, + { + "epoch": 0.9841226389269094, + "grad_norm": 0.38644635827545804, + "learning_rate": 1.5360845175695916e-08, + "loss": 1.3378, + "step": 3595 + }, + { + "epoch": 0.9854913769504516, + "grad_norm": 0.4046028492421605, + "learning_rate": 1.2827121637992712e-08, + "loss": 1.3104, + "step": 3600 + }, + { + "epoch": 0.986860114973994, + "grad_norm": 0.37588594630122674, + "learning_rate": 1.0521475412830218e-08, + "loss": 1.3345, + "step": 3605 + }, + { + "epoch": 0.9882288529975363, + "grad_norm": 0.39556552792879707, + "learning_rate": 8.44395915430729e-09, + "loss": 1.3184, + "step": 3610 + }, + { + "epoch": 0.9895975910210786, + "grad_norm": 0.3745912876489376, + "learning_rate": 6.5946203067135395e-09, + "loss": 1.3036, + "step": 3615 + }, + { + "epoch": 0.9909663290446209, + "grad_norm": 0.39112472329549625, + "learning_rate": 4.9735011034457434e-09, + "loss": 1.2682, + "step": 3620 + }, + { + "epoch": 0.9923350670681631, + "grad_norm": 0.380307545731869, + "learning_rate": 3.580638566043071e-09, + "loss": 1.2837, + "step": 3625 + }, + { + "epoch": 0.9937038050917054, + "grad_norm": 0.3955743424810463, + "learning_rate": 2.416064503342197e-09, + "loss": 1.3092, + "step": 3630 + }, + { + "epoch": 0.9950725431152477, + "grad_norm": 0.3853245568354318, + "learning_rate": 1.4798055107489996e-09, + "loss": 1.3025, + "step": 3635 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 0.39071516823719027, + "learning_rate": 7.718829696334862e-10, + "loss": 1.3332, + "step": 3640 + }, + { + "epoch": 0.9978100191623324, + "grad_norm": 0.38346889130554324, + "learning_rate": 2.9231304683907667e-10, + "loss": 1.3369, + "step": 3645 + }, + { + "epoch": 0.9991787571858747, + "grad_norm": 0.4139215786504167, + "learning_rate": 4.1106694317338826e-11, + "loss": 1.3518, + "step": 3650 + }, { "epoch": 1.0, - "eval_loss": 1.0320301055908203, - "eval_runtime": 4528.3515, - "eval_samples_per_second": 18.469, - "eval_steps_per_second": 1.155, - "step": 3488 + "eval_loss": 1.320330262184143, + "eval_runtime": 951.5767, + "eval_samples_per_second": 92.065, + "eval_steps_per_second": 5.755, + "step": 3653 }, { "epoch": 1.0, - "step": 3488, - "total_flos": 182579059752960.0, - "train_loss": 0.025541797006895784, - "train_runtime": 4827.4873, - "train_samples_per_second": 11.557, - "train_steps_per_second": 0.723 + "step": 3653, + "total_flos": 66190143651840.0, + "train_loss": 1.3573657579367173, + "train_runtime": 8062.8425, + "train_samples_per_second": 7.249, + "train_steps_per_second": 0.453 } ], "logging_steps": 5, - "max_steps": 3488, + "max_steps": 3653, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -4922,7 +5153,7 @@ "attributes": {} } }, - "total_flos": 182579059752960.0, + "total_flos": 66190143651840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null