{ "best_metric": 0.039273809641599655, "best_model_checkpoint": "./data/EuroSAT_output/checkpoint-14345", "epoch": 5.0, "eval_steps": 500, "global_step": 14345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003485535029627048, "grad_norm": 2.583953380584717, "learning_rate": 1.9986057859881492e-05, "loss": 2.3035, "step": 10 }, { "epoch": 0.006971070059254096, "grad_norm": 3.04152774810791, "learning_rate": 1.9972115719762987e-05, "loss": 2.2087, "step": 20 }, { "epoch": 0.010456605088881143, "grad_norm": 2.454313039779663, "learning_rate": 1.9958173579644477e-05, "loss": 2.179, "step": 30 }, { "epoch": 0.013942140118508192, "grad_norm": 2.8994503021240234, "learning_rate": 1.9944231439525968e-05, "loss": 2.1153, "step": 40 }, { "epoch": 0.01742767514813524, "grad_norm": 2.569394111633301, "learning_rate": 1.9930289299407462e-05, "loss": 2.0797, "step": 50 }, { "epoch": 0.020913210177762286, "grad_norm": 2.860028028488159, "learning_rate": 1.9916347159288953e-05, "loss": 1.9448, "step": 60 }, { "epoch": 0.024398745207389334, "grad_norm": 3.2729475498199463, "learning_rate": 1.9902405019170444e-05, "loss": 1.8354, "step": 70 }, { "epoch": 0.027884280237016383, "grad_norm": 3.2359659671783447, "learning_rate": 1.9888462879051935e-05, "loss": 1.8398, "step": 80 }, { "epoch": 0.03136981526664343, "grad_norm": 3.4146273136138916, "learning_rate": 1.9874520738933426e-05, "loss": 1.7559, "step": 90 }, { "epoch": 0.03485535029627048, "grad_norm": 3.6460695266723633, "learning_rate": 1.986057859881492e-05, "loss": 1.7249, "step": 100 }, { "epoch": 0.03834088532589752, "grad_norm": 3.8710196018218994, "learning_rate": 1.984663645869641e-05, "loss": 1.6526, "step": 110 }, { "epoch": 0.04182642035552457, "grad_norm": 3.0699777603149414, "learning_rate": 1.98326943185779e-05, "loss": 1.4757, "step": 120 }, { "epoch": 0.04531195538515162, "grad_norm": 3.8974356651306152, "learning_rate": 1.9818752178459396e-05, "loss": 1.5169, "step": 130 }, { "epoch": 0.04879749041477867, "grad_norm": 4.513424873352051, "learning_rate": 1.9804810038340887e-05, "loss": 1.4487, "step": 140 }, { "epoch": 0.05228302544440572, "grad_norm": 3.8483262062072754, "learning_rate": 1.9790867898222377e-05, "loss": 1.4174, "step": 150 }, { "epoch": 0.055768560474032766, "grad_norm": 3.938114643096924, "learning_rate": 1.977692575810387e-05, "loss": 1.2989, "step": 160 }, { "epoch": 0.059254095503659815, "grad_norm": 4.86665678024292, "learning_rate": 1.9762983617985362e-05, "loss": 1.3303, "step": 170 }, { "epoch": 0.06273963053328686, "grad_norm": 2.8868749141693115, "learning_rate": 1.9749041477866853e-05, "loss": 1.2444, "step": 180 }, { "epoch": 0.06622516556291391, "grad_norm": 2.838812828063965, "learning_rate": 1.9735099337748347e-05, "loss": 1.0791, "step": 190 }, { "epoch": 0.06971070059254096, "grad_norm": 3.111001968383789, "learning_rate": 1.9721157197629838e-05, "loss": 1.1166, "step": 200 }, { "epoch": 0.073196235622168, "grad_norm": 4.862301826477051, "learning_rate": 1.970721505751133e-05, "loss": 1.053, "step": 210 }, { "epoch": 0.07668177065179504, "grad_norm": 3.7676494121551514, "learning_rate": 1.969327291739282e-05, "loss": 1.0223, "step": 220 }, { "epoch": 0.0801673056814221, "grad_norm": 4.827447414398193, "learning_rate": 1.9679330777274314e-05, "loss": 1.0277, "step": 230 }, { "epoch": 0.08365284071104914, "grad_norm": 2.027529716491699, "learning_rate": 1.9665388637155805e-05, "loss": 0.8187, "step": 240 }, { "epoch": 0.08713837574067619, "grad_norm": 5.508512020111084, "learning_rate": 1.9651446497037296e-05, "loss": 0.9514, "step": 250 }, { "epoch": 0.09062391077030324, "grad_norm": 4.481660842895508, "learning_rate": 1.963750435691879e-05, "loss": 0.9503, "step": 260 }, { "epoch": 0.09410944579993029, "grad_norm": 2.511476993560791, "learning_rate": 1.962356221680028e-05, "loss": 0.9108, "step": 270 }, { "epoch": 0.09759498082955734, "grad_norm": 5.287069320678711, "learning_rate": 1.960962007668177e-05, "loss": 0.8772, "step": 280 }, { "epoch": 0.10108051585918439, "grad_norm": 3.6010959148406982, "learning_rate": 1.9595677936563266e-05, "loss": 0.793, "step": 290 }, { "epoch": 0.10456605088881143, "grad_norm": 4.5751776695251465, "learning_rate": 1.9581735796444757e-05, "loss": 0.7934, "step": 300 }, { "epoch": 0.10805158591843848, "grad_norm": 2.073023796081543, "learning_rate": 1.9567793656326247e-05, "loss": 0.7081, "step": 310 }, { "epoch": 0.11153712094806553, "grad_norm": 5.257015228271484, "learning_rate": 1.955385151620774e-05, "loss": 0.7023, "step": 320 }, { "epoch": 0.11502265597769258, "grad_norm": 5.037456035614014, "learning_rate": 1.9539909376089232e-05, "loss": 0.705, "step": 330 }, { "epoch": 0.11850819100731963, "grad_norm": 2.995671510696411, "learning_rate": 1.9525967235970723e-05, "loss": 0.654, "step": 340 }, { "epoch": 0.12199372603694666, "grad_norm": 4.856020450592041, "learning_rate": 1.9512025095852214e-05, "loss": 0.6384, "step": 350 }, { "epoch": 0.12547926106657373, "grad_norm": 4.719180107116699, "learning_rate": 1.9498082955733705e-05, "loss": 0.6655, "step": 360 }, { "epoch": 0.12896479609620076, "grad_norm": 7.83165979385376, "learning_rate": 1.94841408156152e-05, "loss": 0.5098, "step": 370 }, { "epoch": 0.13245033112582782, "grad_norm": 3.3765158653259277, "learning_rate": 1.947019867549669e-05, "loss": 0.6758, "step": 380 }, { "epoch": 0.13593586615545486, "grad_norm": 4.360907554626465, "learning_rate": 1.945625653537818e-05, "loss": 0.6017, "step": 390 }, { "epoch": 0.13942140118508192, "grad_norm": 3.162569522857666, "learning_rate": 1.9442314395259675e-05, "loss": 0.6965, "step": 400 }, { "epoch": 0.14290693621470896, "grad_norm": 5.929446697235107, "learning_rate": 1.9428372255141166e-05, "loss": 0.6226, "step": 410 }, { "epoch": 0.146392471244336, "grad_norm": 2.6196141242980957, "learning_rate": 1.9414430115022657e-05, "loss": 0.5276, "step": 420 }, { "epoch": 0.14987800627396305, "grad_norm": 4.234184265136719, "learning_rate": 1.940048797490415e-05, "loss": 0.7085, "step": 430 }, { "epoch": 0.1533635413035901, "grad_norm": 3.8775856494903564, "learning_rate": 1.938654583478564e-05, "loss": 0.7142, "step": 440 }, { "epoch": 0.15684907633321715, "grad_norm": 2.744117498397827, "learning_rate": 1.9372603694667132e-05, "loss": 0.606, "step": 450 }, { "epoch": 0.1603346113628442, "grad_norm": 4.7066969871521, "learning_rate": 1.9358661554548627e-05, "loss": 0.5027, "step": 460 }, { "epoch": 0.16382014639247125, "grad_norm": 6.463034152984619, "learning_rate": 1.9344719414430117e-05, "loss": 0.5223, "step": 470 }, { "epoch": 0.16730568142209828, "grad_norm": 9.271512985229492, "learning_rate": 1.9330777274311608e-05, "loss": 0.6037, "step": 480 }, { "epoch": 0.17079121645172535, "grad_norm": 14.535713195800781, "learning_rate": 1.93168351341931e-05, "loss": 0.5479, "step": 490 }, { "epoch": 0.17427675148135238, "grad_norm": 8.693368911743164, "learning_rate": 1.930289299407459e-05, "loss": 0.6084, "step": 500 }, { "epoch": 0.17776228651097944, "grad_norm": 5.175466537475586, "learning_rate": 1.9288950853956084e-05, "loss": 0.6587, "step": 510 }, { "epoch": 0.18124782154060648, "grad_norm": 2.11287522315979, "learning_rate": 1.9275008713837575e-05, "loss": 0.49, "step": 520 }, { "epoch": 0.18473335657023354, "grad_norm": 2.2307288646698, "learning_rate": 1.9261066573719066e-05, "loss": 0.6132, "step": 530 }, { "epoch": 0.18821889159986058, "grad_norm": 11.079753875732422, "learning_rate": 1.924712443360056e-05, "loss": 0.572, "step": 540 }, { "epoch": 0.19170442662948764, "grad_norm": 2.487985610961914, "learning_rate": 1.923318229348205e-05, "loss": 0.3897, "step": 550 }, { "epoch": 0.19518996165911467, "grad_norm": 3.330732822418213, "learning_rate": 1.921924015336354e-05, "loss": 0.3884, "step": 560 }, { "epoch": 0.1986754966887417, "grad_norm": 9.687525749206543, "learning_rate": 1.9205298013245036e-05, "loss": 0.4155, "step": 570 }, { "epoch": 0.20216103171836877, "grad_norm": 5.554359436035156, "learning_rate": 1.9191355873126526e-05, "loss": 0.5435, "step": 580 }, { "epoch": 0.2056465667479958, "grad_norm": 2.3810067176818848, "learning_rate": 1.917741373300802e-05, "loss": 0.4487, "step": 590 }, { "epoch": 0.20913210177762287, "grad_norm": 2.6795718669891357, "learning_rate": 1.916347159288951e-05, "loss": 0.5523, "step": 600 }, { "epoch": 0.2126176368072499, "grad_norm": 11.402462005615234, "learning_rate": 1.9149529452771002e-05, "loss": 0.6798, "step": 610 }, { "epoch": 0.21610317183687697, "grad_norm": 4.038872241973877, "learning_rate": 1.9135587312652493e-05, "loss": 0.3321, "step": 620 }, { "epoch": 0.219588706866504, "grad_norm": 0.8252215385437012, "learning_rate": 1.9121645172533984e-05, "loss": 0.4267, "step": 630 }, { "epoch": 0.22307424189613106, "grad_norm": 3.365762233734131, "learning_rate": 1.9107703032415478e-05, "loss": 0.5648, "step": 640 }, { "epoch": 0.2265597769257581, "grad_norm": 3.1780312061309814, "learning_rate": 1.909376089229697e-05, "loss": 0.4703, "step": 650 }, { "epoch": 0.23004531195538516, "grad_norm": 1.3519302606582642, "learning_rate": 1.907981875217846e-05, "loss": 0.4887, "step": 660 }, { "epoch": 0.2335308469850122, "grad_norm": 2.861978054046631, "learning_rate": 1.9065876612059954e-05, "loss": 0.5227, "step": 670 }, { "epoch": 0.23701638201463926, "grad_norm": 9.564464569091797, "learning_rate": 1.9051934471941445e-05, "loss": 0.3791, "step": 680 }, { "epoch": 0.2405019170442663, "grad_norm": 6.82305383682251, "learning_rate": 1.9037992331822936e-05, "loss": 0.5024, "step": 690 }, { "epoch": 0.24398745207389333, "grad_norm": 0.8181214332580566, "learning_rate": 1.902405019170443e-05, "loss": 0.4091, "step": 700 }, { "epoch": 0.2474729871035204, "grad_norm": 11.525835990905762, "learning_rate": 1.901010805158592e-05, "loss": 0.3477, "step": 710 }, { "epoch": 0.25095852213314745, "grad_norm": 2.1437933444976807, "learning_rate": 1.899616591146741e-05, "loss": 0.3997, "step": 720 }, { "epoch": 0.25444405716277446, "grad_norm": 14.871402740478516, "learning_rate": 1.8982223771348906e-05, "loss": 0.4577, "step": 730 }, { "epoch": 0.2579295921924015, "grad_norm": 1.370771050453186, "learning_rate": 1.8968281631230396e-05, "loss": 0.2808, "step": 740 }, { "epoch": 0.2614151272220286, "grad_norm": 14.198845863342285, "learning_rate": 1.8954339491111887e-05, "loss": 0.4222, "step": 750 }, { "epoch": 0.26490066225165565, "grad_norm": 0.7197427749633789, "learning_rate": 1.8940397350993378e-05, "loss": 0.3704, "step": 760 }, { "epoch": 0.26838619728128266, "grad_norm": 8.696059226989746, "learning_rate": 1.892645521087487e-05, "loss": 0.3291, "step": 770 }, { "epoch": 0.2718717323109097, "grad_norm": 0.5741353034973145, "learning_rate": 1.8912513070756363e-05, "loss": 0.2361, "step": 780 }, { "epoch": 0.2753572673405368, "grad_norm": 3.235818386077881, "learning_rate": 1.8898570930637854e-05, "loss": 0.4824, "step": 790 }, { "epoch": 0.27884280237016384, "grad_norm": 0.8503827452659607, "learning_rate": 1.8884628790519345e-05, "loss": 0.3376, "step": 800 }, { "epoch": 0.28232833739979085, "grad_norm": 1.6630173921585083, "learning_rate": 1.887068665040084e-05, "loss": 0.3883, "step": 810 }, { "epoch": 0.2858138724294179, "grad_norm": 10.400406837463379, "learning_rate": 1.885674451028233e-05, "loss": 0.4094, "step": 820 }, { "epoch": 0.289299407459045, "grad_norm": 1.3901340961456299, "learning_rate": 1.884280237016382e-05, "loss": 0.5164, "step": 830 }, { "epoch": 0.292784942488672, "grad_norm": 5.530642509460449, "learning_rate": 1.8828860230045315e-05, "loss": 0.3125, "step": 840 }, { "epoch": 0.29627047751829905, "grad_norm": 6.685891151428223, "learning_rate": 1.8814918089926806e-05, "loss": 0.3943, "step": 850 }, { "epoch": 0.2997560125479261, "grad_norm": 15.618851661682129, "learning_rate": 1.8800975949808296e-05, "loss": 0.3619, "step": 860 }, { "epoch": 0.30324154757755317, "grad_norm": 9.351723670959473, "learning_rate": 1.878703380968979e-05, "loss": 0.4237, "step": 870 }, { "epoch": 0.3067270826071802, "grad_norm": 3.25188946723938, "learning_rate": 1.877309166957128e-05, "loss": 0.3106, "step": 880 }, { "epoch": 0.31021261763680724, "grad_norm": 3.0205211639404297, "learning_rate": 1.8759149529452772e-05, "loss": 0.3932, "step": 890 }, { "epoch": 0.3136981526664343, "grad_norm": 0.8111696243286133, "learning_rate": 1.8745207389334263e-05, "loss": 0.3497, "step": 900 }, { "epoch": 0.31718368769606137, "grad_norm": 1.2595362663269043, "learning_rate": 1.8731265249215754e-05, "loss": 0.4002, "step": 910 }, { "epoch": 0.3206692227256884, "grad_norm": 0.6795431971549988, "learning_rate": 1.8717323109097248e-05, "loss": 0.3498, "step": 920 }, { "epoch": 0.32415475775531544, "grad_norm": 1.0169798135757446, "learning_rate": 1.870338096897874e-05, "loss": 0.2839, "step": 930 }, { "epoch": 0.3276402927849425, "grad_norm": 9.539275169372559, "learning_rate": 1.868943882886023e-05, "loss": 0.3325, "step": 940 }, { "epoch": 0.33112582781456956, "grad_norm": 8.384815216064453, "learning_rate": 1.8675496688741724e-05, "loss": 0.3253, "step": 950 }, { "epoch": 0.33461136284419657, "grad_norm": 4.2581281661987305, "learning_rate": 1.8661554548623215e-05, "loss": 0.3612, "step": 960 }, { "epoch": 0.33809689787382363, "grad_norm": 1.6423273086547852, "learning_rate": 1.8647612408504706e-05, "loss": 0.3876, "step": 970 }, { "epoch": 0.3415824329034507, "grad_norm": 3.153898000717163, "learning_rate": 1.86336702683862e-05, "loss": 0.4319, "step": 980 }, { "epoch": 0.3450679679330777, "grad_norm": 0.48236197233200073, "learning_rate": 1.861972812826769e-05, "loss": 0.1993, "step": 990 }, { "epoch": 0.34855350296270476, "grad_norm": 0.4741288423538208, "learning_rate": 1.8605785988149185e-05, "loss": 0.3256, "step": 1000 }, { "epoch": 0.3520390379923318, "grad_norm": 10.781346321105957, "learning_rate": 1.8591843848030676e-05, "loss": 0.4149, "step": 1010 }, { "epoch": 0.3555245730219589, "grad_norm": 0.4621415436267853, "learning_rate": 1.8577901707912166e-05, "loss": 0.3206, "step": 1020 }, { "epoch": 0.3590101080515859, "grad_norm": 6.963232040405273, "learning_rate": 1.8563959567793657e-05, "loss": 0.3789, "step": 1030 }, { "epoch": 0.36249564308121296, "grad_norm": 6.472696781158447, "learning_rate": 1.8550017427675148e-05, "loss": 0.2999, "step": 1040 }, { "epoch": 0.36598117811084, "grad_norm": 8.689462661743164, "learning_rate": 1.8536075287556642e-05, "loss": 0.2934, "step": 1050 }, { "epoch": 0.3694667131404671, "grad_norm": 0.33935561776161194, "learning_rate": 1.8522133147438133e-05, "loss": 0.2774, "step": 1060 }, { "epoch": 0.3729522481700941, "grad_norm": 10.953472137451172, "learning_rate": 1.8508191007319624e-05, "loss": 0.3193, "step": 1070 }, { "epoch": 0.37643778319972115, "grad_norm": 1.503441333770752, "learning_rate": 1.8494248867201118e-05, "loss": 0.3257, "step": 1080 }, { "epoch": 0.3799233182293482, "grad_norm": 0.4625272750854492, "learning_rate": 1.848030672708261e-05, "loss": 0.2634, "step": 1090 }, { "epoch": 0.3834088532589753, "grad_norm": 11.566400527954102, "learning_rate": 1.84663645869641e-05, "loss": 0.2718, "step": 1100 }, { "epoch": 0.3868943882886023, "grad_norm": 12.47866439819336, "learning_rate": 1.8452422446845594e-05, "loss": 0.3186, "step": 1110 }, { "epoch": 0.39037992331822935, "grad_norm": 8.613419532775879, "learning_rate": 1.8438480306727085e-05, "loss": 0.3122, "step": 1120 }, { "epoch": 0.3938654583478564, "grad_norm": 0.35111886262893677, "learning_rate": 1.8424538166608576e-05, "loss": 0.2495, "step": 1130 }, { "epoch": 0.3973509933774834, "grad_norm": 0.34094560146331787, "learning_rate": 1.841059602649007e-05, "loss": 0.2337, "step": 1140 }, { "epoch": 0.4008365284071105, "grad_norm": 4.428099155426025, "learning_rate": 1.839665388637156e-05, "loss": 0.1903, "step": 1150 }, { "epoch": 0.40432206343673754, "grad_norm": 0.3854966461658478, "learning_rate": 1.838271174625305e-05, "loss": 0.3, "step": 1160 }, { "epoch": 0.4078075984663646, "grad_norm": 38.84294509887695, "learning_rate": 1.8368769606134542e-05, "loss": 0.2134, "step": 1170 }, { "epoch": 0.4112931334959916, "grad_norm": 1.8577314615249634, "learning_rate": 1.8354827466016033e-05, "loss": 0.2276, "step": 1180 }, { "epoch": 0.4147786685256187, "grad_norm": 0.9145134091377258, "learning_rate": 1.8340885325897527e-05, "loss": 0.2989, "step": 1190 }, { "epoch": 0.41826420355524574, "grad_norm": 13.17152214050293, "learning_rate": 1.8326943185779018e-05, "loss": 0.2938, "step": 1200 }, { "epoch": 0.4217497385848728, "grad_norm": 0.34265998005867004, "learning_rate": 1.831300104566051e-05, "loss": 0.2257, "step": 1210 }, { "epoch": 0.4252352736144998, "grad_norm": 10.514459609985352, "learning_rate": 1.8299058905542003e-05, "loss": 0.382, "step": 1220 }, { "epoch": 0.42872080864412687, "grad_norm": 16.24938201904297, "learning_rate": 1.8285116765423494e-05, "loss": 0.3638, "step": 1230 }, { "epoch": 0.43220634367375393, "grad_norm": 2.323594331741333, "learning_rate": 1.8271174625304985e-05, "loss": 0.3058, "step": 1240 }, { "epoch": 0.43569187870338094, "grad_norm": 0.8403987288475037, "learning_rate": 1.825723248518648e-05, "loss": 0.1748, "step": 1250 }, { "epoch": 0.439177413733008, "grad_norm": 0.3924444913864136, "learning_rate": 1.824329034506797e-05, "loss": 0.2051, "step": 1260 }, { "epoch": 0.44266294876263507, "grad_norm": 1.6590919494628906, "learning_rate": 1.822934820494946e-05, "loss": 0.1713, "step": 1270 }, { "epoch": 0.44614848379226213, "grad_norm": 10.351805686950684, "learning_rate": 1.8215406064830955e-05, "loss": 0.2202, "step": 1280 }, { "epoch": 0.44963401882188914, "grad_norm": 13.177727699279785, "learning_rate": 1.8201463924712445e-05, "loss": 0.2005, "step": 1290 }, { "epoch": 0.4531195538515162, "grad_norm": 3.9433481693267822, "learning_rate": 1.8187521784593936e-05, "loss": 0.2948, "step": 1300 }, { "epoch": 0.45660508888114326, "grad_norm": 0.3618289530277252, "learning_rate": 1.8173579644475427e-05, "loss": 0.148, "step": 1310 }, { "epoch": 0.4600906239107703, "grad_norm": 10.462213516235352, "learning_rate": 1.8159637504356918e-05, "loss": 0.2468, "step": 1320 }, { "epoch": 0.46357615894039733, "grad_norm": 0.270712286233902, "learning_rate": 1.8145695364238412e-05, "loss": 0.2283, "step": 1330 }, { "epoch": 0.4670616939700244, "grad_norm": 0.30840155482292175, "learning_rate": 1.8131753224119903e-05, "loss": 0.1058, "step": 1340 }, { "epoch": 0.47054722899965146, "grad_norm": 7.213718414306641, "learning_rate": 1.8117811084001394e-05, "loss": 0.2548, "step": 1350 }, { "epoch": 0.4740327640292785, "grad_norm": 4.371262550354004, "learning_rate": 1.8103868943882888e-05, "loss": 0.2346, "step": 1360 }, { "epoch": 0.4775182990589055, "grad_norm": 5.258789539337158, "learning_rate": 1.808992680376438e-05, "loss": 0.3612, "step": 1370 }, { "epoch": 0.4810038340885326, "grad_norm": 0.29574891924858093, "learning_rate": 1.8075984663645873e-05, "loss": 0.2976, "step": 1380 }, { "epoch": 0.48448936911815965, "grad_norm": 0.7350137233734131, "learning_rate": 1.8062042523527364e-05, "loss": 0.1397, "step": 1390 }, { "epoch": 0.48797490414778666, "grad_norm": 0.22566929459571838, "learning_rate": 1.8048100383408855e-05, "loss": 0.2482, "step": 1400 }, { "epoch": 0.4914604391774137, "grad_norm": 4.66607141494751, "learning_rate": 1.803415824329035e-05, "loss": 0.2287, "step": 1410 }, { "epoch": 0.4949459742070408, "grad_norm": 1.6220591068267822, "learning_rate": 1.802021610317184e-05, "loss": 0.2499, "step": 1420 }, { "epoch": 0.49843150923666785, "grad_norm": 9.882462501525879, "learning_rate": 1.800627396305333e-05, "loss": 0.1941, "step": 1430 }, { "epoch": 0.5019170442662949, "grad_norm": 9.781709671020508, "learning_rate": 1.799233182293482e-05, "loss": 0.2008, "step": 1440 }, { "epoch": 0.5054025792959219, "grad_norm": 20.839242935180664, "learning_rate": 1.7978389682816312e-05, "loss": 0.4964, "step": 1450 }, { "epoch": 0.5088881143255489, "grad_norm": 14.371743202209473, "learning_rate": 1.7964447542697806e-05, "loss": 0.3222, "step": 1460 }, { "epoch": 0.512373649355176, "grad_norm": 0.32212069630622864, "learning_rate": 1.7950505402579297e-05, "loss": 0.2794, "step": 1470 }, { "epoch": 0.515859184384803, "grad_norm": 0.4930780827999115, "learning_rate": 1.7936563262460788e-05, "loss": 0.0921, "step": 1480 }, { "epoch": 0.5193447194144302, "grad_norm": 2.0369691848754883, "learning_rate": 1.7922621122342282e-05, "loss": 0.2109, "step": 1490 }, { "epoch": 0.5228302544440572, "grad_norm": 1.3328112363815308, "learning_rate": 1.7908678982223773e-05, "loss": 0.2685, "step": 1500 }, { "epoch": 0.5263157894736842, "grad_norm": 1.5157321691513062, "learning_rate": 1.7894736842105264e-05, "loss": 0.343, "step": 1510 }, { "epoch": 0.5298013245033113, "grad_norm": 24.59377670288086, "learning_rate": 1.7880794701986758e-05, "loss": 0.257, "step": 1520 }, { "epoch": 0.5332868595329383, "grad_norm": 0.25167328119277954, "learning_rate": 1.786685256186825e-05, "loss": 0.2256, "step": 1530 }, { "epoch": 0.5367723945625653, "grad_norm": 0.31769871711730957, "learning_rate": 1.785291042174974e-05, "loss": 0.2461, "step": 1540 }, { "epoch": 0.5402579295921924, "grad_norm": 4.568899631500244, "learning_rate": 1.7838968281631234e-05, "loss": 0.1961, "step": 1550 }, { "epoch": 0.5437434646218194, "grad_norm": 3.1680665016174316, "learning_rate": 1.7825026141512725e-05, "loss": 0.2128, "step": 1560 }, { "epoch": 0.5472289996514464, "grad_norm": 4.516887187957764, "learning_rate": 1.7811084001394215e-05, "loss": 0.2493, "step": 1570 }, { "epoch": 0.5507145346810736, "grad_norm": 9.322068214416504, "learning_rate": 1.7797141861275706e-05, "loss": 0.1234, "step": 1580 }, { "epoch": 0.5542000697107006, "grad_norm": 0.5057626366615295, "learning_rate": 1.7783199721157197e-05, "loss": 0.1955, "step": 1590 }, { "epoch": 0.5576856047403277, "grad_norm": 3.936608076095581, "learning_rate": 1.776925758103869e-05, "loss": 0.4223, "step": 1600 }, { "epoch": 0.5611711397699547, "grad_norm": 1.4608641862869263, "learning_rate": 1.7755315440920182e-05, "loss": 0.2389, "step": 1610 }, { "epoch": 0.5646566747995817, "grad_norm": 0.2644886076450348, "learning_rate": 1.7741373300801673e-05, "loss": 0.2196, "step": 1620 }, { "epoch": 0.5681422098292088, "grad_norm": 8.523164749145508, "learning_rate": 1.7727431160683167e-05, "loss": 0.4851, "step": 1630 }, { "epoch": 0.5716277448588358, "grad_norm": 0.24561679363250732, "learning_rate": 1.7713489020564658e-05, "loss": 0.1291, "step": 1640 }, { "epoch": 0.5751132798884628, "grad_norm": 3.906648874282837, "learning_rate": 1.769954688044615e-05, "loss": 0.1317, "step": 1650 }, { "epoch": 0.57859881491809, "grad_norm": 12.78685474395752, "learning_rate": 1.7685604740327643e-05, "loss": 0.2986, "step": 1660 }, { "epoch": 0.582084349947717, "grad_norm": 0.1987033486366272, "learning_rate": 1.7671662600209134e-05, "loss": 0.165, "step": 1670 }, { "epoch": 0.585569884977344, "grad_norm": 29.639982223510742, "learning_rate": 1.7657720460090625e-05, "loss": 0.1937, "step": 1680 }, { "epoch": 0.5890554200069711, "grad_norm": 3.289419174194336, "learning_rate": 1.764377831997212e-05, "loss": 0.0802, "step": 1690 }, { "epoch": 0.5925409550365981, "grad_norm": 0.2210395336151123, "learning_rate": 1.762983617985361e-05, "loss": 0.2471, "step": 1700 }, { "epoch": 0.5960264900662252, "grad_norm": 23.282852172851562, "learning_rate": 1.76158940397351e-05, "loss": 0.2112, "step": 1710 }, { "epoch": 0.5995120250958522, "grad_norm": 0.22058001160621643, "learning_rate": 1.760195189961659e-05, "loss": 0.0758, "step": 1720 }, { "epoch": 0.6029975601254792, "grad_norm": 7.556725025177002, "learning_rate": 1.7588009759498082e-05, "loss": 0.1733, "step": 1730 }, { "epoch": 0.6064830951551063, "grad_norm": 13.092087745666504, "learning_rate": 1.7574067619379576e-05, "loss": 0.118, "step": 1740 }, { "epoch": 0.6099686301847334, "grad_norm": 2.1389198303222656, "learning_rate": 1.7560125479261067e-05, "loss": 0.1971, "step": 1750 }, { "epoch": 0.6134541652143604, "grad_norm": 2.126339912414551, "learning_rate": 1.754618333914256e-05, "loss": 0.3391, "step": 1760 }, { "epoch": 0.6169397002439875, "grad_norm": 0.7699007987976074, "learning_rate": 1.7532241199024052e-05, "loss": 0.299, "step": 1770 }, { "epoch": 0.6204252352736145, "grad_norm": 0.1900128424167633, "learning_rate": 1.7518299058905543e-05, "loss": 0.1021, "step": 1780 }, { "epoch": 0.6239107703032416, "grad_norm": 20.255779266357422, "learning_rate": 1.7504356918787037e-05, "loss": 0.158, "step": 1790 }, { "epoch": 0.6273963053328686, "grad_norm": 0.18125827610492706, "learning_rate": 1.7490414778668528e-05, "loss": 0.2431, "step": 1800 }, { "epoch": 0.6308818403624956, "grad_norm": 0.5560604929924011, "learning_rate": 1.747647263855002e-05, "loss": 0.281, "step": 1810 }, { "epoch": 0.6343673753921227, "grad_norm": 0.6502192616462708, "learning_rate": 1.7462530498431513e-05, "loss": 0.1709, "step": 1820 }, { "epoch": 0.6378529104217497, "grad_norm": 3.676496982574463, "learning_rate": 1.7448588358313004e-05, "loss": 0.3918, "step": 1830 }, { "epoch": 0.6413384454513767, "grad_norm": 8.176740646362305, "learning_rate": 1.7434646218194494e-05, "loss": 0.1222, "step": 1840 }, { "epoch": 0.6448239804810039, "grad_norm": 1.6892282962799072, "learning_rate": 1.7420704078075985e-05, "loss": 0.3701, "step": 1850 }, { "epoch": 0.6483095155106309, "grad_norm": 3.25469708442688, "learning_rate": 1.7406761937957476e-05, "loss": 0.2383, "step": 1860 }, { "epoch": 0.6517950505402579, "grad_norm": 1.433481216430664, "learning_rate": 1.739281979783897e-05, "loss": 0.1936, "step": 1870 }, { "epoch": 0.655280585569885, "grad_norm": 0.20236587524414062, "learning_rate": 1.737887765772046e-05, "loss": 0.4097, "step": 1880 }, { "epoch": 0.658766120599512, "grad_norm": 3.3700122833251953, "learning_rate": 1.7364935517601952e-05, "loss": 0.2351, "step": 1890 }, { "epoch": 0.6622516556291391, "grad_norm": 10.565374374389648, "learning_rate": 1.7350993377483446e-05, "loss": 0.3335, "step": 1900 }, { "epoch": 0.6657371906587661, "grad_norm": 0.19991923868656158, "learning_rate": 1.7337051237364937e-05, "loss": 0.4573, "step": 1910 }, { "epoch": 0.6692227256883931, "grad_norm": 2.1597394943237305, "learning_rate": 1.7323109097246428e-05, "loss": 0.1396, "step": 1920 }, { "epoch": 0.6727082607180203, "grad_norm": 23.80716896057129, "learning_rate": 1.7309166957127922e-05, "loss": 0.2434, "step": 1930 }, { "epoch": 0.6761937957476473, "grad_norm": 0.18005388975143433, "learning_rate": 1.7295224817009413e-05, "loss": 0.2398, "step": 1940 }, { "epoch": 0.6796793307772743, "grad_norm": 0.24591070413589478, "learning_rate": 1.7281282676890904e-05, "loss": 0.1673, "step": 1950 }, { "epoch": 0.6831648658069014, "grad_norm": 0.35577160120010376, "learning_rate": 1.7267340536772398e-05, "loss": 0.1134, "step": 1960 }, { "epoch": 0.6866504008365284, "grad_norm": 0.14578354358673096, "learning_rate": 1.725339839665389e-05, "loss": 0.1394, "step": 1970 }, { "epoch": 0.6901359358661554, "grad_norm": 0.15032856166362762, "learning_rate": 1.723945625653538e-05, "loss": 0.2795, "step": 1980 }, { "epoch": 0.6936214708957825, "grad_norm": 0.15948134660720825, "learning_rate": 1.722551411641687e-05, "loss": 0.1617, "step": 1990 }, { "epoch": 0.6971070059254095, "grad_norm": 1.1098570823669434, "learning_rate": 1.721157197629836e-05, "loss": 0.1358, "step": 2000 }, { "epoch": 0.7005925409550366, "grad_norm": 14.691607475280762, "learning_rate": 1.7197629836179855e-05, "loss": 0.2749, "step": 2010 }, { "epoch": 0.7040780759846637, "grad_norm": 9.642597198486328, "learning_rate": 1.7183687696061346e-05, "loss": 0.2413, "step": 2020 }, { "epoch": 0.7075636110142907, "grad_norm": 0.25982531905174255, "learning_rate": 1.7169745555942837e-05, "loss": 0.2381, "step": 2030 }, { "epoch": 0.7110491460439178, "grad_norm": 11.684551239013672, "learning_rate": 1.715580341582433e-05, "loss": 0.2735, "step": 2040 }, { "epoch": 0.7145346810735448, "grad_norm": 13.072251319885254, "learning_rate": 1.7141861275705822e-05, "loss": 0.1853, "step": 2050 }, { "epoch": 0.7180202161031718, "grad_norm": 4.4599504470825195, "learning_rate": 1.7127919135587313e-05, "loss": 0.0791, "step": 2060 }, { "epoch": 0.7215057511327989, "grad_norm": 11.390928268432617, "learning_rate": 1.7113976995468807e-05, "loss": 0.363, "step": 2070 }, { "epoch": 0.7249912861624259, "grad_norm": 24.80426788330078, "learning_rate": 1.7100034855350298e-05, "loss": 0.1271, "step": 2080 }, { "epoch": 0.7284768211920529, "grad_norm": 4.6373724937438965, "learning_rate": 1.708609271523179e-05, "loss": 0.2046, "step": 2090 }, { "epoch": 0.73196235622168, "grad_norm": 18.688011169433594, "learning_rate": 1.7072150575113283e-05, "loss": 0.3764, "step": 2100 }, { "epoch": 0.735447891251307, "grad_norm": 0.1868332177400589, "learning_rate": 1.7058208434994774e-05, "loss": 0.2338, "step": 2110 }, { "epoch": 0.7389334262809342, "grad_norm": 0.9303812980651855, "learning_rate": 1.7044266294876264e-05, "loss": 0.1955, "step": 2120 }, { "epoch": 0.7424189613105612, "grad_norm": 20.024219512939453, "learning_rate": 1.7030324154757755e-05, "loss": 0.2066, "step": 2130 }, { "epoch": 0.7459044963401882, "grad_norm": 0.41837865114212036, "learning_rate": 1.701638201463925e-05, "loss": 0.43, "step": 2140 }, { "epoch": 0.7493900313698153, "grad_norm": 0.2002245932817459, "learning_rate": 1.700243987452074e-05, "loss": 0.2395, "step": 2150 }, { "epoch": 0.7528755663994423, "grad_norm": 16.88410186767578, "learning_rate": 1.698849773440223e-05, "loss": 0.3061, "step": 2160 }, { "epoch": 0.7563611014290693, "grad_norm": 13.094812393188477, "learning_rate": 1.6974555594283725e-05, "loss": 0.2037, "step": 2170 }, { "epoch": 0.7598466364586964, "grad_norm": 0.489964097738266, "learning_rate": 1.6960613454165216e-05, "loss": 0.1176, "step": 2180 }, { "epoch": 0.7633321714883234, "grad_norm": 0.36262962222099304, "learning_rate": 1.6946671314046707e-05, "loss": 0.1996, "step": 2190 }, { "epoch": 0.7668177065179506, "grad_norm": 13.340808868408203, "learning_rate": 1.69327291739282e-05, "loss": 0.3046, "step": 2200 }, { "epoch": 0.7703032415475776, "grad_norm": 4.607888221740723, "learning_rate": 1.6918787033809692e-05, "loss": 0.2871, "step": 2210 }, { "epoch": 0.7737887765772046, "grad_norm": 0.15974846482276917, "learning_rate": 1.6904844893691183e-05, "loss": 0.2577, "step": 2220 }, { "epoch": 0.7772743116068317, "grad_norm": 0.5607108473777771, "learning_rate": 1.6890902753572677e-05, "loss": 0.1506, "step": 2230 }, { "epoch": 0.7807598466364587, "grad_norm": 0.22983232140541077, "learning_rate": 1.6876960613454168e-05, "loss": 0.0976, "step": 2240 }, { "epoch": 0.7842453816660857, "grad_norm": 0.15347087383270264, "learning_rate": 1.686301847333566e-05, "loss": 0.1101, "step": 2250 }, { "epoch": 0.7877309166957128, "grad_norm": 0.15394413471221924, "learning_rate": 1.684907633321715e-05, "loss": 0.2706, "step": 2260 }, { "epoch": 0.7912164517253398, "grad_norm": 16.60938262939453, "learning_rate": 1.683513419309864e-05, "loss": 0.3779, "step": 2270 }, { "epoch": 0.7947019867549668, "grad_norm": 19.347156524658203, "learning_rate": 1.6821192052980134e-05, "loss": 0.2697, "step": 2280 }, { "epoch": 0.798187521784594, "grad_norm": 18.826566696166992, "learning_rate": 1.6807249912861625e-05, "loss": 0.1772, "step": 2290 }, { "epoch": 0.801673056814221, "grad_norm": 6.404747009277344, "learning_rate": 1.6793307772743116e-05, "loss": 0.1967, "step": 2300 }, { "epoch": 0.8051585918438481, "grad_norm": 2.8832345008850098, "learning_rate": 1.677936563262461e-05, "loss": 0.2426, "step": 2310 }, { "epoch": 0.8086441268734751, "grad_norm": 8.400115966796875, "learning_rate": 1.67654234925061e-05, "loss": 0.1401, "step": 2320 }, { "epoch": 0.8121296619031021, "grad_norm": 3.5098154544830322, "learning_rate": 1.6751481352387592e-05, "loss": 0.1521, "step": 2330 }, { "epoch": 0.8156151969327292, "grad_norm": 7.560535907745361, "learning_rate": 1.6737539212269086e-05, "loss": 0.2052, "step": 2340 }, { "epoch": 0.8191007319623562, "grad_norm": 20.31093978881836, "learning_rate": 1.6723597072150577e-05, "loss": 0.2422, "step": 2350 }, { "epoch": 0.8225862669919832, "grad_norm": 0.14886406064033508, "learning_rate": 1.6709654932032068e-05, "loss": 0.3423, "step": 2360 }, { "epoch": 0.8260718020216103, "grad_norm": 0.16942627727985382, "learning_rate": 1.6695712791913562e-05, "loss": 0.1422, "step": 2370 }, { "epoch": 0.8295573370512374, "grad_norm": 2.075434446334839, "learning_rate": 1.6681770651795053e-05, "loss": 0.2055, "step": 2380 }, { "epoch": 0.8330428720808644, "grad_norm": 12.085251808166504, "learning_rate": 1.6667828511676544e-05, "loss": 0.1778, "step": 2390 }, { "epoch": 0.8365284071104915, "grad_norm": 9.108433723449707, "learning_rate": 1.6653886371558034e-05, "loss": 0.2636, "step": 2400 }, { "epoch": 0.8400139421401185, "grad_norm": 0.22255125641822815, "learning_rate": 1.6639944231439525e-05, "loss": 0.1163, "step": 2410 }, { "epoch": 0.8434994771697456, "grad_norm": 0.34382200241088867, "learning_rate": 1.662600209132102e-05, "loss": 0.3501, "step": 2420 }, { "epoch": 0.8469850121993726, "grad_norm": 3.8618369102478027, "learning_rate": 1.661205995120251e-05, "loss": 0.3013, "step": 2430 }, { "epoch": 0.8504705472289996, "grad_norm": 3.9153378009796143, "learning_rate": 1.6598117811084e-05, "loss": 0.2558, "step": 2440 }, { "epoch": 0.8539560822586267, "grad_norm": 6.622893810272217, "learning_rate": 1.6584175670965495e-05, "loss": 0.1465, "step": 2450 }, { "epoch": 0.8574416172882537, "grad_norm": 0.25614631175994873, "learning_rate": 1.6570233530846986e-05, "loss": 0.3136, "step": 2460 }, { "epoch": 0.8609271523178808, "grad_norm": 1.1627339124679565, "learning_rate": 1.6556291390728477e-05, "loss": 0.1084, "step": 2470 }, { "epoch": 0.8644126873475079, "grad_norm": 3.3835268020629883, "learning_rate": 1.654234925060997e-05, "loss": 0.4099, "step": 2480 }, { "epoch": 0.8678982223771349, "grad_norm": 15.521218299865723, "learning_rate": 1.6528407110491462e-05, "loss": 0.1028, "step": 2490 }, { "epoch": 0.8713837574067619, "grad_norm": 0.2716628313064575, "learning_rate": 1.6514464970372953e-05, "loss": 0.1371, "step": 2500 }, { "epoch": 0.874869292436389, "grad_norm": 0.11692991107702255, "learning_rate": 1.6500522830254447e-05, "loss": 0.2051, "step": 2510 }, { "epoch": 0.878354827466016, "grad_norm": 5.96846866607666, "learning_rate": 1.6486580690135938e-05, "loss": 0.1758, "step": 2520 }, { "epoch": 0.8818403624956431, "grad_norm": 3.1128947734832764, "learning_rate": 1.647263855001743e-05, "loss": 0.3391, "step": 2530 }, { "epoch": 0.8853258975252701, "grad_norm": 0.7964233756065369, "learning_rate": 1.645869640989892e-05, "loss": 0.3088, "step": 2540 }, { "epoch": 0.8888114325548971, "grad_norm": 0.2802188992500305, "learning_rate": 1.6444754269780413e-05, "loss": 0.2082, "step": 2550 }, { "epoch": 0.8922969675845243, "grad_norm": 0.3661327362060547, "learning_rate": 1.6430812129661904e-05, "loss": 0.231, "step": 2560 }, { "epoch": 0.8957825026141513, "grad_norm": 6.788111686706543, "learning_rate": 1.6416869989543395e-05, "loss": 0.183, "step": 2570 }, { "epoch": 0.8992680376437783, "grad_norm": 17.159854888916016, "learning_rate": 1.640292784942489e-05, "loss": 0.2526, "step": 2580 }, { "epoch": 0.9027535726734054, "grad_norm": 23.243045806884766, "learning_rate": 1.638898570930638e-05, "loss": 0.2203, "step": 2590 }, { "epoch": 0.9062391077030324, "grad_norm": 0.10591837018728256, "learning_rate": 1.637504356918787e-05, "loss": 0.2705, "step": 2600 }, { "epoch": 0.9097246427326595, "grad_norm": 8.592724800109863, "learning_rate": 1.6361101429069365e-05, "loss": 0.1686, "step": 2610 }, { "epoch": 0.9132101777622865, "grad_norm": 0.5851829051971436, "learning_rate": 1.6347159288950856e-05, "loss": 0.1452, "step": 2620 }, { "epoch": 0.9166957127919135, "grad_norm": 0.5782294869422913, "learning_rate": 1.6333217148832347e-05, "loss": 0.1843, "step": 2630 }, { "epoch": 0.9201812478215406, "grad_norm": 27.61756134033203, "learning_rate": 1.631927500871384e-05, "loss": 0.1058, "step": 2640 }, { "epoch": 0.9236667828511677, "grad_norm": 0.1436556726694107, "learning_rate": 1.6305332868595332e-05, "loss": 0.1947, "step": 2650 }, { "epoch": 0.9271523178807947, "grad_norm": 0.11741270869970322, "learning_rate": 1.6291390728476823e-05, "loss": 0.2313, "step": 2660 }, { "epoch": 0.9306378529104218, "grad_norm": 6.36224889755249, "learning_rate": 1.6277448588358313e-05, "loss": 0.291, "step": 2670 }, { "epoch": 0.9341233879400488, "grad_norm": 0.2954885959625244, "learning_rate": 1.6263506448239804e-05, "loss": 0.3318, "step": 2680 }, { "epoch": 0.9376089229696758, "grad_norm": 13.240195274353027, "learning_rate": 1.62495643081213e-05, "loss": 0.3679, "step": 2690 }, { "epoch": 0.9410944579993029, "grad_norm": 24.815946578979492, "learning_rate": 1.623562216800279e-05, "loss": 0.3279, "step": 2700 }, { "epoch": 0.9445799930289299, "grad_norm": 0.11210108548402786, "learning_rate": 1.622168002788428e-05, "loss": 0.2753, "step": 2710 }, { "epoch": 0.948065528058557, "grad_norm": 3.065356969833374, "learning_rate": 1.6207737887765774e-05, "loss": 0.1844, "step": 2720 }, { "epoch": 0.951551063088184, "grad_norm": 1.7035739421844482, "learning_rate": 1.6193795747647265e-05, "loss": 0.2426, "step": 2730 }, { "epoch": 0.955036598117811, "grad_norm": 23.689104080200195, "learning_rate": 1.6179853607528756e-05, "loss": 0.3324, "step": 2740 }, { "epoch": 0.9585221331474382, "grad_norm": 8.64907169342041, "learning_rate": 1.616591146741025e-05, "loss": 0.1349, "step": 2750 }, { "epoch": 0.9620076681770652, "grad_norm": 0.21376356482505798, "learning_rate": 1.615196932729174e-05, "loss": 0.1536, "step": 2760 }, { "epoch": 0.9654932032066922, "grad_norm": 21.229475021362305, "learning_rate": 1.6138027187173232e-05, "loss": 0.3432, "step": 2770 }, { "epoch": 0.9689787382363193, "grad_norm": 0.1763678640127182, "learning_rate": 1.6124085047054726e-05, "loss": 0.121, "step": 2780 }, { "epoch": 0.9724642732659463, "grad_norm": 0.35627982020378113, "learning_rate": 1.6110142906936217e-05, "loss": 0.2114, "step": 2790 }, { "epoch": 0.9759498082955733, "grad_norm": 0.675436794757843, "learning_rate": 1.6096200766817708e-05, "loss": 0.2136, "step": 2800 }, { "epoch": 0.9794353433252004, "grad_norm": 0.7453556656837463, "learning_rate": 1.60822586266992e-05, "loss": 0.1687, "step": 2810 }, { "epoch": 0.9829208783548274, "grad_norm": 1.4391916990280151, "learning_rate": 1.606831648658069e-05, "loss": 0.1633, "step": 2820 }, { "epoch": 0.9864064133844546, "grad_norm": 0.10471702367067337, "learning_rate": 1.6054374346462183e-05, "loss": 0.1795, "step": 2830 }, { "epoch": 0.9898919484140816, "grad_norm": 2.418611764907837, "learning_rate": 1.6040432206343674e-05, "loss": 0.1899, "step": 2840 }, { "epoch": 0.9933774834437086, "grad_norm": 6.218674659729004, "learning_rate": 1.6026490066225165e-05, "loss": 0.1613, "step": 2850 }, { "epoch": 0.9968630184733357, "grad_norm": 23.257627487182617, "learning_rate": 1.601254792610666e-05, "loss": 0.1829, "step": 2860 }, { "epoch": 1.0, "eval_accuracy": 0.9686419753086419, "eval_loss": 0.131916344165802, "eval_runtime": 20.8607, "eval_samples_per_second": 194.145, "eval_steps_per_second": 24.304, "step": 2869 }, { "epoch": 1.0003485535029628, "grad_norm": 13.868711471557617, "learning_rate": 1.599860578598815e-05, "loss": 0.3279, "step": 2870 }, { "epoch": 1.0038340885325898, "grad_norm": 0.0889001339673996, "learning_rate": 1.598466364586964e-05, "loss": 0.1179, "step": 2880 }, { "epoch": 1.0073196235622168, "grad_norm": 7.238311767578125, "learning_rate": 1.5970721505751135e-05, "loss": 0.1339, "step": 2890 }, { "epoch": 1.0108051585918438, "grad_norm": 9.563148498535156, "learning_rate": 1.5956779365632626e-05, "loss": 0.2212, "step": 2900 }, { "epoch": 1.0142906936214708, "grad_norm": 10.648134231567383, "learning_rate": 1.5942837225514117e-05, "loss": 0.232, "step": 2910 }, { "epoch": 1.0177762286510978, "grad_norm": 0.2760317325592041, "learning_rate": 1.592889508539561e-05, "loss": 0.1669, "step": 2920 }, { "epoch": 1.021261763680725, "grad_norm": 0.15449553728103638, "learning_rate": 1.5914952945277102e-05, "loss": 0.2226, "step": 2930 }, { "epoch": 1.024747298710352, "grad_norm": 0.09482862800359726, "learning_rate": 1.5901010805158593e-05, "loss": 0.1987, "step": 2940 }, { "epoch": 1.028232833739979, "grad_norm": 1.185239553451538, "learning_rate": 1.5887068665040083e-05, "loss": 0.2861, "step": 2950 }, { "epoch": 1.031718368769606, "grad_norm": 0.9025907516479492, "learning_rate": 1.5873126524921578e-05, "loss": 0.1047, "step": 2960 }, { "epoch": 1.035203903799233, "grad_norm": 21.41613006591797, "learning_rate": 1.585918438480307e-05, "loss": 0.2394, "step": 2970 }, { "epoch": 1.0386894388288603, "grad_norm": 25.966737747192383, "learning_rate": 1.584524224468456e-05, "loss": 0.1353, "step": 2980 }, { "epoch": 1.0421749738584873, "grad_norm": 0.21152754127979279, "learning_rate": 1.5831300104566053e-05, "loss": 0.3361, "step": 2990 }, { "epoch": 1.0456605088881143, "grad_norm": 0.37001797556877136, "learning_rate": 1.5817357964447544e-05, "loss": 0.2875, "step": 3000 }, { "epoch": 1.0491460439177414, "grad_norm": 0.16642731428146362, "learning_rate": 1.5803415824329035e-05, "loss": 0.2697, "step": 3010 }, { "epoch": 1.0526315789473684, "grad_norm": 0.22467225790023804, "learning_rate": 1.578947368421053e-05, "loss": 0.1819, "step": 3020 }, { "epoch": 1.0561171139769954, "grad_norm": 0.6170486807823181, "learning_rate": 1.577553154409202e-05, "loss": 0.0742, "step": 3030 }, { "epoch": 1.0596026490066226, "grad_norm": 7.1253862380981445, "learning_rate": 1.576158940397351e-05, "loss": 0.2141, "step": 3040 }, { "epoch": 1.0630881840362496, "grad_norm": 0.6570123434066772, "learning_rate": 1.5747647263855005e-05, "loss": 0.1353, "step": 3050 }, { "epoch": 1.0665737190658766, "grad_norm": 1.323370337486267, "learning_rate": 1.5733705123736496e-05, "loss": 0.2516, "step": 3060 }, { "epoch": 1.0700592540955036, "grad_norm": 0.3584739863872528, "learning_rate": 1.5719762983617987e-05, "loss": 0.1877, "step": 3070 }, { "epoch": 1.0735447891251306, "grad_norm": 26.268049240112305, "learning_rate": 1.5705820843499477e-05, "loss": 0.1618, "step": 3080 }, { "epoch": 1.0770303241547579, "grad_norm": 1.0968873500823975, "learning_rate": 1.5691878703380968e-05, "loss": 0.2546, "step": 3090 }, { "epoch": 1.0805158591843849, "grad_norm": 0.08166905492544174, "learning_rate": 1.5677936563262463e-05, "loss": 0.1949, "step": 3100 }, { "epoch": 1.0840013942140119, "grad_norm": 0.12049412727355957, "learning_rate": 1.5663994423143953e-05, "loss": 0.2507, "step": 3110 }, { "epoch": 1.0874869292436389, "grad_norm": 14.46548080444336, "learning_rate": 1.5650052283025444e-05, "loss": 0.1774, "step": 3120 }, { "epoch": 1.0909724642732659, "grad_norm": 3.6209402084350586, "learning_rate": 1.563611014290694e-05, "loss": 0.162, "step": 3130 }, { "epoch": 1.094457999302893, "grad_norm": 0.8061731457710266, "learning_rate": 1.562216800278843e-05, "loss": 0.117, "step": 3140 }, { "epoch": 1.0979435343325201, "grad_norm": 0.17544050514698029, "learning_rate": 1.560822586266992e-05, "loss": 0.1057, "step": 3150 }, { "epoch": 1.1014290693621471, "grad_norm": 0.06735656410455704, "learning_rate": 1.5594283722551414e-05, "loss": 0.2237, "step": 3160 }, { "epoch": 1.1049146043917741, "grad_norm": 12.56591510772705, "learning_rate": 1.5580341582432905e-05, "loss": 0.1499, "step": 3170 }, { "epoch": 1.1084001394214011, "grad_norm": 1.4435107707977295, "learning_rate": 1.5566399442314396e-05, "loss": 0.3691, "step": 3180 }, { "epoch": 1.1118856744510281, "grad_norm": 0.12776677310466766, "learning_rate": 1.555245730219589e-05, "loss": 0.1918, "step": 3190 }, { "epoch": 1.1153712094806554, "grad_norm": 0.09562991559505463, "learning_rate": 1.553851516207738e-05, "loss": 0.4167, "step": 3200 }, { "epoch": 1.1188567445102824, "grad_norm": 1.6694416999816895, "learning_rate": 1.552457302195887e-05, "loss": 0.1662, "step": 3210 }, { "epoch": 1.1223422795399094, "grad_norm": 16.271982192993164, "learning_rate": 1.5510630881840362e-05, "loss": 0.238, "step": 3220 }, { "epoch": 1.1258278145695364, "grad_norm": 10.23219108581543, "learning_rate": 1.5496688741721853e-05, "loss": 0.2752, "step": 3230 }, { "epoch": 1.1293133495991634, "grad_norm": 2.7908616065979004, "learning_rate": 1.5482746601603347e-05, "loss": 0.058, "step": 3240 }, { "epoch": 1.1327988846287904, "grad_norm": 0.17575271427631378, "learning_rate": 1.5468804461484838e-05, "loss": 0.2244, "step": 3250 }, { "epoch": 1.1362844196584176, "grad_norm": 1.6930599212646484, "learning_rate": 1.545486232136633e-05, "loss": 0.2232, "step": 3260 }, { "epoch": 1.1397699546880447, "grad_norm": 1.9705103635787964, "learning_rate": 1.5440920181247823e-05, "loss": 0.1543, "step": 3270 }, { "epoch": 1.1432554897176717, "grad_norm": 11.21917724609375, "learning_rate": 1.5426978041129314e-05, "loss": 0.3028, "step": 3280 }, { "epoch": 1.1467410247472987, "grad_norm": 4.643821716308594, "learning_rate": 1.5413035901010805e-05, "loss": 0.303, "step": 3290 }, { "epoch": 1.1502265597769257, "grad_norm": 19.168472290039062, "learning_rate": 1.53990937608923e-05, "loss": 0.2017, "step": 3300 }, { "epoch": 1.153712094806553, "grad_norm": 0.08442903310060501, "learning_rate": 1.538515162077379e-05, "loss": 0.0612, "step": 3310 }, { "epoch": 1.15719762983618, "grad_norm": 0.07782625406980515, "learning_rate": 1.5371209480655284e-05, "loss": 0.1009, "step": 3320 }, { "epoch": 1.160683164865807, "grad_norm": 17.379108428955078, "learning_rate": 1.5357267340536775e-05, "loss": 0.205, "step": 3330 }, { "epoch": 1.164168699895434, "grad_norm": 0.07793194055557251, "learning_rate": 1.5343325200418266e-05, "loss": 0.261, "step": 3340 }, { "epoch": 1.167654234925061, "grad_norm": 0.10120818763971329, "learning_rate": 1.5329383060299757e-05, "loss": 0.1319, "step": 3350 }, { "epoch": 1.1711397699546882, "grad_norm": 13.039447784423828, "learning_rate": 1.5315440920181247e-05, "loss": 0.2302, "step": 3360 }, { "epoch": 1.1746253049843152, "grad_norm": 0.0781761035323143, "learning_rate": 1.530149878006274e-05, "loss": 0.1687, "step": 3370 }, { "epoch": 1.1781108400139422, "grad_norm": 5.389220237731934, "learning_rate": 1.5287556639944232e-05, "loss": 0.1653, "step": 3380 }, { "epoch": 1.1815963750435692, "grad_norm": 16.386268615722656, "learning_rate": 1.5273614499825723e-05, "loss": 0.2369, "step": 3390 }, { "epoch": 1.1850819100731962, "grad_norm": 0.0885128378868103, "learning_rate": 1.5259672359707217e-05, "loss": 0.1952, "step": 3400 }, { "epoch": 1.1885674451028232, "grad_norm": 0.17366395890712738, "learning_rate": 1.5245730219588708e-05, "loss": 0.2546, "step": 3410 }, { "epoch": 1.1920529801324504, "grad_norm": 0.10797934234142303, "learning_rate": 1.52317880794702e-05, "loss": 0.1879, "step": 3420 }, { "epoch": 1.1955385151620774, "grad_norm": 5.789648532867432, "learning_rate": 1.5217845939351692e-05, "loss": 0.2316, "step": 3430 }, { "epoch": 1.1990240501917044, "grad_norm": 0.19041916728019714, "learning_rate": 1.5203903799233184e-05, "loss": 0.3066, "step": 3440 }, { "epoch": 1.2025095852213314, "grad_norm": 0.12138816714286804, "learning_rate": 1.5189961659114677e-05, "loss": 0.0508, "step": 3450 }, { "epoch": 1.2059951202509585, "grad_norm": 0.9047268629074097, "learning_rate": 1.5176019518996167e-05, "loss": 0.2256, "step": 3460 }, { "epoch": 1.2094806552805855, "grad_norm": 15.816200256347656, "learning_rate": 1.516207737887766e-05, "loss": 0.2817, "step": 3470 }, { "epoch": 1.2129661903102127, "grad_norm": 0.109500952064991, "learning_rate": 1.5148135238759152e-05, "loss": 0.0286, "step": 3480 }, { "epoch": 1.2164517253398397, "grad_norm": 1.6199902296066284, "learning_rate": 1.5134193098640642e-05, "loss": 0.124, "step": 3490 }, { "epoch": 1.2199372603694667, "grad_norm": 0.5839070677757263, "learning_rate": 1.5120250958522134e-05, "loss": 0.1435, "step": 3500 }, { "epoch": 1.2234227953990937, "grad_norm": 0.088262178003788, "learning_rate": 1.5106308818403625e-05, "loss": 0.1329, "step": 3510 }, { "epoch": 1.2269083304287207, "grad_norm": 19.781274795532227, "learning_rate": 1.5092366678285117e-05, "loss": 0.105, "step": 3520 }, { "epoch": 1.230393865458348, "grad_norm": 0.21191132068634033, "learning_rate": 1.507842453816661e-05, "loss": 0.2493, "step": 3530 }, { "epoch": 1.233879400487975, "grad_norm": 12.934003829956055, "learning_rate": 1.50644823980481e-05, "loss": 0.185, "step": 3540 }, { "epoch": 1.237364935517602, "grad_norm": 8.322224617004395, "learning_rate": 1.5050540257929593e-05, "loss": 0.2953, "step": 3550 }, { "epoch": 1.240850470547229, "grad_norm": 0.5193243622779846, "learning_rate": 1.5036598117811086e-05, "loss": 0.2215, "step": 3560 }, { "epoch": 1.244336005576856, "grad_norm": 12.667407989501953, "learning_rate": 1.5022655977692577e-05, "loss": 0.1893, "step": 3570 }, { "epoch": 1.2478215406064832, "grad_norm": 0.07419061660766602, "learning_rate": 1.5008713837574069e-05, "loss": 0.1897, "step": 3580 }, { "epoch": 1.2513070756361102, "grad_norm": 14.40754222869873, "learning_rate": 1.4994771697455562e-05, "loss": 0.4052, "step": 3590 }, { "epoch": 1.2547926106657372, "grad_norm": 3.1872828006744385, "learning_rate": 1.4980829557337054e-05, "loss": 0.1727, "step": 3600 }, { "epoch": 1.2582781456953642, "grad_norm": 28.886524200439453, "learning_rate": 1.4966887417218545e-05, "loss": 0.5003, "step": 3610 }, { "epoch": 1.2617636807249912, "grad_norm": 0.11840394884347916, "learning_rate": 1.4952945277100037e-05, "loss": 0.1676, "step": 3620 }, { "epoch": 1.2652492157546185, "grad_norm": 13.540458679199219, "learning_rate": 1.4939003136981527e-05, "loss": 0.2395, "step": 3630 }, { "epoch": 1.2687347507842452, "grad_norm": 0.23518075048923492, "learning_rate": 1.4925060996863019e-05, "loss": 0.3769, "step": 3640 }, { "epoch": 1.2722202858138725, "grad_norm": 0.5583080053329468, "learning_rate": 1.4911118856744512e-05, "loss": 0.2068, "step": 3650 }, { "epoch": 1.2757058208434995, "grad_norm": 3.620421886444092, "learning_rate": 1.4897176716626002e-05, "loss": 0.2443, "step": 3660 }, { "epoch": 1.2791913558731265, "grad_norm": 14.638289451599121, "learning_rate": 1.4883234576507495e-05, "loss": 0.0804, "step": 3670 }, { "epoch": 1.2826768909027535, "grad_norm": 0.17217248678207397, "learning_rate": 1.4869292436388987e-05, "loss": 0.1282, "step": 3680 }, { "epoch": 1.2861624259323805, "grad_norm": 0.0552227608859539, "learning_rate": 1.4855350296270478e-05, "loss": 0.0728, "step": 3690 }, { "epoch": 1.2896479609620077, "grad_norm": 0.18412978947162628, "learning_rate": 1.484140815615197e-05, "loss": 0.3107, "step": 3700 }, { "epoch": 1.2931334959916347, "grad_norm": 1.4496484994888306, "learning_rate": 1.4827466016033463e-05, "loss": 0.2406, "step": 3710 }, { "epoch": 1.2966190310212617, "grad_norm": 16.918373107910156, "learning_rate": 1.4813523875914954e-05, "loss": 0.3134, "step": 3720 }, { "epoch": 1.3001045660508888, "grad_norm": 5.249573230743408, "learning_rate": 1.4799581735796447e-05, "loss": 0.1284, "step": 3730 }, { "epoch": 1.3035901010805158, "grad_norm": 19.409761428833008, "learning_rate": 1.4785639595677939e-05, "loss": 0.0873, "step": 3740 }, { "epoch": 1.307075636110143, "grad_norm": 0.08998361974954605, "learning_rate": 1.477169745555943e-05, "loss": 0.2249, "step": 3750 }, { "epoch": 1.31056117113977, "grad_norm": 4.676770210266113, "learning_rate": 1.475775531544092e-05, "loss": 0.2761, "step": 3760 }, { "epoch": 1.314046706169397, "grad_norm": 4.773134708404541, "learning_rate": 1.4743813175322411e-05, "loss": 0.1475, "step": 3770 }, { "epoch": 1.317532241199024, "grad_norm": 23.77858543395996, "learning_rate": 1.4729871035203904e-05, "loss": 0.1701, "step": 3780 }, { "epoch": 1.321017776228651, "grad_norm": 6.024458408355713, "learning_rate": 1.4715928895085396e-05, "loss": 0.245, "step": 3790 }, { "epoch": 1.3245033112582782, "grad_norm": 26.570066452026367, "learning_rate": 1.4701986754966889e-05, "loss": 0.2245, "step": 3800 }, { "epoch": 1.3279888462879053, "grad_norm": 0.06447375565767288, "learning_rate": 1.468804461484838e-05, "loss": 0.101, "step": 3810 }, { "epoch": 1.3314743813175323, "grad_norm": 11.744895935058594, "learning_rate": 1.4674102474729872e-05, "loss": 0.2009, "step": 3820 }, { "epoch": 1.3349599163471593, "grad_norm": 0.07097235321998596, "learning_rate": 1.4660160334611365e-05, "loss": 0.0224, "step": 3830 }, { "epoch": 1.3384454513767863, "grad_norm": 0.11342521011829376, "learning_rate": 1.4646218194492856e-05, "loss": 0.0885, "step": 3840 }, { "epoch": 1.3419309864064135, "grad_norm": 0.06430939584970474, "learning_rate": 1.4632276054374348e-05, "loss": 0.2388, "step": 3850 }, { "epoch": 1.3454165214360405, "grad_norm": 3.096365451812744, "learning_rate": 1.461833391425584e-05, "loss": 0.1814, "step": 3860 }, { "epoch": 1.3489020564656675, "grad_norm": 0.2073626071214676, "learning_rate": 1.4604391774137331e-05, "loss": 0.1304, "step": 3870 }, { "epoch": 1.3523875914952945, "grad_norm": 0.8284794688224792, "learning_rate": 1.4590449634018824e-05, "loss": 0.2057, "step": 3880 }, { "epoch": 1.3558731265249215, "grad_norm": 23.129026412963867, "learning_rate": 1.4576507493900316e-05, "loss": 0.175, "step": 3890 }, { "epoch": 1.3593586615545485, "grad_norm": 0.11513429880142212, "learning_rate": 1.4562565353781806e-05, "loss": 0.2399, "step": 3900 }, { "epoch": 1.3628441965841755, "grad_norm": 5.663066864013672, "learning_rate": 1.4548623213663298e-05, "loss": 0.1074, "step": 3910 }, { "epoch": 1.3663297316138028, "grad_norm": 21.3768310546875, "learning_rate": 1.4534681073544789e-05, "loss": 0.1827, "step": 3920 }, { "epoch": 1.3698152666434298, "grad_norm": 9.777505874633789, "learning_rate": 1.4520738933426281e-05, "loss": 0.2197, "step": 3930 }, { "epoch": 1.3733008016730568, "grad_norm": 0.8618378043174744, "learning_rate": 1.4506796793307774e-05, "loss": 0.2736, "step": 3940 }, { "epoch": 1.3767863367026838, "grad_norm": 23.52062225341797, "learning_rate": 1.4492854653189265e-05, "loss": 0.2393, "step": 3950 }, { "epoch": 1.3802718717323108, "grad_norm": 7.636297225952148, "learning_rate": 1.4478912513070757e-05, "loss": 0.16, "step": 3960 }, { "epoch": 1.383757406761938, "grad_norm": 3.086662530899048, "learning_rate": 1.446497037295225e-05, "loss": 0.2637, "step": 3970 }, { "epoch": 1.387242941791565, "grad_norm": 6.053905487060547, "learning_rate": 1.4451028232833742e-05, "loss": 0.1277, "step": 3980 }, { "epoch": 1.390728476821192, "grad_norm": 0.09486166387796402, "learning_rate": 1.4437086092715233e-05, "loss": 0.0652, "step": 3990 }, { "epoch": 1.394214011850819, "grad_norm": 9.314697265625, "learning_rate": 1.4423143952596726e-05, "loss": 0.194, "step": 4000 }, { "epoch": 1.397699546880446, "grad_norm": 5.727607727050781, "learning_rate": 1.4409201812478218e-05, "loss": 0.1755, "step": 4010 }, { "epoch": 1.4011850819100733, "grad_norm": 10.165273666381836, "learning_rate": 1.4395259672359709e-05, "loss": 0.3493, "step": 4020 }, { "epoch": 1.4046706169397003, "grad_norm": 0.11753875017166138, "learning_rate": 1.4381317532241201e-05, "loss": 0.1553, "step": 4030 }, { "epoch": 1.4081561519693273, "grad_norm": 0.0666135773062706, "learning_rate": 1.436737539212269e-05, "loss": 0.3826, "step": 4040 }, { "epoch": 1.4116416869989543, "grad_norm": 0.06710106134414673, "learning_rate": 1.4353433252004183e-05, "loss": 0.1867, "step": 4050 }, { "epoch": 1.4151272220285813, "grad_norm": 18.661643981933594, "learning_rate": 1.4339491111885676e-05, "loss": 0.2184, "step": 4060 }, { "epoch": 1.4186127570582086, "grad_norm": 0.0907972976565361, "learning_rate": 1.4325548971767166e-05, "loss": 0.2187, "step": 4070 }, { "epoch": 1.4220982920878356, "grad_norm": 1.6616415977478027, "learning_rate": 1.4311606831648659e-05, "loss": 0.1063, "step": 4080 }, { "epoch": 1.4255838271174626, "grad_norm": 0.11328552663326263, "learning_rate": 1.4297664691530151e-05, "loss": 0.1416, "step": 4090 }, { "epoch": 1.4290693621470896, "grad_norm": 0.36197254061698914, "learning_rate": 1.4283722551411642e-05, "loss": 0.3247, "step": 4100 }, { "epoch": 1.4325548971767166, "grad_norm": 2.550544500350952, "learning_rate": 1.4269780411293135e-05, "loss": 0.2398, "step": 4110 }, { "epoch": 1.4360404322063438, "grad_norm": 0.25212812423706055, "learning_rate": 1.4255838271174627e-05, "loss": 0.0808, "step": 4120 }, { "epoch": 1.4395259672359706, "grad_norm": 0.5252062678337097, "learning_rate": 1.4241896131056118e-05, "loss": 0.0474, "step": 4130 }, { "epoch": 1.4430115022655978, "grad_norm": 1.4656167030334473, "learning_rate": 1.422795399093761e-05, "loss": 0.2964, "step": 4140 }, { "epoch": 1.4464970372952248, "grad_norm": 0.06279879063367844, "learning_rate": 1.4214011850819103e-05, "loss": 0.2259, "step": 4150 }, { "epoch": 1.4499825723248518, "grad_norm": 0.2818046510219574, "learning_rate": 1.4200069710700594e-05, "loss": 0.2609, "step": 4160 }, { "epoch": 1.4534681073544788, "grad_norm": 3.4003350734710693, "learning_rate": 1.4186127570582085e-05, "loss": 0.1227, "step": 4170 }, { "epoch": 1.4569536423841059, "grad_norm": 0.0695640966296196, "learning_rate": 1.4172185430463577e-05, "loss": 0.0424, "step": 4180 }, { "epoch": 1.460439177413733, "grad_norm": 0.06560744345188141, "learning_rate": 1.4158243290345068e-05, "loss": 0.2265, "step": 4190 }, { "epoch": 1.46392471244336, "grad_norm": 0.46215468645095825, "learning_rate": 1.414430115022656e-05, "loss": 0.1508, "step": 4200 }, { "epoch": 1.467410247472987, "grad_norm": 0.21034209430217743, "learning_rate": 1.4130359010108053e-05, "loss": 0.1693, "step": 4210 }, { "epoch": 1.470895782502614, "grad_norm": 24.144786834716797, "learning_rate": 1.4116416869989544e-05, "loss": 0.1907, "step": 4220 }, { "epoch": 1.474381317532241, "grad_norm": 0.0716620460152626, "learning_rate": 1.4102474729871036e-05, "loss": 0.1234, "step": 4230 }, { "epoch": 1.4778668525618683, "grad_norm": 5.373108863830566, "learning_rate": 1.4088532589752529e-05, "loss": 0.0568, "step": 4240 }, { "epoch": 1.4813523875914953, "grad_norm": 0.06202000752091408, "learning_rate": 1.407459044963402e-05, "loss": 0.2224, "step": 4250 }, { "epoch": 1.4848379226211224, "grad_norm": 0.06813399493694305, "learning_rate": 1.4060648309515512e-05, "loss": 0.2171, "step": 4260 }, { "epoch": 1.4883234576507494, "grad_norm": 0.05181474983692169, "learning_rate": 1.4046706169397005e-05, "loss": 0.1021, "step": 4270 }, { "epoch": 1.4918089926803764, "grad_norm": 1.736260175704956, "learning_rate": 1.4032764029278496e-05, "loss": 0.0671, "step": 4280 }, { "epoch": 1.4952945277100036, "grad_norm": 0.14703615009784698, "learning_rate": 1.4018821889159988e-05, "loss": 0.073, "step": 4290 }, { "epoch": 1.4987800627396306, "grad_norm": 0.08682423084974289, "learning_rate": 1.400487974904148e-05, "loss": 0.1046, "step": 4300 }, { "epoch": 1.5022655977692576, "grad_norm": 0.08216214179992676, "learning_rate": 1.399093760892297e-05, "loss": 0.1728, "step": 4310 }, { "epoch": 1.5057511327988846, "grad_norm": 4.310842037200928, "learning_rate": 1.3976995468804462e-05, "loss": 0.0201, "step": 4320 }, { "epoch": 1.5092366678285116, "grad_norm": 0.6964895129203796, "learning_rate": 1.3963053328685953e-05, "loss": 0.2253, "step": 4330 }, { "epoch": 1.5127222028581389, "grad_norm": 0.06760009378194809, "learning_rate": 1.3949111188567445e-05, "loss": 0.0558, "step": 4340 }, { "epoch": 1.5162077378877656, "grad_norm": 1.6589277982711792, "learning_rate": 1.3935169048448938e-05, "loss": 0.286, "step": 4350 }, { "epoch": 1.5196932729173929, "grad_norm": 2.4069876670837402, "learning_rate": 1.3921226908330429e-05, "loss": 0.2241, "step": 4360 }, { "epoch": 1.5231788079470199, "grad_norm": 0.12570062279701233, "learning_rate": 1.3907284768211921e-05, "loss": 0.1356, "step": 4370 }, { "epoch": 1.5266643429766469, "grad_norm": 0.06179194152355194, "learning_rate": 1.3893342628093414e-05, "loss": 0.1527, "step": 4380 }, { "epoch": 1.5301498780062741, "grad_norm": 0.05693870410323143, "learning_rate": 1.3879400487974906e-05, "loss": 0.1209, "step": 4390 }, { "epoch": 1.533635413035901, "grad_norm": 9.671772003173828, "learning_rate": 1.3865458347856397e-05, "loss": 0.1119, "step": 4400 }, { "epoch": 1.5371209480655281, "grad_norm": 0.3705396354198456, "learning_rate": 1.385151620773789e-05, "loss": 0.1369, "step": 4410 }, { "epoch": 1.5406064830951551, "grad_norm": 0.1401708871126175, "learning_rate": 1.3837574067619382e-05, "loss": 0.1132, "step": 4420 }, { "epoch": 1.5440920181247821, "grad_norm": 2.265974521636963, "learning_rate": 1.3823631927500873e-05, "loss": 0.3808, "step": 4430 }, { "epoch": 1.5475775531544091, "grad_norm": 3.8402411937713623, "learning_rate": 1.3809689787382366e-05, "loss": 0.2986, "step": 4440 }, { "epoch": 1.5510630881840362, "grad_norm": 1.6255053281784058, "learning_rate": 1.3795747647263855e-05, "loss": 0.042, "step": 4450 }, { "epoch": 1.5545486232136634, "grad_norm": 0.05299604684114456, "learning_rate": 1.3781805507145347e-05, "loss": 0.0497, "step": 4460 }, { "epoch": 1.5580341582432904, "grad_norm": 0.05339618772268295, "learning_rate": 1.376786336702684e-05, "loss": 0.1915, "step": 4470 }, { "epoch": 1.5615196932729174, "grad_norm": 30.932308197021484, "learning_rate": 1.375392122690833e-05, "loss": 0.1334, "step": 4480 }, { "epoch": 1.5650052283025444, "grad_norm": 18.293027877807617, "learning_rate": 1.3739979086789823e-05, "loss": 0.2085, "step": 4490 }, { "epoch": 1.5684907633321714, "grad_norm": 5.334358215332031, "learning_rate": 1.3726036946671315e-05, "loss": 0.1781, "step": 4500 }, { "epoch": 1.5719762983617986, "grad_norm": 0.09247801452875137, "learning_rate": 1.3712094806552806e-05, "loss": 0.1584, "step": 4510 }, { "epoch": 1.5754618333914254, "grad_norm": 0.10990609228610992, "learning_rate": 1.3698152666434299e-05, "loss": 0.0539, "step": 4520 }, { "epoch": 1.5789473684210527, "grad_norm": 0.12785302102565765, "learning_rate": 1.3684210526315791e-05, "loss": 0.2398, "step": 4530 }, { "epoch": 1.5824329034506797, "grad_norm": 0.24163594841957092, "learning_rate": 1.3670268386197282e-05, "loss": 0.227, "step": 4540 }, { "epoch": 1.5859184384803067, "grad_norm": 0.06425183266401291, "learning_rate": 1.3656326246078775e-05, "loss": 0.0495, "step": 4550 }, { "epoch": 1.589403973509934, "grad_norm": 15.79089641571045, "learning_rate": 1.3642384105960267e-05, "loss": 0.1335, "step": 4560 }, { "epoch": 1.5928895085395607, "grad_norm": 8.039728164672852, "learning_rate": 1.362844196584176e-05, "loss": 0.1999, "step": 4570 }, { "epoch": 1.596375043569188, "grad_norm": 0.10453764349222183, "learning_rate": 1.3614499825723249e-05, "loss": 0.0428, "step": 4580 }, { "epoch": 1.599860578598815, "grad_norm": 0.09684242308139801, "learning_rate": 1.3600557685604741e-05, "loss": 0.0348, "step": 4590 }, { "epoch": 1.603346113628442, "grad_norm": 2.0049784183502197, "learning_rate": 1.3586615545486232e-05, "loss": 0.0227, "step": 4600 }, { "epoch": 1.6068316486580692, "grad_norm": 0.05621238425374031, "learning_rate": 1.3572673405367725e-05, "loss": 0.2134, "step": 4610 }, { "epoch": 1.610317183687696, "grad_norm": 24.956762313842773, "learning_rate": 1.3558731265249217e-05, "loss": 0.0569, "step": 4620 }, { "epoch": 1.6138027187173232, "grad_norm": 0.04639355093240738, "learning_rate": 1.3544789125130708e-05, "loss": 0.1955, "step": 4630 }, { "epoch": 1.6172882537469502, "grad_norm": 21.884952545166016, "learning_rate": 1.35308469850122e-05, "loss": 0.2995, "step": 4640 }, { "epoch": 1.6207737887765772, "grad_norm": 0.07225365191698074, "learning_rate": 1.3516904844893693e-05, "loss": 0.1419, "step": 4650 }, { "epoch": 1.6242593238062044, "grad_norm": 0.06399868428707123, "learning_rate": 1.3502962704775184e-05, "loss": 0.0805, "step": 4660 }, { "epoch": 1.6277448588358312, "grad_norm": 0.06368258595466614, "learning_rate": 1.3489020564656676e-05, "loss": 0.3106, "step": 4670 }, { "epoch": 1.6312303938654584, "grad_norm": 0.08223626762628555, "learning_rate": 1.3475078424538169e-05, "loss": 0.1631, "step": 4680 }, { "epoch": 1.6347159288950854, "grad_norm": 6.698259353637695, "learning_rate": 1.346113628441966e-05, "loss": 0.2371, "step": 4690 }, { "epoch": 1.6382014639247124, "grad_norm": 0.10331364721059799, "learning_rate": 1.3447194144301152e-05, "loss": 0.1428, "step": 4700 }, { "epoch": 1.6416869989543394, "grad_norm": 0.18476131558418274, "learning_rate": 1.3433252004182645e-05, "loss": 0.1106, "step": 4710 }, { "epoch": 1.6451725339839665, "grad_norm": 5.083899021148682, "learning_rate": 1.3419309864064134e-05, "loss": 0.1615, "step": 4720 }, { "epoch": 1.6486580690135937, "grad_norm": 0.05285824462771416, "learning_rate": 1.3405367723945626e-05, "loss": 0.368, "step": 4730 }, { "epoch": 1.6521436040432205, "grad_norm": 4.834890365600586, "learning_rate": 1.3391425583827117e-05, "loss": 0.2775, "step": 4740 }, { "epoch": 1.6556291390728477, "grad_norm": 0.3817296624183655, "learning_rate": 1.337748344370861e-05, "loss": 0.0175, "step": 4750 }, { "epoch": 1.6591146741024747, "grad_norm": 21.72384262084961, "learning_rate": 1.3363541303590102e-05, "loss": 0.181, "step": 4760 }, { "epoch": 1.6626002091321017, "grad_norm": 6.993847846984863, "learning_rate": 1.3349599163471595e-05, "loss": 0.1962, "step": 4770 }, { "epoch": 1.666085744161729, "grad_norm": 0.06561506539583206, "learning_rate": 1.3335657023353085e-05, "loss": 0.1334, "step": 4780 }, { "epoch": 1.6695712791913557, "grad_norm": 10.643206596374512, "learning_rate": 1.3321714883234578e-05, "loss": 0.183, "step": 4790 }, { "epoch": 1.673056814220983, "grad_norm": 10.353646278381348, "learning_rate": 1.330777274311607e-05, "loss": 0.2462, "step": 4800 }, { "epoch": 1.67654234925061, "grad_norm": 14.645241737365723, "learning_rate": 1.3293830602997561e-05, "loss": 0.1927, "step": 4810 }, { "epoch": 1.680027884280237, "grad_norm": 0.07404288649559021, "learning_rate": 1.3279888462879054e-05, "loss": 0.1398, "step": 4820 }, { "epoch": 1.6835134193098642, "grad_norm": 11.214003562927246, "learning_rate": 1.3265946322760546e-05, "loss": 0.1837, "step": 4830 }, { "epoch": 1.686998954339491, "grad_norm": 0.11196375638246536, "learning_rate": 1.3252004182642037e-05, "loss": 0.1121, "step": 4840 }, { "epoch": 1.6904844893691182, "grad_norm": 9.329215049743652, "learning_rate": 1.3238062042523528e-05, "loss": 0.1331, "step": 4850 }, { "epoch": 1.6939700243987452, "grad_norm": 0.13003626465797424, "learning_rate": 1.3224119902405019e-05, "loss": 0.1499, "step": 4860 }, { "epoch": 1.6974555594283722, "grad_norm": 0.055021703243255615, "learning_rate": 1.3210177762286511e-05, "loss": 0.1088, "step": 4870 }, { "epoch": 1.7009410944579995, "grad_norm": 0.05173968896269798, "learning_rate": 1.3196235622168004e-05, "loss": 0.0519, "step": 4880 }, { "epoch": 1.7044266294876262, "grad_norm": 1.1130493879318237, "learning_rate": 1.3182293482049495e-05, "loss": 0.215, "step": 4890 }, { "epoch": 1.7079121645172535, "grad_norm": 36.07246017456055, "learning_rate": 1.3168351341930987e-05, "loss": 0.26, "step": 4900 }, { "epoch": 1.7113976995468805, "grad_norm": 0.09109174460172653, "learning_rate": 1.315440920181248e-05, "loss": 0.018, "step": 4910 }, { "epoch": 1.7148832345765075, "grad_norm": 2.289961814880371, "learning_rate": 1.314046706169397e-05, "loss": 0.1378, "step": 4920 }, { "epoch": 1.7183687696061345, "grad_norm": 0.04753530025482178, "learning_rate": 1.3126524921575463e-05, "loss": 0.1949, "step": 4930 }, { "epoch": 1.7218543046357615, "grad_norm": 6.459181785583496, "learning_rate": 1.3112582781456955e-05, "loss": 0.288, "step": 4940 }, { "epoch": 1.7253398396653887, "grad_norm": 11.19622802734375, "learning_rate": 1.3098640641338448e-05, "loss": 0.1554, "step": 4950 }, { "epoch": 1.7288253746950157, "grad_norm": 15.069147109985352, "learning_rate": 1.3084698501219939e-05, "loss": 0.2112, "step": 4960 }, { "epoch": 1.7323109097246427, "grad_norm": 17.245620727539062, "learning_rate": 1.3070756361101431e-05, "loss": 0.2637, "step": 4970 }, { "epoch": 1.7357964447542698, "grad_norm": 25.261966705322266, "learning_rate": 1.3056814220982924e-05, "loss": 0.1142, "step": 4980 }, { "epoch": 1.7392819797838968, "grad_norm": 0.08086353540420532, "learning_rate": 1.3042872080864413e-05, "loss": 0.2082, "step": 4990 }, { "epoch": 1.742767514813524, "grad_norm": 33.71073532104492, "learning_rate": 1.3028929940745905e-05, "loss": 0.0617, "step": 5000 }, { "epoch": 1.7462530498431508, "grad_norm": 8.686712265014648, "learning_rate": 1.3014987800627396e-05, "loss": 0.1367, "step": 5010 }, { "epoch": 1.749738584872778, "grad_norm": 2.0827035903930664, "learning_rate": 1.3001045660508889e-05, "loss": 0.1099, "step": 5020 }, { "epoch": 1.753224119902405, "grad_norm": 7.724724769592285, "learning_rate": 1.2987103520390381e-05, "loss": 0.132, "step": 5030 }, { "epoch": 1.756709654932032, "grad_norm": 7.940341949462891, "learning_rate": 1.2973161380271872e-05, "loss": 0.3053, "step": 5040 }, { "epoch": 1.7601951899616592, "grad_norm": 0.10044675320386887, "learning_rate": 1.2959219240153364e-05, "loss": 0.1138, "step": 5050 }, { "epoch": 1.763680724991286, "grad_norm": 0.045176051557064056, "learning_rate": 1.2945277100034857e-05, "loss": 0.212, "step": 5060 }, { "epoch": 1.7671662600209133, "grad_norm": 3.353107452392578, "learning_rate": 1.2931334959916348e-05, "loss": 0.2066, "step": 5070 }, { "epoch": 1.7706517950505403, "grad_norm": 0.062912218272686, "learning_rate": 1.291739281979784e-05, "loss": 0.2011, "step": 5080 }, { "epoch": 1.7741373300801673, "grad_norm": 7.203383922576904, "learning_rate": 1.2903450679679333e-05, "loss": 0.4203, "step": 5090 }, { "epoch": 1.7776228651097945, "grad_norm": 0.4804973602294922, "learning_rate": 1.2889508539560824e-05, "loss": 0.2008, "step": 5100 }, { "epoch": 1.7811084001394213, "grad_norm": 0.10035104304552078, "learning_rate": 1.2875566399442316e-05, "loss": 0.1394, "step": 5110 }, { "epoch": 1.7845939351690485, "grad_norm": 18.202369689941406, "learning_rate": 1.2861624259323809e-05, "loss": 0.0973, "step": 5120 }, { "epoch": 1.7880794701986755, "grad_norm": 27.99166488647461, "learning_rate": 1.2847682119205298e-05, "loss": 0.3659, "step": 5130 }, { "epoch": 1.7915650052283025, "grad_norm": 5.776055812835693, "learning_rate": 1.283373997908679e-05, "loss": 0.0682, "step": 5140 }, { "epoch": 1.7950505402579295, "grad_norm": 6.58804178237915, "learning_rate": 1.2819797838968283e-05, "loss": 0.0869, "step": 5150 }, { "epoch": 1.7985360752875565, "grad_norm": 2.2123751640319824, "learning_rate": 1.2805855698849774e-05, "loss": 0.1033, "step": 5160 }, { "epoch": 1.8020216103171838, "grad_norm": 5.483887195587158, "learning_rate": 1.2791913558731266e-05, "loss": 0.21, "step": 5170 }, { "epoch": 1.8055071453468108, "grad_norm": 35.22694778442383, "learning_rate": 1.2777971418612759e-05, "loss": 0.0469, "step": 5180 }, { "epoch": 1.8089926803764378, "grad_norm": 16.363428115844727, "learning_rate": 1.276402927849425e-05, "loss": 0.2203, "step": 5190 }, { "epoch": 1.8124782154060648, "grad_norm": 3.4047915935516357, "learning_rate": 1.2750087138375742e-05, "loss": 0.0254, "step": 5200 }, { "epoch": 1.8159637504356918, "grad_norm": 0.04784770309925079, "learning_rate": 1.2736144998257234e-05, "loss": 0.1671, "step": 5210 }, { "epoch": 1.819449285465319, "grad_norm": 0.18466421961784363, "learning_rate": 1.2722202858138725e-05, "loss": 0.2333, "step": 5220 }, { "epoch": 1.8229348204949458, "grad_norm": 0.04754915460944176, "learning_rate": 1.2708260718020218e-05, "loss": 0.109, "step": 5230 }, { "epoch": 1.826420355524573, "grad_norm": 12.06218147277832, "learning_rate": 1.269431857790171e-05, "loss": 0.1115, "step": 5240 }, { "epoch": 1.8299058905542, "grad_norm": 12.35152816772461, "learning_rate": 1.2680376437783201e-05, "loss": 0.1294, "step": 5250 }, { "epoch": 1.833391425583827, "grad_norm": 0.5232785940170288, "learning_rate": 1.2666434297664692e-05, "loss": 0.1716, "step": 5260 }, { "epoch": 1.8368769606134543, "grad_norm": 0.065973199903965, "learning_rate": 1.2652492157546183e-05, "loss": 0.2806, "step": 5270 }, { "epoch": 1.840362495643081, "grad_norm": 0.05121095851063728, "learning_rate": 1.2638550017427675e-05, "loss": 0.3057, "step": 5280 }, { "epoch": 1.8438480306727083, "grad_norm": 0.04605868458747864, "learning_rate": 1.2624607877309168e-05, "loss": 0.182, "step": 5290 }, { "epoch": 1.8473335657023353, "grad_norm": 3.7501471042633057, "learning_rate": 1.2610665737190659e-05, "loss": 0.0176, "step": 5300 }, { "epoch": 1.8508191007319623, "grad_norm": 21.904052734375, "learning_rate": 1.2596723597072151e-05, "loss": 0.2681, "step": 5310 }, { "epoch": 1.8543046357615895, "grad_norm": 4.048603534698486, "learning_rate": 1.2582781456953644e-05, "loss": 0.1016, "step": 5320 }, { "epoch": 1.8577901707912163, "grad_norm": 17.176145553588867, "learning_rate": 1.2568839316835134e-05, "loss": 0.344, "step": 5330 }, { "epoch": 1.8612757058208436, "grad_norm": 0.05373512953519821, "learning_rate": 1.2554897176716627e-05, "loss": 0.0364, "step": 5340 }, { "epoch": 1.8647612408504706, "grad_norm": 0.4178902506828308, "learning_rate": 1.254095503659812e-05, "loss": 0.2752, "step": 5350 }, { "epoch": 1.8682467758800976, "grad_norm": 0.11804527789354324, "learning_rate": 1.2527012896479612e-05, "loss": 0.2173, "step": 5360 }, { "epoch": 1.8717323109097248, "grad_norm": 17.27764892578125, "learning_rate": 1.2513070756361103e-05, "loss": 0.1362, "step": 5370 }, { "epoch": 1.8752178459393516, "grad_norm": 20.017900466918945, "learning_rate": 1.2499128616242595e-05, "loss": 0.1478, "step": 5380 }, { "epoch": 1.8787033809689788, "grad_norm": 34.95002746582031, "learning_rate": 1.2485186476124088e-05, "loss": 0.1725, "step": 5390 }, { "epoch": 1.8821889159986058, "grad_norm": 0.23638759553432465, "learning_rate": 1.2471244336005577e-05, "loss": 0.0903, "step": 5400 }, { "epoch": 1.8856744510282328, "grad_norm": 8.639537811279297, "learning_rate": 1.245730219588707e-05, "loss": 0.1225, "step": 5410 }, { "epoch": 1.8891599860578598, "grad_norm": 0.10233496874570847, "learning_rate": 1.244336005576856e-05, "loss": 0.2227, "step": 5420 }, { "epoch": 1.8926455210874868, "grad_norm": 4.306946754455566, "learning_rate": 1.2429417915650053e-05, "loss": 0.1647, "step": 5430 }, { "epoch": 1.896131056117114, "grad_norm": 18.103878021240234, "learning_rate": 1.2415475775531545e-05, "loss": 0.231, "step": 5440 }, { "epoch": 1.8996165911467409, "grad_norm": 20.41852569580078, "learning_rate": 1.2401533635413036e-05, "loss": 0.2714, "step": 5450 }, { "epoch": 1.903102126176368, "grad_norm": 2.2409658432006836, "learning_rate": 1.2387591495294529e-05, "loss": 0.1807, "step": 5460 }, { "epoch": 1.906587661205995, "grad_norm": 0.10098310559988022, "learning_rate": 1.2373649355176021e-05, "loss": 0.2041, "step": 5470 }, { "epoch": 1.910073196235622, "grad_norm": 2.7435004711151123, "learning_rate": 1.2359707215057512e-05, "loss": 0.1135, "step": 5480 }, { "epoch": 1.9135587312652493, "grad_norm": 0.11639636754989624, "learning_rate": 1.2345765074939004e-05, "loss": 0.1849, "step": 5490 }, { "epoch": 1.9170442662948761, "grad_norm": 13.145666122436523, "learning_rate": 1.2331822934820497e-05, "loss": 0.2934, "step": 5500 }, { "epoch": 1.9205298013245033, "grad_norm": 8.368687629699707, "learning_rate": 1.2317880794701988e-05, "loss": 0.1166, "step": 5510 }, { "epoch": 1.9240153363541304, "grad_norm": 0.16210561990737915, "learning_rate": 1.230393865458348e-05, "loss": 0.133, "step": 5520 }, { "epoch": 1.9275008713837574, "grad_norm": 0.6329889893531799, "learning_rate": 1.2289996514464973e-05, "loss": 0.1613, "step": 5530 }, { "epoch": 1.9309864064133846, "grad_norm": 4.6153693199157715, "learning_rate": 1.2276054374346462e-05, "loss": 0.09, "step": 5540 }, { "epoch": 1.9344719414430114, "grad_norm": 0.1312878131866455, "learning_rate": 1.2262112234227954e-05, "loss": 0.1287, "step": 5550 }, { "epoch": 1.9379574764726386, "grad_norm": 0.05980315059423447, "learning_rate": 1.2248170094109447e-05, "loss": 0.0959, "step": 5560 }, { "epoch": 1.9414430115022656, "grad_norm": 19.431610107421875, "learning_rate": 1.2234227953990938e-05, "loss": 0.1181, "step": 5570 }, { "epoch": 1.9449285465318926, "grad_norm": 0.44517797231674194, "learning_rate": 1.222028581387243e-05, "loss": 0.0802, "step": 5580 }, { "epoch": 1.9484140815615199, "grad_norm": 0.04525260254740715, "learning_rate": 1.2206343673753923e-05, "loss": 0.2204, "step": 5590 }, { "epoch": 1.9518996165911466, "grad_norm": 0.1254422813653946, "learning_rate": 1.2192401533635414e-05, "loss": 0.0652, "step": 5600 }, { "epoch": 1.9553851516207739, "grad_norm": 1.6299126148223877, "learning_rate": 1.2178459393516906e-05, "loss": 0.2201, "step": 5610 }, { "epoch": 1.9588706866504009, "grad_norm": 27.747234344482422, "learning_rate": 1.2164517253398399e-05, "loss": 0.0627, "step": 5620 }, { "epoch": 1.9623562216800279, "grad_norm": 2.0174477100372314, "learning_rate": 1.215057511327989e-05, "loss": 0.2007, "step": 5630 }, { "epoch": 1.9658417567096549, "grad_norm": 7.153161525726318, "learning_rate": 1.2136632973161382e-05, "loss": 0.2419, "step": 5640 }, { "epoch": 1.969327291739282, "grad_norm": 0.46940529346466064, "learning_rate": 1.2122690833042874e-05, "loss": 0.0573, "step": 5650 }, { "epoch": 1.9728128267689091, "grad_norm": 4.334830284118652, "learning_rate": 1.2108748692924365e-05, "loss": 0.2491, "step": 5660 }, { "epoch": 1.9762983617985361, "grad_norm": 0.04316120222210884, "learning_rate": 1.2094806552805856e-05, "loss": 0.1859, "step": 5670 }, { "epoch": 1.9797838968281631, "grad_norm": 0.04128291830420494, "learning_rate": 1.2080864412687347e-05, "loss": 0.1061, "step": 5680 }, { "epoch": 1.9832694318577901, "grad_norm": 0.06560923904180527, "learning_rate": 1.206692227256884e-05, "loss": 0.054, "step": 5690 }, { "epoch": 1.9867549668874172, "grad_norm": 0.05319322645664215, "learning_rate": 1.2052980132450332e-05, "loss": 0.1104, "step": 5700 }, { "epoch": 1.9902405019170444, "grad_norm": 0.04641329124569893, "learning_rate": 1.2039037992331823e-05, "loss": 0.184, "step": 5710 }, { "epoch": 1.9937260369466712, "grad_norm": 6.287529468536377, "learning_rate": 1.2025095852213315e-05, "loss": 0.0912, "step": 5720 }, { "epoch": 1.9972115719762984, "grad_norm": 0.6517956852912903, "learning_rate": 1.2011153712094808e-05, "loss": 0.1706, "step": 5730 }, { "epoch": 2.0, "eval_accuracy": 0.9795061728395061, "eval_loss": 0.08464130759239197, "eval_runtime": 18.9519, "eval_samples_per_second": 213.699, "eval_steps_per_second": 26.752, "step": 5738 }, { "epoch": 2.0006971070059256, "grad_norm": 9.767017364501953, "learning_rate": 1.19972115719763e-05, "loss": 0.0665, "step": 5740 }, { "epoch": 2.0041826420355524, "grad_norm": 9.686302185058594, "learning_rate": 1.1983269431857791e-05, "loss": 0.3084, "step": 5750 }, { "epoch": 2.0076681770651796, "grad_norm": 34.309242248535156, "learning_rate": 1.1969327291739283e-05, "loss": 0.1616, "step": 5760 }, { "epoch": 2.0111537120948064, "grad_norm": 0.03740281984210014, "learning_rate": 1.1955385151620776e-05, "loss": 0.0595, "step": 5770 }, { "epoch": 2.0146392471244337, "grad_norm": 10.491089820861816, "learning_rate": 1.1941443011502267e-05, "loss": 0.2387, "step": 5780 }, { "epoch": 2.0181247821540604, "grad_norm": 3.4705872535705566, "learning_rate": 1.192750087138376e-05, "loss": 0.0817, "step": 5790 }, { "epoch": 2.0216103171836877, "grad_norm": 0.04854891821742058, "learning_rate": 1.1913558731265252e-05, "loss": 0.1739, "step": 5800 }, { "epoch": 2.025095852213315, "grad_norm": 3.5387609004974365, "learning_rate": 1.1899616591146741e-05, "loss": 0.3629, "step": 5810 }, { "epoch": 2.0285813872429417, "grad_norm": 0.45299866795539856, "learning_rate": 1.1885674451028233e-05, "loss": 0.2141, "step": 5820 }, { "epoch": 2.032066922272569, "grad_norm": 0.03833978250622749, "learning_rate": 1.1871732310909724e-05, "loss": 0.0408, "step": 5830 }, { "epoch": 2.0355524573021957, "grad_norm": 9.347251892089844, "learning_rate": 1.1857790170791217e-05, "loss": 0.2311, "step": 5840 }, { "epoch": 2.039037992331823, "grad_norm": 0.11472854763269424, "learning_rate": 1.184384803067271e-05, "loss": 0.0753, "step": 5850 }, { "epoch": 2.04252352736145, "grad_norm": 0.03732588514685631, "learning_rate": 1.18299058905542e-05, "loss": 0.0874, "step": 5860 }, { "epoch": 2.046009062391077, "grad_norm": 9.975323677062988, "learning_rate": 1.1815963750435693e-05, "loss": 0.1819, "step": 5870 }, { "epoch": 2.049494597420704, "grad_norm": 0.06995538622140884, "learning_rate": 1.1802021610317185e-05, "loss": 0.1342, "step": 5880 }, { "epoch": 2.052980132450331, "grad_norm": 12.45875072479248, "learning_rate": 1.1788079470198676e-05, "loss": 0.0675, "step": 5890 }, { "epoch": 2.056465667479958, "grad_norm": 15.665087699890137, "learning_rate": 1.1774137330080168e-05, "loss": 0.3064, "step": 5900 }, { "epoch": 2.0599512025095854, "grad_norm": 6.563640594482422, "learning_rate": 1.1760195189961661e-05, "loss": 0.115, "step": 5910 }, { "epoch": 2.063436737539212, "grad_norm": 0.057756856083869934, "learning_rate": 1.1746253049843153e-05, "loss": 0.1534, "step": 5920 }, { "epoch": 2.0669222725688394, "grad_norm": 1.6861991882324219, "learning_rate": 1.1732310909724644e-05, "loss": 0.0366, "step": 5930 }, { "epoch": 2.070407807598466, "grad_norm": 7.513923645019531, "learning_rate": 1.1718368769606137e-05, "loss": 0.3493, "step": 5940 }, { "epoch": 2.0738933426280934, "grad_norm": 0.05242444574832916, "learning_rate": 1.1704426629487626e-05, "loss": 0.117, "step": 5950 }, { "epoch": 2.0773788776577207, "grad_norm": 8.048820495605469, "learning_rate": 1.1690484489369118e-05, "loss": 0.1696, "step": 5960 }, { "epoch": 2.0808644126873475, "grad_norm": 6.724079608917236, "learning_rate": 1.1676542349250611e-05, "loss": 0.3178, "step": 5970 }, { "epoch": 2.0843499477169747, "grad_norm": 20.328981399536133, "learning_rate": 1.1662600209132102e-05, "loss": 0.0367, "step": 5980 }, { "epoch": 2.0878354827466015, "grad_norm": 0.051835183054208755, "learning_rate": 1.1648658069013594e-05, "loss": 0.1535, "step": 5990 }, { "epoch": 2.0913210177762287, "grad_norm": 1.784183382987976, "learning_rate": 1.1634715928895087e-05, "loss": 0.0967, "step": 6000 }, { "epoch": 2.0948065528058555, "grad_norm": 0.08854708820581436, "learning_rate": 1.1620773788776578e-05, "loss": 0.2557, "step": 6010 }, { "epoch": 2.0982920878354827, "grad_norm": 0.035098157823085785, "learning_rate": 1.160683164865807e-05, "loss": 0.0805, "step": 6020 }, { "epoch": 2.10177762286511, "grad_norm": 2.4362149238586426, "learning_rate": 1.1592889508539563e-05, "loss": 0.1533, "step": 6030 }, { "epoch": 2.1052631578947367, "grad_norm": 0.06450653076171875, "learning_rate": 1.1578947368421053e-05, "loss": 0.0535, "step": 6040 }, { "epoch": 2.108748692924364, "grad_norm": 0.1979568749666214, "learning_rate": 1.1565005228302546e-05, "loss": 0.1879, "step": 6050 }, { "epoch": 2.1122342279539907, "grad_norm": 0.039692219346761703, "learning_rate": 1.1551063088184038e-05, "loss": 0.2937, "step": 6060 }, { "epoch": 2.115719762983618, "grad_norm": 0.07281683385372162, "learning_rate": 1.153712094806553e-05, "loss": 0.2128, "step": 6070 }, { "epoch": 2.119205298013245, "grad_norm": 0.1039639413356781, "learning_rate": 1.152317880794702e-05, "loss": 0.0335, "step": 6080 }, { "epoch": 2.122690833042872, "grad_norm": 6.56930685043335, "learning_rate": 1.1509236667828511e-05, "loss": 0.1515, "step": 6090 }, { "epoch": 2.126176368072499, "grad_norm": 0.03588613122701645, "learning_rate": 1.1495294527710003e-05, "loss": 0.2306, "step": 6100 }, { "epoch": 2.129661903102126, "grad_norm": 11.106224060058594, "learning_rate": 1.1481352387591496e-05, "loss": 0.1734, "step": 6110 }, { "epoch": 2.1331474381317532, "grad_norm": 26.07666778564453, "learning_rate": 1.1467410247472988e-05, "loss": 0.2769, "step": 6120 }, { "epoch": 2.1366329731613805, "grad_norm": 10.249650955200195, "learning_rate": 1.145346810735448e-05, "loss": 0.1453, "step": 6130 }, { "epoch": 2.1401185081910072, "grad_norm": 2.9616682529449463, "learning_rate": 1.1439525967235972e-05, "loss": 0.1438, "step": 6140 }, { "epoch": 2.1436040432206345, "grad_norm": 0.1858883649110794, "learning_rate": 1.1425583827117464e-05, "loss": 0.0802, "step": 6150 }, { "epoch": 2.1470895782502613, "grad_norm": 0.5032067894935608, "learning_rate": 1.1411641686998955e-05, "loss": 0.1924, "step": 6160 }, { "epoch": 2.1505751132798885, "grad_norm": 0.04852207750082016, "learning_rate": 1.1397699546880448e-05, "loss": 0.0768, "step": 6170 }, { "epoch": 2.1540606483095157, "grad_norm": 0.9909989833831787, "learning_rate": 1.138375740676194e-05, "loss": 0.0138, "step": 6180 }, { "epoch": 2.1575461833391425, "grad_norm": 0.036640316247940063, "learning_rate": 1.1369815266643431e-05, "loss": 0.0621, "step": 6190 }, { "epoch": 2.1610317183687697, "grad_norm": 11.941831588745117, "learning_rate": 1.1355873126524923e-05, "loss": 0.1628, "step": 6200 }, { "epoch": 2.1645172533983965, "grad_norm": 0.07637592405080795, "learning_rate": 1.1341930986406416e-05, "loss": 0.1821, "step": 6210 }, { "epoch": 2.1680027884280237, "grad_norm": 0.07637934386730194, "learning_rate": 1.1327988846287905e-05, "loss": 0.1699, "step": 6220 }, { "epoch": 2.171488323457651, "grad_norm": 0.0320703499019146, "learning_rate": 1.1314046706169398e-05, "loss": 0.136, "step": 6230 }, { "epoch": 2.1749738584872778, "grad_norm": 0.06301452219486237, "learning_rate": 1.1300104566050888e-05, "loss": 0.1863, "step": 6240 }, { "epoch": 2.178459393516905, "grad_norm": 0.03842555731534958, "learning_rate": 1.128616242593238e-05, "loss": 0.0453, "step": 6250 }, { "epoch": 2.1819449285465318, "grad_norm": 0.03716614842414856, "learning_rate": 1.1272220285813873e-05, "loss": 0.1863, "step": 6260 }, { "epoch": 2.185430463576159, "grad_norm": 0.043157655745744705, "learning_rate": 1.1258278145695364e-05, "loss": 0.2721, "step": 6270 }, { "epoch": 2.188915998605786, "grad_norm": 0.07806668430566788, "learning_rate": 1.1244336005576857e-05, "loss": 0.1212, "step": 6280 }, { "epoch": 2.192401533635413, "grad_norm": 0.025718241930007935, "learning_rate": 1.123039386545835e-05, "loss": 0.0891, "step": 6290 }, { "epoch": 2.1958870686650402, "grad_norm": 1.4488409757614136, "learning_rate": 1.121645172533984e-05, "loss": 0.1036, "step": 6300 }, { "epoch": 2.199372603694667, "grad_norm": 0.10433078557252884, "learning_rate": 1.1202509585221333e-05, "loss": 0.2275, "step": 6310 }, { "epoch": 2.2028581387242943, "grad_norm": 0.049195412546396255, "learning_rate": 1.1188567445102825e-05, "loss": 0.01, "step": 6320 }, { "epoch": 2.206343673753921, "grad_norm": 1.253458857536316, "learning_rate": 1.1174625304984318e-05, "loss": 0.0472, "step": 6330 }, { "epoch": 2.2098292087835483, "grad_norm": 0.04194582253694534, "learning_rate": 1.1160683164865808e-05, "loss": 0.2199, "step": 6340 }, { "epoch": 2.2133147438131755, "grad_norm": 0.22297650575637817, "learning_rate": 1.11467410247473e-05, "loss": 0.0768, "step": 6350 }, { "epoch": 2.2168002788428023, "grad_norm": 0.03070775978267193, "learning_rate": 1.113279888462879e-05, "loss": 0.2436, "step": 6360 }, { "epoch": 2.2202858138724295, "grad_norm": 0.04577281326055527, "learning_rate": 1.1118856744510282e-05, "loss": 0.0474, "step": 6370 }, { "epoch": 2.2237713489020563, "grad_norm": 0.07843345403671265, "learning_rate": 1.1104914604391775e-05, "loss": 0.0127, "step": 6380 }, { "epoch": 2.2272568839316835, "grad_norm": 0.08817867934703827, "learning_rate": 1.1090972464273266e-05, "loss": 0.2568, "step": 6390 }, { "epoch": 2.2307424189613108, "grad_norm": 0.16615049540996552, "learning_rate": 1.1077030324154758e-05, "loss": 0.0813, "step": 6400 }, { "epoch": 2.2342279539909375, "grad_norm": 0.0826653391122818, "learning_rate": 1.106308818403625e-05, "loss": 0.0563, "step": 6410 }, { "epoch": 2.2377134890205648, "grad_norm": 0.05890136957168579, "learning_rate": 1.1049146043917742e-05, "loss": 0.1036, "step": 6420 }, { "epoch": 2.2411990240501916, "grad_norm": 0.02518044412136078, "learning_rate": 1.1035203903799234e-05, "loss": 0.3812, "step": 6430 }, { "epoch": 2.244684559079819, "grad_norm": 10.567590713500977, "learning_rate": 1.1021261763680727e-05, "loss": 0.1296, "step": 6440 }, { "epoch": 2.2481700941094456, "grad_norm": 34.6755485534668, "learning_rate": 1.1007319623562217e-05, "loss": 0.1012, "step": 6450 }, { "epoch": 2.251655629139073, "grad_norm": 15.21702766418457, "learning_rate": 1.099337748344371e-05, "loss": 0.104, "step": 6460 }, { "epoch": 2.2551411641687, "grad_norm": 0.03763876110315323, "learning_rate": 1.0979435343325202e-05, "loss": 0.1439, "step": 6470 }, { "epoch": 2.258626699198327, "grad_norm": 1.3450989723205566, "learning_rate": 1.0965493203206693e-05, "loss": 0.1072, "step": 6480 }, { "epoch": 2.262112234227954, "grad_norm": 0.040909543633461, "learning_rate": 1.0951551063088184e-05, "loss": 0.1536, "step": 6490 }, { "epoch": 2.265597769257581, "grad_norm": 11.683106422424316, "learning_rate": 1.0937608922969677e-05, "loss": 0.1513, "step": 6500 }, { "epoch": 2.269083304287208, "grad_norm": 17.460403442382812, "learning_rate": 1.0923666782851167e-05, "loss": 0.2438, "step": 6510 }, { "epoch": 2.2725688393168353, "grad_norm": 0.03460359200835228, "learning_rate": 1.090972464273266e-05, "loss": 0.0227, "step": 6520 }, { "epoch": 2.276054374346462, "grad_norm": 0.37530404329299927, "learning_rate": 1.0895782502614152e-05, "loss": 0.3024, "step": 6530 }, { "epoch": 2.2795399093760893, "grad_norm": 2.3782732486724854, "learning_rate": 1.0881840362495643e-05, "loss": 0.0873, "step": 6540 }, { "epoch": 2.283025444405716, "grad_norm": 0.10322298854589462, "learning_rate": 1.0867898222377136e-05, "loss": 0.0503, "step": 6550 }, { "epoch": 2.2865109794353433, "grad_norm": 0.08421849459409714, "learning_rate": 1.0853956082258628e-05, "loss": 0.2299, "step": 6560 }, { "epoch": 2.2899965144649705, "grad_norm": 0.02602728269994259, "learning_rate": 1.0840013942140119e-05, "loss": 0.0616, "step": 6570 }, { "epoch": 2.2934820494945973, "grad_norm": 0.14224806427955627, "learning_rate": 1.0826071802021612e-05, "loss": 0.1129, "step": 6580 }, { "epoch": 2.2969675845242246, "grad_norm": 0.02626832202076912, "learning_rate": 1.0812129661903104e-05, "loss": 0.1778, "step": 6590 }, { "epoch": 2.3004531195538513, "grad_norm": 0.044093526899814606, "learning_rate": 1.0798187521784595e-05, "loss": 0.3261, "step": 6600 }, { "epoch": 2.3039386545834786, "grad_norm": 10.756707191467285, "learning_rate": 1.0784245381666087e-05, "loss": 0.0347, "step": 6610 }, { "epoch": 2.307424189613106, "grad_norm": 21.886564254760742, "learning_rate": 1.077030324154758e-05, "loss": 0.0578, "step": 6620 }, { "epoch": 2.3109097246427326, "grad_norm": 0.18429026007652283, "learning_rate": 1.0756361101429069e-05, "loss": 0.2541, "step": 6630 }, { "epoch": 2.31439525967236, "grad_norm": 0.07464715838432312, "learning_rate": 1.0742418961310562e-05, "loss": 0.1547, "step": 6640 }, { "epoch": 2.3178807947019866, "grad_norm": 3.714336395263672, "learning_rate": 1.0728476821192052e-05, "loss": 0.2902, "step": 6650 }, { "epoch": 2.321366329731614, "grad_norm": 0.04718979448080063, "learning_rate": 1.0714534681073545e-05, "loss": 0.1189, "step": 6660 }, { "epoch": 2.324851864761241, "grad_norm": 0.9812682867050171, "learning_rate": 1.0700592540955037e-05, "loss": 0.2747, "step": 6670 }, { "epoch": 2.328337399790868, "grad_norm": 5.4444684982299805, "learning_rate": 1.0686650400836528e-05, "loss": 0.1446, "step": 6680 }, { "epoch": 2.331822934820495, "grad_norm": 59.1495475769043, "learning_rate": 1.067270826071802e-05, "loss": 0.0812, "step": 6690 }, { "epoch": 2.335308469850122, "grad_norm": 0.24086332321166992, "learning_rate": 1.0658766120599513e-05, "loss": 0.1042, "step": 6700 }, { "epoch": 2.338794004879749, "grad_norm": 0.045006413012742996, "learning_rate": 1.0644823980481006e-05, "loss": 0.0306, "step": 6710 }, { "epoch": 2.3422795399093763, "grad_norm": 0.03548438847064972, "learning_rate": 1.0630881840362497e-05, "loss": 0.0339, "step": 6720 }, { "epoch": 2.345765074939003, "grad_norm": 0.043661557137966156, "learning_rate": 1.0616939700243989e-05, "loss": 0.1046, "step": 6730 }, { "epoch": 2.3492506099686303, "grad_norm": 0.03901302441954613, "learning_rate": 1.0602997560125482e-05, "loss": 0.1696, "step": 6740 }, { "epoch": 2.352736144998257, "grad_norm": 31.03610610961914, "learning_rate": 1.0589055420006972e-05, "loss": 0.464, "step": 6750 }, { "epoch": 2.3562216800278843, "grad_norm": 1.4271265268325806, "learning_rate": 1.0575113279888465e-05, "loss": 0.0679, "step": 6760 }, { "epoch": 2.3597072150575116, "grad_norm": 15.831421852111816, "learning_rate": 1.0561171139769954e-05, "loss": 0.1207, "step": 6770 }, { "epoch": 2.3631927500871384, "grad_norm": 0.0829152911901474, "learning_rate": 1.0547228999651447e-05, "loss": 0.1525, "step": 6780 }, { "epoch": 2.3666782851167656, "grad_norm": 0.04117709770798683, "learning_rate": 1.0533286859532939e-05, "loss": 0.1828, "step": 6790 }, { "epoch": 2.3701638201463924, "grad_norm": 0.49705639481544495, "learning_rate": 1.051934471941443e-05, "loss": 0.2104, "step": 6800 }, { "epoch": 2.3736493551760196, "grad_norm": 24.12613296508789, "learning_rate": 1.0505402579295922e-05, "loss": 0.2642, "step": 6810 }, { "epoch": 2.3771348902056464, "grad_norm": 9.280982971191406, "learning_rate": 1.0491460439177415e-05, "loss": 0.1206, "step": 6820 }, { "epoch": 2.3806204252352736, "grad_norm": 0.5508664846420288, "learning_rate": 1.0477518299058906e-05, "loss": 0.1583, "step": 6830 }, { "epoch": 2.384105960264901, "grad_norm": 9.97512149810791, "learning_rate": 1.0463576158940398e-05, "loss": 0.112, "step": 6840 }, { "epoch": 2.3875914952945276, "grad_norm": 0.03401152417063713, "learning_rate": 1.044963401882189e-05, "loss": 0.0891, "step": 6850 }, { "epoch": 2.391077030324155, "grad_norm": 9.554537773132324, "learning_rate": 1.0435691878703382e-05, "loss": 0.1256, "step": 6860 }, { "epoch": 2.3945625653537816, "grad_norm": 10.09185791015625, "learning_rate": 1.0421749738584874e-05, "loss": 0.1756, "step": 6870 }, { "epoch": 2.398048100383409, "grad_norm": 0.05580438673496246, "learning_rate": 1.0407807598466367e-05, "loss": 0.3326, "step": 6880 }, { "epoch": 2.4015336354130357, "grad_norm": 0.7927899360656738, "learning_rate": 1.0393865458347859e-05, "loss": 0.2091, "step": 6890 }, { "epoch": 2.405019170442663, "grad_norm": 4.687802791595459, "learning_rate": 1.0379923318229348e-05, "loss": 0.1468, "step": 6900 }, { "epoch": 2.40850470547229, "grad_norm": 0.03195074200630188, "learning_rate": 1.036598117811084e-05, "loss": 0.0722, "step": 6910 }, { "epoch": 2.411990240501917, "grad_norm": 7.4297590255737305, "learning_rate": 1.0352039037992331e-05, "loss": 0.1048, "step": 6920 }, { "epoch": 2.415475775531544, "grad_norm": 0.04244573786854744, "learning_rate": 1.0338096897873824e-05, "loss": 0.1695, "step": 6930 }, { "epoch": 2.418961310561171, "grad_norm": 0.28662291169166565, "learning_rate": 1.0324154757755317e-05, "loss": 0.0572, "step": 6940 }, { "epoch": 2.422446845590798, "grad_norm": 8.817971229553223, "learning_rate": 1.0310212617636807e-05, "loss": 0.169, "step": 6950 }, { "epoch": 2.4259323806204254, "grad_norm": 0.06383698433637619, "learning_rate": 1.02962704775183e-05, "loss": 0.0905, "step": 6960 }, { "epoch": 2.429417915650052, "grad_norm": 0.18662381172180176, "learning_rate": 1.0282328337399792e-05, "loss": 0.0093, "step": 6970 }, { "epoch": 2.4329034506796794, "grad_norm": 23.655628204345703, "learning_rate": 1.0268386197281283e-05, "loss": 0.1657, "step": 6980 }, { "epoch": 2.436388985709306, "grad_norm": 19.459394454956055, "learning_rate": 1.0254444057162776e-05, "loss": 0.1519, "step": 6990 }, { "epoch": 2.4398745207389334, "grad_norm": 9.555098533630371, "learning_rate": 1.0240501917044268e-05, "loss": 0.1095, "step": 7000 }, { "epoch": 2.4433600557685606, "grad_norm": 0.10387060046195984, "learning_rate": 1.0226559776925759e-05, "loss": 0.1536, "step": 7010 }, { "epoch": 2.4468455907981874, "grad_norm": 0.0778062492609024, "learning_rate": 1.0212617636807251e-05, "loss": 0.0229, "step": 7020 }, { "epoch": 2.4503311258278146, "grad_norm": 15.077468872070312, "learning_rate": 1.0198675496688744e-05, "loss": 0.2126, "step": 7030 }, { "epoch": 2.4538166608574414, "grad_norm": 0.02709888108074665, "learning_rate": 1.0184733356570233e-05, "loss": 0.0632, "step": 7040 }, { "epoch": 2.4573021958870687, "grad_norm": 0.04719749838113785, "learning_rate": 1.0170791216451726e-05, "loss": 0.2281, "step": 7050 }, { "epoch": 2.460787730916696, "grad_norm": 12.81986141204834, "learning_rate": 1.0156849076333216e-05, "loss": 0.2326, "step": 7060 }, { "epoch": 2.4642732659463227, "grad_norm": 0.12026797980070114, "learning_rate": 1.0142906936214709e-05, "loss": 0.1655, "step": 7070 }, { "epoch": 2.46775880097595, "grad_norm": 0.03786276653409004, "learning_rate": 1.0128964796096201e-05, "loss": 0.1009, "step": 7080 }, { "epoch": 2.4712443360055767, "grad_norm": 0.12191443145275116, "learning_rate": 1.0115022655977694e-05, "loss": 0.1566, "step": 7090 }, { "epoch": 2.474729871035204, "grad_norm": 3.893367290496826, "learning_rate": 1.0101080515859185e-05, "loss": 0.152, "step": 7100 }, { "epoch": 2.478215406064831, "grad_norm": 0.02769341878592968, "learning_rate": 1.0087138375740677e-05, "loss": 0.1226, "step": 7110 }, { "epoch": 2.481700941094458, "grad_norm": 14.743918418884277, "learning_rate": 1.007319623562217e-05, "loss": 0.0629, "step": 7120 }, { "epoch": 2.485186476124085, "grad_norm": 0.024167189374566078, "learning_rate": 1.005925409550366e-05, "loss": 0.1118, "step": 7130 }, { "epoch": 2.488672011153712, "grad_norm": 0.027763212099671364, "learning_rate": 1.0045311955385153e-05, "loss": 0.1814, "step": 7140 }, { "epoch": 2.492157546183339, "grad_norm": 0.03643191605806351, "learning_rate": 1.0031369815266646e-05, "loss": 0.1689, "step": 7150 }, { "epoch": 2.4956430812129664, "grad_norm": 9.347540855407715, "learning_rate": 1.0017427675148136e-05, "loss": 0.1229, "step": 7160 }, { "epoch": 2.499128616242593, "grad_norm": 0.03700919449329376, "learning_rate": 1.0003485535029629e-05, "loss": 0.1254, "step": 7170 }, { "epoch": 2.5026141512722204, "grad_norm": 0.03421923145651817, "learning_rate": 9.98954339491112e-06, "loss": 0.086, "step": 7180 }, { "epoch": 2.506099686301847, "grad_norm": 9.573734283447266, "learning_rate": 9.975601254792612e-06, "loss": 0.2747, "step": 7190 }, { "epoch": 2.5095852213314744, "grad_norm": 0.4348721504211426, "learning_rate": 9.961659114674103e-06, "loss": 0.1462, "step": 7200 }, { "epoch": 2.5130707563611017, "grad_norm": 0.0570727176964283, "learning_rate": 9.947716974555594e-06, "loss": 0.3057, "step": 7210 }, { "epoch": 2.5165562913907285, "grad_norm": 8.233874320983887, "learning_rate": 9.933774834437086e-06, "loss": 0.1181, "step": 7220 }, { "epoch": 2.5200418264203557, "grad_norm": 0.04349507763981819, "learning_rate": 9.919832694318579e-06, "loss": 0.0152, "step": 7230 }, { "epoch": 2.5235273614499825, "grad_norm": 0.12035762518644333, "learning_rate": 9.90589055420007e-06, "loss": 0.2576, "step": 7240 }, { "epoch": 2.5270128964796097, "grad_norm": 1.1068867444992065, "learning_rate": 9.891948414081562e-06, "loss": 0.3256, "step": 7250 }, { "epoch": 2.530498431509237, "grad_norm": 0.03243900090456009, "learning_rate": 9.878006273963055e-06, "loss": 0.1656, "step": 7260 }, { "epoch": 2.5339839665388637, "grad_norm": 0.02715582214295864, "learning_rate": 9.864064133844546e-06, "loss": 0.053, "step": 7270 }, { "epoch": 2.5374695015684905, "grad_norm": 1.750795602798462, "learning_rate": 9.850121993726038e-06, "loss": 0.1305, "step": 7280 }, { "epoch": 2.5409550365981177, "grad_norm": 2.8734793663024902, "learning_rate": 9.836179853607529e-06, "loss": 0.019, "step": 7290 }, { "epoch": 2.544440571627745, "grad_norm": 0.04098232463002205, "learning_rate": 9.822237713489021e-06, "loss": 0.0792, "step": 7300 }, { "epoch": 2.547926106657372, "grad_norm": 0.94338059425354, "learning_rate": 9.808295573370514e-06, "loss": 0.0443, "step": 7310 }, { "epoch": 2.551411641686999, "grad_norm": 0.20982791483402252, "learning_rate": 9.794353433252005e-06, "loss": 0.0906, "step": 7320 }, { "epoch": 2.5548971767166258, "grad_norm": 15.172703742980957, "learning_rate": 9.780411293133497e-06, "loss": 0.3467, "step": 7330 }, { "epoch": 2.558382711746253, "grad_norm": 0.26785117387771606, "learning_rate": 9.76646915301499e-06, "loss": 0.0437, "step": 7340 }, { "epoch": 2.56186824677588, "grad_norm": 12.561800003051758, "learning_rate": 9.75252701289648e-06, "loss": 0.1028, "step": 7350 }, { "epoch": 2.565353781805507, "grad_norm": 20.95243263244629, "learning_rate": 9.738584872777971e-06, "loss": 0.1875, "step": 7360 }, { "epoch": 2.5688393168351342, "grad_norm": 0.034853193908929825, "learning_rate": 9.724642732659464e-06, "loss": 0.0107, "step": 7370 }, { "epoch": 2.572324851864761, "grad_norm": 3.5008201599121094, "learning_rate": 9.710700592540956e-06, "loss": 0.1182, "step": 7380 }, { "epoch": 2.5758103868943882, "grad_norm": 0.5372085571289062, "learning_rate": 9.696758452422447e-06, "loss": 0.0679, "step": 7390 }, { "epoch": 2.5792959219240155, "grad_norm": 0.05770620331168175, "learning_rate": 9.68281631230394e-06, "loss": 0.0422, "step": 7400 }, { "epoch": 2.5827814569536423, "grad_norm": 0.0414259098470211, "learning_rate": 9.668874172185432e-06, "loss": 0.0136, "step": 7410 }, { "epoch": 2.5862669919832695, "grad_norm": 0.03852393478155136, "learning_rate": 9.654932032066923e-06, "loss": 0.2012, "step": 7420 }, { "epoch": 2.5897525270128963, "grad_norm": 13.666277885437012, "learning_rate": 9.640989891948414e-06, "loss": 0.0595, "step": 7430 }, { "epoch": 2.5932380620425235, "grad_norm": 0.4676874279975891, "learning_rate": 9.627047751829906e-06, "loss": 0.2056, "step": 7440 }, { "epoch": 2.5967235970721507, "grad_norm": 2.7470552921295166, "learning_rate": 9.613105611711399e-06, "loss": 0.0593, "step": 7450 }, { "epoch": 2.6002091321017775, "grad_norm": 0.027122966945171356, "learning_rate": 9.59916347159289e-06, "loss": 0.1028, "step": 7460 }, { "epoch": 2.6036946671314047, "grad_norm": 0.020191051065921783, "learning_rate": 9.585221331474382e-06, "loss": 0.0137, "step": 7470 }, { "epoch": 2.6071802021610315, "grad_norm": 5.530102729797363, "learning_rate": 9.571279191355873e-06, "loss": 0.0952, "step": 7480 }, { "epoch": 2.6106657371906588, "grad_norm": 30.30436897277832, "learning_rate": 9.557337051237366e-06, "loss": 0.1527, "step": 7490 }, { "epoch": 2.614151272220286, "grad_norm": 0.10788305848836899, "learning_rate": 9.543394911118858e-06, "loss": 0.1038, "step": 7500 }, { "epoch": 2.6176368072499128, "grad_norm": 4.27375602722168, "learning_rate": 9.529452771000349e-06, "loss": 0.1816, "step": 7510 }, { "epoch": 2.62112234227954, "grad_norm": 0.06542309373617172, "learning_rate": 9.515510630881841e-06, "loss": 0.1523, "step": 7520 }, { "epoch": 2.624607877309167, "grad_norm": 13.549959182739258, "learning_rate": 9.501568490763334e-06, "loss": 0.208, "step": 7530 }, { "epoch": 2.628093412338794, "grad_norm": 15.343768119812012, "learning_rate": 9.487626350644825e-06, "loss": 0.2428, "step": 7540 }, { "epoch": 2.6315789473684212, "grad_norm": 0.05019600689411163, "learning_rate": 9.473684210526315e-06, "loss": 0.249, "step": 7550 }, { "epoch": 2.635064482398048, "grad_norm": 13.942766189575195, "learning_rate": 9.459742070407808e-06, "loss": 0.215, "step": 7560 }, { "epoch": 2.6385500174276753, "grad_norm": 0.032675858587026596, "learning_rate": 9.4457999302893e-06, "loss": 0.133, "step": 7570 }, { "epoch": 2.642035552457302, "grad_norm": 0.05308655649423599, "learning_rate": 9.431857790170791e-06, "loss": 0.218, "step": 7580 }, { "epoch": 2.6455210874869293, "grad_norm": 0.03882508724927902, "learning_rate": 9.417915650052284e-06, "loss": 0.1013, "step": 7590 }, { "epoch": 2.6490066225165565, "grad_norm": 0.08975423872470856, "learning_rate": 9.403973509933776e-06, "loss": 0.3583, "step": 7600 }, { "epoch": 2.6524921575461833, "grad_norm": 5.848064422607422, "learning_rate": 9.390031369815267e-06, "loss": 0.117, "step": 7610 }, { "epoch": 2.6559776925758105, "grad_norm": 9.604795455932617, "learning_rate": 9.376089229696758e-06, "loss": 0.2501, "step": 7620 }, { "epoch": 2.6594632276054373, "grad_norm": 0.13165131211280823, "learning_rate": 9.36214708957825e-06, "loss": 0.1344, "step": 7630 }, { "epoch": 2.6629487626350645, "grad_norm": 0.2645893692970276, "learning_rate": 9.348204949459743e-06, "loss": 0.0126, "step": 7640 }, { "epoch": 2.6664342976646918, "grad_norm": 0.19416283071041107, "learning_rate": 9.334262809341234e-06, "loss": 0.0361, "step": 7650 }, { "epoch": 2.6699198326943185, "grad_norm": 0.15585561096668243, "learning_rate": 9.320320669222726e-06, "loss": 0.0768, "step": 7660 }, { "epoch": 2.6734053677239458, "grad_norm": 0.03315681591629982, "learning_rate": 9.306378529104219e-06, "loss": 0.1021, "step": 7670 }, { "epoch": 2.6768909027535726, "grad_norm": 0.26877591013908386, "learning_rate": 9.292436388985711e-06, "loss": 0.1073, "step": 7680 }, { "epoch": 2.6803764377832, "grad_norm": 0.08659540116786957, "learning_rate": 9.278494248867202e-06, "loss": 0.1891, "step": 7690 }, { "epoch": 2.683861972812827, "grad_norm": 21.811100006103516, "learning_rate": 9.264552108748693e-06, "loss": 0.2154, "step": 7700 }, { "epoch": 2.687347507842454, "grad_norm": 0.033906009048223495, "learning_rate": 9.250609968630185e-06, "loss": 0.2231, "step": 7710 }, { "epoch": 2.690833042872081, "grad_norm": 0.03380822762846947, "learning_rate": 9.236667828511678e-06, "loss": 0.2515, "step": 7720 }, { "epoch": 2.694318577901708, "grad_norm": 0.03534507378935814, "learning_rate": 9.222725688393169e-06, "loss": 0.1303, "step": 7730 }, { "epoch": 2.697804112931335, "grad_norm": 0.03269047290086746, "learning_rate": 9.208783548274661e-06, "loss": 0.0908, "step": 7740 }, { "epoch": 2.7012896479609623, "grad_norm": 5.287631988525391, "learning_rate": 9.194841408156154e-06, "loss": 0.1089, "step": 7750 }, { "epoch": 2.704775182990589, "grad_norm": 0.03315020725131035, "learning_rate": 9.180899268037645e-06, "loss": 0.0785, "step": 7760 }, { "epoch": 2.708260718020216, "grad_norm": 0.04839996621012688, "learning_rate": 9.166957127919135e-06, "loss": 0.1913, "step": 7770 }, { "epoch": 2.711746253049843, "grad_norm": 17.74064826965332, "learning_rate": 9.153014987800628e-06, "loss": 0.3599, "step": 7780 }, { "epoch": 2.7152317880794703, "grad_norm": 0.03409993648529053, "learning_rate": 9.13907284768212e-06, "loss": 0.0807, "step": 7790 }, { "epoch": 2.718717323109097, "grad_norm": 15.558609008789062, "learning_rate": 9.125130707563611e-06, "loss": 0.2329, "step": 7800 }, { "epoch": 2.7222028581387243, "grad_norm": 0.7072465419769287, "learning_rate": 9.111188567445104e-06, "loss": 0.2571, "step": 7810 }, { "epoch": 2.725688393168351, "grad_norm": 0.0745309516787529, "learning_rate": 9.097246427326596e-06, "loss": 0.072, "step": 7820 }, { "epoch": 2.7291739281979783, "grad_norm": 0.05214143171906471, "learning_rate": 9.083304287208087e-06, "loss": 0.3442, "step": 7830 }, { "epoch": 2.7326594632276056, "grad_norm": 0.5700211524963379, "learning_rate": 9.069362147089578e-06, "loss": 0.1222, "step": 7840 }, { "epoch": 2.7361449982572323, "grad_norm": 0.032385822385549545, "learning_rate": 9.05542000697107e-06, "loss": 0.0603, "step": 7850 }, { "epoch": 2.7396305332868596, "grad_norm": 1.3482292890548706, "learning_rate": 9.041477866852563e-06, "loss": 0.018, "step": 7860 }, { "epoch": 2.7431160683164864, "grad_norm": 0.031450305134058, "learning_rate": 9.027535726734055e-06, "loss": 0.2208, "step": 7870 }, { "epoch": 2.7466016033461136, "grad_norm": 17.35287857055664, "learning_rate": 9.013593586615546e-06, "loss": 0.1161, "step": 7880 }, { "epoch": 2.750087138375741, "grad_norm": 0.044247403740882874, "learning_rate": 8.999651446497037e-06, "loss": 0.1503, "step": 7890 }, { "epoch": 2.7535726734053676, "grad_norm": 0.03640054911375046, "learning_rate": 8.98570930637853e-06, "loss": 0.1018, "step": 7900 }, { "epoch": 2.757058208434995, "grad_norm": 2.4188730716705322, "learning_rate": 8.971767166260022e-06, "loss": 0.2335, "step": 7910 }, { "epoch": 2.7605437434646216, "grad_norm": 1.5793354511260986, "learning_rate": 8.957825026141513e-06, "loss": 0.0794, "step": 7920 }, { "epoch": 2.764029278494249, "grad_norm": 7.847117900848389, "learning_rate": 8.943882886023005e-06, "loss": 0.1338, "step": 7930 }, { "epoch": 2.767514813523876, "grad_norm": 0.0489552803337574, "learning_rate": 8.929940745904498e-06, "loss": 0.2494, "step": 7940 }, { "epoch": 2.771000348553503, "grad_norm": 0.023390300571918488, "learning_rate": 8.915998605785989e-06, "loss": 0.2496, "step": 7950 }, { "epoch": 2.77448588358313, "grad_norm": 0.07795246690511703, "learning_rate": 8.90205646566748e-06, "loss": 0.1616, "step": 7960 }, { "epoch": 2.777971418612757, "grad_norm": 6.65225076675415, "learning_rate": 8.888114325548972e-06, "loss": 0.2364, "step": 7970 }, { "epoch": 2.781456953642384, "grad_norm": 1.1383771896362305, "learning_rate": 8.874172185430465e-06, "loss": 0.2328, "step": 7980 }, { "epoch": 2.7849424886720113, "grad_norm": 0.02891557849943638, "learning_rate": 8.860230045311955e-06, "loss": 0.0924, "step": 7990 }, { "epoch": 2.788428023701638, "grad_norm": 0.2302275449037552, "learning_rate": 8.846287905193448e-06, "loss": 0.1521, "step": 8000 }, { "epoch": 2.7919135587312653, "grad_norm": 0.022319387644529343, "learning_rate": 8.83234576507494e-06, "loss": 0.2744, "step": 8010 }, { "epoch": 2.795399093760892, "grad_norm": 10.700783729553223, "learning_rate": 8.818403624956431e-06, "loss": 0.113, "step": 8020 }, { "epoch": 2.7988846287905194, "grad_norm": 8.70779800415039, "learning_rate": 8.804461484837922e-06, "loss": 0.0871, "step": 8030 }, { "epoch": 2.8023701638201466, "grad_norm": 21.17486000061035, "learning_rate": 8.790519344719415e-06, "loss": 0.1093, "step": 8040 }, { "epoch": 2.8058556988497734, "grad_norm": 0.03342998027801514, "learning_rate": 8.776577204600907e-06, "loss": 0.0455, "step": 8050 }, { "epoch": 2.8093412338794006, "grad_norm": 0.0786479264497757, "learning_rate": 8.7626350644824e-06, "loss": 0.1769, "step": 8060 }, { "epoch": 2.8128267689090274, "grad_norm": 0.03312867507338524, "learning_rate": 8.74869292436389e-06, "loss": 0.0294, "step": 8070 }, { "epoch": 2.8163123039386546, "grad_norm": 0.3976738154888153, "learning_rate": 8.734750784245383e-06, "loss": 0.0484, "step": 8080 }, { "epoch": 2.819797838968282, "grad_norm": 0.04097994789481163, "learning_rate": 8.720808644126875e-06, "loss": 0.2901, "step": 8090 }, { "epoch": 2.8232833739979086, "grad_norm": 0.03706745803356171, "learning_rate": 8.706866504008366e-06, "loss": 0.1388, "step": 8100 }, { "epoch": 2.826768909027536, "grad_norm": 2.765982151031494, "learning_rate": 8.692924363889857e-06, "loss": 0.2213, "step": 8110 }, { "epoch": 2.8302544440571626, "grad_norm": 0.11387185752391815, "learning_rate": 8.67898222377135e-06, "loss": 0.3626, "step": 8120 }, { "epoch": 2.83373997908679, "grad_norm": 10.69524097442627, "learning_rate": 8.665040083652842e-06, "loss": 0.1166, "step": 8130 }, { "epoch": 2.837225514116417, "grad_norm": 0.020516468212008476, "learning_rate": 8.651097943534333e-06, "loss": 0.1093, "step": 8140 }, { "epoch": 2.840711049146044, "grad_norm": 0.24759964644908905, "learning_rate": 8.637155803415825e-06, "loss": 0.2226, "step": 8150 }, { "epoch": 2.844196584175671, "grad_norm": 0.030304880812764168, "learning_rate": 8.623213663297318e-06, "loss": 0.1346, "step": 8160 }, { "epoch": 2.847682119205298, "grad_norm": 2.6233885288238525, "learning_rate": 8.609271523178809e-06, "loss": 0.175, "step": 8170 }, { "epoch": 2.851167654234925, "grad_norm": 2.439249038696289, "learning_rate": 8.5953293830603e-06, "loss": 0.0595, "step": 8180 }, { "epoch": 2.8546531892645524, "grad_norm": 0.10265187174081802, "learning_rate": 8.581387242941792e-06, "loss": 0.0761, "step": 8190 }, { "epoch": 2.858138724294179, "grad_norm": 17.636390686035156, "learning_rate": 8.567445102823285e-06, "loss": 0.2557, "step": 8200 }, { "epoch": 2.861624259323806, "grad_norm": 0.16799131035804749, "learning_rate": 8.553502962704775e-06, "loss": 0.0318, "step": 8210 }, { "epoch": 2.865109794353433, "grad_norm": 0.3623441755771637, "learning_rate": 8.539560822586268e-06, "loss": 0.0818, "step": 8220 }, { "epoch": 2.8685953293830604, "grad_norm": 6.526904106140137, "learning_rate": 8.52561868246776e-06, "loss": 0.0823, "step": 8230 }, { "epoch": 2.8720808644126876, "grad_norm": 3.7460978031158447, "learning_rate": 8.511676542349251e-06, "loss": 0.2567, "step": 8240 }, { "epoch": 2.8755663994423144, "grad_norm": 0.03773918002843857, "learning_rate": 8.497734402230744e-06, "loss": 0.0277, "step": 8250 }, { "epoch": 2.879051934471941, "grad_norm": 0.03944707289338112, "learning_rate": 8.483792262112234e-06, "loss": 0.0547, "step": 8260 }, { "epoch": 2.8825374695015684, "grad_norm": 0.16772694885730743, "learning_rate": 8.469850121993727e-06, "loss": 0.0881, "step": 8270 }, { "epoch": 2.8860230045311956, "grad_norm": 7.099332809448242, "learning_rate": 8.45590798187522e-06, "loss": 0.1931, "step": 8280 }, { "epoch": 2.8895085395608224, "grad_norm": 1.7227249145507812, "learning_rate": 8.44196584175671e-06, "loss": 0.1587, "step": 8290 }, { "epoch": 2.8929940745904497, "grad_norm": 0.22683005034923553, "learning_rate": 8.428023701638201e-06, "loss": 0.0558, "step": 8300 }, { "epoch": 2.8964796096200764, "grad_norm": 0.030454624444246292, "learning_rate": 8.414081561519694e-06, "loss": 0.1197, "step": 8310 }, { "epoch": 2.8999651446497037, "grad_norm": 0.07277271151542664, "learning_rate": 8.400139421401186e-06, "loss": 0.0851, "step": 8320 }, { "epoch": 2.903450679679331, "grad_norm": 6.2670063972473145, "learning_rate": 8.386197281282677e-06, "loss": 0.0101, "step": 8330 }, { "epoch": 2.9069362147089577, "grad_norm": 25.918701171875, "learning_rate": 8.37225514116417e-06, "loss": 0.2227, "step": 8340 }, { "epoch": 2.910421749738585, "grad_norm": 0.02563529834151268, "learning_rate": 8.358313001045662e-06, "loss": 0.2458, "step": 8350 }, { "epoch": 2.9139072847682117, "grad_norm": 0.12701044976711273, "learning_rate": 8.344370860927153e-06, "loss": 0.0929, "step": 8360 }, { "epoch": 2.917392819797839, "grad_norm": 0.0416131317615509, "learning_rate": 8.330428720808644e-06, "loss": 0.1182, "step": 8370 }, { "epoch": 2.920878354827466, "grad_norm": 20.982830047607422, "learning_rate": 8.316486580690136e-06, "loss": 0.2912, "step": 8380 }, { "epoch": 2.924363889857093, "grad_norm": 0.04384233430027962, "learning_rate": 8.302544440571629e-06, "loss": 0.0101, "step": 8390 }, { "epoch": 2.92784942488672, "grad_norm": 0.35305264592170715, "learning_rate": 8.28860230045312e-06, "loss": 0.1257, "step": 8400 }, { "epoch": 2.931334959916347, "grad_norm": 0.6124236583709717, "learning_rate": 8.274660160334612e-06, "loss": 0.0244, "step": 8410 }, { "epoch": 2.934820494945974, "grad_norm": 21.486831665039062, "learning_rate": 8.260718020216104e-06, "loss": 0.1246, "step": 8420 }, { "epoch": 2.9383060299756014, "grad_norm": 0.04249065741896629, "learning_rate": 8.246775880097595e-06, "loss": 0.1105, "step": 8430 }, { "epoch": 2.941791565005228, "grad_norm": 22.740060806274414, "learning_rate": 8.232833739979088e-06, "loss": 0.3444, "step": 8440 }, { "epoch": 2.9452771000348554, "grad_norm": 0.03325342759490013, "learning_rate": 8.218891599860579e-06, "loss": 0.0593, "step": 8450 }, { "epoch": 2.948762635064482, "grad_norm": 0.06233534216880798, "learning_rate": 8.204949459742071e-06, "loss": 0.0664, "step": 8460 }, { "epoch": 2.9522481700941094, "grad_norm": 0.13591769337654114, "learning_rate": 8.191007319623564e-06, "loss": 0.2097, "step": 8470 }, { "epoch": 2.9557337051237367, "grad_norm": 0.034143611788749695, "learning_rate": 8.177065179505054e-06, "loss": 0.0226, "step": 8480 }, { "epoch": 2.9592192401533635, "grad_norm": 0.04051746428012848, "learning_rate": 8.163123039386547e-06, "loss": 0.1239, "step": 8490 }, { "epoch": 2.9627047751829907, "grad_norm": 0.17615370452404022, "learning_rate": 8.14918089926804e-06, "loss": 0.0881, "step": 8500 }, { "epoch": 2.9661903102126175, "grad_norm": 10.80210018157959, "learning_rate": 8.13523875914953e-06, "loss": 0.1604, "step": 8510 }, { "epoch": 2.9696758452422447, "grad_norm": 0.04547272250056267, "learning_rate": 8.121296619031021e-06, "loss": 0.1647, "step": 8520 }, { "epoch": 2.973161380271872, "grad_norm": 0.026660796254873276, "learning_rate": 8.107354478912514e-06, "loss": 0.0634, "step": 8530 }, { "epoch": 2.9766469153014987, "grad_norm": 0.02724742889404297, "learning_rate": 8.093412338794006e-06, "loss": 0.0997, "step": 8540 }, { "epoch": 2.980132450331126, "grad_norm": 0.02272176183760166, "learning_rate": 8.079470198675497e-06, "loss": 0.1778, "step": 8550 }, { "epoch": 2.9836179853607527, "grad_norm": 0.02710247039794922, "learning_rate": 8.06552805855699e-06, "loss": 0.1714, "step": 8560 }, { "epoch": 2.98710352039038, "grad_norm": 3.23300838470459, "learning_rate": 8.051585918438482e-06, "loss": 0.1514, "step": 8570 }, { "epoch": 2.990589055420007, "grad_norm": 19.48651123046875, "learning_rate": 8.037643778319973e-06, "loss": 0.101, "step": 8580 }, { "epoch": 2.994074590449634, "grad_norm": 15.483784675598145, "learning_rate": 8.023701638201464e-06, "loss": 0.072, "step": 8590 }, { "epoch": 2.997560125479261, "grad_norm": 13.562616348266602, "learning_rate": 8.009759498082956e-06, "loss": 0.0941, "step": 8600 }, { "epoch": 3.0, "eval_accuracy": 0.9861728395061728, "eval_loss": 0.05898062884807587, "eval_runtime": 18.9157, "eval_samples_per_second": 214.108, "eval_steps_per_second": 26.803, "step": 8607 }, { "epoch": 3.001045660508888, "grad_norm": 0.05343186855316162, "learning_rate": 7.995817357964449e-06, "loss": 0.1422, "step": 8610 }, { "epoch": 3.004531195538515, "grad_norm": 24.563711166381836, "learning_rate": 7.98187521784594e-06, "loss": 0.0213, "step": 8620 }, { "epoch": 3.008016730568142, "grad_norm": 0.026707723736763, "learning_rate": 7.967933077727432e-06, "loss": 0.031, "step": 8630 }, { "epoch": 3.0115022655977692, "grad_norm": 0.02573820762336254, "learning_rate": 7.953990937608924e-06, "loss": 0.0848, "step": 8640 }, { "epoch": 3.0149878006273965, "grad_norm": 0.020314253866672516, "learning_rate": 7.940048797490415e-06, "loss": 0.1811, "step": 8650 }, { "epoch": 3.0184733356570232, "grad_norm": 0.0360274612903595, "learning_rate": 7.926106657371908e-06, "loss": 0.0079, "step": 8660 }, { "epoch": 3.0219588706866505, "grad_norm": 0.34295937418937683, "learning_rate": 7.912164517253399e-06, "loss": 0.141, "step": 8670 }, { "epoch": 3.0254444057162773, "grad_norm": 0.383859783411026, "learning_rate": 7.898222377134891e-06, "loss": 0.1501, "step": 8680 }, { "epoch": 3.0289299407459045, "grad_norm": 0.06587964296340942, "learning_rate": 7.884280237016384e-06, "loss": 0.095, "step": 8690 }, { "epoch": 3.0324154757755317, "grad_norm": 0.041697971522808075, "learning_rate": 7.870338096897874e-06, "loss": 0.1642, "step": 8700 }, { "epoch": 3.0359010108051585, "grad_norm": 28.092723846435547, "learning_rate": 7.856395956779365e-06, "loss": 0.1282, "step": 8710 }, { "epoch": 3.0393865458347857, "grad_norm": 1.7253819704055786, "learning_rate": 7.842453816660858e-06, "loss": 0.0581, "step": 8720 }, { "epoch": 3.0428720808644125, "grad_norm": 1.214120626449585, "learning_rate": 7.82851167654235e-06, "loss": 0.0404, "step": 8730 }, { "epoch": 3.0463576158940397, "grad_norm": 18.530441284179688, "learning_rate": 7.814569536423841e-06, "loss": 0.1388, "step": 8740 }, { "epoch": 3.049843150923667, "grad_norm": 0.030280515551567078, "learning_rate": 7.800627396305334e-06, "loss": 0.0956, "step": 8750 }, { "epoch": 3.0533286859532938, "grad_norm": 0.02037186734378338, "learning_rate": 7.786685256186826e-06, "loss": 0.126, "step": 8760 }, { "epoch": 3.056814220982921, "grad_norm": 33.261940002441406, "learning_rate": 7.772743116068317e-06, "loss": 0.1255, "step": 8770 }, { "epoch": 3.060299756012548, "grad_norm": 22.391355514526367, "learning_rate": 7.758800975949808e-06, "loss": 0.1261, "step": 8780 }, { "epoch": 3.063785291042175, "grad_norm": 0.017767876386642456, "learning_rate": 7.7448588358313e-06, "loss": 0.1211, "step": 8790 }, { "epoch": 3.0672708260718022, "grad_norm": 5.44157600402832, "learning_rate": 7.730916695712793e-06, "loss": 0.1787, "step": 8800 }, { "epoch": 3.070756361101429, "grad_norm": 0.02152939699590206, "learning_rate": 7.716974555594284e-06, "loss": 0.0867, "step": 8810 }, { "epoch": 3.0742418961310563, "grad_norm": 29.05510711669922, "learning_rate": 7.703032415475776e-06, "loss": 0.2069, "step": 8820 }, { "epoch": 3.077727431160683, "grad_norm": 0.029673922806978226, "learning_rate": 7.689090275357269e-06, "loss": 0.1669, "step": 8830 }, { "epoch": 3.0812129661903103, "grad_norm": 0.02840520814061165, "learning_rate": 7.675148135238761e-06, "loss": 0.2332, "step": 8840 }, { "epoch": 3.084698501219937, "grad_norm": 0.2850615978240967, "learning_rate": 7.661205995120252e-06, "loss": 0.146, "step": 8850 }, { "epoch": 3.0881840362495643, "grad_norm": 3.1782281398773193, "learning_rate": 7.647263855001743e-06, "loss": 0.0449, "step": 8860 }, { "epoch": 3.0916695712791915, "grad_norm": 2.359084367752075, "learning_rate": 7.633321714883235e-06, "loss": 0.0291, "step": 8870 }, { "epoch": 3.0951551063088183, "grad_norm": 0.0349331870675087, "learning_rate": 7.619379574764727e-06, "loss": 0.1953, "step": 8880 }, { "epoch": 3.0986406413384455, "grad_norm": 0.03900326043367386, "learning_rate": 7.605437434646219e-06, "loss": 0.0101, "step": 8890 }, { "epoch": 3.1021261763680723, "grad_norm": 0.05492232367396355, "learning_rate": 7.591495294527711e-06, "loss": 0.024, "step": 8900 }, { "epoch": 3.1056117113976995, "grad_norm": 0.024204595014452934, "learning_rate": 7.577553154409203e-06, "loss": 0.2914, "step": 8910 }, { "epoch": 3.1090972464273268, "grad_norm": 0.21245832741260529, "learning_rate": 7.5636110142906935e-06, "loss": 0.1574, "step": 8920 }, { "epoch": 3.1125827814569536, "grad_norm": 4.7999587059021, "learning_rate": 7.549668874172186e-06, "loss": 0.1669, "step": 8930 }, { "epoch": 3.116068316486581, "grad_norm": 0.0458659753203392, "learning_rate": 7.535726734053678e-06, "loss": 0.2493, "step": 8940 }, { "epoch": 3.1195538515162076, "grad_norm": 0.025298912078142166, "learning_rate": 7.521784593935169e-06, "loss": 0.0541, "step": 8950 }, { "epoch": 3.123039386545835, "grad_norm": 1.2849235534667969, "learning_rate": 7.507842453816662e-06, "loss": 0.0486, "step": 8960 }, { "epoch": 3.126524921575462, "grad_norm": 0.03680930659174919, "learning_rate": 7.4939003136981535e-06, "loss": 0.1282, "step": 8970 }, { "epoch": 3.130010456605089, "grad_norm": 13.066097259521484, "learning_rate": 7.479958173579646e-06, "loss": 0.2534, "step": 8980 }, { "epoch": 3.133495991634716, "grad_norm": 0.02675449103116989, "learning_rate": 7.466016033461137e-06, "loss": 0.0103, "step": 8990 }, { "epoch": 3.136981526664343, "grad_norm": 0.038677338510751724, "learning_rate": 7.4520738933426285e-06, "loss": 0.0742, "step": 9000 }, { "epoch": 3.14046706169397, "grad_norm": 0.026775427162647247, "learning_rate": 7.43813175322412e-06, "loss": 0.0929, "step": 9010 }, { "epoch": 3.1439525967235973, "grad_norm": 0.04959922656416893, "learning_rate": 7.424189613105613e-06, "loss": 0.171, "step": 9020 }, { "epoch": 3.147438131753224, "grad_norm": 0.018528427928686142, "learning_rate": 7.410247472987104e-06, "loss": 0.1509, "step": 9030 }, { "epoch": 3.1509236667828513, "grad_norm": 0.02553379535675049, "learning_rate": 7.396305332868596e-06, "loss": 0.0405, "step": 9040 }, { "epoch": 3.154409201812478, "grad_norm": 0.045496754348278046, "learning_rate": 7.3823631927500885e-06, "loss": 0.0465, "step": 9050 }, { "epoch": 3.1578947368421053, "grad_norm": 18.267221450805664, "learning_rate": 7.368421052631579e-06, "loss": 0.1788, "step": 9060 }, { "epoch": 3.1613802718717325, "grad_norm": 0.037292297929525375, "learning_rate": 7.354478912513071e-06, "loss": 0.1418, "step": 9070 }, { "epoch": 3.1648658069013593, "grad_norm": 1.9102615118026733, "learning_rate": 7.3405367723945635e-06, "loss": 0.0699, "step": 9080 }, { "epoch": 3.1683513419309866, "grad_norm": 47.18280029296875, "learning_rate": 7.326594632276055e-06, "loss": 0.1691, "step": 9090 }, { "epoch": 3.1718368769606133, "grad_norm": 0.13229234516620636, "learning_rate": 7.312652492157547e-06, "loss": 0.1772, "step": 9100 }, { "epoch": 3.1753224119902406, "grad_norm": 2.6047537326812744, "learning_rate": 7.298710352039039e-06, "loss": 0.07, "step": 9110 }, { "epoch": 3.1788079470198674, "grad_norm": 0.02106287144124508, "learning_rate": 7.28476821192053e-06, "loss": 0.1055, "step": 9120 }, { "epoch": 3.1822934820494946, "grad_norm": 10.96900749206543, "learning_rate": 7.270826071802022e-06, "loss": 0.1797, "step": 9130 }, { "epoch": 3.185779017079122, "grad_norm": 0.1008637398481369, "learning_rate": 7.256883931683513e-06, "loss": 0.0178, "step": 9140 }, { "epoch": 3.1892645521087486, "grad_norm": 0.0249412152916193, "learning_rate": 7.242941791565006e-06, "loss": 0.3967, "step": 9150 }, { "epoch": 3.192750087138376, "grad_norm": 0.025104615837335587, "learning_rate": 7.228999651446498e-06, "loss": 0.0881, "step": 9160 }, { "epoch": 3.1962356221680026, "grad_norm": 0.02654813602566719, "learning_rate": 7.21505751132799e-06, "loss": 0.1262, "step": 9170 }, { "epoch": 3.19972115719763, "grad_norm": 0.03677063435316086, "learning_rate": 7.201115371209482e-06, "loss": 0.0244, "step": 9180 }, { "epoch": 3.203206692227257, "grad_norm": 4.220332145690918, "learning_rate": 7.187173231090973e-06, "loss": 0.1071, "step": 9190 }, { "epoch": 3.206692227256884, "grad_norm": 0.019408810883760452, "learning_rate": 7.173231090972464e-06, "loss": 0.1627, "step": 9200 }, { "epoch": 3.210177762286511, "grad_norm": 1.1298311948776245, "learning_rate": 7.159288950853957e-06, "loss": 0.1347, "step": 9210 }, { "epoch": 3.213663297316138, "grad_norm": 0.04483231157064438, "learning_rate": 7.145346810735448e-06, "loss": 0.2057, "step": 9220 }, { "epoch": 3.217148832345765, "grad_norm": 0.020672811195254326, "learning_rate": 7.13140467061694e-06, "loss": 0.0078, "step": 9230 }, { "epoch": 3.2206343673753923, "grad_norm": 0.022838251665234566, "learning_rate": 7.117462530498433e-06, "loss": 0.1516, "step": 9240 }, { "epoch": 3.224119902405019, "grad_norm": 0.037012387067079544, "learning_rate": 7.103520390379924e-06, "loss": 0.0569, "step": 9250 }, { "epoch": 3.2276054374346463, "grad_norm": 0.2272426038980484, "learning_rate": 7.089578250261415e-06, "loss": 0.0739, "step": 9260 }, { "epoch": 3.231090972464273, "grad_norm": 26.441532135009766, "learning_rate": 7.0756361101429076e-06, "loss": 0.0436, "step": 9270 }, { "epoch": 3.2345765074939004, "grad_norm": 19.132356643676758, "learning_rate": 7.061693970024399e-06, "loss": 0.181, "step": 9280 }, { "epoch": 3.238062042523527, "grad_norm": 0.4390275776386261, "learning_rate": 7.047751829905891e-06, "loss": 0.18, "step": 9290 }, { "epoch": 3.2415475775531544, "grad_norm": 0.013922056183218956, "learning_rate": 7.033809689787383e-06, "loss": 0.148, "step": 9300 }, { "epoch": 3.2450331125827816, "grad_norm": 0.01766609586775303, "learning_rate": 7.019867549668875e-06, "loss": 0.1277, "step": 9310 }, { "epoch": 3.2485186476124084, "grad_norm": 0.03631417080760002, "learning_rate": 7.005925409550367e-06, "loss": 0.0666, "step": 9320 }, { "epoch": 3.2520041826420356, "grad_norm": 0.022327765822410583, "learning_rate": 6.9919832694318575e-06, "loss": 0.1104, "step": 9330 }, { "epoch": 3.2554897176716624, "grad_norm": 28.965906143188477, "learning_rate": 6.97804112931335e-06, "loss": 0.1194, "step": 9340 }, { "epoch": 3.2589752527012896, "grad_norm": 0.016817284747958183, "learning_rate": 6.964098989194842e-06, "loss": 0.105, "step": 9350 }, { "epoch": 3.262460787730917, "grad_norm": 19.65138816833496, "learning_rate": 6.950156849076334e-06, "loss": 0.1736, "step": 9360 }, { "epoch": 3.2659463227605436, "grad_norm": 0.1805201917886734, "learning_rate": 6.936214708957826e-06, "loss": 0.2812, "step": 9370 }, { "epoch": 3.269431857790171, "grad_norm": 3.660529613494873, "learning_rate": 6.9222725688393175e-06, "loss": 0.1089, "step": 9380 }, { "epoch": 3.2729173928197977, "grad_norm": 18.988616943359375, "learning_rate": 6.90833042872081e-06, "loss": 0.2735, "step": 9390 }, { "epoch": 3.276402927849425, "grad_norm": 0.05193915590643883, "learning_rate": 6.894388288602301e-06, "loss": 0.3342, "step": 9400 }, { "epoch": 3.279888462879052, "grad_norm": 0.2501579225063324, "learning_rate": 6.8804461484837925e-06, "loss": 0.0079, "step": 9410 }, { "epoch": 3.283373997908679, "grad_norm": 0.035554543137550354, "learning_rate": 6.866504008365284e-06, "loss": 0.0192, "step": 9420 }, { "epoch": 3.286859532938306, "grad_norm": 0.020876824855804443, "learning_rate": 6.852561868246777e-06, "loss": 0.1817, "step": 9430 }, { "epoch": 3.290345067967933, "grad_norm": 3.084765672683716, "learning_rate": 6.838619728128268e-06, "loss": 0.044, "step": 9440 }, { "epoch": 3.29383060299756, "grad_norm": 18.532669067382812, "learning_rate": 6.82467758800976e-06, "loss": 0.272, "step": 9450 }, { "epoch": 3.2973161380271874, "grad_norm": 5.328271865844727, "learning_rate": 6.8107354478912525e-06, "loss": 0.1708, "step": 9460 }, { "epoch": 3.300801673056814, "grad_norm": 16.922470092773438, "learning_rate": 6.796793307772743e-06, "loss": 0.1577, "step": 9470 }, { "epoch": 3.3042872080864414, "grad_norm": 20.75090980529785, "learning_rate": 6.782851167654235e-06, "loss": 0.2414, "step": 9480 }, { "epoch": 3.307772743116068, "grad_norm": 0.03418176993727684, "learning_rate": 6.7689090275357275e-06, "loss": 0.1943, "step": 9490 }, { "epoch": 3.3112582781456954, "grad_norm": 0.4410526156425476, "learning_rate": 6.754966887417219e-06, "loss": 0.0739, "step": 9500 }, { "epoch": 3.3147438131753226, "grad_norm": 0.02994650788605213, "learning_rate": 6.741024747298711e-06, "loss": 0.187, "step": 9510 }, { "epoch": 3.3182293482049494, "grad_norm": 6.901801109313965, "learning_rate": 6.727082607180203e-06, "loss": 0.1074, "step": 9520 }, { "epoch": 3.3217148832345766, "grad_norm": 24.88802719116211, "learning_rate": 6.713140467061694e-06, "loss": 0.1535, "step": 9530 }, { "epoch": 3.3252004182642034, "grad_norm": 0.2463180273771286, "learning_rate": 6.699198326943186e-06, "loss": 0.1021, "step": 9540 }, { "epoch": 3.3286859532938307, "grad_norm": 0.09169216454029083, "learning_rate": 6.685256186824678e-06, "loss": 0.0811, "step": 9550 }, { "epoch": 3.332171488323458, "grad_norm": 2.2960045337677, "learning_rate": 6.67131404670617e-06, "loss": 0.1068, "step": 9560 }, { "epoch": 3.3356570233530847, "grad_norm": 0.021253207698464394, "learning_rate": 6.657371906587662e-06, "loss": 0.1687, "step": 9570 }, { "epoch": 3.339142558382712, "grad_norm": 28.67542266845703, "learning_rate": 6.643429766469154e-06, "loss": 0.1184, "step": 9580 }, { "epoch": 3.3426280934123387, "grad_norm": 0.02517896145582199, "learning_rate": 6.629487626350646e-06, "loss": 0.1933, "step": 9590 }, { "epoch": 3.346113628441966, "grad_norm": 0.04085422307252884, "learning_rate": 6.615545486232137e-06, "loss": 0.0423, "step": 9600 }, { "epoch": 3.3495991634715927, "grad_norm": 1.9252256155014038, "learning_rate": 6.601603346113628e-06, "loss": 0.1851, "step": 9610 }, { "epoch": 3.35308469850122, "grad_norm": 15.485836029052734, "learning_rate": 6.587661205995121e-06, "loss": 0.2882, "step": 9620 }, { "epoch": 3.356570233530847, "grad_norm": 7.36222505569458, "learning_rate": 6.5737190658766125e-06, "loss": 0.256, "step": 9630 }, { "epoch": 3.360055768560474, "grad_norm": 0.07581675052642822, "learning_rate": 6.559776925758104e-06, "loss": 0.1117, "step": 9640 }, { "epoch": 3.363541303590101, "grad_norm": 1.1792480945587158, "learning_rate": 6.545834785639597e-06, "loss": 0.2192, "step": 9650 }, { "epoch": 3.367026838619728, "grad_norm": 0.04260709509253502, "learning_rate": 6.531892645521088e-06, "loss": 0.0455, "step": 9660 }, { "epoch": 3.370512373649355, "grad_norm": 0.02434307336807251, "learning_rate": 6.517950505402579e-06, "loss": 0.0985, "step": 9670 }, { "epoch": 3.3739979086789824, "grad_norm": 80.342529296875, "learning_rate": 6.504008365284072e-06, "loss": 0.0361, "step": 9680 }, { "epoch": 3.377483443708609, "grad_norm": 0.01803007163107395, "learning_rate": 6.490066225165563e-06, "loss": 0.07, "step": 9690 }, { "epoch": 3.3809689787382364, "grad_norm": 0.06161894649267197, "learning_rate": 6.476124085047055e-06, "loss": 0.1747, "step": 9700 }, { "epoch": 3.384454513767863, "grad_norm": 0.02253262884914875, "learning_rate": 6.4621819449285475e-06, "loss": 0.0444, "step": 9710 }, { "epoch": 3.3879400487974904, "grad_norm": 0.027096986770629883, "learning_rate": 6.448239804810039e-06, "loss": 0.1927, "step": 9720 }, { "epoch": 3.3914255838271172, "grad_norm": 0.42232781648635864, "learning_rate": 6.434297664691531e-06, "loss": 0.1841, "step": 9730 }, { "epoch": 3.3949111188567445, "grad_norm": 0.01901787333190441, "learning_rate": 6.4203555245730224e-06, "loss": 0.135, "step": 9740 }, { "epoch": 3.3983966538863717, "grad_norm": 4.167501926422119, "learning_rate": 6.406413384454514e-06, "loss": 0.0409, "step": 9750 }, { "epoch": 3.4018821889159985, "grad_norm": 17.263132095336914, "learning_rate": 6.392471244336006e-06, "loss": 0.1366, "step": 9760 }, { "epoch": 3.4053677239456257, "grad_norm": 4.265865325927734, "learning_rate": 6.378529104217498e-06, "loss": 0.1339, "step": 9770 }, { "epoch": 3.4088532589752525, "grad_norm": 0.022587908431887627, "learning_rate": 6.36458696409899e-06, "loss": 0.1715, "step": 9780 }, { "epoch": 3.4123387940048797, "grad_norm": 15.038971900939941, "learning_rate": 6.350644823980482e-06, "loss": 0.1432, "step": 9790 }, { "epoch": 3.415824329034507, "grad_norm": 0.056241635233163834, "learning_rate": 6.336702683861974e-06, "loss": 0.093, "step": 9800 }, { "epoch": 3.4193098640641337, "grad_norm": 0.08624168485403061, "learning_rate": 6.322760543743465e-06, "loss": 0.139, "step": 9810 }, { "epoch": 3.422795399093761, "grad_norm": 0.025193965062499046, "learning_rate": 6.308818403624957e-06, "loss": 0.0069, "step": 9820 }, { "epoch": 3.4262809341233877, "grad_norm": 12.65317440032959, "learning_rate": 6.294876263506448e-06, "loss": 0.1008, "step": 9830 }, { "epoch": 3.429766469153015, "grad_norm": 3.4288766384124756, "learning_rate": 6.280934123387941e-06, "loss": 0.2026, "step": 9840 }, { "epoch": 3.433252004182642, "grad_norm": 0.01608997769653797, "learning_rate": 6.266991983269432e-06, "loss": 0.0358, "step": 9850 }, { "epoch": 3.436737539212269, "grad_norm": 9.135321617126465, "learning_rate": 6.253049843150925e-06, "loss": 0.1489, "step": 9860 }, { "epoch": 3.440223074241896, "grad_norm": 0.02817300148308277, "learning_rate": 6.239107703032417e-06, "loss": 0.0225, "step": 9870 }, { "epoch": 3.443708609271523, "grad_norm": 0.03426109254360199, "learning_rate": 6.225165562913907e-06, "loss": 0.0896, "step": 9880 }, { "epoch": 3.4471941443011502, "grad_norm": 0.029791146516799927, "learning_rate": 6.211223422795399e-06, "loss": 0.0733, "step": 9890 }, { "epoch": 3.4506796793307775, "grad_norm": 5.195836067199707, "learning_rate": 6.1972812826768916e-06, "loss": 0.0285, "step": 9900 }, { "epoch": 3.4541652143604042, "grad_norm": 0.046240124851465225, "learning_rate": 6.183339142558383e-06, "loss": 0.2035, "step": 9910 }, { "epoch": 3.4576507493900315, "grad_norm": 14.654541969299316, "learning_rate": 6.169397002439875e-06, "loss": 0.115, "step": 9920 }, { "epoch": 3.4611362844196583, "grad_norm": 0.05055411905050278, "learning_rate": 6.155454862321367e-06, "loss": 0.0856, "step": 9930 }, { "epoch": 3.4646218194492855, "grad_norm": 0.04881644248962402, "learning_rate": 6.141512722202858e-06, "loss": 0.1484, "step": 9940 }, { "epoch": 3.4681073544789127, "grad_norm": 0.08495481312274933, "learning_rate": 6.12757058208435e-06, "loss": 0.0758, "step": 9950 }, { "epoch": 3.4715928895085395, "grad_norm": 0.028945187106728554, "learning_rate": 6.113628441965842e-06, "loss": 0.0965, "step": 9960 }, { "epoch": 3.4750784245381667, "grad_norm": 8.535677909851074, "learning_rate": 6.099686301847334e-06, "loss": 0.2292, "step": 9970 }, { "epoch": 3.4785639595677935, "grad_norm": 7.189770221710205, "learning_rate": 6.085744161728826e-06, "loss": 0.155, "step": 9980 }, { "epoch": 3.4820494945974207, "grad_norm": 0.022676438093185425, "learning_rate": 6.071802021610318e-06, "loss": 0.1451, "step": 9990 }, { "epoch": 3.485535029627048, "grad_norm": 37.57761764526367, "learning_rate": 6.05785988149181e-06, "loss": 0.1904, "step": 10000 }, { "epoch": 3.4890205646566748, "grad_norm": 0.05265439301729202, "learning_rate": 6.043917741373301e-06, "loss": 0.0894, "step": 10010 }, { "epoch": 3.492506099686302, "grad_norm": 0.08444428443908691, "learning_rate": 6.029975601254792e-06, "loss": 0.1031, "step": 10020 }, { "epoch": 3.4959916347159288, "grad_norm": 0.3577669858932495, "learning_rate": 6.016033461136285e-06, "loss": 0.0539, "step": 10030 }, { "epoch": 3.499477169745556, "grad_norm": 0.026337046176195145, "learning_rate": 6.0020913210177765e-06, "loss": 0.1267, "step": 10040 }, { "epoch": 3.5029627047751832, "grad_norm": 4.85567569732666, "learning_rate": 5.988149180899269e-06, "loss": 0.0094, "step": 10050 }, { "epoch": 3.50644823980481, "grad_norm": 1.0068359375, "learning_rate": 5.974207040780761e-06, "loss": 0.1011, "step": 10060 }, { "epoch": 3.5099337748344372, "grad_norm": 0.03198888525366783, "learning_rate": 5.960264900662252e-06, "loss": 0.1203, "step": 10070 }, { "epoch": 3.513419309864064, "grad_norm": 0.08028540015220642, "learning_rate": 5.946322760543743e-06, "loss": 0.073, "step": 10080 }, { "epoch": 3.5169048448936913, "grad_norm": 6.395030975341797, "learning_rate": 5.932380620425236e-06, "loss": 0.1022, "step": 10090 }, { "epoch": 3.5203903799233185, "grad_norm": 0.9497456550598145, "learning_rate": 5.918438480306727e-06, "loss": 0.133, "step": 10100 }, { "epoch": 3.5238759149529453, "grad_norm": 0.02326410636305809, "learning_rate": 5.904496340188219e-06, "loss": 0.218, "step": 10110 }, { "epoch": 3.527361449982572, "grad_norm": 0.12932069599628448, "learning_rate": 5.8905542000697115e-06, "loss": 0.0913, "step": 10120 }, { "epoch": 3.5308469850121993, "grad_norm": 0.023471644148230553, "learning_rate": 5.876612059951203e-06, "loss": 0.1593, "step": 10130 }, { "epoch": 3.5343325200418265, "grad_norm": 11.825116157531738, "learning_rate": 5.862669919832696e-06, "loss": 0.0478, "step": 10140 }, { "epoch": 3.5378180550714533, "grad_norm": 4.660243034362793, "learning_rate": 5.8487277797141865e-06, "loss": 0.0862, "step": 10150 }, { "epoch": 3.5413035901010805, "grad_norm": 0.017631346359848976, "learning_rate": 5.834785639595678e-06, "loss": 0.072, "step": 10160 }, { "epoch": 3.5447891251307073, "grad_norm": 0.02127777598798275, "learning_rate": 5.82084349947717e-06, "loss": 0.0143, "step": 10170 }, { "epoch": 3.5482746601603345, "grad_norm": 12.403838157653809, "learning_rate": 5.806901359358662e-06, "loss": 0.194, "step": 10180 }, { "epoch": 3.5517601951899618, "grad_norm": 0.01695849932730198, "learning_rate": 5.792959219240154e-06, "loss": 0.0156, "step": 10190 }, { "epoch": 3.5552457302195886, "grad_norm": 0.06215062364935875, "learning_rate": 5.779017079121646e-06, "loss": 0.3112, "step": 10200 }, { "epoch": 3.558731265249216, "grad_norm": 0.015378969721496105, "learning_rate": 5.765074939003138e-06, "loss": 0.0328, "step": 10210 }, { "epoch": 3.5622168002788426, "grad_norm": 0.5997032523155212, "learning_rate": 5.751132798884629e-06, "loss": 0.0973, "step": 10220 }, { "epoch": 3.56570233530847, "grad_norm": 0.17242105305194855, "learning_rate": 5.737190658766121e-06, "loss": 0.0697, "step": 10230 }, { "epoch": 3.569187870338097, "grad_norm": 0.028457053005695343, "learning_rate": 5.723248518647613e-06, "loss": 0.2034, "step": 10240 }, { "epoch": 3.572673405367724, "grad_norm": 51.637733459472656, "learning_rate": 5.709306378529105e-06, "loss": 0.3527, "step": 10250 }, { "epoch": 3.576158940397351, "grad_norm": 0.02154484950006008, "learning_rate": 5.6953642384105965e-06, "loss": 0.0558, "step": 10260 }, { "epoch": 3.579644475426978, "grad_norm": 0.02288723737001419, "learning_rate": 5.681422098292089e-06, "loss": 0.1087, "step": 10270 }, { "epoch": 3.583130010456605, "grad_norm": 0.02297695353627205, "learning_rate": 5.667479958173581e-06, "loss": 0.0692, "step": 10280 }, { "epoch": 3.5866155454862323, "grad_norm": 0.0711924359202385, "learning_rate": 5.6535378180550715e-06, "loss": 0.065, "step": 10290 }, { "epoch": 3.590101080515859, "grad_norm": 17.65730857849121, "learning_rate": 5.639595677936563e-06, "loss": 0.2072, "step": 10300 }, { "epoch": 3.5935866155454863, "grad_norm": 6.287166118621826, "learning_rate": 5.625653537818056e-06, "loss": 0.3501, "step": 10310 }, { "epoch": 3.597072150575113, "grad_norm": 0.05497564375400543, "learning_rate": 5.611711397699547e-06, "loss": 0.1217, "step": 10320 }, { "epoch": 3.6005576856047403, "grad_norm": 0.2877632677555084, "learning_rate": 5.59776925758104e-06, "loss": 0.1098, "step": 10330 }, { "epoch": 3.6040432206343676, "grad_norm": 0.033131808042526245, "learning_rate": 5.5838271174625315e-06, "loss": 0.0095, "step": 10340 }, { "epoch": 3.6075287556639943, "grad_norm": 0.057142358273267746, "learning_rate": 5.569884977344022e-06, "loss": 0.1609, "step": 10350 }, { "epoch": 3.6110142906936216, "grad_norm": 3.4275271892547607, "learning_rate": 5.555942837225514e-06, "loss": 0.1044, "step": 10360 }, { "epoch": 3.6144998257232483, "grad_norm": 0.03326639533042908, "learning_rate": 5.5420006971070064e-06, "loss": 0.0441, "step": 10370 }, { "epoch": 3.6179853607528756, "grad_norm": 0.018506675958633423, "learning_rate": 5.528058556988498e-06, "loss": 0.1586, "step": 10380 }, { "epoch": 3.621470895782503, "grad_norm": 0.05169185996055603, "learning_rate": 5.51411641686999e-06, "loss": 0.0747, "step": 10390 }, { "epoch": 3.6249564308121296, "grad_norm": 0.017710169777274132, "learning_rate": 5.500174276751482e-06, "loss": 0.0171, "step": 10400 }, { "epoch": 3.628441965841757, "grad_norm": 20.366230010986328, "learning_rate": 5.486232136632974e-06, "loss": 0.4428, "step": 10410 }, { "epoch": 3.6319275008713836, "grad_norm": 0.12488202750682831, "learning_rate": 5.472289996514465e-06, "loss": 0.2049, "step": 10420 }, { "epoch": 3.635413035901011, "grad_norm": 0.027057647705078125, "learning_rate": 5.458347856395957e-06, "loss": 0.0281, "step": 10430 }, { "epoch": 3.638898570930638, "grad_norm": 0.2535144090652466, "learning_rate": 5.444405716277449e-06, "loss": 0.009, "step": 10440 }, { "epoch": 3.642384105960265, "grad_norm": 5.463354110717773, "learning_rate": 5.430463576158941e-06, "loss": 0.131, "step": 10450 }, { "epoch": 3.645869640989892, "grad_norm": 0.021357407793402672, "learning_rate": 5.416521436040433e-06, "loss": 0.407, "step": 10460 }, { "epoch": 3.649355176019519, "grad_norm": 30.37545394897461, "learning_rate": 5.402579295921925e-06, "loss": 0.231, "step": 10470 }, { "epoch": 3.652840711049146, "grad_norm": 0.027980081737041473, "learning_rate": 5.388637155803416e-06, "loss": 0.2154, "step": 10480 }, { "epoch": 3.6563262460787733, "grad_norm": 0.03117654100060463, "learning_rate": 5.374695015684907e-06, "loss": 0.1123, "step": 10490 }, { "epoch": 3.6598117811084, "grad_norm": 0.9009264707565308, "learning_rate": 5.3607528755664e-06, "loss": 0.1333, "step": 10500 }, { "epoch": 3.6632973161380273, "grad_norm": 5.609897613525391, "learning_rate": 5.346810735447891e-06, "loss": 0.1772, "step": 10510 }, { "epoch": 3.666782851167654, "grad_norm": 0.05884939059615135, "learning_rate": 5.332868595329384e-06, "loss": 0.0984, "step": 10520 }, { "epoch": 3.6702683861972814, "grad_norm": 1.9585541486740112, "learning_rate": 5.318926455210876e-06, "loss": 0.0611, "step": 10530 }, { "epoch": 3.6737539212269086, "grad_norm": 1.0173242092132568, "learning_rate": 5.304984315092367e-06, "loss": 0.2067, "step": 10540 }, { "epoch": 3.6772394562565354, "grad_norm": 0.15780754387378693, "learning_rate": 5.29104217497386e-06, "loss": 0.1282, "step": 10550 }, { "epoch": 3.680724991286162, "grad_norm": 27.38100814819336, "learning_rate": 5.2771000348553506e-06, "loss": 0.2614, "step": 10560 }, { "epoch": 3.6842105263157894, "grad_norm": 0.04858740046620369, "learning_rate": 5.263157894736842e-06, "loss": 0.197, "step": 10570 }, { "epoch": 3.6876960613454166, "grad_norm": 0.01936270296573639, "learning_rate": 5.249215754618334e-06, "loss": 0.1291, "step": 10580 }, { "epoch": 3.691181596375044, "grad_norm": 0.02714901603758335, "learning_rate": 5.235273614499826e-06, "loss": 0.1723, "step": 10590 }, { "epoch": 3.6946671314046706, "grad_norm": 0.02381339855492115, "learning_rate": 5.221331474381318e-06, "loss": 0.0174, "step": 10600 }, { "epoch": 3.6981526664342974, "grad_norm": 0.06992737948894501, "learning_rate": 5.20738933426281e-06, "loss": 0.0499, "step": 10610 }, { "epoch": 3.7016382014639246, "grad_norm": 0.4815942943096161, "learning_rate": 5.193447194144302e-06, "loss": 0.0657, "step": 10620 }, { "epoch": 3.705123736493552, "grad_norm": 4.941831588745117, "learning_rate": 5.179505054025793e-06, "loss": 0.1945, "step": 10630 }, { "epoch": 3.7086092715231787, "grad_norm": 4.8328166007995605, "learning_rate": 5.165562913907285e-06, "loss": 0.1758, "step": 10640 }, { "epoch": 3.712094806552806, "grad_norm": 0.24197635054588318, "learning_rate": 5.151620773788777e-06, "loss": 0.1795, "step": 10650 }, { "epoch": 3.7155803415824327, "grad_norm": 22.532445907592773, "learning_rate": 5.137678633670269e-06, "loss": 0.1157, "step": 10660 }, { "epoch": 3.71906587661206, "grad_norm": 0.19902735948562622, "learning_rate": 5.1237364935517605e-06, "loss": 0.0064, "step": 10670 }, { "epoch": 3.722551411641687, "grad_norm": 12.738110542297363, "learning_rate": 5.109794353433253e-06, "loss": 0.0219, "step": 10680 }, { "epoch": 3.726036946671314, "grad_norm": 0.07453146576881409, "learning_rate": 5.095852213314745e-06, "loss": 0.1586, "step": 10690 }, { "epoch": 3.729522481700941, "grad_norm": 0.022043762728571892, "learning_rate": 5.0819100731962355e-06, "loss": 0.0053, "step": 10700 }, { "epoch": 3.733008016730568, "grad_norm": 15.734837532043457, "learning_rate": 5.067967933077728e-06, "loss": 0.2529, "step": 10710 }, { "epoch": 3.736493551760195, "grad_norm": 0.5030373930931091, "learning_rate": 5.05402579295922e-06, "loss": 0.0513, "step": 10720 }, { "epoch": 3.7399790867898224, "grad_norm": 0.10042136162519455, "learning_rate": 5.040083652840711e-06, "loss": 0.2097, "step": 10730 }, { "epoch": 3.743464621819449, "grad_norm": 0.4963725507259369, "learning_rate": 5.026141512722204e-06, "loss": 0.0957, "step": 10740 }, { "epoch": 3.7469501568490764, "grad_norm": 8.767095565795898, "learning_rate": 5.0121993726036955e-06, "loss": 0.2483, "step": 10750 }, { "epoch": 3.750435691878703, "grad_norm": 12.400495529174805, "learning_rate": 4.998257232485187e-06, "loss": 0.1802, "step": 10760 }, { "epoch": 3.7539212269083304, "grad_norm": 5.502656936645508, "learning_rate": 4.984315092366679e-06, "loss": 0.1525, "step": 10770 }, { "epoch": 3.7574067619379576, "grad_norm": 0.01613186113536358, "learning_rate": 4.9703729522481705e-06, "loss": 0.1123, "step": 10780 }, { "epoch": 3.7608922969675844, "grad_norm": 0.01936766505241394, "learning_rate": 4.956430812129662e-06, "loss": 0.0666, "step": 10790 }, { "epoch": 3.7643778319972117, "grad_norm": 16.322816848754883, "learning_rate": 4.942488672011154e-06, "loss": 0.0653, "step": 10800 }, { "epoch": 3.7678633670268384, "grad_norm": 17.28783416748047, "learning_rate": 4.9285465318926455e-06, "loss": 0.0617, "step": 10810 }, { "epoch": 3.7713489020564657, "grad_norm": 0.042436350136995316, "learning_rate": 4.914604391774138e-06, "loss": 0.152, "step": 10820 }, { "epoch": 3.774834437086093, "grad_norm": 0.20540641248226166, "learning_rate": 4.90066225165563e-06, "loss": 0.0412, "step": 10830 }, { "epoch": 3.7783199721157197, "grad_norm": 0.211298406124115, "learning_rate": 4.886720111537121e-06, "loss": 0.0181, "step": 10840 }, { "epoch": 3.781805507145347, "grad_norm": 0.09502696245908737, "learning_rate": 4.872777971418613e-06, "loss": 0.0132, "step": 10850 }, { "epoch": 3.7852910421749737, "grad_norm": 0.03848971426486969, "learning_rate": 4.858835831300105e-06, "loss": 0.0608, "step": 10860 }, { "epoch": 3.788776577204601, "grad_norm": 21.687044143676758, "learning_rate": 4.844893691181597e-06, "loss": 0.1938, "step": 10870 }, { "epoch": 3.792262112234228, "grad_norm": 0.02101794257760048, "learning_rate": 4.830951551063088e-06, "loss": 0.0113, "step": 10880 }, { "epoch": 3.795747647263855, "grad_norm": 0.036022286862134933, "learning_rate": 4.8170094109445805e-06, "loss": 0.164, "step": 10890 }, { "epoch": 3.799233182293482, "grad_norm": 0.035032596439123154, "learning_rate": 4.803067270826072e-06, "loss": 0.0974, "step": 10900 }, { "epoch": 3.802718717323109, "grad_norm": 8.165773391723633, "learning_rate": 4.789125130707565e-06, "loss": 0.0294, "step": 10910 }, { "epoch": 3.806204252352736, "grad_norm": 0.01464312057942152, "learning_rate": 4.7751829905890555e-06, "loss": 0.1462, "step": 10920 }, { "epoch": 3.8096897873823634, "grad_norm": 0.02338375523686409, "learning_rate": 4.761240850470548e-06, "loss": 0.1984, "step": 10930 }, { "epoch": 3.81317532241199, "grad_norm": 0.021365277469158173, "learning_rate": 4.74729871035204e-06, "loss": 0.1061, "step": 10940 }, { "epoch": 3.8166608574416174, "grad_norm": 0.01990444026887417, "learning_rate": 4.733356570233531e-06, "loss": 0.0306, "step": 10950 }, { "epoch": 3.820146392471244, "grad_norm": 0.664445161819458, "learning_rate": 4.719414430115023e-06, "loss": 0.0078, "step": 10960 }, { "epoch": 3.8236319275008714, "grad_norm": 2.076838254928589, "learning_rate": 4.705472289996515e-06, "loss": 0.0971, "step": 10970 }, { "epoch": 3.8271174625304987, "grad_norm": 0.01907547004520893, "learning_rate": 4.691530149878007e-06, "loss": 0.0404, "step": 10980 }, { "epoch": 3.8306029975601255, "grad_norm": 0.4929560720920563, "learning_rate": 4.677588009759498e-06, "loss": 0.0403, "step": 10990 }, { "epoch": 3.8340885325897527, "grad_norm": 3.4360711574554443, "learning_rate": 4.6636458696409905e-06, "loss": 0.2111, "step": 11000 }, { "epoch": 3.8375740676193795, "grad_norm": 3.6092143058776855, "learning_rate": 4.649703729522482e-06, "loss": 0.2806, "step": 11010 }, { "epoch": 3.8410596026490067, "grad_norm": 0.01587533950805664, "learning_rate": 4.635761589403974e-06, "loss": 0.0803, "step": 11020 }, { "epoch": 3.844545137678634, "grad_norm": 17.90401840209961, "learning_rate": 4.6218194492854654e-06, "loss": 0.1311, "step": 11030 }, { "epoch": 3.8480306727082607, "grad_norm": 0.02171589806675911, "learning_rate": 4.607877309166958e-06, "loss": 0.0515, "step": 11040 }, { "epoch": 3.8515162077378875, "grad_norm": 0.36789968609809875, "learning_rate": 4.593935169048449e-06, "loss": 0.1818, "step": 11050 }, { "epoch": 3.8550017427675147, "grad_norm": 0.017076525837183, "learning_rate": 4.579993028929941e-06, "loss": 0.0441, "step": 11060 }, { "epoch": 3.858487277797142, "grad_norm": 0.03205644711852074, "learning_rate": 4.566050888811433e-06, "loss": 0.0985, "step": 11070 }, { "epoch": 3.861972812826769, "grad_norm": 0.031755849719047546, "learning_rate": 4.552108748692925e-06, "loss": 0.1107, "step": 11080 }, { "epoch": 3.865458347856396, "grad_norm": 0.020482953637838364, "learning_rate": 4.538166608574416e-06, "loss": 0.1496, "step": 11090 }, { "epoch": 3.8689438828860228, "grad_norm": 0.047831941395998, "learning_rate": 4.524224468455909e-06, "loss": 0.1769, "step": 11100 }, { "epoch": 3.87242941791565, "grad_norm": 0.02326442301273346, "learning_rate": 4.5102823283374004e-06, "loss": 0.1952, "step": 11110 }, { "epoch": 3.875914952945277, "grad_norm": 0.2660370469093323, "learning_rate": 4.496340188218892e-06, "loss": 0.0117, "step": 11120 }, { "epoch": 3.879400487974904, "grad_norm": 0.023432690650224686, "learning_rate": 4.482398048100384e-06, "loss": 0.1016, "step": 11130 }, { "epoch": 3.8828860230045312, "grad_norm": 11.599336624145508, "learning_rate": 4.468455907981875e-06, "loss": 0.0267, "step": 11140 }, { "epoch": 3.886371558034158, "grad_norm": 0.030066780745983124, "learning_rate": 4.454513767863368e-06, "loss": 0.0146, "step": 11150 }, { "epoch": 3.8898570930637852, "grad_norm": 0.30465713143348694, "learning_rate": 4.440571627744859e-06, "loss": 0.0186, "step": 11160 }, { "epoch": 3.8933426280934125, "grad_norm": 0.04410243034362793, "learning_rate": 4.426629487626351e-06, "loss": 0.0072, "step": 11170 }, { "epoch": 3.8968281631230393, "grad_norm": 39.044795989990234, "learning_rate": 4.412687347507843e-06, "loss": 0.1416, "step": 11180 }, { "epoch": 3.9003136981526665, "grad_norm": 28.497705459594727, "learning_rate": 4.3987452073893346e-06, "loss": 0.0732, "step": 11190 }, { "epoch": 3.9037992331822933, "grad_norm": 0.019765986129641533, "learning_rate": 4.384803067270826e-06, "loss": 0.0791, "step": 11200 }, { "epoch": 3.9072847682119205, "grad_norm": 0.32468003034591675, "learning_rate": 4.370860927152319e-06, "loss": 0.1067, "step": 11210 }, { "epoch": 3.9107703032415477, "grad_norm": 0.02749641239643097, "learning_rate": 4.3569187870338096e-06, "loss": 0.171, "step": 11220 }, { "epoch": 3.9142558382711745, "grad_norm": 12.454182624816895, "learning_rate": 4.342976646915302e-06, "loss": 0.2891, "step": 11230 }, { "epoch": 3.9177413733008017, "grad_norm": 0.03475377708673477, "learning_rate": 4.329034506796794e-06, "loss": 0.2393, "step": 11240 }, { "epoch": 3.9212269083304285, "grad_norm": 0.01971781998872757, "learning_rate": 4.315092366678285e-06, "loss": 0.1189, "step": 11250 }, { "epoch": 3.9247124433600558, "grad_norm": 0.7159892916679382, "learning_rate": 4.301150226559777e-06, "loss": 0.1387, "step": 11260 }, { "epoch": 3.928197978389683, "grad_norm": 10.798375129699707, "learning_rate": 4.287208086441269e-06, "loss": 0.2423, "step": 11270 }, { "epoch": 3.9316835134193098, "grad_norm": 1.2279161214828491, "learning_rate": 4.273265946322761e-06, "loss": 0.0822, "step": 11280 }, { "epoch": 3.935169048448937, "grad_norm": 0.023021556437015533, "learning_rate": 4.259323806204252e-06, "loss": 0.1275, "step": 11290 }, { "epoch": 3.938654583478564, "grad_norm": 0.041272278875112534, "learning_rate": 4.2453816660857445e-06, "loss": 0.0148, "step": 11300 }, { "epoch": 3.942140118508191, "grad_norm": 3.4936373233795166, "learning_rate": 4.231439525967236e-06, "loss": 0.1178, "step": 11310 }, { "epoch": 3.9456256535378182, "grad_norm": 0.033816587179899216, "learning_rate": 4.217497385848729e-06, "loss": 0.2568, "step": 11320 }, { "epoch": 3.949111188567445, "grad_norm": 0.018212752416729927, "learning_rate": 4.2035552457302195e-06, "loss": 0.0131, "step": 11330 }, { "epoch": 3.9525967235970723, "grad_norm": 10.551046371459961, "learning_rate": 4.189613105611712e-06, "loss": 0.1611, "step": 11340 }, { "epoch": 3.956082258626699, "grad_norm": 0.018220404163002968, "learning_rate": 4.175670965493204e-06, "loss": 0.223, "step": 11350 }, { "epoch": 3.9595677936563263, "grad_norm": 0.02707715705037117, "learning_rate": 4.161728825374695e-06, "loss": 0.0611, "step": 11360 }, { "epoch": 3.9630533286859535, "grad_norm": 6.247076511383057, "learning_rate": 4.147786685256187e-06, "loss": 0.2263, "step": 11370 }, { "epoch": 3.9665388637155803, "grad_norm": 0.013209417462348938, "learning_rate": 4.133844545137679e-06, "loss": 0.0905, "step": 11380 }, { "epoch": 3.9700243987452075, "grad_norm": 0.19896091520786285, "learning_rate": 4.119902405019171e-06, "loss": 0.1132, "step": 11390 }, { "epoch": 3.9735099337748343, "grad_norm": 0.01759384572505951, "learning_rate": 4.105960264900663e-06, "loss": 0.0424, "step": 11400 }, { "epoch": 3.9769954688044615, "grad_norm": 0.018540671095252037, "learning_rate": 4.0920181247821545e-06, "loss": 0.095, "step": 11410 }, { "epoch": 3.9804810038340888, "grad_norm": 0.018298756331205368, "learning_rate": 4.078075984663646e-06, "loss": 0.0239, "step": 11420 }, { "epoch": 3.9839665388637155, "grad_norm": 0.10425955057144165, "learning_rate": 4.064133844545138e-06, "loss": 0.0831, "step": 11430 }, { "epoch": 3.9874520738933428, "grad_norm": 0.29570281505584717, "learning_rate": 4.0501917044266295e-06, "loss": 0.0078, "step": 11440 }, { "epoch": 3.9909376089229696, "grad_norm": 10.413164138793945, "learning_rate": 4.036249564308122e-06, "loss": 0.0978, "step": 11450 }, { "epoch": 3.994423143952597, "grad_norm": 0.021612277254462242, "learning_rate": 4.022307424189613e-06, "loss": 0.0971, "step": 11460 }, { "epoch": 3.997908678982224, "grad_norm": 2.315714120864868, "learning_rate": 4.008365284071105e-06, "loss": 0.0977, "step": 11470 }, { "epoch": 4.0, "eval_accuracy": 0.9906172839506173, "eval_loss": 0.04471336677670479, "eval_runtime": 18.8269, "eval_samples_per_second": 215.118, "eval_steps_per_second": 26.93, "step": 11476 }, { "epoch": 4.001394214011851, "grad_norm": 0.014861468225717545, "learning_rate": 3.994423143952597e-06, "loss": 0.0664, "step": 11480 }, { "epoch": 4.004879749041478, "grad_norm": 0.18193350732326508, "learning_rate": 3.9804810038340895e-06, "loss": 0.1891, "step": 11490 }, { "epoch": 4.008365284071105, "grad_norm": 0.04888654500246048, "learning_rate": 3.96653886371558e-06, "loss": 0.054, "step": 11500 }, { "epoch": 4.011850819100732, "grad_norm": 0.022148391231894493, "learning_rate": 3.952596723597073e-06, "loss": 0.0184, "step": 11510 }, { "epoch": 4.015336354130359, "grad_norm": 3.5250911712646484, "learning_rate": 3.9386545834785645e-06, "loss": 0.0466, "step": 11520 }, { "epoch": 4.018821889159986, "grad_norm": 8.17469596862793, "learning_rate": 3.924712443360056e-06, "loss": 0.1563, "step": 11530 }, { "epoch": 4.022307424189613, "grad_norm": 0.12059949338436127, "learning_rate": 3.910770303241548e-06, "loss": 0.1135, "step": 11540 }, { "epoch": 4.02579295921924, "grad_norm": 0.23522257804870605, "learning_rate": 3.8968281631230395e-06, "loss": 0.1346, "step": 11550 }, { "epoch": 4.029278494248867, "grad_norm": 0.014988483861088753, "learning_rate": 3.882886023004532e-06, "loss": 0.0105, "step": 11560 }, { "epoch": 4.0327640292784945, "grad_norm": 0.3500690460205078, "learning_rate": 3.868943882886023e-06, "loss": 0.0901, "step": 11570 }, { "epoch": 4.036249564308121, "grad_norm": 0.01964680850505829, "learning_rate": 3.855001742767515e-06, "loss": 0.0047, "step": 11580 }, { "epoch": 4.039735099337748, "grad_norm": 0.01860162802040577, "learning_rate": 3.841059602649007e-06, "loss": 0.0216, "step": 11590 }, { "epoch": 4.043220634367375, "grad_norm": 31.001237869262695, "learning_rate": 3.827117462530499e-06, "loss": 0.1348, "step": 11600 }, { "epoch": 4.046706169397003, "grad_norm": 1.8247324228286743, "learning_rate": 3.8131753224119907e-06, "loss": 0.1087, "step": 11610 }, { "epoch": 4.05019170442663, "grad_norm": 0.9444401860237122, "learning_rate": 3.7992331822934824e-06, "loss": 0.1226, "step": 11620 }, { "epoch": 4.053677239456256, "grad_norm": 0.05249320715665817, "learning_rate": 3.785291042174974e-06, "loss": 0.1094, "step": 11630 }, { "epoch": 4.057162774485883, "grad_norm": 13.864377975463867, "learning_rate": 3.7713489020564657e-06, "loss": 0.1238, "step": 11640 }, { "epoch": 4.060648309515511, "grad_norm": 0.03523989021778107, "learning_rate": 3.7574067619379578e-06, "loss": 0.0571, "step": 11650 }, { "epoch": 4.064133844545138, "grad_norm": 22.960308074951172, "learning_rate": 3.74346462181945e-06, "loss": 0.2689, "step": 11660 }, { "epoch": 4.067619379574765, "grad_norm": 0.023842979222536087, "learning_rate": 3.729522481700941e-06, "loss": 0.1566, "step": 11670 }, { "epoch": 4.071104914604391, "grad_norm": 0.2230115532875061, "learning_rate": 3.715580341582433e-06, "loss": 0.0784, "step": 11680 }, { "epoch": 4.074590449634019, "grad_norm": 1.818585753440857, "learning_rate": 3.7016382014639253e-06, "loss": 0.1562, "step": 11690 }, { "epoch": 4.078075984663646, "grad_norm": 0.024207105860114098, "learning_rate": 3.6876960613454165e-06, "loss": 0.0896, "step": 11700 }, { "epoch": 4.081561519693273, "grad_norm": 8.846477508544922, "learning_rate": 3.6737539212269086e-06, "loss": 0.1073, "step": 11710 }, { "epoch": 4.0850470547229, "grad_norm": 4.648227214813232, "learning_rate": 3.6598117811084007e-06, "loss": 0.0199, "step": 11720 }, { "epoch": 4.088532589752527, "grad_norm": 0.014286634512245655, "learning_rate": 3.6458696409898923e-06, "loss": 0.0208, "step": 11730 }, { "epoch": 4.092018124782154, "grad_norm": 8.536304473876953, "learning_rate": 3.631927500871384e-06, "loss": 0.0569, "step": 11740 }, { "epoch": 4.095503659811781, "grad_norm": 0.018616320565342903, "learning_rate": 3.6179853607528757e-06, "loss": 0.0343, "step": 11750 }, { "epoch": 4.098989194841408, "grad_norm": 0.03940120339393616, "learning_rate": 3.6040432206343678e-06, "loss": 0.0799, "step": 11760 }, { "epoch": 4.102474729871036, "grad_norm": 0.017780063673853874, "learning_rate": 3.5901010805158594e-06, "loss": 0.0922, "step": 11770 }, { "epoch": 4.105960264900662, "grad_norm": 0.03300795704126358, "learning_rate": 3.576158940397351e-06, "loss": 0.0589, "step": 11780 }, { "epoch": 4.109445799930289, "grad_norm": 37.71131896972656, "learning_rate": 3.562216800278843e-06, "loss": 0.144, "step": 11790 }, { "epoch": 4.112931334959916, "grad_norm": 0.2066090852022171, "learning_rate": 3.5482746601603352e-06, "loss": 0.0068, "step": 11800 }, { "epoch": 4.116416869989544, "grad_norm": 0.02809598483145237, "learning_rate": 3.5343325200418265e-06, "loss": 0.2141, "step": 11810 }, { "epoch": 4.119902405019171, "grad_norm": 11.681623458862305, "learning_rate": 3.5203903799233186e-06, "loss": 0.1601, "step": 11820 }, { "epoch": 4.123387940048797, "grad_norm": 0.021636666730046272, "learning_rate": 3.5064482398048107e-06, "loss": 0.1021, "step": 11830 }, { "epoch": 4.126873475078424, "grad_norm": 0.01840069517493248, "learning_rate": 3.492506099686302e-06, "loss": 0.0054, "step": 11840 }, { "epoch": 4.130359010108052, "grad_norm": 14.279787063598633, "learning_rate": 3.478563959567794e-06, "loss": 0.4595, "step": 11850 }, { "epoch": 4.133844545137679, "grad_norm": 0.06327791512012482, "learning_rate": 3.464621819449286e-06, "loss": 0.0815, "step": 11860 }, { "epoch": 4.137330080167306, "grad_norm": 7.440787315368652, "learning_rate": 3.4506796793307773e-06, "loss": 0.0566, "step": 11870 }, { "epoch": 4.140815615196932, "grad_norm": 0.01611819863319397, "learning_rate": 3.4367375392122694e-06, "loss": 0.1947, "step": 11880 }, { "epoch": 4.14430115022656, "grad_norm": 0.07511651515960693, "learning_rate": 3.422795399093761e-06, "loss": 0.2238, "step": 11890 }, { "epoch": 4.147786685256187, "grad_norm": 0.05026530474424362, "learning_rate": 3.408853258975253e-06, "loss": 0.1576, "step": 11900 }, { "epoch": 4.151272220285814, "grad_norm": 12.329663276672363, "learning_rate": 3.394911118856745e-06, "loss": 0.0859, "step": 11910 }, { "epoch": 4.154757755315441, "grad_norm": 6.723945140838623, "learning_rate": 3.3809689787382365e-06, "loss": 0.1676, "step": 11920 }, { "epoch": 4.158243290345068, "grad_norm": 0.024456940591335297, "learning_rate": 3.3670268386197285e-06, "loss": 0.1249, "step": 11930 }, { "epoch": 4.161728825374695, "grad_norm": 0.017061809077858925, "learning_rate": 3.3530846985012198e-06, "loss": 0.1512, "step": 11940 }, { "epoch": 4.165214360404322, "grad_norm": 0.03508226200938225, "learning_rate": 3.339142558382712e-06, "loss": 0.1291, "step": 11950 }, { "epoch": 4.168699895433949, "grad_norm": 0.017415596172213554, "learning_rate": 3.325200418264204e-06, "loss": 0.1062, "step": 11960 }, { "epoch": 4.172185430463577, "grad_norm": 4.462165355682373, "learning_rate": 3.311258278145696e-06, "loss": 0.0761, "step": 11970 }, { "epoch": 4.175670965493203, "grad_norm": 0.017748460173606873, "learning_rate": 3.2973161380271873e-06, "loss": 0.1781, "step": 11980 }, { "epoch": 4.17915650052283, "grad_norm": 0.04285305365920067, "learning_rate": 3.2833739979086794e-06, "loss": 0.0114, "step": 11990 }, { "epoch": 4.182642035552457, "grad_norm": 13.827252388000488, "learning_rate": 3.2694318577901714e-06, "loss": 0.0203, "step": 12000 }, { "epoch": 4.186127570582085, "grad_norm": 0.01608210802078247, "learning_rate": 3.2554897176716627e-06, "loss": 0.065, "step": 12010 }, { "epoch": 4.189613105611711, "grad_norm": 22.72849464416504, "learning_rate": 3.2415475775531548e-06, "loss": 0.146, "step": 12020 }, { "epoch": 4.193098640641338, "grad_norm": 0.013584673404693604, "learning_rate": 3.2276054374346464e-06, "loss": 0.0754, "step": 12030 }, { "epoch": 4.196584175670965, "grad_norm": 0.02281971462070942, "learning_rate": 3.213663297316138e-06, "loss": 0.0626, "step": 12040 }, { "epoch": 4.200069710700593, "grad_norm": 1.6404436826705933, "learning_rate": 3.19972115719763e-06, "loss": 0.2863, "step": 12050 }, { "epoch": 4.20355524573022, "grad_norm": 6.147258281707764, "learning_rate": 3.185779017079122e-06, "loss": 0.1192, "step": 12060 }, { "epoch": 4.207040780759846, "grad_norm": 19.095373153686523, "learning_rate": 3.171836876960614e-06, "loss": 0.2516, "step": 12070 }, { "epoch": 4.2105263157894735, "grad_norm": 0.01998833194375038, "learning_rate": 3.157894736842105e-06, "loss": 0.0836, "step": 12080 }, { "epoch": 4.214011850819101, "grad_norm": 0.02509804256260395, "learning_rate": 3.1439525967235973e-06, "loss": 0.1586, "step": 12090 }, { "epoch": 4.217497385848728, "grad_norm": 0.02285916358232498, "learning_rate": 3.1300104566050893e-06, "loss": 0.1148, "step": 12100 }, { "epoch": 4.220982920878355, "grad_norm": 0.017659736797213554, "learning_rate": 3.1160683164865806e-06, "loss": 0.0536, "step": 12110 }, { "epoch": 4.2244684559079815, "grad_norm": 0.14031673967838287, "learning_rate": 3.1021261763680727e-06, "loss": 0.1591, "step": 12120 }, { "epoch": 4.227953990937609, "grad_norm": 0.03257942944765091, "learning_rate": 3.0881840362495647e-06, "loss": 0.0295, "step": 12130 }, { "epoch": 4.231439525967236, "grad_norm": 5.860164165496826, "learning_rate": 3.0742418961310564e-06, "loss": 0.0525, "step": 12140 }, { "epoch": 4.234925060996863, "grad_norm": 0.023333774879574776, "learning_rate": 3.060299756012548e-06, "loss": 0.1362, "step": 12150 }, { "epoch": 4.23841059602649, "grad_norm": 0.021630102768540382, "learning_rate": 3.04635761589404e-06, "loss": 0.306, "step": 12160 }, { "epoch": 4.241896131056117, "grad_norm": 0.9809282422065735, "learning_rate": 3.032415475775532e-06, "loss": 0.0831, "step": 12170 }, { "epoch": 4.245381666085744, "grad_norm": 0.17336943745613098, "learning_rate": 3.0184733356570235e-06, "loss": 0.0056, "step": 12180 }, { "epoch": 4.248867201115371, "grad_norm": 0.020092690363526344, "learning_rate": 3.0045311955385156e-06, "loss": 0.023, "step": 12190 }, { "epoch": 4.252352736144998, "grad_norm": 6.104828834533691, "learning_rate": 2.9905890554200072e-06, "loss": 0.1396, "step": 12200 }, { "epoch": 4.255838271174626, "grad_norm": 0.06953276693820953, "learning_rate": 2.9766469153014993e-06, "loss": 0.0521, "step": 12210 }, { "epoch": 4.259323806204252, "grad_norm": 0.06008859723806381, "learning_rate": 2.9627047751829905e-06, "loss": 0.0795, "step": 12220 }, { "epoch": 4.262809341233879, "grad_norm": 0.06809389591217041, "learning_rate": 2.9487626350644826e-06, "loss": 0.0766, "step": 12230 }, { "epoch": 4.2662948762635065, "grad_norm": 0.011282606981694698, "learning_rate": 2.9348204949459747e-06, "loss": 0.1127, "step": 12240 }, { "epoch": 4.269780411293134, "grad_norm": 0.01990368217229843, "learning_rate": 2.920878354827466e-06, "loss": 0.0512, "step": 12250 }, { "epoch": 4.273265946322761, "grad_norm": 2.730123996734619, "learning_rate": 2.906936214708958e-06, "loss": 0.133, "step": 12260 }, { "epoch": 4.276751481352387, "grad_norm": 0.018478725105524063, "learning_rate": 2.89299407459045e-06, "loss": 0.3299, "step": 12270 }, { "epoch": 4.2802370163820145, "grad_norm": 0.027384992688894272, "learning_rate": 2.8790519344719414e-06, "loss": 0.015, "step": 12280 }, { "epoch": 4.283722551411642, "grad_norm": 0.38204970955848694, "learning_rate": 2.8651097943534334e-06, "loss": 0.0074, "step": 12290 }, { "epoch": 4.287208086441269, "grad_norm": 6.249034881591797, "learning_rate": 2.8511676542349255e-06, "loss": 0.0899, "step": 12300 }, { "epoch": 4.290693621470896, "grad_norm": 2.7420356273651123, "learning_rate": 2.837225514116417e-06, "loss": 0.0179, "step": 12310 }, { "epoch": 4.2941791565005225, "grad_norm": 3.861992835998535, "learning_rate": 2.823283373997909e-06, "loss": 0.1254, "step": 12320 }, { "epoch": 4.29766469153015, "grad_norm": 1.4951390027999878, "learning_rate": 2.8093412338794005e-06, "loss": 0.0504, "step": 12330 }, { "epoch": 4.301150226559777, "grad_norm": 0.025097187608480453, "learning_rate": 2.7953990937608926e-06, "loss": 0.0815, "step": 12340 }, { "epoch": 4.304635761589404, "grad_norm": 0.020737938582897186, "learning_rate": 2.7814569536423843e-06, "loss": 0.0526, "step": 12350 }, { "epoch": 4.308121296619031, "grad_norm": 0.025950776413083076, "learning_rate": 2.767514813523876e-06, "loss": 0.0167, "step": 12360 }, { "epoch": 4.311606831648658, "grad_norm": 0.4815838634967804, "learning_rate": 2.753572673405368e-06, "loss": 0.0061, "step": 12370 }, { "epoch": 4.315092366678285, "grad_norm": 0.020343678072094917, "learning_rate": 2.73963053328686e-06, "loss": 0.1312, "step": 12380 }, { "epoch": 4.318577901707912, "grad_norm": 6.27520227432251, "learning_rate": 2.7256883931683513e-06, "loss": 0.1138, "step": 12390 }, { "epoch": 4.3220634367375395, "grad_norm": 19.571958541870117, "learning_rate": 2.7117462530498434e-06, "loss": 0.1826, "step": 12400 }, { "epoch": 4.325548971767166, "grad_norm": 0.015069302171468735, "learning_rate": 2.6978041129313355e-06, "loss": 0.0538, "step": 12410 }, { "epoch": 4.329034506796793, "grad_norm": 0.01998594030737877, "learning_rate": 2.6838619728128267e-06, "loss": 0.0339, "step": 12420 }, { "epoch": 4.33252004182642, "grad_norm": 0.017047762870788574, "learning_rate": 2.669919832694319e-06, "loss": 0.0475, "step": 12430 }, { "epoch": 4.3360055768560475, "grad_norm": 6.753593444824219, "learning_rate": 2.655977692575811e-06, "loss": 0.2493, "step": 12440 }, { "epoch": 4.339491111885675, "grad_norm": 0.022272732108831406, "learning_rate": 2.642035552457302e-06, "loss": 0.1162, "step": 12450 }, { "epoch": 4.342976646915302, "grad_norm": 0.01944439485669136, "learning_rate": 2.6280934123387942e-06, "loss": 0.2014, "step": 12460 }, { "epoch": 4.346462181944928, "grad_norm": 0.020634399726986885, "learning_rate": 2.614151272220286e-06, "loss": 0.0043, "step": 12470 }, { "epoch": 4.3499477169745555, "grad_norm": 0.12367178499698639, "learning_rate": 2.600209132101778e-06, "loss": 0.0248, "step": 12480 }, { "epoch": 4.353433252004183, "grad_norm": 6.508563995361328, "learning_rate": 2.5862669919832696e-06, "loss": 0.1527, "step": 12490 }, { "epoch": 4.35691878703381, "grad_norm": 0.014639653265476227, "learning_rate": 2.5723248518647613e-06, "loss": 0.0247, "step": 12500 }, { "epoch": 4.360404322063436, "grad_norm": 0.128875732421875, "learning_rate": 2.5583827117462534e-06, "loss": 0.1266, "step": 12510 }, { "epoch": 4.3638898570930635, "grad_norm": 0.028906451538205147, "learning_rate": 2.5444405716277446e-06, "loss": 0.1789, "step": 12520 }, { "epoch": 4.367375392122691, "grad_norm": 0.0448245145380497, "learning_rate": 2.5304984315092367e-06, "loss": 0.1235, "step": 12530 }, { "epoch": 4.370860927152318, "grad_norm": 0.015661604702472687, "learning_rate": 2.516556291390729e-06, "loss": 0.2039, "step": 12540 }, { "epoch": 4.374346462181945, "grad_norm": 0.021742140874266624, "learning_rate": 2.502614151272221e-06, "loss": 0.0054, "step": 12550 }, { "epoch": 4.377831997211572, "grad_norm": 68.89356994628906, "learning_rate": 2.4886720111537126e-06, "loss": 0.1393, "step": 12560 }, { "epoch": 4.381317532241199, "grad_norm": 0.11168365180492401, "learning_rate": 2.4747298710352042e-06, "loss": 0.0047, "step": 12570 }, { "epoch": 4.384803067270826, "grad_norm": 0.05092855915427208, "learning_rate": 2.460787730916696e-06, "loss": 0.1151, "step": 12580 }, { "epoch": 4.388288602300453, "grad_norm": 0.025330761447548866, "learning_rate": 2.446845590798188e-06, "loss": 0.0385, "step": 12590 }, { "epoch": 4.3917741373300805, "grad_norm": 0.034395404160022736, "learning_rate": 2.4329034506796796e-06, "loss": 0.1206, "step": 12600 }, { "epoch": 4.395259672359707, "grad_norm": 10.358739852905273, "learning_rate": 2.4189613105611713e-06, "loss": 0.1857, "step": 12610 }, { "epoch": 4.398745207389334, "grad_norm": 1.8648558855056763, "learning_rate": 2.405019170442663e-06, "loss": 0.0807, "step": 12620 }, { "epoch": 4.402230742418961, "grad_norm": 3.7046706676483154, "learning_rate": 2.391077030324155e-06, "loss": 0.0273, "step": 12630 }, { "epoch": 4.4057162774485885, "grad_norm": 0.2638864517211914, "learning_rate": 2.3771348902056467e-06, "loss": 0.052, "step": 12640 }, { "epoch": 4.409201812478216, "grad_norm": 10.242549896240234, "learning_rate": 2.3631927500871384e-06, "loss": 0.25, "step": 12650 }, { "epoch": 4.412687347507842, "grad_norm": 0.021201113238930702, "learning_rate": 2.3492506099686304e-06, "loss": 0.1335, "step": 12660 }, { "epoch": 4.416172882537469, "grad_norm": 0.015749704092741013, "learning_rate": 2.335308469850122e-06, "loss": 0.2059, "step": 12670 }, { "epoch": 4.4196584175670965, "grad_norm": 0.01748356781899929, "learning_rate": 2.3213663297316138e-06, "loss": 0.1123, "step": 12680 }, { "epoch": 4.423143952596724, "grad_norm": 16.068105697631836, "learning_rate": 2.307424189613106e-06, "loss": 0.0576, "step": 12690 }, { "epoch": 4.426629487626351, "grad_norm": 0.042768694460392, "learning_rate": 2.2934820494945975e-06, "loss": 0.0666, "step": 12700 }, { "epoch": 4.430115022655977, "grad_norm": 6.316594123840332, "learning_rate": 2.2795399093760896e-06, "loss": 0.1412, "step": 12710 }, { "epoch": 4.433600557685605, "grad_norm": 0.023556068539619446, "learning_rate": 2.2655977692575813e-06, "loss": 0.1011, "step": 12720 }, { "epoch": 4.437086092715232, "grad_norm": 0.03524250537157059, "learning_rate": 2.2516556291390733e-06, "loss": 0.1937, "step": 12730 }, { "epoch": 4.440571627744859, "grad_norm": 4.944858551025391, "learning_rate": 2.237713489020565e-06, "loss": 0.1197, "step": 12740 }, { "epoch": 4.444057162774486, "grad_norm": 18.027040481567383, "learning_rate": 2.2237713489020567e-06, "loss": 0.1547, "step": 12750 }, { "epoch": 4.447542697804113, "grad_norm": 0.22646109759807587, "learning_rate": 2.2098292087835483e-06, "loss": 0.1819, "step": 12760 }, { "epoch": 4.45102823283374, "grad_norm": 1.5214154720306396, "learning_rate": 2.1958870686650404e-06, "loss": 0.0784, "step": 12770 }, { "epoch": 4.454513767863367, "grad_norm": 9.586079597473145, "learning_rate": 2.181944928546532e-06, "loss": 0.1344, "step": 12780 }, { "epoch": 4.457999302892994, "grad_norm": 0.03214803338050842, "learning_rate": 2.1680027884280237e-06, "loss": 0.0525, "step": 12790 }, { "epoch": 4.4614848379226215, "grad_norm": 0.06377983093261719, "learning_rate": 2.1540606483095154e-06, "loss": 0.1088, "step": 12800 }, { "epoch": 4.464970372952248, "grad_norm": 2.2348597049713135, "learning_rate": 2.1401185081910075e-06, "loss": 0.0789, "step": 12810 }, { "epoch": 4.468455907981875, "grad_norm": 0.040668126195669174, "learning_rate": 2.126176368072499e-06, "loss": 0.033, "step": 12820 }, { "epoch": 4.471941443011502, "grad_norm": 0.012590868398547173, "learning_rate": 2.1122342279539912e-06, "loss": 0.027, "step": 12830 }, { "epoch": 4.4754269780411295, "grad_norm": 0.01730727031826973, "learning_rate": 2.098292087835483e-06, "loss": 0.0822, "step": 12840 }, { "epoch": 4.478912513070757, "grad_norm": 0.05510111153125763, "learning_rate": 2.084349947716975e-06, "loss": 0.0093, "step": 12850 }, { "epoch": 4.482398048100383, "grad_norm": 0.07657311856746674, "learning_rate": 2.0704078075984666e-06, "loss": 0.0442, "step": 12860 }, { "epoch": 4.48588358313001, "grad_norm": 9.510165214538574, "learning_rate": 2.0564656674799583e-06, "loss": 0.1325, "step": 12870 }, { "epoch": 4.489369118159638, "grad_norm": 0.020914927124977112, "learning_rate": 2.0425235273614504e-06, "loss": 0.0475, "step": 12880 }, { "epoch": 4.492854653189265, "grad_norm": 41.944488525390625, "learning_rate": 2.028581387242942e-06, "loss": 0.1917, "step": 12890 }, { "epoch": 4.496340188218891, "grad_norm": 25.84819984436035, "learning_rate": 2.0146392471244337e-06, "loss": 0.2525, "step": 12900 }, { "epoch": 4.499825723248518, "grad_norm": 0.020068377256393433, "learning_rate": 2.0006971070059254e-06, "loss": 0.1163, "step": 12910 }, { "epoch": 4.503311258278146, "grad_norm": 0.1646548956632614, "learning_rate": 1.9867549668874175e-06, "loss": 0.124, "step": 12920 }, { "epoch": 4.506796793307773, "grad_norm": 0.013263706117868423, "learning_rate": 1.972812826768909e-06, "loss": 0.1735, "step": 12930 }, { "epoch": 4.5102823283374, "grad_norm": 0.03346388414502144, "learning_rate": 1.9588706866504008e-06, "loss": 0.006, "step": 12940 }, { "epoch": 4.513767863367027, "grad_norm": 0.04874487593770027, "learning_rate": 1.944928546531893e-06, "loss": 0.1541, "step": 12950 }, { "epoch": 4.517253398396654, "grad_norm": 0.03367030993103981, "learning_rate": 1.9309864064133845e-06, "loss": 0.095, "step": 12960 }, { "epoch": 4.520738933426281, "grad_norm": 0.015354972332715988, "learning_rate": 1.9170442662948766e-06, "loss": 0.2585, "step": 12970 }, { "epoch": 4.524224468455908, "grad_norm": 17.290773391723633, "learning_rate": 1.9031021261763683e-06, "loss": 0.1559, "step": 12980 }, { "epoch": 4.527710003485535, "grad_norm": 0.03650696575641632, "learning_rate": 1.88915998605786e-06, "loss": 0.0843, "step": 12990 }, { "epoch": 4.531195538515162, "grad_norm": 0.8950408697128296, "learning_rate": 1.8752178459393518e-06, "loss": 0.0086, "step": 13000 }, { "epoch": 4.534681073544789, "grad_norm": 0.047185130417346954, "learning_rate": 1.8612757058208437e-06, "loss": 0.1137, "step": 13010 }, { "epoch": 4.538166608574416, "grad_norm": 19.941810607910156, "learning_rate": 1.8473335657023356e-06, "loss": 0.0534, "step": 13020 }, { "epoch": 4.541652143604043, "grad_norm": 0.03801906108856201, "learning_rate": 1.8333914255838272e-06, "loss": 0.1251, "step": 13030 }, { "epoch": 4.545137678633671, "grad_norm": 0.018815016373991966, "learning_rate": 1.8194492854653189e-06, "loss": 0.1215, "step": 13040 }, { "epoch": 4.548623213663298, "grad_norm": 0.015067674219608307, "learning_rate": 1.805507145346811e-06, "loss": 0.1624, "step": 13050 }, { "epoch": 4.552108748692924, "grad_norm": 0.05160703510046005, "learning_rate": 1.7915650052283026e-06, "loss": 0.0379, "step": 13060 }, { "epoch": 4.555594283722551, "grad_norm": 0.9364856481552124, "learning_rate": 1.7776228651097945e-06, "loss": 0.0417, "step": 13070 }, { "epoch": 4.559079818752179, "grad_norm": 0.06938211619853973, "learning_rate": 1.7636807249912864e-06, "loss": 0.0135, "step": 13080 }, { "epoch": 4.562565353781806, "grad_norm": 0.019827021285891533, "learning_rate": 1.749738584872778e-06, "loss": 0.0508, "step": 13090 }, { "epoch": 4.566050888811432, "grad_norm": 0.11981372535228729, "learning_rate": 1.73579644475427e-06, "loss": 0.057, "step": 13100 }, { "epoch": 4.569536423841059, "grad_norm": 0.3380275368690491, "learning_rate": 1.7218543046357616e-06, "loss": 0.0149, "step": 13110 }, { "epoch": 4.573021958870687, "grad_norm": 0.017787497490644455, "learning_rate": 1.7079121645172537e-06, "loss": 0.0651, "step": 13120 }, { "epoch": 4.576507493900314, "grad_norm": 0.033968303352594376, "learning_rate": 1.6939700243987453e-06, "loss": 0.0363, "step": 13130 }, { "epoch": 4.579993028929941, "grad_norm": 0.08648999035358429, "learning_rate": 1.6800278842802372e-06, "loss": 0.1379, "step": 13140 }, { "epoch": 4.583478563959567, "grad_norm": 11.68699836730957, "learning_rate": 1.666085744161729e-06, "loss": 0.0815, "step": 13150 }, { "epoch": 4.586964098989195, "grad_norm": 0.021847940981388092, "learning_rate": 1.6521436040432207e-06, "loss": 0.116, "step": 13160 }, { "epoch": 4.590449634018822, "grad_norm": 0.018791014328598976, "learning_rate": 1.6382014639247126e-06, "loss": 0.0267, "step": 13170 }, { "epoch": 4.593935169048449, "grad_norm": 18.658403396606445, "learning_rate": 1.6242593238062043e-06, "loss": 0.1086, "step": 13180 }, { "epoch": 4.597420704078076, "grad_norm": 0.09876930713653564, "learning_rate": 1.6103171836876963e-06, "loss": 0.0523, "step": 13190 }, { "epoch": 4.600906239107703, "grad_norm": 1.7560397386550903, "learning_rate": 1.596375043569188e-06, "loss": 0.0086, "step": 13200 }, { "epoch": 4.60439177413733, "grad_norm": 0.05320524796843529, "learning_rate": 1.5824329034506797e-06, "loss": 0.0986, "step": 13210 }, { "epoch": 4.607877309166957, "grad_norm": 0.028736671432852745, "learning_rate": 1.5684907633321718e-06, "loss": 0.1403, "step": 13220 }, { "epoch": 4.611362844196584, "grad_norm": 0.021549373865127563, "learning_rate": 1.5545486232136634e-06, "loss": 0.0132, "step": 13230 }, { "epoch": 4.614848379226212, "grad_norm": 0.7282363176345825, "learning_rate": 1.5406064830951553e-06, "loss": 0.0712, "step": 13240 }, { "epoch": 4.618333914255838, "grad_norm": 0.08495603501796722, "learning_rate": 1.526664342976647e-06, "loss": 0.0062, "step": 13250 }, { "epoch": 4.621819449285465, "grad_norm": 0.015024428255856037, "learning_rate": 1.512722202858139e-06, "loss": 0.0218, "step": 13260 }, { "epoch": 4.625304984315092, "grad_norm": 0.04030692204833031, "learning_rate": 1.4987800627396307e-06, "loss": 0.0916, "step": 13270 }, { "epoch": 4.62879051934472, "grad_norm": 0.018205171450972557, "learning_rate": 1.4848379226211224e-06, "loss": 0.0038, "step": 13280 }, { "epoch": 4.632276054374346, "grad_norm": 1.867049217224121, "learning_rate": 1.4708957825026142e-06, "loss": 0.1129, "step": 13290 }, { "epoch": 4.635761589403973, "grad_norm": 9.02762508392334, "learning_rate": 1.456953642384106e-06, "loss": 0.0186, "step": 13300 }, { "epoch": 4.6392471244336, "grad_norm": 9.14907169342041, "learning_rate": 1.443011502265598e-06, "loss": 0.1519, "step": 13310 }, { "epoch": 4.642732659463228, "grad_norm": 15.300692558288574, "learning_rate": 1.4290693621470896e-06, "loss": 0.0941, "step": 13320 }, { "epoch": 4.646218194492855, "grad_norm": 0.02297472208738327, "learning_rate": 1.4151272220285813e-06, "loss": 0.0892, "step": 13330 }, { "epoch": 4.649703729522482, "grad_norm": 0.013637103140354156, "learning_rate": 1.4011850819100734e-06, "loss": 0.1728, "step": 13340 }, { "epoch": 4.6531892645521085, "grad_norm": 3.2457101345062256, "learning_rate": 1.387242941791565e-06, "loss": 0.2769, "step": 13350 }, { "epoch": 4.656674799581736, "grad_norm": 0.019199127331376076, "learning_rate": 1.373300801673057e-06, "loss": 0.0128, "step": 13360 }, { "epoch": 4.660160334611363, "grad_norm": 4.897704601287842, "learning_rate": 1.3593586615545488e-06, "loss": 0.0735, "step": 13370 }, { "epoch": 4.66364586964099, "grad_norm": 3.3588168621063232, "learning_rate": 1.3454165214360407e-06, "loss": 0.1323, "step": 13380 }, { "epoch": 4.6671314046706165, "grad_norm": 4.936280727386475, "learning_rate": 1.3314743813175323e-06, "loss": 0.0841, "step": 13390 }, { "epoch": 4.670616939700244, "grad_norm": 3.541562080383301, "learning_rate": 1.317532241199024e-06, "loss": 0.0276, "step": 13400 }, { "epoch": 4.674102474729871, "grad_norm": 0.05973378196358681, "learning_rate": 1.303590101080516e-06, "loss": 0.0098, "step": 13410 }, { "epoch": 4.677588009759498, "grad_norm": 0.05543622002005577, "learning_rate": 1.2896479609620077e-06, "loss": 0.0983, "step": 13420 }, { "epoch": 4.681073544789125, "grad_norm": 0.028797583654522896, "learning_rate": 1.2757058208434996e-06, "loss": 0.0048, "step": 13430 }, { "epoch": 4.684559079818753, "grad_norm": 0.03675423562526703, "learning_rate": 1.2617636807249915e-06, "loss": 0.041, "step": 13440 }, { "epoch": 4.688044614848379, "grad_norm": 0.04537774622440338, "learning_rate": 1.2478215406064831e-06, "loss": 0.0517, "step": 13450 }, { "epoch": 4.691530149878006, "grad_norm": 0.11532427370548248, "learning_rate": 1.233879400487975e-06, "loss": 0.056, "step": 13460 }, { "epoch": 4.695015684907633, "grad_norm": 0.06099528446793556, "learning_rate": 1.2199372603694667e-06, "loss": 0.0924, "step": 13470 }, { "epoch": 4.698501219937261, "grad_norm": 0.050252217799425125, "learning_rate": 1.2059951202509586e-06, "loss": 0.1061, "step": 13480 }, { "epoch": 4.701986754966887, "grad_norm": 0.04012449085712433, "learning_rate": 1.1920529801324504e-06, "loss": 0.1954, "step": 13490 }, { "epoch": 4.705472289996514, "grad_norm": 0.016682496294379234, "learning_rate": 1.1781108400139423e-06, "loss": 0.1782, "step": 13500 }, { "epoch": 4.7089578250261415, "grad_norm": 0.021084588021039963, "learning_rate": 1.1641686998954342e-06, "loss": 0.2397, "step": 13510 }, { "epoch": 4.712443360055769, "grad_norm": 0.016580209136009216, "learning_rate": 1.1502265597769258e-06, "loss": 0.0376, "step": 13520 }, { "epoch": 4.715928895085396, "grad_norm": 11.953326225280762, "learning_rate": 1.1362844196584175e-06, "loss": 0.0902, "step": 13530 }, { "epoch": 4.719414430115023, "grad_norm": 17.264039993286133, "learning_rate": 1.1223422795399094e-06, "loss": 0.189, "step": 13540 }, { "epoch": 4.7228999651446495, "grad_norm": 0.02113143354654312, "learning_rate": 1.1084001394214012e-06, "loss": 0.1533, "step": 13550 }, { "epoch": 4.726385500174277, "grad_norm": 0.029918327927589417, "learning_rate": 1.0944579993028931e-06, "loss": 0.1908, "step": 13560 }, { "epoch": 4.729871035203904, "grad_norm": 0.1918916404247284, "learning_rate": 1.080515859184385e-06, "loss": 0.1448, "step": 13570 }, { "epoch": 4.733356570233531, "grad_norm": 3.7918004989624023, "learning_rate": 1.0665737190658767e-06, "loss": 0.0895, "step": 13580 }, { "epoch": 4.7368421052631575, "grad_norm": 15.452225685119629, "learning_rate": 1.0526315789473685e-06, "loss": 0.154, "step": 13590 }, { "epoch": 4.740327640292785, "grad_norm": 0.013763573952019215, "learning_rate": 1.0386894388288602e-06, "loss": 0.0615, "step": 13600 }, { "epoch": 4.743813175322412, "grad_norm": 2.3458456993103027, "learning_rate": 1.024747298710352e-06, "loss": 0.1512, "step": 13610 }, { "epoch": 4.747298710352039, "grad_norm": 25.5869197845459, "learning_rate": 1.010805158591844e-06, "loss": 0.1152, "step": 13620 }, { "epoch": 4.750784245381666, "grad_norm": 0.01689908280968666, "learning_rate": 9.968630184733358e-07, "loss": 0.0267, "step": 13630 }, { "epoch": 4.754269780411293, "grad_norm": 0.054154131561517715, "learning_rate": 9.829208783548277e-07, "loss": 0.0818, "step": 13640 }, { "epoch": 4.75775531544092, "grad_norm": 0.030660415068268776, "learning_rate": 9.689787382363193e-07, "loss": 0.0049, "step": 13650 }, { "epoch": 4.761240850470547, "grad_norm": 0.10165391117334366, "learning_rate": 9.550365981178112e-07, "loss": 0.179, "step": 13660 }, { "epoch": 4.7647263855001745, "grad_norm": 0.038559578359127045, "learning_rate": 9.41094457999303e-07, "loss": 0.1079, "step": 13670 }, { "epoch": 4.768211920529802, "grad_norm": 0.322663813829422, "learning_rate": 9.271523178807948e-07, "loss": 0.073, "step": 13680 }, { "epoch": 4.771697455559428, "grad_norm": 0.013543305918574333, "learning_rate": 9.132101777622866e-07, "loss": 0.0798, "step": 13690 }, { "epoch": 4.775182990589055, "grad_norm": 0.022806629538536072, "learning_rate": 8.992680376437785e-07, "loss": 0.056, "step": 13700 }, { "epoch": 4.7786685256186825, "grad_norm": 0.024512002244591713, "learning_rate": 8.853258975252702e-07, "loss": 0.1357, "step": 13710 }, { "epoch": 4.78215406064831, "grad_norm": 0.03469526022672653, "learning_rate": 8.713837574067619e-07, "loss": 0.0626, "step": 13720 }, { "epoch": 4.785639595677937, "grad_norm": 19.904218673706055, "learning_rate": 8.574416172882538e-07, "loss": 0.0779, "step": 13730 }, { "epoch": 4.789125130707563, "grad_norm": 0.01593809947371483, "learning_rate": 8.434994771697457e-07, "loss": 0.0538, "step": 13740 }, { "epoch": 4.7926106657371905, "grad_norm": 0.012609903700649738, "learning_rate": 8.295573370512374e-07, "loss": 0.162, "step": 13750 }, { "epoch": 4.796096200766818, "grad_norm": 0.14370125532150269, "learning_rate": 8.156151969327293e-07, "loss": 0.0212, "step": 13760 }, { "epoch": 4.799581735796445, "grad_norm": 0.16198384761810303, "learning_rate": 8.01673056814221e-07, "loss": 0.0167, "step": 13770 }, { "epoch": 4.803067270826071, "grad_norm": 37.06806945800781, "learning_rate": 7.877309166957129e-07, "loss": 0.0545, "step": 13780 }, { "epoch": 4.8065528058556986, "grad_norm": 0.036267928779125214, "learning_rate": 7.737887765772046e-07, "loss": 0.1962, "step": 13790 }, { "epoch": 4.810038340885326, "grad_norm": 0.038520101457834244, "learning_rate": 7.598466364586965e-07, "loss": 0.0049, "step": 13800 }, { "epoch": 4.813523875914953, "grad_norm": 0.34361743927001953, "learning_rate": 7.459044963401884e-07, "loss": 0.063, "step": 13810 }, { "epoch": 4.81700941094458, "grad_norm": 1.973721981048584, "learning_rate": 7.3196235622168e-07, "loss": 0.1321, "step": 13820 }, { "epoch": 4.8204949459742075, "grad_norm": 0.058219779282808304, "learning_rate": 7.180202161031719e-07, "loss": 0.0716, "step": 13830 }, { "epoch": 4.823980481003834, "grad_norm": 0.029252415522933006, "learning_rate": 7.040780759846637e-07, "loss": 0.11, "step": 13840 }, { "epoch": 4.827466016033461, "grad_norm": 8.357165336608887, "learning_rate": 6.901359358661555e-07, "loss": 0.1614, "step": 13850 }, { "epoch": 4.830951551063088, "grad_norm": 0.27816373109817505, "learning_rate": 6.761937957476473e-07, "loss": 0.1628, "step": 13860 }, { "epoch": 4.8344370860927155, "grad_norm": 0.04683591425418854, "learning_rate": 6.622516556291392e-07, "loss": 0.1641, "step": 13870 }, { "epoch": 4.837922621122342, "grad_norm": 13.681280136108398, "learning_rate": 6.483095155106308e-07, "loss": 0.0121, "step": 13880 }, { "epoch": 4.841408156151969, "grad_norm": 0.020022893324494362, "learning_rate": 6.343673753921227e-07, "loss": 0.0361, "step": 13890 }, { "epoch": 4.844893691181596, "grad_norm": 0.016766728833317757, "learning_rate": 6.204252352736145e-07, "loss": 0.0044, "step": 13900 }, { "epoch": 4.8483792262112235, "grad_norm": 0.034691717475652695, "learning_rate": 6.064830951551064e-07, "loss": 0.1031, "step": 13910 }, { "epoch": 4.851864761240851, "grad_norm": 0.28107964992523193, "learning_rate": 5.925409550365982e-07, "loss": 0.1587, "step": 13920 }, { "epoch": 4.855350296270478, "grad_norm": 0.032920971512794495, "learning_rate": 5.7859881491809e-07, "loss": 0.1591, "step": 13930 }, { "epoch": 4.858835831300104, "grad_norm": 0.01294040773063898, "learning_rate": 5.646566747995818e-07, "loss": 0.0468, "step": 13940 }, { "epoch": 4.8623213663297316, "grad_norm": 0.06196725741028786, "learning_rate": 5.507145346810736e-07, "loss": 0.1613, "step": 13950 }, { "epoch": 4.865806901359359, "grad_norm": 2.9842488765716553, "learning_rate": 5.367723945625654e-07, "loss": 0.0059, "step": 13960 }, { "epoch": 4.869292436388986, "grad_norm": 0.224669948220253, "learning_rate": 5.228302544440572e-07, "loss": 0.0523, "step": 13970 }, { "epoch": 4.872777971418612, "grad_norm": 2.8570218086242676, "learning_rate": 5.08888114325549e-07, "loss": 0.088, "step": 13980 }, { "epoch": 4.87626350644824, "grad_norm": 0.2920582592487335, "learning_rate": 4.949459742070408e-07, "loss": 0.0094, "step": 13990 }, { "epoch": 4.879749041477867, "grad_norm": 0.02076049894094467, "learning_rate": 4.810038340885327e-07, "loss": 0.1568, "step": 14000 }, { "epoch": 4.883234576507494, "grad_norm": 0.015274460427463055, "learning_rate": 4.670616939700244e-07, "loss": 0.1205, "step": 14010 }, { "epoch": 4.886720111537121, "grad_norm": 2.485515594482422, "learning_rate": 4.5311955385151623e-07, "loss": 0.1998, "step": 14020 }, { "epoch": 4.890205646566748, "grad_norm": 0.03131948783993721, "learning_rate": 4.3917741373300805e-07, "loss": 0.0188, "step": 14030 }, { "epoch": 4.893691181596375, "grad_norm": 0.014717698097229004, "learning_rate": 4.252352736144998e-07, "loss": 0.0876, "step": 14040 }, { "epoch": 4.897176716626002, "grad_norm": 0.023664269596338272, "learning_rate": 4.1129313349599164e-07, "loss": 0.1329, "step": 14050 }, { "epoch": 4.900662251655629, "grad_norm": 0.017263269051909447, "learning_rate": 3.973509933774835e-07, "loss": 0.2117, "step": 14060 }, { "epoch": 4.9041477866852565, "grad_norm": 0.2318657636642456, "learning_rate": 3.834088532589753e-07, "loss": 0.0779, "step": 14070 }, { "epoch": 4.907633321714883, "grad_norm": 0.06534525007009506, "learning_rate": 3.694667131404671e-07, "loss": 0.0394, "step": 14080 }, { "epoch": 4.91111885674451, "grad_norm": 14.957361221313477, "learning_rate": 3.555245730219589e-07, "loss": 0.0659, "step": 14090 }, { "epoch": 4.914604391774137, "grad_norm": 11.34598445892334, "learning_rate": 3.415824329034507e-07, "loss": 0.0993, "step": 14100 }, { "epoch": 4.9180899268037646, "grad_norm": 3.656956672668457, "learning_rate": 3.276402927849425e-07, "loss": 0.0936, "step": 14110 }, { "epoch": 4.921575461833392, "grad_norm": 0.048377808183431625, "learning_rate": 3.1369815266643433e-07, "loss": 0.1247, "step": 14120 }, { "epoch": 4.925060996863018, "grad_norm": 21.954980850219727, "learning_rate": 2.9975601254792615e-07, "loss": 0.1416, "step": 14130 }, { "epoch": 4.928546531892645, "grad_norm": 10.352675437927246, "learning_rate": 2.858138724294179e-07, "loss": 0.0832, "step": 14140 }, { "epoch": 4.932032066922273, "grad_norm": 3.8607187271118164, "learning_rate": 2.7187173231090974e-07, "loss": 0.1472, "step": 14150 }, { "epoch": 4.9355176019519, "grad_norm": 0.041085511445999146, "learning_rate": 2.5792959219240156e-07, "loss": 0.1278, "step": 14160 }, { "epoch": 4.939003136981527, "grad_norm": 0.017858577892184258, "learning_rate": 2.439874520738934e-07, "loss": 0.0691, "step": 14170 }, { "epoch": 4.942488672011153, "grad_norm": 10.322635650634766, "learning_rate": 2.3004531195538517e-07, "loss": 0.0753, "step": 14180 }, { "epoch": 4.945974207040781, "grad_norm": 0.03775576502084732, "learning_rate": 2.16103171836877e-07, "loss": 0.022, "step": 14190 }, { "epoch": 4.949459742070408, "grad_norm": 23.317575454711914, "learning_rate": 2.021610317183688e-07, "loss": 0.1228, "step": 14200 }, { "epoch": 4.952945277100035, "grad_norm": 0.027847325429320335, "learning_rate": 1.8821889159986058e-07, "loss": 0.0041, "step": 14210 }, { "epoch": 4.956430812129662, "grad_norm": 7.33128023147583, "learning_rate": 1.7427675148135243e-07, "loss": 0.0204, "step": 14220 }, { "epoch": 4.959916347159289, "grad_norm": 0.49565252661705017, "learning_rate": 1.6033461136284422e-07, "loss": 0.0091, "step": 14230 }, { "epoch": 4.963401882188916, "grad_norm": 0.3179240822792053, "learning_rate": 1.4639247124433602e-07, "loss": 0.1085, "step": 14240 }, { "epoch": 4.966887417218543, "grad_norm": 0.015233837999403477, "learning_rate": 1.3245033112582784e-07, "loss": 0.0079, "step": 14250 }, { "epoch": 4.97037295224817, "grad_norm": 4.127240180969238, "learning_rate": 1.1850819100731964e-07, "loss": 0.0403, "step": 14260 }, { "epoch": 4.973858487277797, "grad_norm": 8.784873962402344, "learning_rate": 1.0456605088881144e-07, "loss": 0.0699, "step": 14270 }, { "epoch": 4.977344022307424, "grad_norm": 9.374968528747559, "learning_rate": 9.062391077030325e-08, "loss": 0.039, "step": 14280 }, { "epoch": 4.980829557337051, "grad_norm": 0.012251177802681923, "learning_rate": 7.668177065179505e-08, "loss": 0.1414, "step": 14290 }, { "epoch": 4.984315092366678, "grad_norm": 2.2470829486846924, "learning_rate": 6.273963053328686e-08, "loss": 0.1129, "step": 14300 }, { "epoch": 4.987800627396306, "grad_norm": 1.4505870342254639, "learning_rate": 4.8797490414778674e-08, "loss": 0.0438, "step": 14310 }, { "epoch": 4.991286162425933, "grad_norm": 0.05611838400363922, "learning_rate": 3.485535029627048e-08, "loss": 0.0759, "step": 14320 }, { "epoch": 4.994771697455559, "grad_norm": 0.01623629219830036, "learning_rate": 2.0913210177762286e-08, "loss": 0.0602, "step": 14330 }, { "epoch": 4.998257232485186, "grad_norm": 0.022672150284051895, "learning_rate": 6.971070059254096e-09, "loss": 0.1617, "step": 14340 }, { "epoch": 5.0, "eval_accuracy": 0.9916049382716049, "eval_loss": 0.039273809641599655, "eval_runtime": 19.3797, "eval_samples_per_second": 208.981, "eval_steps_per_second": 26.161, "step": 14345 }, { "epoch": 5.0, "step": 14345, "total_flos": 8.892843392498688e+18, "train_loss": 0.1924674165219266, "train_runtime": 1341.6959, "train_samples_per_second": 85.526, "train_steps_per_second": 10.692 } ], "logging_steps": 10, "max_steps": 14345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.892843392498688e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }