diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10125 @@ +{ + "best_metric": 0.039273809641599655, + "best_model_checkpoint": "./data/EuroSAT_output/checkpoint-14345", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 14345, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003485535029627048, + "grad_norm": 2.583953380584717, + "learning_rate": 1.9986057859881492e-05, + "loss": 2.3035, + "step": 10 + }, + { + "epoch": 0.006971070059254096, + "grad_norm": 3.04152774810791, + "learning_rate": 1.9972115719762987e-05, + "loss": 2.2087, + "step": 20 + }, + { + "epoch": 0.010456605088881143, + "grad_norm": 2.454313039779663, + "learning_rate": 1.9958173579644477e-05, + "loss": 2.179, + "step": 30 + }, + { + "epoch": 0.013942140118508192, + "grad_norm": 2.8994503021240234, + "learning_rate": 1.9944231439525968e-05, + "loss": 2.1153, + "step": 40 + }, + { + "epoch": 0.01742767514813524, + "grad_norm": 2.569394111633301, + "learning_rate": 1.9930289299407462e-05, + "loss": 2.0797, + "step": 50 + }, + { + "epoch": 0.020913210177762286, + "grad_norm": 2.860028028488159, + "learning_rate": 1.9916347159288953e-05, + "loss": 1.9448, + "step": 60 + }, + { + "epoch": 0.024398745207389334, + "grad_norm": 3.2729475498199463, + "learning_rate": 1.9902405019170444e-05, + "loss": 1.8354, + "step": 70 + }, + { + "epoch": 0.027884280237016383, + "grad_norm": 3.2359659671783447, + "learning_rate": 1.9888462879051935e-05, + "loss": 1.8398, + "step": 80 + }, + { + "epoch": 0.03136981526664343, + "grad_norm": 3.4146273136138916, + "learning_rate": 1.9874520738933426e-05, + "loss": 1.7559, + "step": 90 + }, + { + "epoch": 0.03485535029627048, + "grad_norm": 3.6460695266723633, + "learning_rate": 1.986057859881492e-05, + "loss": 1.7249, + "step": 100 + }, + { + "epoch": 0.03834088532589752, + "grad_norm": 3.8710196018218994, + "learning_rate": 1.984663645869641e-05, + "loss": 1.6526, + "step": 110 + }, + { + "epoch": 0.04182642035552457, + "grad_norm": 3.0699777603149414, + "learning_rate": 1.98326943185779e-05, + "loss": 1.4757, + "step": 120 + }, + { + "epoch": 0.04531195538515162, + "grad_norm": 3.8974356651306152, + "learning_rate": 1.9818752178459396e-05, + "loss": 1.5169, + "step": 130 + }, + { + "epoch": 0.04879749041477867, + "grad_norm": 4.513424873352051, + "learning_rate": 1.9804810038340887e-05, + "loss": 1.4487, + "step": 140 + }, + { + "epoch": 0.05228302544440572, + "grad_norm": 3.8483262062072754, + "learning_rate": 1.9790867898222377e-05, + "loss": 1.4174, + "step": 150 + }, + { + "epoch": 0.055768560474032766, + "grad_norm": 3.938114643096924, + "learning_rate": 1.977692575810387e-05, + "loss": 1.2989, + "step": 160 + }, + { + "epoch": 0.059254095503659815, + "grad_norm": 4.86665678024292, + "learning_rate": 1.9762983617985362e-05, + "loss": 1.3303, + "step": 170 + }, + { + "epoch": 0.06273963053328686, + "grad_norm": 2.8868749141693115, + "learning_rate": 1.9749041477866853e-05, + "loss": 1.2444, + "step": 180 + }, + { + "epoch": 0.06622516556291391, + "grad_norm": 2.838812828063965, + "learning_rate": 1.9735099337748347e-05, + "loss": 1.0791, + "step": 190 + }, + { + "epoch": 0.06971070059254096, + "grad_norm": 3.111001968383789, + "learning_rate": 1.9721157197629838e-05, + "loss": 1.1166, + "step": 200 + }, + { + "epoch": 0.073196235622168, + "grad_norm": 4.862301826477051, + "learning_rate": 1.970721505751133e-05, + "loss": 1.053, + "step": 210 + }, + { + "epoch": 0.07668177065179504, + "grad_norm": 3.7676494121551514, + "learning_rate": 1.969327291739282e-05, + "loss": 1.0223, + "step": 220 + }, + { + "epoch": 0.0801673056814221, + "grad_norm": 4.827447414398193, + "learning_rate": 1.9679330777274314e-05, + "loss": 1.0277, + "step": 230 + }, + { + "epoch": 0.08365284071104914, + "grad_norm": 2.027529716491699, + "learning_rate": 1.9665388637155805e-05, + "loss": 0.8187, + "step": 240 + }, + { + "epoch": 0.08713837574067619, + "grad_norm": 5.508512020111084, + "learning_rate": 1.9651446497037296e-05, + "loss": 0.9514, + "step": 250 + }, + { + "epoch": 0.09062391077030324, + "grad_norm": 4.481660842895508, + "learning_rate": 1.963750435691879e-05, + "loss": 0.9503, + "step": 260 + }, + { + "epoch": 0.09410944579993029, + "grad_norm": 2.511476993560791, + "learning_rate": 1.962356221680028e-05, + "loss": 0.9108, + "step": 270 + }, + { + "epoch": 0.09759498082955734, + "grad_norm": 5.287069320678711, + "learning_rate": 1.960962007668177e-05, + "loss": 0.8772, + "step": 280 + }, + { + "epoch": 0.10108051585918439, + "grad_norm": 3.6010959148406982, + "learning_rate": 1.9595677936563266e-05, + "loss": 0.793, + "step": 290 + }, + { + "epoch": 0.10456605088881143, + "grad_norm": 4.5751776695251465, + "learning_rate": 1.9581735796444757e-05, + "loss": 0.7934, + "step": 300 + }, + { + "epoch": 0.10805158591843848, + "grad_norm": 2.073023796081543, + "learning_rate": 1.9567793656326247e-05, + "loss": 0.7081, + "step": 310 + }, + { + "epoch": 0.11153712094806553, + "grad_norm": 5.257015228271484, + "learning_rate": 1.955385151620774e-05, + "loss": 0.7023, + "step": 320 + }, + { + "epoch": 0.11502265597769258, + "grad_norm": 5.037456035614014, + "learning_rate": 1.9539909376089232e-05, + "loss": 0.705, + "step": 330 + }, + { + "epoch": 0.11850819100731963, + "grad_norm": 2.995671510696411, + "learning_rate": 1.9525967235970723e-05, + "loss": 0.654, + "step": 340 + }, + { + "epoch": 0.12199372603694666, + "grad_norm": 4.856020450592041, + "learning_rate": 1.9512025095852214e-05, + "loss": 0.6384, + "step": 350 + }, + { + "epoch": 0.12547926106657373, + "grad_norm": 4.719180107116699, + "learning_rate": 1.9498082955733705e-05, + "loss": 0.6655, + "step": 360 + }, + { + "epoch": 0.12896479609620076, + "grad_norm": 7.83165979385376, + "learning_rate": 1.94841408156152e-05, + "loss": 0.5098, + "step": 370 + }, + { + "epoch": 0.13245033112582782, + "grad_norm": 3.3765158653259277, + "learning_rate": 1.947019867549669e-05, + "loss": 0.6758, + "step": 380 + }, + { + "epoch": 0.13593586615545486, + "grad_norm": 4.360907554626465, + "learning_rate": 1.945625653537818e-05, + "loss": 0.6017, + "step": 390 + }, + { + "epoch": 0.13942140118508192, + "grad_norm": 3.162569522857666, + "learning_rate": 1.9442314395259675e-05, + "loss": 0.6965, + "step": 400 + }, + { + "epoch": 0.14290693621470896, + "grad_norm": 5.929446697235107, + "learning_rate": 1.9428372255141166e-05, + "loss": 0.6226, + "step": 410 + }, + { + "epoch": 0.146392471244336, + "grad_norm": 2.6196141242980957, + "learning_rate": 1.9414430115022657e-05, + "loss": 0.5276, + "step": 420 + }, + { + "epoch": 0.14987800627396305, + "grad_norm": 4.234184265136719, + "learning_rate": 1.940048797490415e-05, + "loss": 0.7085, + "step": 430 + }, + { + "epoch": 0.1533635413035901, + "grad_norm": 3.8775856494903564, + "learning_rate": 1.938654583478564e-05, + "loss": 0.7142, + "step": 440 + }, + { + "epoch": 0.15684907633321715, + "grad_norm": 2.744117498397827, + "learning_rate": 1.9372603694667132e-05, + "loss": 0.606, + "step": 450 + }, + { + "epoch": 0.1603346113628442, + "grad_norm": 4.7066969871521, + "learning_rate": 1.9358661554548627e-05, + "loss": 0.5027, + "step": 460 + }, + { + "epoch": 0.16382014639247125, + "grad_norm": 6.463034152984619, + "learning_rate": 1.9344719414430117e-05, + "loss": 0.5223, + "step": 470 + }, + { + "epoch": 0.16730568142209828, + "grad_norm": 9.271512985229492, + "learning_rate": 1.9330777274311608e-05, + "loss": 0.6037, + "step": 480 + }, + { + "epoch": 0.17079121645172535, + "grad_norm": 14.535713195800781, + "learning_rate": 1.93168351341931e-05, + "loss": 0.5479, + "step": 490 + }, + { + "epoch": 0.17427675148135238, + "grad_norm": 8.693368911743164, + "learning_rate": 1.930289299407459e-05, + "loss": 0.6084, + "step": 500 + }, + { + "epoch": 0.17776228651097944, + "grad_norm": 5.175466537475586, + "learning_rate": 1.9288950853956084e-05, + "loss": 0.6587, + "step": 510 + }, + { + "epoch": 0.18124782154060648, + "grad_norm": 2.11287522315979, + "learning_rate": 1.9275008713837575e-05, + "loss": 0.49, + "step": 520 + }, + { + "epoch": 0.18473335657023354, + "grad_norm": 2.2307288646698, + "learning_rate": 1.9261066573719066e-05, + "loss": 0.6132, + "step": 530 + }, + { + "epoch": 0.18821889159986058, + "grad_norm": 11.079753875732422, + "learning_rate": 1.924712443360056e-05, + "loss": 0.572, + "step": 540 + }, + { + "epoch": 0.19170442662948764, + "grad_norm": 2.487985610961914, + "learning_rate": 1.923318229348205e-05, + "loss": 0.3897, + "step": 550 + }, + { + "epoch": 0.19518996165911467, + "grad_norm": 3.330732822418213, + "learning_rate": 1.921924015336354e-05, + "loss": 0.3884, + "step": 560 + }, + { + "epoch": 0.1986754966887417, + "grad_norm": 9.687525749206543, + "learning_rate": 1.9205298013245036e-05, + "loss": 0.4155, + "step": 570 + }, + { + "epoch": 0.20216103171836877, + "grad_norm": 5.554359436035156, + "learning_rate": 1.9191355873126526e-05, + "loss": 0.5435, + "step": 580 + }, + { + "epoch": 0.2056465667479958, + "grad_norm": 2.3810067176818848, + "learning_rate": 1.917741373300802e-05, + "loss": 0.4487, + "step": 590 + }, + { + "epoch": 0.20913210177762287, + "grad_norm": 2.6795718669891357, + "learning_rate": 1.916347159288951e-05, + "loss": 0.5523, + "step": 600 + }, + { + "epoch": 0.2126176368072499, + "grad_norm": 11.402462005615234, + "learning_rate": 1.9149529452771002e-05, + "loss": 0.6798, + "step": 610 + }, + { + "epoch": 0.21610317183687697, + "grad_norm": 4.038872241973877, + "learning_rate": 1.9135587312652493e-05, + "loss": 0.3321, + "step": 620 + }, + { + "epoch": 0.219588706866504, + "grad_norm": 0.8252215385437012, + "learning_rate": 1.9121645172533984e-05, + "loss": 0.4267, + "step": 630 + }, + { + "epoch": 0.22307424189613106, + "grad_norm": 3.365762233734131, + "learning_rate": 1.9107703032415478e-05, + "loss": 0.5648, + "step": 640 + }, + { + "epoch": 0.2265597769257581, + "grad_norm": 3.1780312061309814, + "learning_rate": 1.909376089229697e-05, + "loss": 0.4703, + "step": 650 + }, + { + "epoch": 0.23004531195538516, + "grad_norm": 1.3519302606582642, + "learning_rate": 1.907981875217846e-05, + "loss": 0.4887, + "step": 660 + }, + { + "epoch": 0.2335308469850122, + "grad_norm": 2.861978054046631, + "learning_rate": 1.9065876612059954e-05, + "loss": 0.5227, + "step": 670 + }, + { + "epoch": 0.23701638201463926, + "grad_norm": 9.564464569091797, + "learning_rate": 1.9051934471941445e-05, + "loss": 0.3791, + "step": 680 + }, + { + "epoch": 0.2405019170442663, + "grad_norm": 6.82305383682251, + "learning_rate": 1.9037992331822936e-05, + "loss": 0.5024, + "step": 690 + }, + { + "epoch": 0.24398745207389333, + "grad_norm": 0.8181214332580566, + "learning_rate": 1.902405019170443e-05, + "loss": 0.4091, + "step": 700 + }, + { + "epoch": 0.2474729871035204, + "grad_norm": 11.525835990905762, + "learning_rate": 1.901010805158592e-05, + "loss": 0.3477, + "step": 710 + }, + { + "epoch": 0.25095852213314745, + "grad_norm": 2.1437933444976807, + "learning_rate": 1.899616591146741e-05, + "loss": 0.3997, + "step": 720 + }, + { + "epoch": 0.25444405716277446, + "grad_norm": 14.871402740478516, + "learning_rate": 1.8982223771348906e-05, + "loss": 0.4577, + "step": 730 + }, + { + "epoch": 0.2579295921924015, + "grad_norm": 1.370771050453186, + "learning_rate": 1.8968281631230396e-05, + "loss": 0.2808, + "step": 740 + }, + { + "epoch": 0.2614151272220286, + "grad_norm": 14.198845863342285, + "learning_rate": 1.8954339491111887e-05, + "loss": 0.4222, + "step": 750 + }, + { + "epoch": 0.26490066225165565, + "grad_norm": 0.7197427749633789, + "learning_rate": 1.8940397350993378e-05, + "loss": 0.3704, + "step": 760 + }, + { + "epoch": 0.26838619728128266, + "grad_norm": 8.696059226989746, + "learning_rate": 1.892645521087487e-05, + "loss": 0.3291, + "step": 770 + }, + { + "epoch": 0.2718717323109097, + "grad_norm": 0.5741353034973145, + "learning_rate": 1.8912513070756363e-05, + "loss": 0.2361, + "step": 780 + }, + { + "epoch": 0.2753572673405368, + "grad_norm": 3.235818386077881, + "learning_rate": 1.8898570930637854e-05, + "loss": 0.4824, + "step": 790 + }, + { + "epoch": 0.27884280237016384, + "grad_norm": 0.8503827452659607, + "learning_rate": 1.8884628790519345e-05, + "loss": 0.3376, + "step": 800 + }, + { + "epoch": 0.28232833739979085, + "grad_norm": 1.6630173921585083, + "learning_rate": 1.887068665040084e-05, + "loss": 0.3883, + "step": 810 + }, + { + "epoch": 0.2858138724294179, + "grad_norm": 10.400406837463379, + "learning_rate": 1.885674451028233e-05, + "loss": 0.4094, + "step": 820 + }, + { + "epoch": 0.289299407459045, + "grad_norm": 1.3901340961456299, + "learning_rate": 1.884280237016382e-05, + "loss": 0.5164, + "step": 830 + }, + { + "epoch": 0.292784942488672, + "grad_norm": 5.530642509460449, + "learning_rate": 1.8828860230045315e-05, + "loss": 0.3125, + "step": 840 + }, + { + "epoch": 0.29627047751829905, + "grad_norm": 6.685891151428223, + "learning_rate": 1.8814918089926806e-05, + "loss": 0.3943, + "step": 850 + }, + { + "epoch": 0.2997560125479261, + "grad_norm": 15.618851661682129, + "learning_rate": 1.8800975949808296e-05, + "loss": 0.3619, + "step": 860 + }, + { + "epoch": 0.30324154757755317, + "grad_norm": 9.351723670959473, + "learning_rate": 1.878703380968979e-05, + "loss": 0.4237, + "step": 870 + }, + { + "epoch": 0.3067270826071802, + "grad_norm": 3.25188946723938, + "learning_rate": 1.877309166957128e-05, + "loss": 0.3106, + "step": 880 + }, + { + "epoch": 0.31021261763680724, + "grad_norm": 3.0205211639404297, + "learning_rate": 1.8759149529452772e-05, + "loss": 0.3932, + "step": 890 + }, + { + "epoch": 0.3136981526664343, + "grad_norm": 0.8111696243286133, + "learning_rate": 1.8745207389334263e-05, + "loss": 0.3497, + "step": 900 + }, + { + "epoch": 0.31718368769606137, + "grad_norm": 1.2595362663269043, + "learning_rate": 1.8731265249215754e-05, + "loss": 0.4002, + "step": 910 + }, + { + "epoch": 0.3206692227256884, + "grad_norm": 0.6795431971549988, + "learning_rate": 1.8717323109097248e-05, + "loss": 0.3498, + "step": 920 + }, + { + "epoch": 0.32415475775531544, + "grad_norm": 1.0169798135757446, + "learning_rate": 1.870338096897874e-05, + "loss": 0.2839, + "step": 930 + }, + { + "epoch": 0.3276402927849425, + "grad_norm": 9.539275169372559, + "learning_rate": 1.868943882886023e-05, + "loss": 0.3325, + "step": 940 + }, + { + "epoch": 0.33112582781456956, + "grad_norm": 8.384815216064453, + "learning_rate": 1.8675496688741724e-05, + "loss": 0.3253, + "step": 950 + }, + { + "epoch": 0.33461136284419657, + "grad_norm": 4.2581281661987305, + "learning_rate": 1.8661554548623215e-05, + "loss": 0.3612, + "step": 960 + }, + { + "epoch": 0.33809689787382363, + "grad_norm": 1.6423273086547852, + "learning_rate": 1.8647612408504706e-05, + "loss": 0.3876, + "step": 970 + }, + { + "epoch": 0.3415824329034507, + "grad_norm": 3.153898000717163, + "learning_rate": 1.86336702683862e-05, + "loss": 0.4319, + "step": 980 + }, + { + "epoch": 0.3450679679330777, + "grad_norm": 0.48236197233200073, + "learning_rate": 1.861972812826769e-05, + "loss": 0.1993, + "step": 990 + }, + { + "epoch": 0.34855350296270476, + "grad_norm": 0.4741288423538208, + "learning_rate": 1.8605785988149185e-05, + "loss": 0.3256, + "step": 1000 + }, + { + "epoch": 0.3520390379923318, + "grad_norm": 10.781346321105957, + "learning_rate": 1.8591843848030676e-05, + "loss": 0.4149, + "step": 1010 + }, + { + "epoch": 0.3555245730219589, + "grad_norm": 0.4621415436267853, + "learning_rate": 1.8577901707912166e-05, + "loss": 0.3206, + "step": 1020 + }, + { + "epoch": 0.3590101080515859, + "grad_norm": 6.963232040405273, + "learning_rate": 1.8563959567793657e-05, + "loss": 0.3789, + "step": 1030 + }, + { + "epoch": 0.36249564308121296, + "grad_norm": 6.472696781158447, + "learning_rate": 1.8550017427675148e-05, + "loss": 0.2999, + "step": 1040 + }, + { + "epoch": 0.36598117811084, + "grad_norm": 8.689462661743164, + "learning_rate": 1.8536075287556642e-05, + "loss": 0.2934, + "step": 1050 + }, + { + "epoch": 0.3694667131404671, + "grad_norm": 0.33935561776161194, + "learning_rate": 1.8522133147438133e-05, + "loss": 0.2774, + "step": 1060 + }, + { + "epoch": 0.3729522481700941, + "grad_norm": 10.953472137451172, + "learning_rate": 1.8508191007319624e-05, + "loss": 0.3193, + "step": 1070 + }, + { + "epoch": 0.37643778319972115, + "grad_norm": 1.503441333770752, + "learning_rate": 1.8494248867201118e-05, + "loss": 0.3257, + "step": 1080 + }, + { + "epoch": 0.3799233182293482, + "grad_norm": 0.4625272750854492, + "learning_rate": 1.848030672708261e-05, + "loss": 0.2634, + "step": 1090 + }, + { + "epoch": 0.3834088532589753, + "grad_norm": 11.566400527954102, + "learning_rate": 1.84663645869641e-05, + "loss": 0.2718, + "step": 1100 + }, + { + "epoch": 0.3868943882886023, + "grad_norm": 12.47866439819336, + "learning_rate": 1.8452422446845594e-05, + "loss": 0.3186, + "step": 1110 + }, + { + "epoch": 0.39037992331822935, + "grad_norm": 8.613419532775879, + "learning_rate": 1.8438480306727085e-05, + "loss": 0.3122, + "step": 1120 + }, + { + "epoch": 0.3938654583478564, + "grad_norm": 0.35111886262893677, + "learning_rate": 1.8424538166608576e-05, + "loss": 0.2495, + "step": 1130 + }, + { + "epoch": 0.3973509933774834, + "grad_norm": 0.34094560146331787, + "learning_rate": 1.841059602649007e-05, + "loss": 0.2337, + "step": 1140 + }, + { + "epoch": 0.4008365284071105, + "grad_norm": 4.428099155426025, + "learning_rate": 1.839665388637156e-05, + "loss": 0.1903, + "step": 1150 + }, + { + "epoch": 0.40432206343673754, + "grad_norm": 0.3854966461658478, + "learning_rate": 1.838271174625305e-05, + "loss": 0.3, + "step": 1160 + }, + { + "epoch": 0.4078075984663646, + "grad_norm": 38.84294509887695, + "learning_rate": 1.8368769606134542e-05, + "loss": 0.2134, + "step": 1170 + }, + { + "epoch": 0.4112931334959916, + "grad_norm": 1.8577314615249634, + "learning_rate": 1.8354827466016033e-05, + "loss": 0.2276, + "step": 1180 + }, + { + "epoch": 0.4147786685256187, + "grad_norm": 0.9145134091377258, + "learning_rate": 1.8340885325897527e-05, + "loss": 0.2989, + "step": 1190 + }, + { + "epoch": 0.41826420355524574, + "grad_norm": 13.17152214050293, + "learning_rate": 1.8326943185779018e-05, + "loss": 0.2938, + "step": 1200 + }, + { + "epoch": 0.4217497385848728, + "grad_norm": 0.34265998005867004, + "learning_rate": 1.831300104566051e-05, + "loss": 0.2257, + "step": 1210 + }, + { + "epoch": 0.4252352736144998, + "grad_norm": 10.514459609985352, + "learning_rate": 1.8299058905542003e-05, + "loss": 0.382, + "step": 1220 + }, + { + "epoch": 0.42872080864412687, + "grad_norm": 16.24938201904297, + "learning_rate": 1.8285116765423494e-05, + "loss": 0.3638, + "step": 1230 + }, + { + "epoch": 0.43220634367375393, + "grad_norm": 2.323594331741333, + "learning_rate": 1.8271174625304985e-05, + "loss": 0.3058, + "step": 1240 + }, + { + "epoch": 0.43569187870338094, + "grad_norm": 0.8403987288475037, + "learning_rate": 1.825723248518648e-05, + "loss": 0.1748, + "step": 1250 + }, + { + "epoch": 0.439177413733008, + "grad_norm": 0.3924444913864136, + "learning_rate": 1.824329034506797e-05, + "loss": 0.2051, + "step": 1260 + }, + { + "epoch": 0.44266294876263507, + "grad_norm": 1.6590919494628906, + "learning_rate": 1.822934820494946e-05, + "loss": 0.1713, + "step": 1270 + }, + { + "epoch": 0.44614848379226213, + "grad_norm": 10.351805686950684, + "learning_rate": 1.8215406064830955e-05, + "loss": 0.2202, + "step": 1280 + }, + { + "epoch": 0.44963401882188914, + "grad_norm": 13.177727699279785, + "learning_rate": 1.8201463924712445e-05, + "loss": 0.2005, + "step": 1290 + }, + { + "epoch": 0.4531195538515162, + "grad_norm": 3.9433481693267822, + "learning_rate": 1.8187521784593936e-05, + "loss": 0.2948, + "step": 1300 + }, + { + "epoch": 0.45660508888114326, + "grad_norm": 0.3618289530277252, + "learning_rate": 1.8173579644475427e-05, + "loss": 0.148, + "step": 1310 + }, + { + "epoch": 0.4600906239107703, + "grad_norm": 10.462213516235352, + "learning_rate": 1.8159637504356918e-05, + "loss": 0.2468, + "step": 1320 + }, + { + "epoch": 0.46357615894039733, + "grad_norm": 0.270712286233902, + "learning_rate": 1.8145695364238412e-05, + "loss": 0.2283, + "step": 1330 + }, + { + "epoch": 0.4670616939700244, + "grad_norm": 0.30840155482292175, + "learning_rate": 1.8131753224119903e-05, + "loss": 0.1058, + "step": 1340 + }, + { + "epoch": 0.47054722899965146, + "grad_norm": 7.213718414306641, + "learning_rate": 1.8117811084001394e-05, + "loss": 0.2548, + "step": 1350 + }, + { + "epoch": 0.4740327640292785, + "grad_norm": 4.371262550354004, + "learning_rate": 1.8103868943882888e-05, + "loss": 0.2346, + "step": 1360 + }, + { + "epoch": 0.4775182990589055, + "grad_norm": 5.258789539337158, + "learning_rate": 1.808992680376438e-05, + "loss": 0.3612, + "step": 1370 + }, + { + "epoch": 0.4810038340885326, + "grad_norm": 0.29574891924858093, + "learning_rate": 1.8075984663645873e-05, + "loss": 0.2976, + "step": 1380 + }, + { + "epoch": 0.48448936911815965, + "grad_norm": 0.7350137233734131, + "learning_rate": 1.8062042523527364e-05, + "loss": 0.1397, + "step": 1390 + }, + { + "epoch": 0.48797490414778666, + "grad_norm": 0.22566929459571838, + "learning_rate": 1.8048100383408855e-05, + "loss": 0.2482, + "step": 1400 + }, + { + "epoch": 0.4914604391774137, + "grad_norm": 4.66607141494751, + "learning_rate": 1.803415824329035e-05, + "loss": 0.2287, + "step": 1410 + }, + { + "epoch": 0.4949459742070408, + "grad_norm": 1.6220591068267822, + "learning_rate": 1.802021610317184e-05, + "loss": 0.2499, + "step": 1420 + }, + { + "epoch": 0.49843150923666785, + "grad_norm": 9.882462501525879, + "learning_rate": 1.800627396305333e-05, + "loss": 0.1941, + "step": 1430 + }, + { + "epoch": 0.5019170442662949, + "grad_norm": 9.781709671020508, + "learning_rate": 1.799233182293482e-05, + "loss": 0.2008, + "step": 1440 + }, + { + "epoch": 0.5054025792959219, + "grad_norm": 20.839242935180664, + "learning_rate": 1.7978389682816312e-05, + "loss": 0.4964, + "step": 1450 + }, + { + "epoch": 0.5088881143255489, + "grad_norm": 14.371743202209473, + "learning_rate": 1.7964447542697806e-05, + "loss": 0.3222, + "step": 1460 + }, + { + "epoch": 0.512373649355176, + "grad_norm": 0.32212069630622864, + "learning_rate": 1.7950505402579297e-05, + "loss": 0.2794, + "step": 1470 + }, + { + "epoch": 0.515859184384803, + "grad_norm": 0.4930780827999115, + "learning_rate": 1.7936563262460788e-05, + "loss": 0.0921, + "step": 1480 + }, + { + "epoch": 0.5193447194144302, + "grad_norm": 2.0369691848754883, + "learning_rate": 1.7922621122342282e-05, + "loss": 0.2109, + "step": 1490 + }, + { + "epoch": 0.5228302544440572, + "grad_norm": 1.3328112363815308, + "learning_rate": 1.7908678982223773e-05, + "loss": 0.2685, + "step": 1500 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.5157321691513062, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.343, + "step": 1510 + }, + { + "epoch": 0.5298013245033113, + "grad_norm": 24.59377670288086, + "learning_rate": 1.7880794701986758e-05, + "loss": 0.257, + "step": 1520 + }, + { + "epoch": 0.5332868595329383, + "grad_norm": 0.25167328119277954, + "learning_rate": 1.786685256186825e-05, + "loss": 0.2256, + "step": 1530 + }, + { + "epoch": 0.5367723945625653, + "grad_norm": 0.31769871711730957, + "learning_rate": 1.785291042174974e-05, + "loss": 0.2461, + "step": 1540 + }, + { + "epoch": 0.5402579295921924, + "grad_norm": 4.568899631500244, + "learning_rate": 1.7838968281631234e-05, + "loss": 0.1961, + "step": 1550 + }, + { + "epoch": 0.5437434646218194, + "grad_norm": 3.1680665016174316, + "learning_rate": 1.7825026141512725e-05, + "loss": 0.2128, + "step": 1560 + }, + { + "epoch": 0.5472289996514464, + "grad_norm": 4.516887187957764, + "learning_rate": 1.7811084001394215e-05, + "loss": 0.2493, + "step": 1570 + }, + { + "epoch": 0.5507145346810736, + "grad_norm": 9.322068214416504, + "learning_rate": 1.7797141861275706e-05, + "loss": 0.1234, + "step": 1580 + }, + { + "epoch": 0.5542000697107006, + "grad_norm": 0.5057626366615295, + "learning_rate": 1.7783199721157197e-05, + "loss": 0.1955, + "step": 1590 + }, + { + "epoch": 0.5576856047403277, + "grad_norm": 3.936608076095581, + "learning_rate": 1.776925758103869e-05, + "loss": 0.4223, + "step": 1600 + }, + { + "epoch": 0.5611711397699547, + "grad_norm": 1.4608641862869263, + "learning_rate": 1.7755315440920182e-05, + "loss": 0.2389, + "step": 1610 + }, + { + "epoch": 0.5646566747995817, + "grad_norm": 0.2644886076450348, + "learning_rate": 1.7741373300801673e-05, + "loss": 0.2196, + "step": 1620 + }, + { + "epoch": 0.5681422098292088, + "grad_norm": 8.523164749145508, + "learning_rate": 1.7727431160683167e-05, + "loss": 0.4851, + "step": 1630 + }, + { + "epoch": 0.5716277448588358, + "grad_norm": 0.24561679363250732, + "learning_rate": 1.7713489020564658e-05, + "loss": 0.1291, + "step": 1640 + }, + { + "epoch": 0.5751132798884628, + "grad_norm": 3.906648874282837, + "learning_rate": 1.769954688044615e-05, + "loss": 0.1317, + "step": 1650 + }, + { + "epoch": 0.57859881491809, + "grad_norm": 12.78685474395752, + "learning_rate": 1.7685604740327643e-05, + "loss": 0.2986, + "step": 1660 + }, + { + "epoch": 0.582084349947717, + "grad_norm": 0.1987033486366272, + "learning_rate": 1.7671662600209134e-05, + "loss": 0.165, + "step": 1670 + }, + { + "epoch": 0.585569884977344, + "grad_norm": 29.639982223510742, + "learning_rate": 1.7657720460090625e-05, + "loss": 0.1937, + "step": 1680 + }, + { + "epoch": 0.5890554200069711, + "grad_norm": 3.289419174194336, + "learning_rate": 1.764377831997212e-05, + "loss": 0.0802, + "step": 1690 + }, + { + "epoch": 0.5925409550365981, + "grad_norm": 0.2210395336151123, + "learning_rate": 1.762983617985361e-05, + "loss": 0.2471, + "step": 1700 + }, + { + "epoch": 0.5960264900662252, + "grad_norm": 23.282852172851562, + "learning_rate": 1.76158940397351e-05, + "loss": 0.2112, + "step": 1710 + }, + { + "epoch": 0.5995120250958522, + "grad_norm": 0.22058001160621643, + "learning_rate": 1.760195189961659e-05, + "loss": 0.0758, + "step": 1720 + }, + { + "epoch": 0.6029975601254792, + "grad_norm": 7.556725025177002, + "learning_rate": 1.7588009759498082e-05, + "loss": 0.1733, + "step": 1730 + }, + { + "epoch": 0.6064830951551063, + "grad_norm": 13.092087745666504, + "learning_rate": 1.7574067619379576e-05, + "loss": 0.118, + "step": 1740 + }, + { + "epoch": 0.6099686301847334, + "grad_norm": 2.1389198303222656, + "learning_rate": 1.7560125479261067e-05, + "loss": 0.1971, + "step": 1750 + }, + { + "epoch": 0.6134541652143604, + "grad_norm": 2.126339912414551, + "learning_rate": 1.754618333914256e-05, + "loss": 0.3391, + "step": 1760 + }, + { + "epoch": 0.6169397002439875, + "grad_norm": 0.7699007987976074, + "learning_rate": 1.7532241199024052e-05, + "loss": 0.299, + "step": 1770 + }, + { + "epoch": 0.6204252352736145, + "grad_norm": 0.1900128424167633, + "learning_rate": 1.7518299058905543e-05, + "loss": 0.1021, + "step": 1780 + }, + { + "epoch": 0.6239107703032416, + "grad_norm": 20.255779266357422, + "learning_rate": 1.7504356918787037e-05, + "loss": 0.158, + "step": 1790 + }, + { + "epoch": 0.6273963053328686, + "grad_norm": 0.18125827610492706, + "learning_rate": 1.7490414778668528e-05, + "loss": 0.2431, + "step": 1800 + }, + { + "epoch": 0.6308818403624956, + "grad_norm": 0.5560604929924011, + "learning_rate": 1.747647263855002e-05, + "loss": 0.281, + "step": 1810 + }, + { + "epoch": 0.6343673753921227, + "grad_norm": 0.6502192616462708, + "learning_rate": 1.7462530498431513e-05, + "loss": 0.1709, + "step": 1820 + }, + { + "epoch": 0.6378529104217497, + "grad_norm": 3.676496982574463, + "learning_rate": 1.7448588358313004e-05, + "loss": 0.3918, + "step": 1830 + }, + { + "epoch": 0.6413384454513767, + "grad_norm": 8.176740646362305, + "learning_rate": 1.7434646218194494e-05, + "loss": 0.1222, + "step": 1840 + }, + { + "epoch": 0.6448239804810039, + "grad_norm": 1.6892282962799072, + "learning_rate": 1.7420704078075985e-05, + "loss": 0.3701, + "step": 1850 + }, + { + "epoch": 0.6483095155106309, + "grad_norm": 3.25469708442688, + "learning_rate": 1.7406761937957476e-05, + "loss": 0.2383, + "step": 1860 + }, + { + "epoch": 0.6517950505402579, + "grad_norm": 1.433481216430664, + "learning_rate": 1.739281979783897e-05, + "loss": 0.1936, + "step": 1870 + }, + { + "epoch": 0.655280585569885, + "grad_norm": 0.20236587524414062, + "learning_rate": 1.737887765772046e-05, + "loss": 0.4097, + "step": 1880 + }, + { + "epoch": 0.658766120599512, + "grad_norm": 3.3700122833251953, + "learning_rate": 1.7364935517601952e-05, + "loss": 0.2351, + "step": 1890 + }, + { + "epoch": 0.6622516556291391, + "grad_norm": 10.565374374389648, + "learning_rate": 1.7350993377483446e-05, + "loss": 0.3335, + "step": 1900 + }, + { + "epoch": 0.6657371906587661, + "grad_norm": 0.19991923868656158, + "learning_rate": 1.7337051237364937e-05, + "loss": 0.4573, + "step": 1910 + }, + { + "epoch": 0.6692227256883931, + "grad_norm": 2.1597394943237305, + "learning_rate": 1.7323109097246428e-05, + "loss": 0.1396, + "step": 1920 + }, + { + "epoch": 0.6727082607180203, + "grad_norm": 23.80716896057129, + "learning_rate": 1.7309166957127922e-05, + "loss": 0.2434, + "step": 1930 + }, + { + "epoch": 0.6761937957476473, + "grad_norm": 0.18005388975143433, + "learning_rate": 1.7295224817009413e-05, + "loss": 0.2398, + "step": 1940 + }, + { + "epoch": 0.6796793307772743, + "grad_norm": 0.24591070413589478, + "learning_rate": 1.7281282676890904e-05, + "loss": 0.1673, + "step": 1950 + }, + { + "epoch": 0.6831648658069014, + "grad_norm": 0.35577160120010376, + "learning_rate": 1.7267340536772398e-05, + "loss": 0.1134, + "step": 1960 + }, + { + "epoch": 0.6866504008365284, + "grad_norm": 0.14578354358673096, + "learning_rate": 1.725339839665389e-05, + "loss": 0.1394, + "step": 1970 + }, + { + "epoch": 0.6901359358661554, + "grad_norm": 0.15032856166362762, + "learning_rate": 1.723945625653538e-05, + "loss": 0.2795, + "step": 1980 + }, + { + "epoch": 0.6936214708957825, + "grad_norm": 0.15948134660720825, + "learning_rate": 1.722551411641687e-05, + "loss": 0.1617, + "step": 1990 + }, + { + "epoch": 0.6971070059254095, + "grad_norm": 1.1098570823669434, + "learning_rate": 1.721157197629836e-05, + "loss": 0.1358, + "step": 2000 + }, + { + "epoch": 0.7005925409550366, + "grad_norm": 14.691607475280762, + "learning_rate": 1.7197629836179855e-05, + "loss": 0.2749, + "step": 2010 + }, + { + "epoch": 0.7040780759846637, + "grad_norm": 9.642597198486328, + "learning_rate": 1.7183687696061346e-05, + "loss": 0.2413, + "step": 2020 + }, + { + "epoch": 0.7075636110142907, + "grad_norm": 0.25982531905174255, + "learning_rate": 1.7169745555942837e-05, + "loss": 0.2381, + "step": 2030 + }, + { + "epoch": 0.7110491460439178, + "grad_norm": 11.684551239013672, + "learning_rate": 1.715580341582433e-05, + "loss": 0.2735, + "step": 2040 + }, + { + "epoch": 0.7145346810735448, + "grad_norm": 13.072251319885254, + "learning_rate": 1.7141861275705822e-05, + "loss": 0.1853, + "step": 2050 + }, + { + "epoch": 0.7180202161031718, + "grad_norm": 4.4599504470825195, + "learning_rate": 1.7127919135587313e-05, + "loss": 0.0791, + "step": 2060 + }, + { + "epoch": 0.7215057511327989, + "grad_norm": 11.390928268432617, + "learning_rate": 1.7113976995468807e-05, + "loss": 0.363, + "step": 2070 + }, + { + "epoch": 0.7249912861624259, + "grad_norm": 24.80426788330078, + "learning_rate": 1.7100034855350298e-05, + "loss": 0.1271, + "step": 2080 + }, + { + "epoch": 0.7284768211920529, + "grad_norm": 4.6373724937438965, + "learning_rate": 1.708609271523179e-05, + "loss": 0.2046, + "step": 2090 + }, + { + "epoch": 0.73196235622168, + "grad_norm": 18.688011169433594, + "learning_rate": 1.7072150575113283e-05, + "loss": 0.3764, + "step": 2100 + }, + { + "epoch": 0.735447891251307, + "grad_norm": 0.1868332177400589, + "learning_rate": 1.7058208434994774e-05, + "loss": 0.2338, + "step": 2110 + }, + { + "epoch": 0.7389334262809342, + "grad_norm": 0.9303812980651855, + "learning_rate": 1.7044266294876264e-05, + "loss": 0.1955, + "step": 2120 + }, + { + "epoch": 0.7424189613105612, + "grad_norm": 20.024219512939453, + "learning_rate": 1.7030324154757755e-05, + "loss": 0.2066, + "step": 2130 + }, + { + "epoch": 0.7459044963401882, + "grad_norm": 0.41837865114212036, + "learning_rate": 1.701638201463925e-05, + "loss": 0.43, + "step": 2140 + }, + { + "epoch": 0.7493900313698153, + "grad_norm": 0.2002245932817459, + "learning_rate": 1.700243987452074e-05, + "loss": 0.2395, + "step": 2150 + }, + { + "epoch": 0.7528755663994423, + "grad_norm": 16.88410186767578, + "learning_rate": 1.698849773440223e-05, + "loss": 0.3061, + "step": 2160 + }, + { + "epoch": 0.7563611014290693, + "grad_norm": 13.094812393188477, + "learning_rate": 1.6974555594283725e-05, + "loss": 0.2037, + "step": 2170 + }, + { + "epoch": 0.7598466364586964, + "grad_norm": 0.489964097738266, + "learning_rate": 1.6960613454165216e-05, + "loss": 0.1176, + "step": 2180 + }, + { + "epoch": 0.7633321714883234, + "grad_norm": 0.36262962222099304, + "learning_rate": 1.6946671314046707e-05, + "loss": 0.1996, + "step": 2190 + }, + { + "epoch": 0.7668177065179506, + "grad_norm": 13.340808868408203, + "learning_rate": 1.69327291739282e-05, + "loss": 0.3046, + "step": 2200 + }, + { + "epoch": 0.7703032415475776, + "grad_norm": 4.607888221740723, + "learning_rate": 1.6918787033809692e-05, + "loss": 0.2871, + "step": 2210 + }, + { + "epoch": 0.7737887765772046, + "grad_norm": 0.15974846482276917, + "learning_rate": 1.6904844893691183e-05, + "loss": 0.2577, + "step": 2220 + }, + { + "epoch": 0.7772743116068317, + "grad_norm": 0.5607108473777771, + "learning_rate": 1.6890902753572677e-05, + "loss": 0.1506, + "step": 2230 + }, + { + "epoch": 0.7807598466364587, + "grad_norm": 0.22983232140541077, + "learning_rate": 1.6876960613454168e-05, + "loss": 0.0976, + "step": 2240 + }, + { + "epoch": 0.7842453816660857, + "grad_norm": 0.15347087383270264, + "learning_rate": 1.686301847333566e-05, + "loss": 0.1101, + "step": 2250 + }, + { + "epoch": 0.7877309166957128, + "grad_norm": 0.15394413471221924, + "learning_rate": 1.684907633321715e-05, + "loss": 0.2706, + "step": 2260 + }, + { + "epoch": 0.7912164517253398, + "grad_norm": 16.60938262939453, + "learning_rate": 1.683513419309864e-05, + "loss": 0.3779, + "step": 2270 + }, + { + "epoch": 0.7947019867549668, + "grad_norm": 19.347156524658203, + "learning_rate": 1.6821192052980134e-05, + "loss": 0.2697, + "step": 2280 + }, + { + "epoch": 0.798187521784594, + "grad_norm": 18.826566696166992, + "learning_rate": 1.6807249912861625e-05, + "loss": 0.1772, + "step": 2290 + }, + { + "epoch": 0.801673056814221, + "grad_norm": 6.404747009277344, + "learning_rate": 1.6793307772743116e-05, + "loss": 0.1967, + "step": 2300 + }, + { + "epoch": 0.8051585918438481, + "grad_norm": 2.8832345008850098, + "learning_rate": 1.677936563262461e-05, + "loss": 0.2426, + "step": 2310 + }, + { + "epoch": 0.8086441268734751, + "grad_norm": 8.400115966796875, + "learning_rate": 1.67654234925061e-05, + "loss": 0.1401, + "step": 2320 + }, + { + "epoch": 0.8121296619031021, + "grad_norm": 3.5098154544830322, + "learning_rate": 1.6751481352387592e-05, + "loss": 0.1521, + "step": 2330 + }, + { + "epoch": 0.8156151969327292, + "grad_norm": 7.560535907745361, + "learning_rate": 1.6737539212269086e-05, + "loss": 0.2052, + "step": 2340 + }, + { + "epoch": 0.8191007319623562, + "grad_norm": 20.31093978881836, + "learning_rate": 1.6723597072150577e-05, + "loss": 0.2422, + "step": 2350 + }, + { + "epoch": 0.8225862669919832, + "grad_norm": 0.14886406064033508, + "learning_rate": 1.6709654932032068e-05, + "loss": 0.3423, + "step": 2360 + }, + { + "epoch": 0.8260718020216103, + "grad_norm": 0.16942627727985382, + "learning_rate": 1.6695712791913562e-05, + "loss": 0.1422, + "step": 2370 + }, + { + "epoch": 0.8295573370512374, + "grad_norm": 2.075434446334839, + "learning_rate": 1.6681770651795053e-05, + "loss": 0.2055, + "step": 2380 + }, + { + "epoch": 0.8330428720808644, + "grad_norm": 12.085251808166504, + "learning_rate": 1.6667828511676544e-05, + "loss": 0.1778, + "step": 2390 + }, + { + "epoch": 0.8365284071104915, + "grad_norm": 9.108433723449707, + "learning_rate": 1.6653886371558034e-05, + "loss": 0.2636, + "step": 2400 + }, + { + "epoch": 0.8400139421401185, + "grad_norm": 0.22255125641822815, + "learning_rate": 1.6639944231439525e-05, + "loss": 0.1163, + "step": 2410 + }, + { + "epoch": 0.8434994771697456, + "grad_norm": 0.34382200241088867, + "learning_rate": 1.662600209132102e-05, + "loss": 0.3501, + "step": 2420 + }, + { + "epoch": 0.8469850121993726, + "grad_norm": 3.8618369102478027, + "learning_rate": 1.661205995120251e-05, + "loss": 0.3013, + "step": 2430 + }, + { + "epoch": 0.8504705472289996, + "grad_norm": 3.9153378009796143, + "learning_rate": 1.6598117811084e-05, + "loss": 0.2558, + "step": 2440 + }, + { + "epoch": 0.8539560822586267, + "grad_norm": 6.622893810272217, + "learning_rate": 1.6584175670965495e-05, + "loss": 0.1465, + "step": 2450 + }, + { + "epoch": 0.8574416172882537, + "grad_norm": 0.25614631175994873, + "learning_rate": 1.6570233530846986e-05, + "loss": 0.3136, + "step": 2460 + }, + { + "epoch": 0.8609271523178808, + "grad_norm": 1.1627339124679565, + "learning_rate": 1.6556291390728477e-05, + "loss": 0.1084, + "step": 2470 + }, + { + "epoch": 0.8644126873475079, + "grad_norm": 3.3835268020629883, + "learning_rate": 1.654234925060997e-05, + "loss": 0.4099, + "step": 2480 + }, + { + "epoch": 0.8678982223771349, + "grad_norm": 15.521218299865723, + "learning_rate": 1.6528407110491462e-05, + "loss": 0.1028, + "step": 2490 + }, + { + "epoch": 0.8713837574067619, + "grad_norm": 0.2716628313064575, + "learning_rate": 1.6514464970372953e-05, + "loss": 0.1371, + "step": 2500 + }, + { + "epoch": 0.874869292436389, + "grad_norm": 0.11692991107702255, + "learning_rate": 1.6500522830254447e-05, + "loss": 0.2051, + "step": 2510 + }, + { + "epoch": 0.878354827466016, + "grad_norm": 5.96846866607666, + "learning_rate": 1.6486580690135938e-05, + "loss": 0.1758, + "step": 2520 + }, + { + "epoch": 0.8818403624956431, + "grad_norm": 3.1128947734832764, + "learning_rate": 1.647263855001743e-05, + "loss": 0.3391, + "step": 2530 + }, + { + "epoch": 0.8853258975252701, + "grad_norm": 0.7964233756065369, + "learning_rate": 1.645869640989892e-05, + "loss": 0.3088, + "step": 2540 + }, + { + "epoch": 0.8888114325548971, + "grad_norm": 0.2802188992500305, + "learning_rate": 1.6444754269780413e-05, + "loss": 0.2082, + "step": 2550 + }, + { + "epoch": 0.8922969675845243, + "grad_norm": 0.3661327362060547, + "learning_rate": 1.6430812129661904e-05, + "loss": 0.231, + "step": 2560 + }, + { + "epoch": 0.8957825026141513, + "grad_norm": 6.788111686706543, + "learning_rate": 1.6416869989543395e-05, + "loss": 0.183, + "step": 2570 + }, + { + "epoch": 0.8992680376437783, + "grad_norm": 17.159854888916016, + "learning_rate": 1.640292784942489e-05, + "loss": 0.2526, + "step": 2580 + }, + { + "epoch": 0.9027535726734054, + "grad_norm": 23.243045806884766, + "learning_rate": 1.638898570930638e-05, + "loss": 0.2203, + "step": 2590 + }, + { + "epoch": 0.9062391077030324, + "grad_norm": 0.10591837018728256, + "learning_rate": 1.637504356918787e-05, + "loss": 0.2705, + "step": 2600 + }, + { + "epoch": 0.9097246427326595, + "grad_norm": 8.592724800109863, + "learning_rate": 1.6361101429069365e-05, + "loss": 0.1686, + "step": 2610 + }, + { + "epoch": 0.9132101777622865, + "grad_norm": 0.5851829051971436, + "learning_rate": 1.6347159288950856e-05, + "loss": 0.1452, + "step": 2620 + }, + { + "epoch": 0.9166957127919135, + "grad_norm": 0.5782294869422913, + "learning_rate": 1.6333217148832347e-05, + "loss": 0.1843, + "step": 2630 + }, + { + "epoch": 0.9201812478215406, + "grad_norm": 27.61756134033203, + "learning_rate": 1.631927500871384e-05, + "loss": 0.1058, + "step": 2640 + }, + { + "epoch": 0.9236667828511677, + "grad_norm": 0.1436556726694107, + "learning_rate": 1.6305332868595332e-05, + "loss": 0.1947, + "step": 2650 + }, + { + "epoch": 0.9271523178807947, + "grad_norm": 0.11741270869970322, + "learning_rate": 1.6291390728476823e-05, + "loss": 0.2313, + "step": 2660 + }, + { + "epoch": 0.9306378529104218, + "grad_norm": 6.36224889755249, + "learning_rate": 1.6277448588358313e-05, + "loss": 0.291, + "step": 2670 + }, + { + "epoch": 0.9341233879400488, + "grad_norm": 0.2954885959625244, + "learning_rate": 1.6263506448239804e-05, + "loss": 0.3318, + "step": 2680 + }, + { + "epoch": 0.9376089229696758, + "grad_norm": 13.240195274353027, + "learning_rate": 1.62495643081213e-05, + "loss": 0.3679, + "step": 2690 + }, + { + "epoch": 0.9410944579993029, + "grad_norm": 24.815946578979492, + "learning_rate": 1.623562216800279e-05, + "loss": 0.3279, + "step": 2700 + }, + { + "epoch": 0.9445799930289299, + "grad_norm": 0.11210108548402786, + "learning_rate": 1.622168002788428e-05, + "loss": 0.2753, + "step": 2710 + }, + { + "epoch": 0.948065528058557, + "grad_norm": 3.065356969833374, + "learning_rate": 1.6207737887765774e-05, + "loss": 0.1844, + "step": 2720 + }, + { + "epoch": 0.951551063088184, + "grad_norm": 1.7035739421844482, + "learning_rate": 1.6193795747647265e-05, + "loss": 0.2426, + "step": 2730 + }, + { + "epoch": 0.955036598117811, + "grad_norm": 23.689104080200195, + "learning_rate": 1.6179853607528756e-05, + "loss": 0.3324, + "step": 2740 + }, + { + "epoch": 0.9585221331474382, + "grad_norm": 8.64907169342041, + "learning_rate": 1.616591146741025e-05, + "loss": 0.1349, + "step": 2750 + }, + { + "epoch": 0.9620076681770652, + "grad_norm": 0.21376356482505798, + "learning_rate": 1.615196932729174e-05, + "loss": 0.1536, + "step": 2760 + }, + { + "epoch": 0.9654932032066922, + "grad_norm": 21.229475021362305, + "learning_rate": 1.6138027187173232e-05, + "loss": 0.3432, + "step": 2770 + }, + { + "epoch": 0.9689787382363193, + "grad_norm": 0.1763678640127182, + "learning_rate": 1.6124085047054726e-05, + "loss": 0.121, + "step": 2780 + }, + { + "epoch": 0.9724642732659463, + "grad_norm": 0.35627982020378113, + "learning_rate": 1.6110142906936217e-05, + "loss": 0.2114, + "step": 2790 + }, + { + "epoch": 0.9759498082955733, + "grad_norm": 0.675436794757843, + "learning_rate": 1.6096200766817708e-05, + "loss": 0.2136, + "step": 2800 + }, + { + "epoch": 0.9794353433252004, + "grad_norm": 0.7453556656837463, + "learning_rate": 1.60822586266992e-05, + "loss": 0.1687, + "step": 2810 + }, + { + "epoch": 0.9829208783548274, + "grad_norm": 1.4391916990280151, + "learning_rate": 1.606831648658069e-05, + "loss": 0.1633, + "step": 2820 + }, + { + "epoch": 0.9864064133844546, + "grad_norm": 0.10471702367067337, + "learning_rate": 1.6054374346462183e-05, + "loss": 0.1795, + "step": 2830 + }, + { + "epoch": 0.9898919484140816, + "grad_norm": 2.418611764907837, + "learning_rate": 1.6040432206343674e-05, + "loss": 0.1899, + "step": 2840 + }, + { + "epoch": 0.9933774834437086, + "grad_norm": 6.218674659729004, + "learning_rate": 1.6026490066225165e-05, + "loss": 0.1613, + "step": 2850 + }, + { + "epoch": 0.9968630184733357, + "grad_norm": 23.257627487182617, + "learning_rate": 1.601254792610666e-05, + "loss": 0.1829, + "step": 2860 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9686419753086419, + "eval_loss": 0.131916344165802, + "eval_runtime": 20.8607, + "eval_samples_per_second": 194.145, + "eval_steps_per_second": 24.304, + "step": 2869 + }, + { + "epoch": 1.0003485535029628, + "grad_norm": 13.868711471557617, + "learning_rate": 1.599860578598815e-05, + "loss": 0.3279, + "step": 2870 + }, + { + "epoch": 1.0038340885325898, + "grad_norm": 0.0889001339673996, + "learning_rate": 1.598466364586964e-05, + "loss": 0.1179, + "step": 2880 + }, + { + "epoch": 1.0073196235622168, + "grad_norm": 7.238311767578125, + "learning_rate": 1.5970721505751135e-05, + "loss": 0.1339, + "step": 2890 + }, + { + "epoch": 1.0108051585918438, + "grad_norm": 9.563148498535156, + "learning_rate": 1.5956779365632626e-05, + "loss": 0.2212, + "step": 2900 + }, + { + "epoch": 1.0142906936214708, + "grad_norm": 10.648134231567383, + "learning_rate": 1.5942837225514117e-05, + "loss": 0.232, + "step": 2910 + }, + { + "epoch": 1.0177762286510978, + "grad_norm": 0.2760317325592041, + "learning_rate": 1.592889508539561e-05, + "loss": 0.1669, + "step": 2920 + }, + { + "epoch": 1.021261763680725, + "grad_norm": 0.15449553728103638, + "learning_rate": 1.5914952945277102e-05, + "loss": 0.2226, + "step": 2930 + }, + { + "epoch": 1.024747298710352, + "grad_norm": 0.09482862800359726, + "learning_rate": 1.5901010805158593e-05, + "loss": 0.1987, + "step": 2940 + }, + { + "epoch": 1.028232833739979, + "grad_norm": 1.185239553451538, + "learning_rate": 1.5887068665040083e-05, + "loss": 0.2861, + "step": 2950 + }, + { + "epoch": 1.031718368769606, + "grad_norm": 0.9025907516479492, + "learning_rate": 1.5873126524921578e-05, + "loss": 0.1047, + "step": 2960 + }, + { + "epoch": 1.035203903799233, + "grad_norm": 21.41613006591797, + "learning_rate": 1.585918438480307e-05, + "loss": 0.2394, + "step": 2970 + }, + { + "epoch": 1.0386894388288603, + "grad_norm": 25.966737747192383, + "learning_rate": 1.584524224468456e-05, + "loss": 0.1353, + "step": 2980 + }, + { + "epoch": 1.0421749738584873, + "grad_norm": 0.21152754127979279, + "learning_rate": 1.5831300104566053e-05, + "loss": 0.3361, + "step": 2990 + }, + { + "epoch": 1.0456605088881143, + "grad_norm": 0.37001797556877136, + "learning_rate": 1.5817357964447544e-05, + "loss": 0.2875, + "step": 3000 + }, + { + "epoch": 1.0491460439177414, + "grad_norm": 0.16642731428146362, + "learning_rate": 1.5803415824329035e-05, + "loss": 0.2697, + "step": 3010 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.22467225790023804, + "learning_rate": 1.578947368421053e-05, + "loss": 0.1819, + "step": 3020 + }, + { + "epoch": 1.0561171139769954, + "grad_norm": 0.6170486807823181, + "learning_rate": 1.577553154409202e-05, + "loss": 0.0742, + "step": 3030 + }, + { + "epoch": 1.0596026490066226, + "grad_norm": 7.1253862380981445, + "learning_rate": 1.576158940397351e-05, + "loss": 0.2141, + "step": 3040 + }, + { + "epoch": 1.0630881840362496, + "grad_norm": 0.6570123434066772, + "learning_rate": 1.5747647263855005e-05, + "loss": 0.1353, + "step": 3050 + }, + { + "epoch": 1.0665737190658766, + "grad_norm": 1.323370337486267, + "learning_rate": 1.5733705123736496e-05, + "loss": 0.2516, + "step": 3060 + }, + { + "epoch": 1.0700592540955036, + "grad_norm": 0.3584739863872528, + "learning_rate": 1.5719762983617987e-05, + "loss": 0.1877, + "step": 3070 + }, + { + "epoch": 1.0735447891251306, + "grad_norm": 26.268049240112305, + "learning_rate": 1.5705820843499477e-05, + "loss": 0.1618, + "step": 3080 + }, + { + "epoch": 1.0770303241547579, + "grad_norm": 1.0968873500823975, + "learning_rate": 1.5691878703380968e-05, + "loss": 0.2546, + "step": 3090 + }, + { + "epoch": 1.0805158591843849, + "grad_norm": 0.08166905492544174, + "learning_rate": 1.5677936563262463e-05, + "loss": 0.1949, + "step": 3100 + }, + { + "epoch": 1.0840013942140119, + "grad_norm": 0.12049412727355957, + "learning_rate": 1.5663994423143953e-05, + "loss": 0.2507, + "step": 3110 + }, + { + "epoch": 1.0874869292436389, + "grad_norm": 14.46548080444336, + "learning_rate": 1.5650052283025444e-05, + "loss": 0.1774, + "step": 3120 + }, + { + "epoch": 1.0909724642732659, + "grad_norm": 3.6209402084350586, + "learning_rate": 1.563611014290694e-05, + "loss": 0.162, + "step": 3130 + }, + { + "epoch": 1.094457999302893, + "grad_norm": 0.8061731457710266, + "learning_rate": 1.562216800278843e-05, + "loss": 0.117, + "step": 3140 + }, + { + "epoch": 1.0979435343325201, + "grad_norm": 0.17544050514698029, + "learning_rate": 1.560822586266992e-05, + "loss": 0.1057, + "step": 3150 + }, + { + "epoch": 1.1014290693621471, + "grad_norm": 0.06735656410455704, + "learning_rate": 1.5594283722551414e-05, + "loss": 0.2237, + "step": 3160 + }, + { + "epoch": 1.1049146043917741, + "grad_norm": 12.56591510772705, + "learning_rate": 1.5580341582432905e-05, + "loss": 0.1499, + "step": 3170 + }, + { + "epoch": 1.1084001394214011, + "grad_norm": 1.4435107707977295, + "learning_rate": 1.5566399442314396e-05, + "loss": 0.3691, + "step": 3180 + }, + { + "epoch": 1.1118856744510281, + "grad_norm": 0.12776677310466766, + "learning_rate": 1.555245730219589e-05, + "loss": 0.1918, + "step": 3190 + }, + { + "epoch": 1.1153712094806554, + "grad_norm": 0.09562991559505463, + "learning_rate": 1.553851516207738e-05, + "loss": 0.4167, + "step": 3200 + }, + { + "epoch": 1.1188567445102824, + "grad_norm": 1.6694416999816895, + "learning_rate": 1.552457302195887e-05, + "loss": 0.1662, + "step": 3210 + }, + { + "epoch": 1.1223422795399094, + "grad_norm": 16.271982192993164, + "learning_rate": 1.5510630881840362e-05, + "loss": 0.238, + "step": 3220 + }, + { + "epoch": 1.1258278145695364, + "grad_norm": 10.23219108581543, + "learning_rate": 1.5496688741721853e-05, + "loss": 0.2752, + "step": 3230 + }, + { + "epoch": 1.1293133495991634, + "grad_norm": 2.7908616065979004, + "learning_rate": 1.5482746601603347e-05, + "loss": 0.058, + "step": 3240 + }, + { + "epoch": 1.1327988846287904, + "grad_norm": 0.17575271427631378, + "learning_rate": 1.5468804461484838e-05, + "loss": 0.2244, + "step": 3250 + }, + { + "epoch": 1.1362844196584176, + "grad_norm": 1.6930599212646484, + "learning_rate": 1.545486232136633e-05, + "loss": 0.2232, + "step": 3260 + }, + { + "epoch": 1.1397699546880447, + "grad_norm": 1.9705103635787964, + "learning_rate": 1.5440920181247823e-05, + "loss": 0.1543, + "step": 3270 + }, + { + "epoch": 1.1432554897176717, + "grad_norm": 11.21917724609375, + "learning_rate": 1.5426978041129314e-05, + "loss": 0.3028, + "step": 3280 + }, + { + "epoch": 1.1467410247472987, + "grad_norm": 4.643821716308594, + "learning_rate": 1.5413035901010805e-05, + "loss": 0.303, + "step": 3290 + }, + { + "epoch": 1.1502265597769257, + "grad_norm": 19.168472290039062, + "learning_rate": 1.53990937608923e-05, + "loss": 0.2017, + "step": 3300 + }, + { + "epoch": 1.153712094806553, + "grad_norm": 0.08442903310060501, + "learning_rate": 1.538515162077379e-05, + "loss": 0.0612, + "step": 3310 + }, + { + "epoch": 1.15719762983618, + "grad_norm": 0.07782625406980515, + "learning_rate": 1.5371209480655284e-05, + "loss": 0.1009, + "step": 3320 + }, + { + "epoch": 1.160683164865807, + "grad_norm": 17.379108428955078, + "learning_rate": 1.5357267340536775e-05, + "loss": 0.205, + "step": 3330 + }, + { + "epoch": 1.164168699895434, + "grad_norm": 0.07793194055557251, + "learning_rate": 1.5343325200418266e-05, + "loss": 0.261, + "step": 3340 + }, + { + "epoch": 1.167654234925061, + "grad_norm": 0.10120818763971329, + "learning_rate": 1.5329383060299757e-05, + "loss": 0.1319, + "step": 3350 + }, + { + "epoch": 1.1711397699546882, + "grad_norm": 13.039447784423828, + "learning_rate": 1.5315440920181247e-05, + "loss": 0.2302, + "step": 3360 + }, + { + "epoch": 1.1746253049843152, + "grad_norm": 0.0781761035323143, + "learning_rate": 1.530149878006274e-05, + "loss": 0.1687, + "step": 3370 + }, + { + "epoch": 1.1781108400139422, + "grad_norm": 5.389220237731934, + "learning_rate": 1.5287556639944232e-05, + "loss": 0.1653, + "step": 3380 + }, + { + "epoch": 1.1815963750435692, + "grad_norm": 16.386268615722656, + "learning_rate": 1.5273614499825723e-05, + "loss": 0.2369, + "step": 3390 + }, + { + "epoch": 1.1850819100731962, + "grad_norm": 0.0885128378868103, + "learning_rate": 1.5259672359707217e-05, + "loss": 0.1952, + "step": 3400 + }, + { + "epoch": 1.1885674451028232, + "grad_norm": 0.17366395890712738, + "learning_rate": 1.5245730219588708e-05, + "loss": 0.2546, + "step": 3410 + }, + { + "epoch": 1.1920529801324504, + "grad_norm": 0.10797934234142303, + "learning_rate": 1.52317880794702e-05, + "loss": 0.1879, + "step": 3420 + }, + { + "epoch": 1.1955385151620774, + "grad_norm": 5.789648532867432, + "learning_rate": 1.5217845939351692e-05, + "loss": 0.2316, + "step": 3430 + }, + { + "epoch": 1.1990240501917044, + "grad_norm": 0.19041916728019714, + "learning_rate": 1.5203903799233184e-05, + "loss": 0.3066, + "step": 3440 + }, + { + "epoch": 1.2025095852213314, + "grad_norm": 0.12138816714286804, + "learning_rate": 1.5189961659114677e-05, + "loss": 0.0508, + "step": 3450 + }, + { + "epoch": 1.2059951202509585, + "grad_norm": 0.9047268629074097, + "learning_rate": 1.5176019518996167e-05, + "loss": 0.2256, + "step": 3460 + }, + { + "epoch": 1.2094806552805855, + "grad_norm": 15.816200256347656, + "learning_rate": 1.516207737887766e-05, + "loss": 0.2817, + "step": 3470 + }, + { + "epoch": 1.2129661903102127, + "grad_norm": 0.109500952064991, + "learning_rate": 1.5148135238759152e-05, + "loss": 0.0286, + "step": 3480 + }, + { + "epoch": 1.2164517253398397, + "grad_norm": 1.6199902296066284, + "learning_rate": 1.5134193098640642e-05, + "loss": 0.124, + "step": 3490 + }, + { + "epoch": 1.2199372603694667, + "grad_norm": 0.5839070677757263, + "learning_rate": 1.5120250958522134e-05, + "loss": 0.1435, + "step": 3500 + }, + { + "epoch": 1.2234227953990937, + "grad_norm": 0.088262178003788, + "learning_rate": 1.5106308818403625e-05, + "loss": 0.1329, + "step": 3510 + }, + { + "epoch": 1.2269083304287207, + "grad_norm": 19.781274795532227, + "learning_rate": 1.5092366678285117e-05, + "loss": 0.105, + "step": 3520 + }, + { + "epoch": 1.230393865458348, + "grad_norm": 0.21191132068634033, + "learning_rate": 1.507842453816661e-05, + "loss": 0.2493, + "step": 3530 + }, + { + "epoch": 1.233879400487975, + "grad_norm": 12.934003829956055, + "learning_rate": 1.50644823980481e-05, + "loss": 0.185, + "step": 3540 + }, + { + "epoch": 1.237364935517602, + "grad_norm": 8.322224617004395, + "learning_rate": 1.5050540257929593e-05, + "loss": 0.2953, + "step": 3550 + }, + { + "epoch": 1.240850470547229, + "grad_norm": 0.5193243622779846, + "learning_rate": 1.5036598117811086e-05, + "loss": 0.2215, + "step": 3560 + }, + { + "epoch": 1.244336005576856, + "grad_norm": 12.667407989501953, + "learning_rate": 1.5022655977692577e-05, + "loss": 0.1893, + "step": 3570 + }, + { + "epoch": 1.2478215406064832, + "grad_norm": 0.07419061660766602, + "learning_rate": 1.5008713837574069e-05, + "loss": 0.1897, + "step": 3580 + }, + { + "epoch": 1.2513070756361102, + "grad_norm": 14.40754222869873, + "learning_rate": 1.4994771697455562e-05, + "loss": 0.4052, + "step": 3590 + }, + { + "epoch": 1.2547926106657372, + "grad_norm": 3.1872828006744385, + "learning_rate": 1.4980829557337054e-05, + "loss": 0.1727, + "step": 3600 + }, + { + "epoch": 1.2582781456953642, + "grad_norm": 28.886524200439453, + "learning_rate": 1.4966887417218545e-05, + "loss": 0.5003, + "step": 3610 + }, + { + "epoch": 1.2617636807249912, + "grad_norm": 0.11840394884347916, + "learning_rate": 1.4952945277100037e-05, + "loss": 0.1676, + "step": 3620 + }, + { + "epoch": 1.2652492157546185, + "grad_norm": 13.540458679199219, + "learning_rate": 1.4939003136981527e-05, + "loss": 0.2395, + "step": 3630 + }, + { + "epoch": 1.2687347507842452, + "grad_norm": 0.23518075048923492, + "learning_rate": 1.4925060996863019e-05, + "loss": 0.3769, + "step": 3640 + }, + { + "epoch": 1.2722202858138725, + "grad_norm": 0.5583080053329468, + "learning_rate": 1.4911118856744512e-05, + "loss": 0.2068, + "step": 3650 + }, + { + "epoch": 1.2757058208434995, + "grad_norm": 3.620421886444092, + "learning_rate": 1.4897176716626002e-05, + "loss": 0.2443, + "step": 3660 + }, + { + "epoch": 1.2791913558731265, + "grad_norm": 14.638289451599121, + "learning_rate": 1.4883234576507495e-05, + "loss": 0.0804, + "step": 3670 + }, + { + "epoch": 1.2826768909027535, + "grad_norm": 0.17217248678207397, + "learning_rate": 1.4869292436388987e-05, + "loss": 0.1282, + "step": 3680 + }, + { + "epoch": 1.2861624259323805, + "grad_norm": 0.0552227608859539, + "learning_rate": 1.4855350296270478e-05, + "loss": 0.0728, + "step": 3690 + }, + { + "epoch": 1.2896479609620077, + "grad_norm": 0.18412978947162628, + "learning_rate": 1.484140815615197e-05, + "loss": 0.3107, + "step": 3700 + }, + { + "epoch": 1.2931334959916347, + "grad_norm": 1.4496484994888306, + "learning_rate": 1.4827466016033463e-05, + "loss": 0.2406, + "step": 3710 + }, + { + "epoch": 1.2966190310212617, + "grad_norm": 16.918373107910156, + "learning_rate": 1.4813523875914954e-05, + "loss": 0.3134, + "step": 3720 + }, + { + "epoch": 1.3001045660508888, + "grad_norm": 5.249573230743408, + "learning_rate": 1.4799581735796447e-05, + "loss": 0.1284, + "step": 3730 + }, + { + "epoch": 1.3035901010805158, + "grad_norm": 19.409761428833008, + "learning_rate": 1.4785639595677939e-05, + "loss": 0.0873, + "step": 3740 + }, + { + "epoch": 1.307075636110143, + "grad_norm": 0.08998361974954605, + "learning_rate": 1.477169745555943e-05, + "loss": 0.2249, + "step": 3750 + }, + { + "epoch": 1.31056117113977, + "grad_norm": 4.676770210266113, + "learning_rate": 1.475775531544092e-05, + "loss": 0.2761, + "step": 3760 + }, + { + "epoch": 1.314046706169397, + "grad_norm": 4.773134708404541, + "learning_rate": 1.4743813175322411e-05, + "loss": 0.1475, + "step": 3770 + }, + { + "epoch": 1.317532241199024, + "grad_norm": 23.77858543395996, + "learning_rate": 1.4729871035203904e-05, + "loss": 0.1701, + "step": 3780 + }, + { + "epoch": 1.321017776228651, + "grad_norm": 6.024458408355713, + "learning_rate": 1.4715928895085396e-05, + "loss": 0.245, + "step": 3790 + }, + { + "epoch": 1.3245033112582782, + "grad_norm": 26.570066452026367, + "learning_rate": 1.4701986754966889e-05, + "loss": 0.2245, + "step": 3800 + }, + { + "epoch": 1.3279888462879053, + "grad_norm": 0.06447375565767288, + "learning_rate": 1.468804461484838e-05, + "loss": 0.101, + "step": 3810 + }, + { + "epoch": 1.3314743813175323, + "grad_norm": 11.744895935058594, + "learning_rate": 1.4674102474729872e-05, + "loss": 0.2009, + "step": 3820 + }, + { + "epoch": 1.3349599163471593, + "grad_norm": 0.07097235321998596, + "learning_rate": 1.4660160334611365e-05, + "loss": 0.0224, + "step": 3830 + }, + { + "epoch": 1.3384454513767863, + "grad_norm": 0.11342521011829376, + "learning_rate": 1.4646218194492856e-05, + "loss": 0.0885, + "step": 3840 + }, + { + "epoch": 1.3419309864064135, + "grad_norm": 0.06430939584970474, + "learning_rate": 1.4632276054374348e-05, + "loss": 0.2388, + "step": 3850 + }, + { + "epoch": 1.3454165214360405, + "grad_norm": 3.096365451812744, + "learning_rate": 1.461833391425584e-05, + "loss": 0.1814, + "step": 3860 + }, + { + "epoch": 1.3489020564656675, + "grad_norm": 0.2073626071214676, + "learning_rate": 1.4604391774137331e-05, + "loss": 0.1304, + "step": 3870 + }, + { + "epoch": 1.3523875914952945, + "grad_norm": 0.8284794688224792, + "learning_rate": 1.4590449634018824e-05, + "loss": 0.2057, + "step": 3880 + }, + { + "epoch": 1.3558731265249215, + "grad_norm": 23.129026412963867, + "learning_rate": 1.4576507493900316e-05, + "loss": 0.175, + "step": 3890 + }, + { + "epoch": 1.3593586615545485, + "grad_norm": 0.11513429880142212, + "learning_rate": 1.4562565353781806e-05, + "loss": 0.2399, + "step": 3900 + }, + { + "epoch": 1.3628441965841755, + "grad_norm": 5.663066864013672, + "learning_rate": 1.4548623213663298e-05, + "loss": 0.1074, + "step": 3910 + }, + { + "epoch": 1.3663297316138028, + "grad_norm": 21.3768310546875, + "learning_rate": 1.4534681073544789e-05, + "loss": 0.1827, + "step": 3920 + }, + { + "epoch": 1.3698152666434298, + "grad_norm": 9.777505874633789, + "learning_rate": 1.4520738933426281e-05, + "loss": 0.2197, + "step": 3930 + }, + { + "epoch": 1.3733008016730568, + "grad_norm": 0.8618378043174744, + "learning_rate": 1.4506796793307774e-05, + "loss": 0.2736, + "step": 3940 + }, + { + "epoch": 1.3767863367026838, + "grad_norm": 23.52062225341797, + "learning_rate": 1.4492854653189265e-05, + "loss": 0.2393, + "step": 3950 + }, + { + "epoch": 1.3802718717323108, + "grad_norm": 7.636297225952148, + "learning_rate": 1.4478912513070757e-05, + "loss": 0.16, + "step": 3960 + }, + { + "epoch": 1.383757406761938, + "grad_norm": 3.086662530899048, + "learning_rate": 1.446497037295225e-05, + "loss": 0.2637, + "step": 3970 + }, + { + "epoch": 1.387242941791565, + "grad_norm": 6.053905487060547, + "learning_rate": 1.4451028232833742e-05, + "loss": 0.1277, + "step": 3980 + }, + { + "epoch": 1.390728476821192, + "grad_norm": 0.09486166387796402, + "learning_rate": 1.4437086092715233e-05, + "loss": 0.0652, + "step": 3990 + }, + { + "epoch": 1.394214011850819, + "grad_norm": 9.314697265625, + "learning_rate": 1.4423143952596726e-05, + "loss": 0.194, + "step": 4000 + }, + { + "epoch": 1.397699546880446, + "grad_norm": 5.727607727050781, + "learning_rate": 1.4409201812478218e-05, + "loss": 0.1755, + "step": 4010 + }, + { + "epoch": 1.4011850819100733, + "grad_norm": 10.165273666381836, + "learning_rate": 1.4395259672359709e-05, + "loss": 0.3493, + "step": 4020 + }, + { + "epoch": 1.4046706169397003, + "grad_norm": 0.11753875017166138, + "learning_rate": 1.4381317532241201e-05, + "loss": 0.1553, + "step": 4030 + }, + { + "epoch": 1.4081561519693273, + "grad_norm": 0.0666135773062706, + "learning_rate": 1.436737539212269e-05, + "loss": 0.3826, + "step": 4040 + }, + { + "epoch": 1.4116416869989543, + "grad_norm": 0.06710106134414673, + "learning_rate": 1.4353433252004183e-05, + "loss": 0.1867, + "step": 4050 + }, + { + "epoch": 1.4151272220285813, + "grad_norm": 18.661643981933594, + "learning_rate": 1.4339491111885676e-05, + "loss": 0.2184, + "step": 4060 + }, + { + "epoch": 1.4186127570582086, + "grad_norm": 0.0907972976565361, + "learning_rate": 1.4325548971767166e-05, + "loss": 0.2187, + "step": 4070 + }, + { + "epoch": 1.4220982920878356, + "grad_norm": 1.6616415977478027, + "learning_rate": 1.4311606831648659e-05, + "loss": 0.1063, + "step": 4080 + }, + { + "epoch": 1.4255838271174626, + "grad_norm": 0.11328552663326263, + "learning_rate": 1.4297664691530151e-05, + "loss": 0.1416, + "step": 4090 + }, + { + "epoch": 1.4290693621470896, + "grad_norm": 0.36197254061698914, + "learning_rate": 1.4283722551411642e-05, + "loss": 0.3247, + "step": 4100 + }, + { + "epoch": 1.4325548971767166, + "grad_norm": 2.550544500350952, + "learning_rate": 1.4269780411293135e-05, + "loss": 0.2398, + "step": 4110 + }, + { + "epoch": 1.4360404322063438, + "grad_norm": 0.25212812423706055, + "learning_rate": 1.4255838271174627e-05, + "loss": 0.0808, + "step": 4120 + }, + { + "epoch": 1.4395259672359706, + "grad_norm": 0.5252062678337097, + "learning_rate": 1.4241896131056118e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 1.4430115022655978, + "grad_norm": 1.4656167030334473, + "learning_rate": 1.422795399093761e-05, + "loss": 0.2964, + "step": 4140 + }, + { + "epoch": 1.4464970372952248, + "grad_norm": 0.06279879063367844, + "learning_rate": 1.4214011850819103e-05, + "loss": 0.2259, + "step": 4150 + }, + { + "epoch": 1.4499825723248518, + "grad_norm": 0.2818046510219574, + "learning_rate": 1.4200069710700594e-05, + "loss": 0.2609, + "step": 4160 + }, + { + "epoch": 1.4534681073544788, + "grad_norm": 3.4003350734710693, + "learning_rate": 1.4186127570582085e-05, + "loss": 0.1227, + "step": 4170 + }, + { + "epoch": 1.4569536423841059, + "grad_norm": 0.0695640966296196, + "learning_rate": 1.4172185430463577e-05, + "loss": 0.0424, + "step": 4180 + }, + { + "epoch": 1.460439177413733, + "grad_norm": 0.06560744345188141, + "learning_rate": 1.4158243290345068e-05, + "loss": 0.2265, + "step": 4190 + }, + { + "epoch": 1.46392471244336, + "grad_norm": 0.46215468645095825, + "learning_rate": 1.414430115022656e-05, + "loss": 0.1508, + "step": 4200 + }, + { + "epoch": 1.467410247472987, + "grad_norm": 0.21034209430217743, + "learning_rate": 1.4130359010108053e-05, + "loss": 0.1693, + "step": 4210 + }, + { + "epoch": 1.470895782502614, + "grad_norm": 24.144786834716797, + "learning_rate": 1.4116416869989544e-05, + "loss": 0.1907, + "step": 4220 + }, + { + "epoch": 1.474381317532241, + "grad_norm": 0.0716620460152626, + "learning_rate": 1.4102474729871036e-05, + "loss": 0.1234, + "step": 4230 + }, + { + "epoch": 1.4778668525618683, + "grad_norm": 5.373108863830566, + "learning_rate": 1.4088532589752529e-05, + "loss": 0.0568, + "step": 4240 + }, + { + "epoch": 1.4813523875914953, + "grad_norm": 0.06202000752091408, + "learning_rate": 1.407459044963402e-05, + "loss": 0.2224, + "step": 4250 + }, + { + "epoch": 1.4848379226211224, + "grad_norm": 0.06813399493694305, + "learning_rate": 1.4060648309515512e-05, + "loss": 0.2171, + "step": 4260 + }, + { + "epoch": 1.4883234576507494, + "grad_norm": 0.05181474983692169, + "learning_rate": 1.4046706169397005e-05, + "loss": 0.1021, + "step": 4270 + }, + { + "epoch": 1.4918089926803764, + "grad_norm": 1.736260175704956, + "learning_rate": 1.4032764029278496e-05, + "loss": 0.0671, + "step": 4280 + }, + { + "epoch": 1.4952945277100036, + "grad_norm": 0.14703615009784698, + "learning_rate": 1.4018821889159988e-05, + "loss": 0.073, + "step": 4290 + }, + { + "epoch": 1.4987800627396306, + "grad_norm": 0.08682423084974289, + "learning_rate": 1.400487974904148e-05, + "loss": 0.1046, + "step": 4300 + }, + { + "epoch": 1.5022655977692576, + "grad_norm": 0.08216214179992676, + "learning_rate": 1.399093760892297e-05, + "loss": 0.1728, + "step": 4310 + }, + { + "epoch": 1.5057511327988846, + "grad_norm": 4.310842037200928, + "learning_rate": 1.3976995468804462e-05, + "loss": 0.0201, + "step": 4320 + }, + { + "epoch": 1.5092366678285116, + "grad_norm": 0.6964895129203796, + "learning_rate": 1.3963053328685953e-05, + "loss": 0.2253, + "step": 4330 + }, + { + "epoch": 1.5127222028581389, + "grad_norm": 0.06760009378194809, + "learning_rate": 1.3949111188567445e-05, + "loss": 0.0558, + "step": 4340 + }, + { + "epoch": 1.5162077378877656, + "grad_norm": 1.6589277982711792, + "learning_rate": 1.3935169048448938e-05, + "loss": 0.286, + "step": 4350 + }, + { + "epoch": 1.5196932729173929, + "grad_norm": 2.4069876670837402, + "learning_rate": 1.3921226908330429e-05, + "loss": 0.2241, + "step": 4360 + }, + { + "epoch": 1.5231788079470199, + "grad_norm": 0.12570062279701233, + "learning_rate": 1.3907284768211921e-05, + "loss": 0.1356, + "step": 4370 + }, + { + "epoch": 1.5266643429766469, + "grad_norm": 0.06179194152355194, + "learning_rate": 1.3893342628093414e-05, + "loss": 0.1527, + "step": 4380 + }, + { + "epoch": 1.5301498780062741, + "grad_norm": 0.05693870410323143, + "learning_rate": 1.3879400487974906e-05, + "loss": 0.1209, + "step": 4390 + }, + { + "epoch": 1.533635413035901, + "grad_norm": 9.671772003173828, + "learning_rate": 1.3865458347856397e-05, + "loss": 0.1119, + "step": 4400 + }, + { + "epoch": 1.5371209480655281, + "grad_norm": 0.3705396354198456, + "learning_rate": 1.385151620773789e-05, + "loss": 0.1369, + "step": 4410 + }, + { + "epoch": 1.5406064830951551, + "grad_norm": 0.1401708871126175, + "learning_rate": 1.3837574067619382e-05, + "loss": 0.1132, + "step": 4420 + }, + { + "epoch": 1.5440920181247821, + "grad_norm": 2.265974521636963, + "learning_rate": 1.3823631927500873e-05, + "loss": 0.3808, + "step": 4430 + }, + { + "epoch": 1.5475775531544091, + "grad_norm": 3.8402411937713623, + "learning_rate": 1.3809689787382366e-05, + "loss": 0.2986, + "step": 4440 + }, + { + "epoch": 1.5510630881840362, + "grad_norm": 1.6255053281784058, + "learning_rate": 1.3795747647263855e-05, + "loss": 0.042, + "step": 4450 + }, + { + "epoch": 1.5545486232136634, + "grad_norm": 0.05299604684114456, + "learning_rate": 1.3781805507145347e-05, + "loss": 0.0497, + "step": 4460 + }, + { + "epoch": 1.5580341582432904, + "grad_norm": 0.05339618772268295, + "learning_rate": 1.376786336702684e-05, + "loss": 0.1915, + "step": 4470 + }, + { + "epoch": 1.5615196932729174, + "grad_norm": 30.932308197021484, + "learning_rate": 1.375392122690833e-05, + "loss": 0.1334, + "step": 4480 + }, + { + "epoch": 1.5650052283025444, + "grad_norm": 18.293027877807617, + "learning_rate": 1.3739979086789823e-05, + "loss": 0.2085, + "step": 4490 + }, + { + "epoch": 1.5684907633321714, + "grad_norm": 5.334358215332031, + "learning_rate": 1.3726036946671315e-05, + "loss": 0.1781, + "step": 4500 + }, + { + "epoch": 1.5719762983617986, + "grad_norm": 0.09247801452875137, + "learning_rate": 1.3712094806552806e-05, + "loss": 0.1584, + "step": 4510 + }, + { + "epoch": 1.5754618333914254, + "grad_norm": 0.10990609228610992, + "learning_rate": 1.3698152666434299e-05, + "loss": 0.0539, + "step": 4520 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.12785302102565765, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.2398, + "step": 4530 + }, + { + "epoch": 1.5824329034506797, + "grad_norm": 0.24163594841957092, + "learning_rate": 1.3670268386197282e-05, + "loss": 0.227, + "step": 4540 + }, + { + "epoch": 1.5859184384803067, + "grad_norm": 0.06425183266401291, + "learning_rate": 1.3656326246078775e-05, + "loss": 0.0495, + "step": 4550 + }, + { + "epoch": 1.589403973509934, + "grad_norm": 15.79089641571045, + "learning_rate": 1.3642384105960267e-05, + "loss": 0.1335, + "step": 4560 + }, + { + "epoch": 1.5928895085395607, + "grad_norm": 8.039728164672852, + "learning_rate": 1.362844196584176e-05, + "loss": 0.1999, + "step": 4570 + }, + { + "epoch": 1.596375043569188, + "grad_norm": 0.10453764349222183, + "learning_rate": 1.3614499825723249e-05, + "loss": 0.0428, + "step": 4580 + }, + { + "epoch": 1.599860578598815, + "grad_norm": 0.09684242308139801, + "learning_rate": 1.3600557685604741e-05, + "loss": 0.0348, + "step": 4590 + }, + { + "epoch": 1.603346113628442, + "grad_norm": 2.0049784183502197, + "learning_rate": 1.3586615545486232e-05, + "loss": 0.0227, + "step": 4600 + }, + { + "epoch": 1.6068316486580692, + "grad_norm": 0.05621238425374031, + "learning_rate": 1.3572673405367725e-05, + "loss": 0.2134, + "step": 4610 + }, + { + "epoch": 1.610317183687696, + "grad_norm": 24.956762313842773, + "learning_rate": 1.3558731265249217e-05, + "loss": 0.0569, + "step": 4620 + }, + { + "epoch": 1.6138027187173232, + "grad_norm": 0.04639355093240738, + "learning_rate": 1.3544789125130708e-05, + "loss": 0.1955, + "step": 4630 + }, + { + "epoch": 1.6172882537469502, + "grad_norm": 21.884952545166016, + "learning_rate": 1.35308469850122e-05, + "loss": 0.2995, + "step": 4640 + }, + { + "epoch": 1.6207737887765772, + "grad_norm": 0.07225365191698074, + "learning_rate": 1.3516904844893693e-05, + "loss": 0.1419, + "step": 4650 + }, + { + "epoch": 1.6242593238062044, + "grad_norm": 0.06399868428707123, + "learning_rate": 1.3502962704775184e-05, + "loss": 0.0805, + "step": 4660 + }, + { + "epoch": 1.6277448588358312, + "grad_norm": 0.06368258595466614, + "learning_rate": 1.3489020564656676e-05, + "loss": 0.3106, + "step": 4670 + }, + { + "epoch": 1.6312303938654584, + "grad_norm": 0.08223626762628555, + "learning_rate": 1.3475078424538169e-05, + "loss": 0.1631, + "step": 4680 + }, + { + "epoch": 1.6347159288950854, + "grad_norm": 6.698259353637695, + "learning_rate": 1.346113628441966e-05, + "loss": 0.2371, + "step": 4690 + }, + { + "epoch": 1.6382014639247124, + "grad_norm": 0.10331364721059799, + "learning_rate": 1.3447194144301152e-05, + "loss": 0.1428, + "step": 4700 + }, + { + "epoch": 1.6416869989543394, + "grad_norm": 0.18476131558418274, + "learning_rate": 1.3433252004182645e-05, + "loss": 0.1106, + "step": 4710 + }, + { + "epoch": 1.6451725339839665, + "grad_norm": 5.083899021148682, + "learning_rate": 1.3419309864064134e-05, + "loss": 0.1615, + "step": 4720 + }, + { + "epoch": 1.6486580690135937, + "grad_norm": 0.05285824462771416, + "learning_rate": 1.3405367723945626e-05, + "loss": 0.368, + "step": 4730 + }, + { + "epoch": 1.6521436040432205, + "grad_norm": 4.834890365600586, + "learning_rate": 1.3391425583827117e-05, + "loss": 0.2775, + "step": 4740 + }, + { + "epoch": 1.6556291390728477, + "grad_norm": 0.3817296624183655, + "learning_rate": 1.337748344370861e-05, + "loss": 0.0175, + "step": 4750 + }, + { + "epoch": 1.6591146741024747, + "grad_norm": 21.72384262084961, + "learning_rate": 1.3363541303590102e-05, + "loss": 0.181, + "step": 4760 + }, + { + "epoch": 1.6626002091321017, + "grad_norm": 6.993847846984863, + "learning_rate": 1.3349599163471595e-05, + "loss": 0.1962, + "step": 4770 + }, + { + "epoch": 1.666085744161729, + "grad_norm": 0.06561506539583206, + "learning_rate": 1.3335657023353085e-05, + "loss": 0.1334, + "step": 4780 + }, + { + "epoch": 1.6695712791913557, + "grad_norm": 10.643206596374512, + "learning_rate": 1.3321714883234578e-05, + "loss": 0.183, + "step": 4790 + }, + { + "epoch": 1.673056814220983, + "grad_norm": 10.353646278381348, + "learning_rate": 1.330777274311607e-05, + "loss": 0.2462, + "step": 4800 + }, + { + "epoch": 1.67654234925061, + "grad_norm": 14.645241737365723, + "learning_rate": 1.3293830602997561e-05, + "loss": 0.1927, + "step": 4810 + }, + { + "epoch": 1.680027884280237, + "grad_norm": 0.07404288649559021, + "learning_rate": 1.3279888462879054e-05, + "loss": 0.1398, + "step": 4820 + }, + { + "epoch": 1.6835134193098642, + "grad_norm": 11.214003562927246, + "learning_rate": 1.3265946322760546e-05, + "loss": 0.1837, + "step": 4830 + }, + { + "epoch": 1.686998954339491, + "grad_norm": 0.11196375638246536, + "learning_rate": 1.3252004182642037e-05, + "loss": 0.1121, + "step": 4840 + }, + { + "epoch": 1.6904844893691182, + "grad_norm": 9.329215049743652, + "learning_rate": 1.3238062042523528e-05, + "loss": 0.1331, + "step": 4850 + }, + { + "epoch": 1.6939700243987452, + "grad_norm": 0.13003626465797424, + "learning_rate": 1.3224119902405019e-05, + "loss": 0.1499, + "step": 4860 + }, + { + "epoch": 1.6974555594283722, + "grad_norm": 0.055021703243255615, + "learning_rate": 1.3210177762286511e-05, + "loss": 0.1088, + "step": 4870 + }, + { + "epoch": 1.7009410944579995, + "grad_norm": 0.05173968896269798, + "learning_rate": 1.3196235622168004e-05, + "loss": 0.0519, + "step": 4880 + }, + { + "epoch": 1.7044266294876262, + "grad_norm": 1.1130493879318237, + "learning_rate": 1.3182293482049495e-05, + "loss": 0.215, + "step": 4890 + }, + { + "epoch": 1.7079121645172535, + "grad_norm": 36.07246017456055, + "learning_rate": 1.3168351341930987e-05, + "loss": 0.26, + "step": 4900 + }, + { + "epoch": 1.7113976995468805, + "grad_norm": 0.09109174460172653, + "learning_rate": 1.315440920181248e-05, + "loss": 0.018, + "step": 4910 + }, + { + "epoch": 1.7148832345765075, + "grad_norm": 2.289961814880371, + "learning_rate": 1.314046706169397e-05, + "loss": 0.1378, + "step": 4920 + }, + { + "epoch": 1.7183687696061345, + "grad_norm": 0.04753530025482178, + "learning_rate": 1.3126524921575463e-05, + "loss": 0.1949, + "step": 4930 + }, + { + "epoch": 1.7218543046357615, + "grad_norm": 6.459181785583496, + "learning_rate": 1.3112582781456955e-05, + "loss": 0.288, + "step": 4940 + }, + { + "epoch": 1.7253398396653887, + "grad_norm": 11.19622802734375, + "learning_rate": 1.3098640641338448e-05, + "loss": 0.1554, + "step": 4950 + }, + { + "epoch": 1.7288253746950157, + "grad_norm": 15.069147109985352, + "learning_rate": 1.3084698501219939e-05, + "loss": 0.2112, + "step": 4960 + }, + { + "epoch": 1.7323109097246427, + "grad_norm": 17.245620727539062, + "learning_rate": 1.3070756361101431e-05, + "loss": 0.2637, + "step": 4970 + }, + { + "epoch": 1.7357964447542698, + "grad_norm": 25.261966705322266, + "learning_rate": 1.3056814220982924e-05, + "loss": 0.1142, + "step": 4980 + }, + { + "epoch": 1.7392819797838968, + "grad_norm": 0.08086353540420532, + "learning_rate": 1.3042872080864413e-05, + "loss": 0.2082, + "step": 4990 + }, + { + "epoch": 1.742767514813524, + "grad_norm": 33.71073532104492, + "learning_rate": 1.3028929940745905e-05, + "loss": 0.0617, + "step": 5000 + }, + { + "epoch": 1.7462530498431508, + "grad_norm": 8.686712265014648, + "learning_rate": 1.3014987800627396e-05, + "loss": 0.1367, + "step": 5010 + }, + { + "epoch": 1.749738584872778, + "grad_norm": 2.0827035903930664, + "learning_rate": 1.3001045660508889e-05, + "loss": 0.1099, + "step": 5020 + }, + { + "epoch": 1.753224119902405, + "grad_norm": 7.724724769592285, + "learning_rate": 1.2987103520390381e-05, + "loss": 0.132, + "step": 5030 + }, + { + "epoch": 1.756709654932032, + "grad_norm": 7.940341949462891, + "learning_rate": 1.2973161380271872e-05, + "loss": 0.3053, + "step": 5040 + }, + { + "epoch": 1.7601951899616592, + "grad_norm": 0.10044675320386887, + "learning_rate": 1.2959219240153364e-05, + "loss": 0.1138, + "step": 5050 + }, + { + "epoch": 1.763680724991286, + "grad_norm": 0.045176051557064056, + "learning_rate": 1.2945277100034857e-05, + "loss": 0.212, + "step": 5060 + }, + { + "epoch": 1.7671662600209133, + "grad_norm": 3.353107452392578, + "learning_rate": 1.2931334959916348e-05, + "loss": 0.2066, + "step": 5070 + }, + { + "epoch": 1.7706517950505403, + "grad_norm": 0.062912218272686, + "learning_rate": 1.291739281979784e-05, + "loss": 0.2011, + "step": 5080 + }, + { + "epoch": 1.7741373300801673, + "grad_norm": 7.203383922576904, + "learning_rate": 1.2903450679679333e-05, + "loss": 0.4203, + "step": 5090 + }, + { + "epoch": 1.7776228651097945, + "grad_norm": 0.4804973602294922, + "learning_rate": 1.2889508539560824e-05, + "loss": 0.2008, + "step": 5100 + }, + { + "epoch": 1.7811084001394213, + "grad_norm": 0.10035104304552078, + "learning_rate": 1.2875566399442316e-05, + "loss": 0.1394, + "step": 5110 + }, + { + "epoch": 1.7845939351690485, + "grad_norm": 18.202369689941406, + "learning_rate": 1.2861624259323809e-05, + "loss": 0.0973, + "step": 5120 + }, + { + "epoch": 1.7880794701986755, + "grad_norm": 27.99166488647461, + "learning_rate": 1.2847682119205298e-05, + "loss": 0.3659, + "step": 5130 + }, + { + "epoch": 1.7915650052283025, + "grad_norm": 5.776055812835693, + "learning_rate": 1.283373997908679e-05, + "loss": 0.0682, + "step": 5140 + }, + { + "epoch": 1.7950505402579295, + "grad_norm": 6.58804178237915, + "learning_rate": 1.2819797838968283e-05, + "loss": 0.0869, + "step": 5150 + }, + { + "epoch": 1.7985360752875565, + "grad_norm": 2.2123751640319824, + "learning_rate": 1.2805855698849774e-05, + "loss": 0.1033, + "step": 5160 + }, + { + "epoch": 1.8020216103171838, + "grad_norm": 5.483887195587158, + "learning_rate": 1.2791913558731266e-05, + "loss": 0.21, + "step": 5170 + }, + { + "epoch": 1.8055071453468108, + "grad_norm": 35.22694778442383, + "learning_rate": 1.2777971418612759e-05, + "loss": 0.0469, + "step": 5180 + }, + { + "epoch": 1.8089926803764378, + "grad_norm": 16.363428115844727, + "learning_rate": 1.276402927849425e-05, + "loss": 0.2203, + "step": 5190 + }, + { + "epoch": 1.8124782154060648, + "grad_norm": 3.4047915935516357, + "learning_rate": 1.2750087138375742e-05, + "loss": 0.0254, + "step": 5200 + }, + { + "epoch": 1.8159637504356918, + "grad_norm": 0.04784770309925079, + "learning_rate": 1.2736144998257234e-05, + "loss": 0.1671, + "step": 5210 + }, + { + "epoch": 1.819449285465319, + "grad_norm": 0.18466421961784363, + "learning_rate": 1.2722202858138725e-05, + "loss": 0.2333, + "step": 5220 + }, + { + "epoch": 1.8229348204949458, + "grad_norm": 0.04754915460944176, + "learning_rate": 1.2708260718020218e-05, + "loss": 0.109, + "step": 5230 + }, + { + "epoch": 1.826420355524573, + "grad_norm": 12.06218147277832, + "learning_rate": 1.269431857790171e-05, + "loss": 0.1115, + "step": 5240 + }, + { + "epoch": 1.8299058905542, + "grad_norm": 12.35152816772461, + "learning_rate": 1.2680376437783201e-05, + "loss": 0.1294, + "step": 5250 + }, + { + "epoch": 1.833391425583827, + "grad_norm": 0.5232785940170288, + "learning_rate": 1.2666434297664692e-05, + "loss": 0.1716, + "step": 5260 + }, + { + "epoch": 1.8368769606134543, + "grad_norm": 0.065973199903965, + "learning_rate": 1.2652492157546183e-05, + "loss": 0.2806, + "step": 5270 + }, + { + "epoch": 1.840362495643081, + "grad_norm": 0.05121095851063728, + "learning_rate": 1.2638550017427675e-05, + "loss": 0.3057, + "step": 5280 + }, + { + "epoch": 1.8438480306727083, + "grad_norm": 0.04605868458747864, + "learning_rate": 1.2624607877309168e-05, + "loss": 0.182, + "step": 5290 + }, + { + "epoch": 1.8473335657023353, + "grad_norm": 3.7501471042633057, + "learning_rate": 1.2610665737190659e-05, + "loss": 0.0176, + "step": 5300 + }, + { + "epoch": 1.8508191007319623, + "grad_norm": 21.904052734375, + "learning_rate": 1.2596723597072151e-05, + "loss": 0.2681, + "step": 5310 + }, + { + "epoch": 1.8543046357615895, + "grad_norm": 4.048603534698486, + "learning_rate": 1.2582781456953644e-05, + "loss": 0.1016, + "step": 5320 + }, + { + "epoch": 1.8577901707912163, + "grad_norm": 17.176145553588867, + "learning_rate": 1.2568839316835134e-05, + "loss": 0.344, + "step": 5330 + }, + { + "epoch": 1.8612757058208436, + "grad_norm": 0.05373512953519821, + "learning_rate": 1.2554897176716627e-05, + "loss": 0.0364, + "step": 5340 + }, + { + "epoch": 1.8647612408504706, + "grad_norm": 0.4178902506828308, + "learning_rate": 1.254095503659812e-05, + "loss": 0.2752, + "step": 5350 + }, + { + "epoch": 1.8682467758800976, + "grad_norm": 0.11804527789354324, + "learning_rate": 1.2527012896479612e-05, + "loss": 0.2173, + "step": 5360 + }, + { + "epoch": 1.8717323109097248, + "grad_norm": 17.27764892578125, + "learning_rate": 1.2513070756361103e-05, + "loss": 0.1362, + "step": 5370 + }, + { + "epoch": 1.8752178459393516, + "grad_norm": 20.017900466918945, + "learning_rate": 1.2499128616242595e-05, + "loss": 0.1478, + "step": 5380 + }, + { + "epoch": 1.8787033809689788, + "grad_norm": 34.95002746582031, + "learning_rate": 1.2485186476124088e-05, + "loss": 0.1725, + "step": 5390 + }, + { + "epoch": 1.8821889159986058, + "grad_norm": 0.23638759553432465, + "learning_rate": 1.2471244336005577e-05, + "loss": 0.0903, + "step": 5400 + }, + { + "epoch": 1.8856744510282328, + "grad_norm": 8.639537811279297, + "learning_rate": 1.245730219588707e-05, + "loss": 0.1225, + "step": 5410 + }, + { + "epoch": 1.8891599860578598, + "grad_norm": 0.10233496874570847, + "learning_rate": 1.244336005576856e-05, + "loss": 0.2227, + "step": 5420 + }, + { + "epoch": 1.8926455210874868, + "grad_norm": 4.306946754455566, + "learning_rate": 1.2429417915650053e-05, + "loss": 0.1647, + "step": 5430 + }, + { + "epoch": 1.896131056117114, + "grad_norm": 18.103878021240234, + "learning_rate": 1.2415475775531545e-05, + "loss": 0.231, + "step": 5440 + }, + { + "epoch": 1.8996165911467409, + "grad_norm": 20.41852569580078, + "learning_rate": 1.2401533635413036e-05, + "loss": 0.2714, + "step": 5450 + }, + { + "epoch": 1.903102126176368, + "grad_norm": 2.2409658432006836, + "learning_rate": 1.2387591495294529e-05, + "loss": 0.1807, + "step": 5460 + }, + { + "epoch": 1.906587661205995, + "grad_norm": 0.10098310559988022, + "learning_rate": 1.2373649355176021e-05, + "loss": 0.2041, + "step": 5470 + }, + { + "epoch": 1.910073196235622, + "grad_norm": 2.7435004711151123, + "learning_rate": 1.2359707215057512e-05, + "loss": 0.1135, + "step": 5480 + }, + { + "epoch": 1.9135587312652493, + "grad_norm": 0.11639636754989624, + "learning_rate": 1.2345765074939004e-05, + "loss": 0.1849, + "step": 5490 + }, + { + "epoch": 1.9170442662948761, + "grad_norm": 13.145666122436523, + "learning_rate": 1.2331822934820497e-05, + "loss": 0.2934, + "step": 5500 + }, + { + "epoch": 1.9205298013245033, + "grad_norm": 8.368687629699707, + "learning_rate": 1.2317880794701988e-05, + "loss": 0.1166, + "step": 5510 + }, + { + "epoch": 1.9240153363541304, + "grad_norm": 0.16210561990737915, + "learning_rate": 1.230393865458348e-05, + "loss": 0.133, + "step": 5520 + }, + { + "epoch": 1.9275008713837574, + "grad_norm": 0.6329889893531799, + "learning_rate": 1.2289996514464973e-05, + "loss": 0.1613, + "step": 5530 + }, + { + "epoch": 1.9309864064133846, + "grad_norm": 4.6153693199157715, + "learning_rate": 1.2276054374346462e-05, + "loss": 0.09, + "step": 5540 + }, + { + "epoch": 1.9344719414430114, + "grad_norm": 0.1312878131866455, + "learning_rate": 1.2262112234227954e-05, + "loss": 0.1287, + "step": 5550 + }, + { + "epoch": 1.9379574764726386, + "grad_norm": 0.05980315059423447, + "learning_rate": 1.2248170094109447e-05, + "loss": 0.0959, + "step": 5560 + }, + { + "epoch": 1.9414430115022656, + "grad_norm": 19.431610107421875, + "learning_rate": 1.2234227953990938e-05, + "loss": 0.1181, + "step": 5570 + }, + { + "epoch": 1.9449285465318926, + "grad_norm": 0.44517797231674194, + "learning_rate": 1.222028581387243e-05, + "loss": 0.0802, + "step": 5580 + }, + { + "epoch": 1.9484140815615199, + "grad_norm": 0.04525260254740715, + "learning_rate": 1.2206343673753923e-05, + "loss": 0.2204, + "step": 5590 + }, + { + "epoch": 1.9518996165911466, + "grad_norm": 0.1254422813653946, + "learning_rate": 1.2192401533635414e-05, + "loss": 0.0652, + "step": 5600 + }, + { + "epoch": 1.9553851516207739, + "grad_norm": 1.6299126148223877, + "learning_rate": 1.2178459393516906e-05, + "loss": 0.2201, + "step": 5610 + }, + { + "epoch": 1.9588706866504009, + "grad_norm": 27.747234344482422, + "learning_rate": 1.2164517253398399e-05, + "loss": 0.0627, + "step": 5620 + }, + { + "epoch": 1.9623562216800279, + "grad_norm": 2.0174477100372314, + "learning_rate": 1.215057511327989e-05, + "loss": 0.2007, + "step": 5630 + }, + { + "epoch": 1.9658417567096549, + "grad_norm": 7.153161525726318, + "learning_rate": 1.2136632973161382e-05, + "loss": 0.2419, + "step": 5640 + }, + { + "epoch": 1.969327291739282, + "grad_norm": 0.46940529346466064, + "learning_rate": 1.2122690833042874e-05, + "loss": 0.0573, + "step": 5650 + }, + { + "epoch": 1.9728128267689091, + "grad_norm": 4.334830284118652, + "learning_rate": 1.2108748692924365e-05, + "loss": 0.2491, + "step": 5660 + }, + { + "epoch": 1.9762983617985361, + "grad_norm": 0.04316120222210884, + "learning_rate": 1.2094806552805856e-05, + "loss": 0.1859, + "step": 5670 + }, + { + "epoch": 1.9797838968281631, + "grad_norm": 0.04128291830420494, + "learning_rate": 1.2080864412687347e-05, + "loss": 0.1061, + "step": 5680 + }, + { + "epoch": 1.9832694318577901, + "grad_norm": 0.06560923904180527, + "learning_rate": 1.206692227256884e-05, + "loss": 0.054, + "step": 5690 + }, + { + "epoch": 1.9867549668874172, + "grad_norm": 0.05319322645664215, + "learning_rate": 1.2052980132450332e-05, + "loss": 0.1104, + "step": 5700 + }, + { + "epoch": 1.9902405019170444, + "grad_norm": 0.04641329124569893, + "learning_rate": 1.2039037992331823e-05, + "loss": 0.184, + "step": 5710 + }, + { + "epoch": 1.9937260369466712, + "grad_norm": 6.287529468536377, + "learning_rate": 1.2025095852213315e-05, + "loss": 0.0912, + "step": 5720 + }, + { + "epoch": 1.9972115719762984, + "grad_norm": 0.6517956852912903, + "learning_rate": 1.2011153712094808e-05, + "loss": 0.1706, + "step": 5730 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9795061728395061, + "eval_loss": 0.08464130759239197, + "eval_runtime": 18.9519, + "eval_samples_per_second": 213.699, + "eval_steps_per_second": 26.752, + "step": 5738 + }, + { + "epoch": 2.0006971070059256, + "grad_norm": 9.767017364501953, + "learning_rate": 1.19972115719763e-05, + "loss": 0.0665, + "step": 5740 + }, + { + "epoch": 2.0041826420355524, + "grad_norm": 9.686302185058594, + "learning_rate": 1.1983269431857791e-05, + "loss": 0.3084, + "step": 5750 + }, + { + "epoch": 2.0076681770651796, + "grad_norm": 34.309242248535156, + "learning_rate": 1.1969327291739283e-05, + "loss": 0.1616, + "step": 5760 + }, + { + "epoch": 2.0111537120948064, + "grad_norm": 0.03740281984210014, + "learning_rate": 1.1955385151620776e-05, + "loss": 0.0595, + "step": 5770 + }, + { + "epoch": 2.0146392471244337, + "grad_norm": 10.491089820861816, + "learning_rate": 1.1941443011502267e-05, + "loss": 0.2387, + "step": 5780 + }, + { + "epoch": 2.0181247821540604, + "grad_norm": 3.4705872535705566, + "learning_rate": 1.192750087138376e-05, + "loss": 0.0817, + "step": 5790 + }, + { + "epoch": 2.0216103171836877, + "grad_norm": 0.04854891821742058, + "learning_rate": 1.1913558731265252e-05, + "loss": 0.1739, + "step": 5800 + }, + { + "epoch": 2.025095852213315, + "grad_norm": 3.5387609004974365, + "learning_rate": 1.1899616591146741e-05, + "loss": 0.3629, + "step": 5810 + }, + { + "epoch": 2.0285813872429417, + "grad_norm": 0.45299866795539856, + "learning_rate": 1.1885674451028233e-05, + "loss": 0.2141, + "step": 5820 + }, + { + "epoch": 2.032066922272569, + "grad_norm": 0.03833978250622749, + "learning_rate": 1.1871732310909724e-05, + "loss": 0.0408, + "step": 5830 + }, + { + "epoch": 2.0355524573021957, + "grad_norm": 9.347251892089844, + "learning_rate": 1.1857790170791217e-05, + "loss": 0.2311, + "step": 5840 + }, + { + "epoch": 2.039037992331823, + "grad_norm": 0.11472854763269424, + "learning_rate": 1.184384803067271e-05, + "loss": 0.0753, + "step": 5850 + }, + { + "epoch": 2.04252352736145, + "grad_norm": 0.03732588514685631, + "learning_rate": 1.18299058905542e-05, + "loss": 0.0874, + "step": 5860 + }, + { + "epoch": 2.046009062391077, + "grad_norm": 9.975323677062988, + "learning_rate": 1.1815963750435693e-05, + "loss": 0.1819, + "step": 5870 + }, + { + "epoch": 2.049494597420704, + "grad_norm": 0.06995538622140884, + "learning_rate": 1.1802021610317185e-05, + "loss": 0.1342, + "step": 5880 + }, + { + "epoch": 2.052980132450331, + "grad_norm": 12.45875072479248, + "learning_rate": 1.1788079470198676e-05, + "loss": 0.0675, + "step": 5890 + }, + { + "epoch": 2.056465667479958, + "grad_norm": 15.665087699890137, + "learning_rate": 1.1774137330080168e-05, + "loss": 0.3064, + "step": 5900 + }, + { + "epoch": 2.0599512025095854, + "grad_norm": 6.563640594482422, + "learning_rate": 1.1760195189961661e-05, + "loss": 0.115, + "step": 5910 + }, + { + "epoch": 2.063436737539212, + "grad_norm": 0.057756856083869934, + "learning_rate": 1.1746253049843153e-05, + "loss": 0.1534, + "step": 5920 + }, + { + "epoch": 2.0669222725688394, + "grad_norm": 1.6861991882324219, + "learning_rate": 1.1732310909724644e-05, + "loss": 0.0366, + "step": 5930 + }, + { + "epoch": 2.070407807598466, + "grad_norm": 7.513923645019531, + "learning_rate": 1.1718368769606137e-05, + "loss": 0.3493, + "step": 5940 + }, + { + "epoch": 2.0738933426280934, + "grad_norm": 0.05242444574832916, + "learning_rate": 1.1704426629487626e-05, + "loss": 0.117, + "step": 5950 + }, + { + "epoch": 2.0773788776577207, + "grad_norm": 8.048820495605469, + "learning_rate": 1.1690484489369118e-05, + "loss": 0.1696, + "step": 5960 + }, + { + "epoch": 2.0808644126873475, + "grad_norm": 6.724079608917236, + "learning_rate": 1.1676542349250611e-05, + "loss": 0.3178, + "step": 5970 + }, + { + "epoch": 2.0843499477169747, + "grad_norm": 20.328981399536133, + "learning_rate": 1.1662600209132102e-05, + "loss": 0.0367, + "step": 5980 + }, + { + "epoch": 2.0878354827466015, + "grad_norm": 0.051835183054208755, + "learning_rate": 1.1648658069013594e-05, + "loss": 0.1535, + "step": 5990 + }, + { + "epoch": 2.0913210177762287, + "grad_norm": 1.784183382987976, + "learning_rate": 1.1634715928895087e-05, + "loss": 0.0967, + "step": 6000 + }, + { + "epoch": 2.0948065528058555, + "grad_norm": 0.08854708820581436, + "learning_rate": 1.1620773788776578e-05, + "loss": 0.2557, + "step": 6010 + }, + { + "epoch": 2.0982920878354827, + "grad_norm": 0.035098157823085785, + "learning_rate": 1.160683164865807e-05, + "loss": 0.0805, + "step": 6020 + }, + { + "epoch": 2.10177762286511, + "grad_norm": 2.4362149238586426, + "learning_rate": 1.1592889508539563e-05, + "loss": 0.1533, + "step": 6030 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.06450653076171875, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.0535, + "step": 6040 + }, + { + "epoch": 2.108748692924364, + "grad_norm": 0.1979568749666214, + "learning_rate": 1.1565005228302546e-05, + "loss": 0.1879, + "step": 6050 + }, + { + "epoch": 2.1122342279539907, + "grad_norm": 0.039692219346761703, + "learning_rate": 1.1551063088184038e-05, + "loss": 0.2937, + "step": 6060 + }, + { + "epoch": 2.115719762983618, + "grad_norm": 0.07281683385372162, + "learning_rate": 1.153712094806553e-05, + "loss": 0.2128, + "step": 6070 + }, + { + "epoch": 2.119205298013245, + "grad_norm": 0.1039639413356781, + "learning_rate": 1.152317880794702e-05, + "loss": 0.0335, + "step": 6080 + }, + { + "epoch": 2.122690833042872, + "grad_norm": 6.56930685043335, + "learning_rate": 1.1509236667828511e-05, + "loss": 0.1515, + "step": 6090 + }, + { + "epoch": 2.126176368072499, + "grad_norm": 0.03588613122701645, + "learning_rate": 1.1495294527710003e-05, + "loss": 0.2306, + "step": 6100 + }, + { + "epoch": 2.129661903102126, + "grad_norm": 11.106224060058594, + "learning_rate": 1.1481352387591496e-05, + "loss": 0.1734, + "step": 6110 + }, + { + "epoch": 2.1331474381317532, + "grad_norm": 26.07666778564453, + "learning_rate": 1.1467410247472988e-05, + "loss": 0.2769, + "step": 6120 + }, + { + "epoch": 2.1366329731613805, + "grad_norm": 10.249650955200195, + "learning_rate": 1.145346810735448e-05, + "loss": 0.1453, + "step": 6130 + }, + { + "epoch": 2.1401185081910072, + "grad_norm": 2.9616682529449463, + "learning_rate": 1.1439525967235972e-05, + "loss": 0.1438, + "step": 6140 + }, + { + "epoch": 2.1436040432206345, + "grad_norm": 0.1858883649110794, + "learning_rate": 1.1425583827117464e-05, + "loss": 0.0802, + "step": 6150 + }, + { + "epoch": 2.1470895782502613, + "grad_norm": 0.5032067894935608, + "learning_rate": 1.1411641686998955e-05, + "loss": 0.1924, + "step": 6160 + }, + { + "epoch": 2.1505751132798885, + "grad_norm": 0.04852207750082016, + "learning_rate": 1.1397699546880448e-05, + "loss": 0.0768, + "step": 6170 + }, + { + "epoch": 2.1540606483095157, + "grad_norm": 0.9909989833831787, + "learning_rate": 1.138375740676194e-05, + "loss": 0.0138, + "step": 6180 + }, + { + "epoch": 2.1575461833391425, + "grad_norm": 0.036640316247940063, + "learning_rate": 1.1369815266643431e-05, + "loss": 0.0621, + "step": 6190 + }, + { + "epoch": 2.1610317183687697, + "grad_norm": 11.941831588745117, + "learning_rate": 1.1355873126524923e-05, + "loss": 0.1628, + "step": 6200 + }, + { + "epoch": 2.1645172533983965, + "grad_norm": 0.07637592405080795, + "learning_rate": 1.1341930986406416e-05, + "loss": 0.1821, + "step": 6210 + }, + { + "epoch": 2.1680027884280237, + "grad_norm": 0.07637934386730194, + "learning_rate": 1.1327988846287905e-05, + "loss": 0.1699, + "step": 6220 + }, + { + "epoch": 2.171488323457651, + "grad_norm": 0.0320703499019146, + "learning_rate": 1.1314046706169398e-05, + "loss": 0.136, + "step": 6230 + }, + { + "epoch": 2.1749738584872778, + "grad_norm": 0.06301452219486237, + "learning_rate": 1.1300104566050888e-05, + "loss": 0.1863, + "step": 6240 + }, + { + "epoch": 2.178459393516905, + "grad_norm": 0.03842555731534958, + "learning_rate": 1.128616242593238e-05, + "loss": 0.0453, + "step": 6250 + }, + { + "epoch": 2.1819449285465318, + "grad_norm": 0.03716614842414856, + "learning_rate": 1.1272220285813873e-05, + "loss": 0.1863, + "step": 6260 + }, + { + "epoch": 2.185430463576159, + "grad_norm": 0.043157655745744705, + "learning_rate": 1.1258278145695364e-05, + "loss": 0.2721, + "step": 6270 + }, + { + "epoch": 2.188915998605786, + "grad_norm": 0.07806668430566788, + "learning_rate": 1.1244336005576857e-05, + "loss": 0.1212, + "step": 6280 + }, + { + "epoch": 2.192401533635413, + "grad_norm": 0.025718241930007935, + "learning_rate": 1.123039386545835e-05, + "loss": 0.0891, + "step": 6290 + }, + { + "epoch": 2.1958870686650402, + "grad_norm": 1.4488409757614136, + "learning_rate": 1.121645172533984e-05, + "loss": 0.1036, + "step": 6300 + }, + { + "epoch": 2.199372603694667, + "grad_norm": 0.10433078557252884, + "learning_rate": 1.1202509585221333e-05, + "loss": 0.2275, + "step": 6310 + }, + { + "epoch": 2.2028581387242943, + "grad_norm": 0.049195412546396255, + "learning_rate": 1.1188567445102825e-05, + "loss": 0.01, + "step": 6320 + }, + { + "epoch": 2.206343673753921, + "grad_norm": 1.253458857536316, + "learning_rate": 1.1174625304984318e-05, + "loss": 0.0472, + "step": 6330 + }, + { + "epoch": 2.2098292087835483, + "grad_norm": 0.04194582253694534, + "learning_rate": 1.1160683164865808e-05, + "loss": 0.2199, + "step": 6340 + }, + { + "epoch": 2.2133147438131755, + "grad_norm": 0.22297650575637817, + "learning_rate": 1.11467410247473e-05, + "loss": 0.0768, + "step": 6350 + }, + { + "epoch": 2.2168002788428023, + "grad_norm": 0.03070775978267193, + "learning_rate": 1.113279888462879e-05, + "loss": 0.2436, + "step": 6360 + }, + { + "epoch": 2.2202858138724295, + "grad_norm": 0.04577281326055527, + "learning_rate": 1.1118856744510282e-05, + "loss": 0.0474, + "step": 6370 + }, + { + "epoch": 2.2237713489020563, + "grad_norm": 0.07843345403671265, + "learning_rate": 1.1104914604391775e-05, + "loss": 0.0127, + "step": 6380 + }, + { + "epoch": 2.2272568839316835, + "grad_norm": 0.08817867934703827, + "learning_rate": 1.1090972464273266e-05, + "loss": 0.2568, + "step": 6390 + }, + { + "epoch": 2.2307424189613108, + "grad_norm": 0.16615049540996552, + "learning_rate": 1.1077030324154758e-05, + "loss": 0.0813, + "step": 6400 + }, + { + "epoch": 2.2342279539909375, + "grad_norm": 0.0826653391122818, + "learning_rate": 1.106308818403625e-05, + "loss": 0.0563, + "step": 6410 + }, + { + "epoch": 2.2377134890205648, + "grad_norm": 0.05890136957168579, + "learning_rate": 1.1049146043917742e-05, + "loss": 0.1036, + "step": 6420 + }, + { + "epoch": 2.2411990240501916, + "grad_norm": 0.02518044412136078, + "learning_rate": 1.1035203903799234e-05, + "loss": 0.3812, + "step": 6430 + }, + { + "epoch": 2.244684559079819, + "grad_norm": 10.567590713500977, + "learning_rate": 1.1021261763680727e-05, + "loss": 0.1296, + "step": 6440 + }, + { + "epoch": 2.2481700941094456, + "grad_norm": 34.6755485534668, + "learning_rate": 1.1007319623562217e-05, + "loss": 0.1012, + "step": 6450 + }, + { + "epoch": 2.251655629139073, + "grad_norm": 15.21702766418457, + "learning_rate": 1.099337748344371e-05, + "loss": 0.104, + "step": 6460 + }, + { + "epoch": 2.2551411641687, + "grad_norm": 0.03763876110315323, + "learning_rate": 1.0979435343325202e-05, + "loss": 0.1439, + "step": 6470 + }, + { + "epoch": 2.258626699198327, + "grad_norm": 1.3450989723205566, + "learning_rate": 1.0965493203206693e-05, + "loss": 0.1072, + "step": 6480 + }, + { + "epoch": 2.262112234227954, + "grad_norm": 0.040909543633461, + "learning_rate": 1.0951551063088184e-05, + "loss": 0.1536, + "step": 6490 + }, + { + "epoch": 2.265597769257581, + "grad_norm": 11.683106422424316, + "learning_rate": 1.0937608922969677e-05, + "loss": 0.1513, + "step": 6500 + }, + { + "epoch": 2.269083304287208, + "grad_norm": 17.460403442382812, + "learning_rate": 1.0923666782851167e-05, + "loss": 0.2438, + "step": 6510 + }, + { + "epoch": 2.2725688393168353, + "grad_norm": 0.03460359200835228, + "learning_rate": 1.090972464273266e-05, + "loss": 0.0227, + "step": 6520 + }, + { + "epoch": 2.276054374346462, + "grad_norm": 0.37530404329299927, + "learning_rate": 1.0895782502614152e-05, + "loss": 0.3024, + "step": 6530 + }, + { + "epoch": 2.2795399093760893, + "grad_norm": 2.3782732486724854, + "learning_rate": 1.0881840362495643e-05, + "loss": 0.0873, + "step": 6540 + }, + { + "epoch": 2.283025444405716, + "grad_norm": 0.10322298854589462, + "learning_rate": 1.0867898222377136e-05, + "loss": 0.0503, + "step": 6550 + }, + { + "epoch": 2.2865109794353433, + "grad_norm": 0.08421849459409714, + "learning_rate": 1.0853956082258628e-05, + "loss": 0.2299, + "step": 6560 + }, + { + "epoch": 2.2899965144649705, + "grad_norm": 0.02602728269994259, + "learning_rate": 1.0840013942140119e-05, + "loss": 0.0616, + "step": 6570 + }, + { + "epoch": 2.2934820494945973, + "grad_norm": 0.14224806427955627, + "learning_rate": 1.0826071802021612e-05, + "loss": 0.1129, + "step": 6580 + }, + { + "epoch": 2.2969675845242246, + "grad_norm": 0.02626832202076912, + "learning_rate": 1.0812129661903104e-05, + "loss": 0.1778, + "step": 6590 + }, + { + "epoch": 2.3004531195538513, + "grad_norm": 0.044093526899814606, + "learning_rate": 1.0798187521784595e-05, + "loss": 0.3261, + "step": 6600 + }, + { + "epoch": 2.3039386545834786, + "grad_norm": 10.756707191467285, + "learning_rate": 1.0784245381666087e-05, + "loss": 0.0347, + "step": 6610 + }, + { + "epoch": 2.307424189613106, + "grad_norm": 21.886564254760742, + "learning_rate": 1.077030324154758e-05, + "loss": 0.0578, + "step": 6620 + }, + { + "epoch": 2.3109097246427326, + "grad_norm": 0.18429026007652283, + "learning_rate": 1.0756361101429069e-05, + "loss": 0.2541, + "step": 6630 + }, + { + "epoch": 2.31439525967236, + "grad_norm": 0.07464715838432312, + "learning_rate": 1.0742418961310562e-05, + "loss": 0.1547, + "step": 6640 + }, + { + "epoch": 2.3178807947019866, + "grad_norm": 3.714336395263672, + "learning_rate": 1.0728476821192052e-05, + "loss": 0.2902, + "step": 6650 + }, + { + "epoch": 2.321366329731614, + "grad_norm": 0.04718979448080063, + "learning_rate": 1.0714534681073545e-05, + "loss": 0.1189, + "step": 6660 + }, + { + "epoch": 2.324851864761241, + "grad_norm": 0.9812682867050171, + "learning_rate": 1.0700592540955037e-05, + "loss": 0.2747, + "step": 6670 + }, + { + "epoch": 2.328337399790868, + "grad_norm": 5.4444684982299805, + "learning_rate": 1.0686650400836528e-05, + "loss": 0.1446, + "step": 6680 + }, + { + "epoch": 2.331822934820495, + "grad_norm": 59.1495475769043, + "learning_rate": 1.067270826071802e-05, + "loss": 0.0812, + "step": 6690 + }, + { + "epoch": 2.335308469850122, + "grad_norm": 0.24086332321166992, + "learning_rate": 1.0658766120599513e-05, + "loss": 0.1042, + "step": 6700 + }, + { + "epoch": 2.338794004879749, + "grad_norm": 0.045006413012742996, + "learning_rate": 1.0644823980481006e-05, + "loss": 0.0306, + "step": 6710 + }, + { + "epoch": 2.3422795399093763, + "grad_norm": 0.03548438847064972, + "learning_rate": 1.0630881840362497e-05, + "loss": 0.0339, + "step": 6720 + }, + { + "epoch": 2.345765074939003, + "grad_norm": 0.043661557137966156, + "learning_rate": 1.0616939700243989e-05, + "loss": 0.1046, + "step": 6730 + }, + { + "epoch": 2.3492506099686303, + "grad_norm": 0.03901302441954613, + "learning_rate": 1.0602997560125482e-05, + "loss": 0.1696, + "step": 6740 + }, + { + "epoch": 2.352736144998257, + "grad_norm": 31.03610610961914, + "learning_rate": 1.0589055420006972e-05, + "loss": 0.464, + "step": 6750 + }, + { + "epoch": 2.3562216800278843, + "grad_norm": 1.4271265268325806, + "learning_rate": 1.0575113279888465e-05, + "loss": 0.0679, + "step": 6760 + }, + { + "epoch": 2.3597072150575116, + "grad_norm": 15.831421852111816, + "learning_rate": 1.0561171139769954e-05, + "loss": 0.1207, + "step": 6770 + }, + { + "epoch": 2.3631927500871384, + "grad_norm": 0.0829152911901474, + "learning_rate": 1.0547228999651447e-05, + "loss": 0.1525, + "step": 6780 + }, + { + "epoch": 2.3666782851167656, + "grad_norm": 0.04117709770798683, + "learning_rate": 1.0533286859532939e-05, + "loss": 0.1828, + "step": 6790 + }, + { + "epoch": 2.3701638201463924, + "grad_norm": 0.49705639481544495, + "learning_rate": 1.051934471941443e-05, + "loss": 0.2104, + "step": 6800 + }, + { + "epoch": 2.3736493551760196, + "grad_norm": 24.12613296508789, + "learning_rate": 1.0505402579295922e-05, + "loss": 0.2642, + "step": 6810 + }, + { + "epoch": 2.3771348902056464, + "grad_norm": 9.280982971191406, + "learning_rate": 1.0491460439177415e-05, + "loss": 0.1206, + "step": 6820 + }, + { + "epoch": 2.3806204252352736, + "grad_norm": 0.5508664846420288, + "learning_rate": 1.0477518299058906e-05, + "loss": 0.1583, + "step": 6830 + }, + { + "epoch": 2.384105960264901, + "grad_norm": 9.97512149810791, + "learning_rate": 1.0463576158940398e-05, + "loss": 0.112, + "step": 6840 + }, + { + "epoch": 2.3875914952945276, + "grad_norm": 0.03401152417063713, + "learning_rate": 1.044963401882189e-05, + "loss": 0.0891, + "step": 6850 + }, + { + "epoch": 2.391077030324155, + "grad_norm": 9.554537773132324, + "learning_rate": 1.0435691878703382e-05, + "loss": 0.1256, + "step": 6860 + }, + { + "epoch": 2.3945625653537816, + "grad_norm": 10.09185791015625, + "learning_rate": 1.0421749738584874e-05, + "loss": 0.1756, + "step": 6870 + }, + { + "epoch": 2.398048100383409, + "grad_norm": 0.05580438673496246, + "learning_rate": 1.0407807598466367e-05, + "loss": 0.3326, + "step": 6880 + }, + { + "epoch": 2.4015336354130357, + "grad_norm": 0.7927899360656738, + "learning_rate": 1.0393865458347859e-05, + "loss": 0.2091, + "step": 6890 + }, + { + "epoch": 2.405019170442663, + "grad_norm": 4.687802791595459, + "learning_rate": 1.0379923318229348e-05, + "loss": 0.1468, + "step": 6900 + }, + { + "epoch": 2.40850470547229, + "grad_norm": 0.03195074200630188, + "learning_rate": 1.036598117811084e-05, + "loss": 0.0722, + "step": 6910 + }, + { + "epoch": 2.411990240501917, + "grad_norm": 7.4297590255737305, + "learning_rate": 1.0352039037992331e-05, + "loss": 0.1048, + "step": 6920 + }, + { + "epoch": 2.415475775531544, + "grad_norm": 0.04244573786854744, + "learning_rate": 1.0338096897873824e-05, + "loss": 0.1695, + "step": 6930 + }, + { + "epoch": 2.418961310561171, + "grad_norm": 0.28662291169166565, + "learning_rate": 1.0324154757755317e-05, + "loss": 0.0572, + "step": 6940 + }, + { + "epoch": 2.422446845590798, + "grad_norm": 8.817971229553223, + "learning_rate": 1.0310212617636807e-05, + "loss": 0.169, + "step": 6950 + }, + { + "epoch": 2.4259323806204254, + "grad_norm": 0.06383698433637619, + "learning_rate": 1.02962704775183e-05, + "loss": 0.0905, + "step": 6960 + }, + { + "epoch": 2.429417915650052, + "grad_norm": 0.18662381172180176, + "learning_rate": 1.0282328337399792e-05, + "loss": 0.0093, + "step": 6970 + }, + { + "epoch": 2.4329034506796794, + "grad_norm": 23.655628204345703, + "learning_rate": 1.0268386197281283e-05, + "loss": 0.1657, + "step": 6980 + }, + { + "epoch": 2.436388985709306, + "grad_norm": 19.459394454956055, + "learning_rate": 1.0254444057162776e-05, + "loss": 0.1519, + "step": 6990 + }, + { + "epoch": 2.4398745207389334, + "grad_norm": 9.555098533630371, + "learning_rate": 1.0240501917044268e-05, + "loss": 0.1095, + "step": 7000 + }, + { + "epoch": 2.4433600557685606, + "grad_norm": 0.10387060046195984, + "learning_rate": 1.0226559776925759e-05, + "loss": 0.1536, + "step": 7010 + }, + { + "epoch": 2.4468455907981874, + "grad_norm": 0.0778062492609024, + "learning_rate": 1.0212617636807251e-05, + "loss": 0.0229, + "step": 7020 + }, + { + "epoch": 2.4503311258278146, + "grad_norm": 15.077468872070312, + "learning_rate": 1.0198675496688744e-05, + "loss": 0.2126, + "step": 7030 + }, + { + "epoch": 2.4538166608574414, + "grad_norm": 0.02709888108074665, + "learning_rate": 1.0184733356570233e-05, + "loss": 0.0632, + "step": 7040 + }, + { + "epoch": 2.4573021958870687, + "grad_norm": 0.04719749838113785, + "learning_rate": 1.0170791216451726e-05, + "loss": 0.2281, + "step": 7050 + }, + { + "epoch": 2.460787730916696, + "grad_norm": 12.81986141204834, + "learning_rate": 1.0156849076333216e-05, + "loss": 0.2326, + "step": 7060 + }, + { + "epoch": 2.4642732659463227, + "grad_norm": 0.12026797980070114, + "learning_rate": 1.0142906936214709e-05, + "loss": 0.1655, + "step": 7070 + }, + { + "epoch": 2.46775880097595, + "grad_norm": 0.03786276653409004, + "learning_rate": 1.0128964796096201e-05, + "loss": 0.1009, + "step": 7080 + }, + { + "epoch": 2.4712443360055767, + "grad_norm": 0.12191443145275116, + "learning_rate": 1.0115022655977694e-05, + "loss": 0.1566, + "step": 7090 + }, + { + "epoch": 2.474729871035204, + "grad_norm": 3.893367290496826, + "learning_rate": 1.0101080515859185e-05, + "loss": 0.152, + "step": 7100 + }, + { + "epoch": 2.478215406064831, + "grad_norm": 0.02769341878592968, + "learning_rate": 1.0087138375740677e-05, + "loss": 0.1226, + "step": 7110 + }, + { + "epoch": 2.481700941094458, + "grad_norm": 14.743918418884277, + "learning_rate": 1.007319623562217e-05, + "loss": 0.0629, + "step": 7120 + }, + { + "epoch": 2.485186476124085, + "grad_norm": 0.024167189374566078, + "learning_rate": 1.005925409550366e-05, + "loss": 0.1118, + "step": 7130 + }, + { + "epoch": 2.488672011153712, + "grad_norm": 0.027763212099671364, + "learning_rate": 1.0045311955385153e-05, + "loss": 0.1814, + "step": 7140 + }, + { + "epoch": 2.492157546183339, + "grad_norm": 0.03643191605806351, + "learning_rate": 1.0031369815266646e-05, + "loss": 0.1689, + "step": 7150 + }, + { + "epoch": 2.4956430812129664, + "grad_norm": 9.347540855407715, + "learning_rate": 1.0017427675148136e-05, + "loss": 0.1229, + "step": 7160 + }, + { + "epoch": 2.499128616242593, + "grad_norm": 0.03700919449329376, + "learning_rate": 1.0003485535029629e-05, + "loss": 0.1254, + "step": 7170 + }, + { + "epoch": 2.5026141512722204, + "grad_norm": 0.03421923145651817, + "learning_rate": 9.98954339491112e-06, + "loss": 0.086, + "step": 7180 + }, + { + "epoch": 2.506099686301847, + "grad_norm": 9.573734283447266, + "learning_rate": 9.975601254792612e-06, + "loss": 0.2747, + "step": 7190 + }, + { + "epoch": 2.5095852213314744, + "grad_norm": 0.4348721504211426, + "learning_rate": 9.961659114674103e-06, + "loss": 0.1462, + "step": 7200 + }, + { + "epoch": 2.5130707563611017, + "grad_norm": 0.0570727176964283, + "learning_rate": 9.947716974555594e-06, + "loss": 0.3057, + "step": 7210 + }, + { + "epoch": 2.5165562913907285, + "grad_norm": 8.233874320983887, + "learning_rate": 9.933774834437086e-06, + "loss": 0.1181, + "step": 7220 + }, + { + "epoch": 2.5200418264203557, + "grad_norm": 0.04349507763981819, + "learning_rate": 9.919832694318579e-06, + "loss": 0.0152, + "step": 7230 + }, + { + "epoch": 2.5235273614499825, + "grad_norm": 0.12035762518644333, + "learning_rate": 9.90589055420007e-06, + "loss": 0.2576, + "step": 7240 + }, + { + "epoch": 2.5270128964796097, + "grad_norm": 1.1068867444992065, + "learning_rate": 9.891948414081562e-06, + "loss": 0.3256, + "step": 7250 + }, + { + "epoch": 2.530498431509237, + "grad_norm": 0.03243900090456009, + "learning_rate": 9.878006273963055e-06, + "loss": 0.1656, + "step": 7260 + }, + { + "epoch": 2.5339839665388637, + "grad_norm": 0.02715582214295864, + "learning_rate": 9.864064133844546e-06, + "loss": 0.053, + "step": 7270 + }, + { + "epoch": 2.5374695015684905, + "grad_norm": 1.750795602798462, + "learning_rate": 9.850121993726038e-06, + "loss": 0.1305, + "step": 7280 + }, + { + "epoch": 2.5409550365981177, + "grad_norm": 2.8734793663024902, + "learning_rate": 9.836179853607529e-06, + "loss": 0.019, + "step": 7290 + }, + { + "epoch": 2.544440571627745, + "grad_norm": 0.04098232463002205, + "learning_rate": 9.822237713489021e-06, + "loss": 0.0792, + "step": 7300 + }, + { + "epoch": 2.547926106657372, + "grad_norm": 0.94338059425354, + "learning_rate": 9.808295573370514e-06, + "loss": 0.0443, + "step": 7310 + }, + { + "epoch": 2.551411641686999, + "grad_norm": 0.20982791483402252, + "learning_rate": 9.794353433252005e-06, + "loss": 0.0906, + "step": 7320 + }, + { + "epoch": 2.5548971767166258, + "grad_norm": 15.172703742980957, + "learning_rate": 9.780411293133497e-06, + "loss": 0.3467, + "step": 7330 + }, + { + "epoch": 2.558382711746253, + "grad_norm": 0.26785117387771606, + "learning_rate": 9.76646915301499e-06, + "loss": 0.0437, + "step": 7340 + }, + { + "epoch": 2.56186824677588, + "grad_norm": 12.561800003051758, + "learning_rate": 9.75252701289648e-06, + "loss": 0.1028, + "step": 7350 + }, + { + "epoch": 2.565353781805507, + "grad_norm": 20.95243263244629, + "learning_rate": 9.738584872777971e-06, + "loss": 0.1875, + "step": 7360 + }, + { + "epoch": 2.5688393168351342, + "grad_norm": 0.034853193908929825, + "learning_rate": 9.724642732659464e-06, + "loss": 0.0107, + "step": 7370 + }, + { + "epoch": 2.572324851864761, + "grad_norm": 3.5008201599121094, + "learning_rate": 9.710700592540956e-06, + "loss": 0.1182, + "step": 7380 + }, + { + "epoch": 2.5758103868943882, + "grad_norm": 0.5372085571289062, + "learning_rate": 9.696758452422447e-06, + "loss": 0.0679, + "step": 7390 + }, + { + "epoch": 2.5792959219240155, + "grad_norm": 0.05770620331168175, + "learning_rate": 9.68281631230394e-06, + "loss": 0.0422, + "step": 7400 + }, + { + "epoch": 2.5827814569536423, + "grad_norm": 0.0414259098470211, + "learning_rate": 9.668874172185432e-06, + "loss": 0.0136, + "step": 7410 + }, + { + "epoch": 2.5862669919832695, + "grad_norm": 0.03852393478155136, + "learning_rate": 9.654932032066923e-06, + "loss": 0.2012, + "step": 7420 + }, + { + "epoch": 2.5897525270128963, + "grad_norm": 13.666277885437012, + "learning_rate": 9.640989891948414e-06, + "loss": 0.0595, + "step": 7430 + }, + { + "epoch": 2.5932380620425235, + "grad_norm": 0.4676874279975891, + "learning_rate": 9.627047751829906e-06, + "loss": 0.2056, + "step": 7440 + }, + { + "epoch": 2.5967235970721507, + "grad_norm": 2.7470552921295166, + "learning_rate": 9.613105611711399e-06, + "loss": 0.0593, + "step": 7450 + }, + { + "epoch": 2.6002091321017775, + "grad_norm": 0.027122966945171356, + "learning_rate": 9.59916347159289e-06, + "loss": 0.1028, + "step": 7460 + }, + { + "epoch": 2.6036946671314047, + "grad_norm": 0.020191051065921783, + "learning_rate": 9.585221331474382e-06, + "loss": 0.0137, + "step": 7470 + }, + { + "epoch": 2.6071802021610315, + "grad_norm": 5.530102729797363, + "learning_rate": 9.571279191355873e-06, + "loss": 0.0952, + "step": 7480 + }, + { + "epoch": 2.6106657371906588, + "grad_norm": 30.30436897277832, + "learning_rate": 9.557337051237366e-06, + "loss": 0.1527, + "step": 7490 + }, + { + "epoch": 2.614151272220286, + "grad_norm": 0.10788305848836899, + "learning_rate": 9.543394911118858e-06, + "loss": 0.1038, + "step": 7500 + }, + { + "epoch": 2.6176368072499128, + "grad_norm": 4.27375602722168, + "learning_rate": 9.529452771000349e-06, + "loss": 0.1816, + "step": 7510 + }, + { + "epoch": 2.62112234227954, + "grad_norm": 0.06542309373617172, + "learning_rate": 9.515510630881841e-06, + "loss": 0.1523, + "step": 7520 + }, + { + "epoch": 2.624607877309167, + "grad_norm": 13.549959182739258, + "learning_rate": 9.501568490763334e-06, + "loss": 0.208, + "step": 7530 + }, + { + "epoch": 2.628093412338794, + "grad_norm": 15.343768119812012, + "learning_rate": 9.487626350644825e-06, + "loss": 0.2428, + "step": 7540 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.05019600689411163, + "learning_rate": 9.473684210526315e-06, + "loss": 0.249, + "step": 7550 + }, + { + "epoch": 2.635064482398048, + "grad_norm": 13.942766189575195, + "learning_rate": 9.459742070407808e-06, + "loss": 0.215, + "step": 7560 + }, + { + "epoch": 2.6385500174276753, + "grad_norm": 0.032675858587026596, + "learning_rate": 9.4457999302893e-06, + "loss": 0.133, + "step": 7570 + }, + { + "epoch": 2.642035552457302, + "grad_norm": 0.05308655649423599, + "learning_rate": 9.431857790170791e-06, + "loss": 0.218, + "step": 7580 + }, + { + "epoch": 2.6455210874869293, + "grad_norm": 0.03882508724927902, + "learning_rate": 9.417915650052284e-06, + "loss": 0.1013, + "step": 7590 + }, + { + "epoch": 2.6490066225165565, + "grad_norm": 0.08975423872470856, + "learning_rate": 9.403973509933776e-06, + "loss": 0.3583, + "step": 7600 + }, + { + "epoch": 2.6524921575461833, + "grad_norm": 5.848064422607422, + "learning_rate": 9.390031369815267e-06, + "loss": 0.117, + "step": 7610 + }, + { + "epoch": 2.6559776925758105, + "grad_norm": 9.604795455932617, + "learning_rate": 9.376089229696758e-06, + "loss": 0.2501, + "step": 7620 + }, + { + "epoch": 2.6594632276054373, + "grad_norm": 0.13165131211280823, + "learning_rate": 9.36214708957825e-06, + "loss": 0.1344, + "step": 7630 + }, + { + "epoch": 2.6629487626350645, + "grad_norm": 0.2645893692970276, + "learning_rate": 9.348204949459743e-06, + "loss": 0.0126, + "step": 7640 + }, + { + "epoch": 2.6664342976646918, + "grad_norm": 0.19416283071041107, + "learning_rate": 9.334262809341234e-06, + "loss": 0.0361, + "step": 7650 + }, + { + "epoch": 2.6699198326943185, + "grad_norm": 0.15585561096668243, + "learning_rate": 9.320320669222726e-06, + "loss": 0.0768, + "step": 7660 + }, + { + "epoch": 2.6734053677239458, + "grad_norm": 0.03315681591629982, + "learning_rate": 9.306378529104219e-06, + "loss": 0.1021, + "step": 7670 + }, + { + "epoch": 2.6768909027535726, + "grad_norm": 0.26877591013908386, + "learning_rate": 9.292436388985711e-06, + "loss": 0.1073, + "step": 7680 + }, + { + "epoch": 2.6803764377832, + "grad_norm": 0.08659540116786957, + "learning_rate": 9.278494248867202e-06, + "loss": 0.1891, + "step": 7690 + }, + { + "epoch": 2.683861972812827, + "grad_norm": 21.811100006103516, + "learning_rate": 9.264552108748693e-06, + "loss": 0.2154, + "step": 7700 + }, + { + "epoch": 2.687347507842454, + "grad_norm": 0.033906009048223495, + "learning_rate": 9.250609968630185e-06, + "loss": 0.2231, + "step": 7710 + }, + { + "epoch": 2.690833042872081, + "grad_norm": 0.03380822762846947, + "learning_rate": 9.236667828511678e-06, + "loss": 0.2515, + "step": 7720 + }, + { + "epoch": 2.694318577901708, + "grad_norm": 0.03534507378935814, + "learning_rate": 9.222725688393169e-06, + "loss": 0.1303, + "step": 7730 + }, + { + "epoch": 2.697804112931335, + "grad_norm": 0.03269047290086746, + "learning_rate": 9.208783548274661e-06, + "loss": 0.0908, + "step": 7740 + }, + { + "epoch": 2.7012896479609623, + "grad_norm": 5.287631988525391, + "learning_rate": 9.194841408156154e-06, + "loss": 0.1089, + "step": 7750 + }, + { + "epoch": 2.704775182990589, + "grad_norm": 0.03315020725131035, + "learning_rate": 9.180899268037645e-06, + "loss": 0.0785, + "step": 7760 + }, + { + "epoch": 2.708260718020216, + "grad_norm": 0.04839996621012688, + "learning_rate": 9.166957127919135e-06, + "loss": 0.1913, + "step": 7770 + }, + { + "epoch": 2.711746253049843, + "grad_norm": 17.74064826965332, + "learning_rate": 9.153014987800628e-06, + "loss": 0.3599, + "step": 7780 + }, + { + "epoch": 2.7152317880794703, + "grad_norm": 0.03409993648529053, + "learning_rate": 9.13907284768212e-06, + "loss": 0.0807, + "step": 7790 + }, + { + "epoch": 2.718717323109097, + "grad_norm": 15.558609008789062, + "learning_rate": 9.125130707563611e-06, + "loss": 0.2329, + "step": 7800 + }, + { + "epoch": 2.7222028581387243, + "grad_norm": 0.7072465419769287, + "learning_rate": 9.111188567445104e-06, + "loss": 0.2571, + "step": 7810 + }, + { + "epoch": 2.725688393168351, + "grad_norm": 0.0745309516787529, + "learning_rate": 9.097246427326596e-06, + "loss": 0.072, + "step": 7820 + }, + { + "epoch": 2.7291739281979783, + "grad_norm": 0.05214143171906471, + "learning_rate": 9.083304287208087e-06, + "loss": 0.3442, + "step": 7830 + }, + { + "epoch": 2.7326594632276056, + "grad_norm": 0.5700211524963379, + "learning_rate": 9.069362147089578e-06, + "loss": 0.1222, + "step": 7840 + }, + { + "epoch": 2.7361449982572323, + "grad_norm": 0.032385822385549545, + "learning_rate": 9.05542000697107e-06, + "loss": 0.0603, + "step": 7850 + }, + { + "epoch": 2.7396305332868596, + "grad_norm": 1.3482292890548706, + "learning_rate": 9.041477866852563e-06, + "loss": 0.018, + "step": 7860 + }, + { + "epoch": 2.7431160683164864, + "grad_norm": 0.031450305134058, + "learning_rate": 9.027535726734055e-06, + "loss": 0.2208, + "step": 7870 + }, + { + "epoch": 2.7466016033461136, + "grad_norm": 17.35287857055664, + "learning_rate": 9.013593586615546e-06, + "loss": 0.1161, + "step": 7880 + }, + { + "epoch": 2.750087138375741, + "grad_norm": 0.044247403740882874, + "learning_rate": 8.999651446497037e-06, + "loss": 0.1503, + "step": 7890 + }, + { + "epoch": 2.7535726734053676, + "grad_norm": 0.03640054911375046, + "learning_rate": 8.98570930637853e-06, + "loss": 0.1018, + "step": 7900 + }, + { + "epoch": 2.757058208434995, + "grad_norm": 2.4188730716705322, + "learning_rate": 8.971767166260022e-06, + "loss": 0.2335, + "step": 7910 + }, + { + "epoch": 2.7605437434646216, + "grad_norm": 1.5793354511260986, + "learning_rate": 8.957825026141513e-06, + "loss": 0.0794, + "step": 7920 + }, + { + "epoch": 2.764029278494249, + "grad_norm": 7.847117900848389, + "learning_rate": 8.943882886023005e-06, + "loss": 0.1338, + "step": 7930 + }, + { + "epoch": 2.767514813523876, + "grad_norm": 0.0489552803337574, + "learning_rate": 8.929940745904498e-06, + "loss": 0.2494, + "step": 7940 + }, + { + "epoch": 2.771000348553503, + "grad_norm": 0.023390300571918488, + "learning_rate": 8.915998605785989e-06, + "loss": 0.2496, + "step": 7950 + }, + { + "epoch": 2.77448588358313, + "grad_norm": 0.07795246690511703, + "learning_rate": 8.90205646566748e-06, + "loss": 0.1616, + "step": 7960 + }, + { + "epoch": 2.777971418612757, + "grad_norm": 6.65225076675415, + "learning_rate": 8.888114325548972e-06, + "loss": 0.2364, + "step": 7970 + }, + { + "epoch": 2.781456953642384, + "grad_norm": 1.1383771896362305, + "learning_rate": 8.874172185430465e-06, + "loss": 0.2328, + "step": 7980 + }, + { + "epoch": 2.7849424886720113, + "grad_norm": 0.02891557849943638, + "learning_rate": 8.860230045311955e-06, + "loss": 0.0924, + "step": 7990 + }, + { + "epoch": 2.788428023701638, + "grad_norm": 0.2302275449037552, + "learning_rate": 8.846287905193448e-06, + "loss": 0.1521, + "step": 8000 + }, + { + "epoch": 2.7919135587312653, + "grad_norm": 0.022319387644529343, + "learning_rate": 8.83234576507494e-06, + "loss": 0.2744, + "step": 8010 + }, + { + "epoch": 2.795399093760892, + "grad_norm": 10.700783729553223, + "learning_rate": 8.818403624956431e-06, + "loss": 0.113, + "step": 8020 + }, + { + "epoch": 2.7988846287905194, + "grad_norm": 8.70779800415039, + "learning_rate": 8.804461484837922e-06, + "loss": 0.0871, + "step": 8030 + }, + { + "epoch": 2.8023701638201466, + "grad_norm": 21.17486000061035, + "learning_rate": 8.790519344719415e-06, + "loss": 0.1093, + "step": 8040 + }, + { + "epoch": 2.8058556988497734, + "grad_norm": 0.03342998027801514, + "learning_rate": 8.776577204600907e-06, + "loss": 0.0455, + "step": 8050 + }, + { + "epoch": 2.8093412338794006, + "grad_norm": 0.0786479264497757, + "learning_rate": 8.7626350644824e-06, + "loss": 0.1769, + "step": 8060 + }, + { + "epoch": 2.8128267689090274, + "grad_norm": 0.03312867507338524, + "learning_rate": 8.74869292436389e-06, + "loss": 0.0294, + "step": 8070 + }, + { + "epoch": 2.8163123039386546, + "grad_norm": 0.3976738154888153, + "learning_rate": 8.734750784245383e-06, + "loss": 0.0484, + "step": 8080 + }, + { + "epoch": 2.819797838968282, + "grad_norm": 0.04097994789481163, + "learning_rate": 8.720808644126875e-06, + "loss": 0.2901, + "step": 8090 + }, + { + "epoch": 2.8232833739979086, + "grad_norm": 0.03706745803356171, + "learning_rate": 8.706866504008366e-06, + "loss": 0.1388, + "step": 8100 + }, + { + "epoch": 2.826768909027536, + "grad_norm": 2.765982151031494, + "learning_rate": 8.692924363889857e-06, + "loss": 0.2213, + "step": 8110 + }, + { + "epoch": 2.8302544440571626, + "grad_norm": 0.11387185752391815, + "learning_rate": 8.67898222377135e-06, + "loss": 0.3626, + "step": 8120 + }, + { + "epoch": 2.83373997908679, + "grad_norm": 10.69524097442627, + "learning_rate": 8.665040083652842e-06, + "loss": 0.1166, + "step": 8130 + }, + { + "epoch": 2.837225514116417, + "grad_norm": 0.020516468212008476, + "learning_rate": 8.651097943534333e-06, + "loss": 0.1093, + "step": 8140 + }, + { + "epoch": 2.840711049146044, + "grad_norm": 0.24759964644908905, + "learning_rate": 8.637155803415825e-06, + "loss": 0.2226, + "step": 8150 + }, + { + "epoch": 2.844196584175671, + "grad_norm": 0.030304880812764168, + "learning_rate": 8.623213663297318e-06, + "loss": 0.1346, + "step": 8160 + }, + { + "epoch": 2.847682119205298, + "grad_norm": 2.6233885288238525, + "learning_rate": 8.609271523178809e-06, + "loss": 0.175, + "step": 8170 + }, + { + "epoch": 2.851167654234925, + "grad_norm": 2.439249038696289, + "learning_rate": 8.5953293830603e-06, + "loss": 0.0595, + "step": 8180 + }, + { + "epoch": 2.8546531892645524, + "grad_norm": 0.10265187174081802, + "learning_rate": 8.581387242941792e-06, + "loss": 0.0761, + "step": 8190 + }, + { + "epoch": 2.858138724294179, + "grad_norm": 17.636390686035156, + "learning_rate": 8.567445102823285e-06, + "loss": 0.2557, + "step": 8200 + }, + { + "epoch": 2.861624259323806, + "grad_norm": 0.16799131035804749, + "learning_rate": 8.553502962704775e-06, + "loss": 0.0318, + "step": 8210 + }, + { + "epoch": 2.865109794353433, + "grad_norm": 0.3623441755771637, + "learning_rate": 8.539560822586268e-06, + "loss": 0.0818, + "step": 8220 + }, + { + "epoch": 2.8685953293830604, + "grad_norm": 6.526904106140137, + "learning_rate": 8.52561868246776e-06, + "loss": 0.0823, + "step": 8230 + }, + { + "epoch": 2.8720808644126876, + "grad_norm": 3.7460978031158447, + "learning_rate": 8.511676542349251e-06, + "loss": 0.2567, + "step": 8240 + }, + { + "epoch": 2.8755663994423144, + "grad_norm": 0.03773918002843857, + "learning_rate": 8.497734402230744e-06, + "loss": 0.0277, + "step": 8250 + }, + { + "epoch": 2.879051934471941, + "grad_norm": 0.03944707289338112, + "learning_rate": 8.483792262112234e-06, + "loss": 0.0547, + "step": 8260 + }, + { + "epoch": 2.8825374695015684, + "grad_norm": 0.16772694885730743, + "learning_rate": 8.469850121993727e-06, + "loss": 0.0881, + "step": 8270 + }, + { + "epoch": 2.8860230045311956, + "grad_norm": 7.099332809448242, + "learning_rate": 8.45590798187522e-06, + "loss": 0.1931, + "step": 8280 + }, + { + "epoch": 2.8895085395608224, + "grad_norm": 1.7227249145507812, + "learning_rate": 8.44196584175671e-06, + "loss": 0.1587, + "step": 8290 + }, + { + "epoch": 2.8929940745904497, + "grad_norm": 0.22683005034923553, + "learning_rate": 8.428023701638201e-06, + "loss": 0.0558, + "step": 8300 + }, + { + "epoch": 2.8964796096200764, + "grad_norm": 0.030454624444246292, + "learning_rate": 8.414081561519694e-06, + "loss": 0.1197, + "step": 8310 + }, + { + "epoch": 2.8999651446497037, + "grad_norm": 0.07277271151542664, + "learning_rate": 8.400139421401186e-06, + "loss": 0.0851, + "step": 8320 + }, + { + "epoch": 2.903450679679331, + "grad_norm": 6.2670063972473145, + "learning_rate": 8.386197281282677e-06, + "loss": 0.0101, + "step": 8330 + }, + { + "epoch": 2.9069362147089577, + "grad_norm": 25.918701171875, + "learning_rate": 8.37225514116417e-06, + "loss": 0.2227, + "step": 8340 + }, + { + "epoch": 2.910421749738585, + "grad_norm": 0.02563529834151268, + "learning_rate": 8.358313001045662e-06, + "loss": 0.2458, + "step": 8350 + }, + { + "epoch": 2.9139072847682117, + "grad_norm": 0.12701044976711273, + "learning_rate": 8.344370860927153e-06, + "loss": 0.0929, + "step": 8360 + }, + { + "epoch": 2.917392819797839, + "grad_norm": 0.0416131317615509, + "learning_rate": 8.330428720808644e-06, + "loss": 0.1182, + "step": 8370 + }, + { + "epoch": 2.920878354827466, + "grad_norm": 20.982830047607422, + "learning_rate": 8.316486580690136e-06, + "loss": 0.2912, + "step": 8380 + }, + { + "epoch": 2.924363889857093, + "grad_norm": 0.04384233430027962, + "learning_rate": 8.302544440571629e-06, + "loss": 0.0101, + "step": 8390 + }, + { + "epoch": 2.92784942488672, + "grad_norm": 0.35305264592170715, + "learning_rate": 8.28860230045312e-06, + "loss": 0.1257, + "step": 8400 + }, + { + "epoch": 2.931334959916347, + "grad_norm": 0.6124236583709717, + "learning_rate": 8.274660160334612e-06, + "loss": 0.0244, + "step": 8410 + }, + { + "epoch": 2.934820494945974, + "grad_norm": 21.486831665039062, + "learning_rate": 8.260718020216104e-06, + "loss": 0.1246, + "step": 8420 + }, + { + "epoch": 2.9383060299756014, + "grad_norm": 0.04249065741896629, + "learning_rate": 8.246775880097595e-06, + "loss": 0.1105, + "step": 8430 + }, + { + "epoch": 2.941791565005228, + "grad_norm": 22.740060806274414, + "learning_rate": 8.232833739979088e-06, + "loss": 0.3444, + "step": 8440 + }, + { + "epoch": 2.9452771000348554, + "grad_norm": 0.03325342759490013, + "learning_rate": 8.218891599860579e-06, + "loss": 0.0593, + "step": 8450 + }, + { + "epoch": 2.948762635064482, + "grad_norm": 0.06233534216880798, + "learning_rate": 8.204949459742071e-06, + "loss": 0.0664, + "step": 8460 + }, + { + "epoch": 2.9522481700941094, + "grad_norm": 0.13591769337654114, + "learning_rate": 8.191007319623564e-06, + "loss": 0.2097, + "step": 8470 + }, + { + "epoch": 2.9557337051237367, + "grad_norm": 0.034143611788749695, + "learning_rate": 8.177065179505054e-06, + "loss": 0.0226, + "step": 8480 + }, + { + "epoch": 2.9592192401533635, + "grad_norm": 0.04051746428012848, + "learning_rate": 8.163123039386547e-06, + "loss": 0.1239, + "step": 8490 + }, + { + "epoch": 2.9627047751829907, + "grad_norm": 0.17615370452404022, + "learning_rate": 8.14918089926804e-06, + "loss": 0.0881, + "step": 8500 + }, + { + "epoch": 2.9661903102126175, + "grad_norm": 10.80210018157959, + "learning_rate": 8.13523875914953e-06, + "loss": 0.1604, + "step": 8510 + }, + { + "epoch": 2.9696758452422447, + "grad_norm": 0.04547272250056267, + "learning_rate": 8.121296619031021e-06, + "loss": 0.1647, + "step": 8520 + }, + { + "epoch": 2.973161380271872, + "grad_norm": 0.026660796254873276, + "learning_rate": 8.107354478912514e-06, + "loss": 0.0634, + "step": 8530 + }, + { + "epoch": 2.9766469153014987, + "grad_norm": 0.02724742889404297, + "learning_rate": 8.093412338794006e-06, + "loss": 0.0997, + "step": 8540 + }, + { + "epoch": 2.980132450331126, + "grad_norm": 0.02272176183760166, + "learning_rate": 8.079470198675497e-06, + "loss": 0.1778, + "step": 8550 + }, + { + "epoch": 2.9836179853607527, + "grad_norm": 0.02710247039794922, + "learning_rate": 8.06552805855699e-06, + "loss": 0.1714, + "step": 8560 + }, + { + "epoch": 2.98710352039038, + "grad_norm": 3.23300838470459, + "learning_rate": 8.051585918438482e-06, + "loss": 0.1514, + "step": 8570 + }, + { + "epoch": 2.990589055420007, + "grad_norm": 19.48651123046875, + "learning_rate": 8.037643778319973e-06, + "loss": 0.101, + "step": 8580 + }, + { + "epoch": 2.994074590449634, + "grad_norm": 15.483784675598145, + "learning_rate": 8.023701638201464e-06, + "loss": 0.072, + "step": 8590 + }, + { + "epoch": 2.997560125479261, + "grad_norm": 13.562616348266602, + "learning_rate": 8.009759498082956e-06, + "loss": 0.0941, + "step": 8600 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9861728395061728, + "eval_loss": 0.05898062884807587, + "eval_runtime": 18.9157, + "eval_samples_per_second": 214.108, + "eval_steps_per_second": 26.803, + "step": 8607 + }, + { + "epoch": 3.001045660508888, + "grad_norm": 0.05343186855316162, + "learning_rate": 7.995817357964449e-06, + "loss": 0.1422, + "step": 8610 + }, + { + "epoch": 3.004531195538515, + "grad_norm": 24.563711166381836, + "learning_rate": 7.98187521784594e-06, + "loss": 0.0213, + "step": 8620 + }, + { + "epoch": 3.008016730568142, + "grad_norm": 0.026707723736763, + "learning_rate": 7.967933077727432e-06, + "loss": 0.031, + "step": 8630 + }, + { + "epoch": 3.0115022655977692, + "grad_norm": 0.02573820762336254, + "learning_rate": 7.953990937608924e-06, + "loss": 0.0848, + "step": 8640 + }, + { + "epoch": 3.0149878006273965, + "grad_norm": 0.020314253866672516, + "learning_rate": 7.940048797490415e-06, + "loss": 0.1811, + "step": 8650 + }, + { + "epoch": 3.0184733356570232, + "grad_norm": 0.0360274612903595, + "learning_rate": 7.926106657371908e-06, + "loss": 0.0079, + "step": 8660 + }, + { + "epoch": 3.0219588706866505, + "grad_norm": 0.34295937418937683, + "learning_rate": 7.912164517253399e-06, + "loss": 0.141, + "step": 8670 + }, + { + "epoch": 3.0254444057162773, + "grad_norm": 0.383859783411026, + "learning_rate": 7.898222377134891e-06, + "loss": 0.1501, + "step": 8680 + }, + { + "epoch": 3.0289299407459045, + "grad_norm": 0.06587964296340942, + "learning_rate": 7.884280237016384e-06, + "loss": 0.095, + "step": 8690 + }, + { + "epoch": 3.0324154757755317, + "grad_norm": 0.041697971522808075, + "learning_rate": 7.870338096897874e-06, + "loss": 0.1642, + "step": 8700 + }, + { + "epoch": 3.0359010108051585, + "grad_norm": 28.092723846435547, + "learning_rate": 7.856395956779365e-06, + "loss": 0.1282, + "step": 8710 + }, + { + "epoch": 3.0393865458347857, + "grad_norm": 1.7253819704055786, + "learning_rate": 7.842453816660858e-06, + "loss": 0.0581, + "step": 8720 + }, + { + "epoch": 3.0428720808644125, + "grad_norm": 1.214120626449585, + "learning_rate": 7.82851167654235e-06, + "loss": 0.0404, + "step": 8730 + }, + { + "epoch": 3.0463576158940397, + "grad_norm": 18.530441284179688, + "learning_rate": 7.814569536423841e-06, + "loss": 0.1388, + "step": 8740 + }, + { + "epoch": 3.049843150923667, + "grad_norm": 0.030280515551567078, + "learning_rate": 7.800627396305334e-06, + "loss": 0.0956, + "step": 8750 + }, + { + "epoch": 3.0533286859532938, + "grad_norm": 0.02037186734378338, + "learning_rate": 7.786685256186826e-06, + "loss": 0.126, + "step": 8760 + }, + { + "epoch": 3.056814220982921, + "grad_norm": 33.261940002441406, + "learning_rate": 7.772743116068317e-06, + "loss": 0.1255, + "step": 8770 + }, + { + "epoch": 3.060299756012548, + "grad_norm": 22.391355514526367, + "learning_rate": 7.758800975949808e-06, + "loss": 0.1261, + "step": 8780 + }, + { + "epoch": 3.063785291042175, + "grad_norm": 0.017767876386642456, + "learning_rate": 7.7448588358313e-06, + "loss": 0.1211, + "step": 8790 + }, + { + "epoch": 3.0672708260718022, + "grad_norm": 5.44157600402832, + "learning_rate": 7.730916695712793e-06, + "loss": 0.1787, + "step": 8800 + }, + { + "epoch": 3.070756361101429, + "grad_norm": 0.02152939699590206, + "learning_rate": 7.716974555594284e-06, + "loss": 0.0867, + "step": 8810 + }, + { + "epoch": 3.0742418961310563, + "grad_norm": 29.05510711669922, + "learning_rate": 7.703032415475776e-06, + "loss": 0.2069, + "step": 8820 + }, + { + "epoch": 3.077727431160683, + "grad_norm": 0.029673922806978226, + "learning_rate": 7.689090275357269e-06, + "loss": 0.1669, + "step": 8830 + }, + { + "epoch": 3.0812129661903103, + "grad_norm": 0.02840520814061165, + "learning_rate": 7.675148135238761e-06, + "loss": 0.2332, + "step": 8840 + }, + { + "epoch": 3.084698501219937, + "grad_norm": 0.2850615978240967, + "learning_rate": 7.661205995120252e-06, + "loss": 0.146, + "step": 8850 + }, + { + "epoch": 3.0881840362495643, + "grad_norm": 3.1782281398773193, + "learning_rate": 7.647263855001743e-06, + "loss": 0.0449, + "step": 8860 + }, + { + "epoch": 3.0916695712791915, + "grad_norm": 2.359084367752075, + "learning_rate": 7.633321714883235e-06, + "loss": 0.0291, + "step": 8870 + }, + { + "epoch": 3.0951551063088183, + "grad_norm": 0.0349331870675087, + "learning_rate": 7.619379574764727e-06, + "loss": 0.1953, + "step": 8880 + }, + { + "epoch": 3.0986406413384455, + "grad_norm": 0.03900326043367386, + "learning_rate": 7.605437434646219e-06, + "loss": 0.0101, + "step": 8890 + }, + { + "epoch": 3.1021261763680723, + "grad_norm": 0.05492232367396355, + "learning_rate": 7.591495294527711e-06, + "loss": 0.024, + "step": 8900 + }, + { + "epoch": 3.1056117113976995, + "grad_norm": 0.024204595014452934, + "learning_rate": 7.577553154409203e-06, + "loss": 0.2914, + "step": 8910 + }, + { + "epoch": 3.1090972464273268, + "grad_norm": 0.21245832741260529, + "learning_rate": 7.5636110142906935e-06, + "loss": 0.1574, + "step": 8920 + }, + { + "epoch": 3.1125827814569536, + "grad_norm": 4.7999587059021, + "learning_rate": 7.549668874172186e-06, + "loss": 0.1669, + "step": 8930 + }, + { + "epoch": 3.116068316486581, + "grad_norm": 0.0458659753203392, + "learning_rate": 7.535726734053678e-06, + "loss": 0.2493, + "step": 8940 + }, + { + "epoch": 3.1195538515162076, + "grad_norm": 0.025298912078142166, + "learning_rate": 7.521784593935169e-06, + "loss": 0.0541, + "step": 8950 + }, + { + "epoch": 3.123039386545835, + "grad_norm": 1.2849235534667969, + "learning_rate": 7.507842453816662e-06, + "loss": 0.0486, + "step": 8960 + }, + { + "epoch": 3.126524921575462, + "grad_norm": 0.03680930659174919, + "learning_rate": 7.4939003136981535e-06, + "loss": 0.1282, + "step": 8970 + }, + { + "epoch": 3.130010456605089, + "grad_norm": 13.066097259521484, + "learning_rate": 7.479958173579646e-06, + "loss": 0.2534, + "step": 8980 + }, + { + "epoch": 3.133495991634716, + "grad_norm": 0.02675449103116989, + "learning_rate": 7.466016033461137e-06, + "loss": 0.0103, + "step": 8990 + }, + { + "epoch": 3.136981526664343, + "grad_norm": 0.038677338510751724, + "learning_rate": 7.4520738933426285e-06, + "loss": 0.0742, + "step": 9000 + }, + { + "epoch": 3.14046706169397, + "grad_norm": 0.026775427162647247, + "learning_rate": 7.43813175322412e-06, + "loss": 0.0929, + "step": 9010 + }, + { + "epoch": 3.1439525967235973, + "grad_norm": 0.04959922656416893, + "learning_rate": 7.424189613105613e-06, + "loss": 0.171, + "step": 9020 + }, + { + "epoch": 3.147438131753224, + "grad_norm": 0.018528427928686142, + "learning_rate": 7.410247472987104e-06, + "loss": 0.1509, + "step": 9030 + }, + { + "epoch": 3.1509236667828513, + "grad_norm": 0.02553379535675049, + "learning_rate": 7.396305332868596e-06, + "loss": 0.0405, + "step": 9040 + }, + { + "epoch": 3.154409201812478, + "grad_norm": 0.045496754348278046, + "learning_rate": 7.3823631927500885e-06, + "loss": 0.0465, + "step": 9050 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 18.267221450805664, + "learning_rate": 7.368421052631579e-06, + "loss": 0.1788, + "step": 9060 + }, + { + "epoch": 3.1613802718717325, + "grad_norm": 0.037292297929525375, + "learning_rate": 7.354478912513071e-06, + "loss": 0.1418, + "step": 9070 + }, + { + "epoch": 3.1648658069013593, + "grad_norm": 1.9102615118026733, + "learning_rate": 7.3405367723945635e-06, + "loss": 0.0699, + "step": 9080 + }, + { + "epoch": 3.1683513419309866, + "grad_norm": 47.18280029296875, + "learning_rate": 7.326594632276055e-06, + "loss": 0.1691, + "step": 9090 + }, + { + "epoch": 3.1718368769606133, + "grad_norm": 0.13229234516620636, + "learning_rate": 7.312652492157547e-06, + "loss": 0.1772, + "step": 9100 + }, + { + "epoch": 3.1753224119902406, + "grad_norm": 2.6047537326812744, + "learning_rate": 7.298710352039039e-06, + "loss": 0.07, + "step": 9110 + }, + { + "epoch": 3.1788079470198674, + "grad_norm": 0.02106287144124508, + "learning_rate": 7.28476821192053e-06, + "loss": 0.1055, + "step": 9120 + }, + { + "epoch": 3.1822934820494946, + "grad_norm": 10.96900749206543, + "learning_rate": 7.270826071802022e-06, + "loss": 0.1797, + "step": 9130 + }, + { + "epoch": 3.185779017079122, + "grad_norm": 0.1008637398481369, + "learning_rate": 7.256883931683513e-06, + "loss": 0.0178, + "step": 9140 + }, + { + "epoch": 3.1892645521087486, + "grad_norm": 0.0249412152916193, + "learning_rate": 7.242941791565006e-06, + "loss": 0.3967, + "step": 9150 + }, + { + "epoch": 3.192750087138376, + "grad_norm": 0.025104615837335587, + "learning_rate": 7.228999651446498e-06, + "loss": 0.0881, + "step": 9160 + }, + { + "epoch": 3.1962356221680026, + "grad_norm": 0.02654813602566719, + "learning_rate": 7.21505751132799e-06, + "loss": 0.1262, + "step": 9170 + }, + { + "epoch": 3.19972115719763, + "grad_norm": 0.03677063435316086, + "learning_rate": 7.201115371209482e-06, + "loss": 0.0244, + "step": 9180 + }, + { + "epoch": 3.203206692227257, + "grad_norm": 4.220332145690918, + "learning_rate": 7.187173231090973e-06, + "loss": 0.1071, + "step": 9190 + }, + { + "epoch": 3.206692227256884, + "grad_norm": 0.019408810883760452, + "learning_rate": 7.173231090972464e-06, + "loss": 0.1627, + "step": 9200 + }, + { + "epoch": 3.210177762286511, + "grad_norm": 1.1298311948776245, + "learning_rate": 7.159288950853957e-06, + "loss": 0.1347, + "step": 9210 + }, + { + "epoch": 3.213663297316138, + "grad_norm": 0.04483231157064438, + "learning_rate": 7.145346810735448e-06, + "loss": 0.2057, + "step": 9220 + }, + { + "epoch": 3.217148832345765, + "grad_norm": 0.020672811195254326, + "learning_rate": 7.13140467061694e-06, + "loss": 0.0078, + "step": 9230 + }, + { + "epoch": 3.2206343673753923, + "grad_norm": 0.022838251665234566, + "learning_rate": 7.117462530498433e-06, + "loss": 0.1516, + "step": 9240 + }, + { + "epoch": 3.224119902405019, + "grad_norm": 0.037012387067079544, + "learning_rate": 7.103520390379924e-06, + "loss": 0.0569, + "step": 9250 + }, + { + "epoch": 3.2276054374346463, + "grad_norm": 0.2272426038980484, + "learning_rate": 7.089578250261415e-06, + "loss": 0.0739, + "step": 9260 + }, + { + "epoch": 3.231090972464273, + "grad_norm": 26.441532135009766, + "learning_rate": 7.0756361101429076e-06, + "loss": 0.0436, + "step": 9270 + }, + { + "epoch": 3.2345765074939004, + "grad_norm": 19.132356643676758, + "learning_rate": 7.061693970024399e-06, + "loss": 0.181, + "step": 9280 + }, + { + "epoch": 3.238062042523527, + "grad_norm": 0.4390275776386261, + "learning_rate": 7.047751829905891e-06, + "loss": 0.18, + "step": 9290 + }, + { + "epoch": 3.2415475775531544, + "grad_norm": 0.013922056183218956, + "learning_rate": 7.033809689787383e-06, + "loss": 0.148, + "step": 9300 + }, + { + "epoch": 3.2450331125827816, + "grad_norm": 0.01766609586775303, + "learning_rate": 7.019867549668875e-06, + "loss": 0.1277, + "step": 9310 + }, + { + "epoch": 3.2485186476124084, + "grad_norm": 0.03631417080760002, + "learning_rate": 7.005925409550367e-06, + "loss": 0.0666, + "step": 9320 + }, + { + "epoch": 3.2520041826420356, + "grad_norm": 0.022327765822410583, + "learning_rate": 6.9919832694318575e-06, + "loss": 0.1104, + "step": 9330 + }, + { + "epoch": 3.2554897176716624, + "grad_norm": 28.965906143188477, + "learning_rate": 6.97804112931335e-06, + "loss": 0.1194, + "step": 9340 + }, + { + "epoch": 3.2589752527012896, + "grad_norm": 0.016817284747958183, + "learning_rate": 6.964098989194842e-06, + "loss": 0.105, + "step": 9350 + }, + { + "epoch": 3.262460787730917, + "grad_norm": 19.65138816833496, + "learning_rate": 6.950156849076334e-06, + "loss": 0.1736, + "step": 9360 + }, + { + "epoch": 3.2659463227605436, + "grad_norm": 0.1805201917886734, + "learning_rate": 6.936214708957826e-06, + "loss": 0.2812, + "step": 9370 + }, + { + "epoch": 3.269431857790171, + "grad_norm": 3.660529613494873, + "learning_rate": 6.9222725688393175e-06, + "loss": 0.1089, + "step": 9380 + }, + { + "epoch": 3.2729173928197977, + "grad_norm": 18.988616943359375, + "learning_rate": 6.90833042872081e-06, + "loss": 0.2735, + "step": 9390 + }, + { + "epoch": 3.276402927849425, + "grad_norm": 0.05193915590643883, + "learning_rate": 6.894388288602301e-06, + "loss": 0.3342, + "step": 9400 + }, + { + "epoch": 3.279888462879052, + "grad_norm": 0.2501579225063324, + "learning_rate": 6.8804461484837925e-06, + "loss": 0.0079, + "step": 9410 + }, + { + "epoch": 3.283373997908679, + "grad_norm": 0.035554543137550354, + "learning_rate": 6.866504008365284e-06, + "loss": 0.0192, + "step": 9420 + }, + { + "epoch": 3.286859532938306, + "grad_norm": 0.020876824855804443, + "learning_rate": 6.852561868246777e-06, + "loss": 0.1817, + "step": 9430 + }, + { + "epoch": 3.290345067967933, + "grad_norm": 3.084765672683716, + "learning_rate": 6.838619728128268e-06, + "loss": 0.044, + "step": 9440 + }, + { + "epoch": 3.29383060299756, + "grad_norm": 18.532669067382812, + "learning_rate": 6.82467758800976e-06, + "loss": 0.272, + "step": 9450 + }, + { + "epoch": 3.2973161380271874, + "grad_norm": 5.328271865844727, + "learning_rate": 6.8107354478912525e-06, + "loss": 0.1708, + "step": 9460 + }, + { + "epoch": 3.300801673056814, + "grad_norm": 16.922470092773438, + "learning_rate": 6.796793307772743e-06, + "loss": 0.1577, + "step": 9470 + }, + { + "epoch": 3.3042872080864414, + "grad_norm": 20.75090980529785, + "learning_rate": 6.782851167654235e-06, + "loss": 0.2414, + "step": 9480 + }, + { + "epoch": 3.307772743116068, + "grad_norm": 0.03418176993727684, + "learning_rate": 6.7689090275357275e-06, + "loss": 0.1943, + "step": 9490 + }, + { + "epoch": 3.3112582781456954, + "grad_norm": 0.4410526156425476, + "learning_rate": 6.754966887417219e-06, + "loss": 0.0739, + "step": 9500 + }, + { + "epoch": 3.3147438131753226, + "grad_norm": 0.02994650788605213, + "learning_rate": 6.741024747298711e-06, + "loss": 0.187, + "step": 9510 + }, + { + "epoch": 3.3182293482049494, + "grad_norm": 6.901801109313965, + "learning_rate": 6.727082607180203e-06, + "loss": 0.1074, + "step": 9520 + }, + { + "epoch": 3.3217148832345766, + "grad_norm": 24.88802719116211, + "learning_rate": 6.713140467061694e-06, + "loss": 0.1535, + "step": 9530 + }, + { + "epoch": 3.3252004182642034, + "grad_norm": 0.2463180273771286, + "learning_rate": 6.699198326943186e-06, + "loss": 0.1021, + "step": 9540 + }, + { + "epoch": 3.3286859532938307, + "grad_norm": 0.09169216454029083, + "learning_rate": 6.685256186824678e-06, + "loss": 0.0811, + "step": 9550 + }, + { + "epoch": 3.332171488323458, + "grad_norm": 2.2960045337677, + "learning_rate": 6.67131404670617e-06, + "loss": 0.1068, + "step": 9560 + }, + { + "epoch": 3.3356570233530847, + "grad_norm": 0.021253207698464394, + "learning_rate": 6.657371906587662e-06, + "loss": 0.1687, + "step": 9570 + }, + { + "epoch": 3.339142558382712, + "grad_norm": 28.67542266845703, + "learning_rate": 6.643429766469154e-06, + "loss": 0.1184, + "step": 9580 + }, + { + "epoch": 3.3426280934123387, + "grad_norm": 0.02517896145582199, + "learning_rate": 6.629487626350646e-06, + "loss": 0.1933, + "step": 9590 + }, + { + "epoch": 3.346113628441966, + "grad_norm": 0.04085422307252884, + "learning_rate": 6.615545486232137e-06, + "loss": 0.0423, + "step": 9600 + }, + { + "epoch": 3.3495991634715927, + "grad_norm": 1.9252256155014038, + "learning_rate": 6.601603346113628e-06, + "loss": 0.1851, + "step": 9610 + }, + { + "epoch": 3.35308469850122, + "grad_norm": 15.485836029052734, + "learning_rate": 6.587661205995121e-06, + "loss": 0.2882, + "step": 9620 + }, + { + "epoch": 3.356570233530847, + "grad_norm": 7.36222505569458, + "learning_rate": 6.5737190658766125e-06, + "loss": 0.256, + "step": 9630 + }, + { + "epoch": 3.360055768560474, + "grad_norm": 0.07581675052642822, + "learning_rate": 6.559776925758104e-06, + "loss": 0.1117, + "step": 9640 + }, + { + "epoch": 3.363541303590101, + "grad_norm": 1.1792480945587158, + "learning_rate": 6.545834785639597e-06, + "loss": 0.2192, + "step": 9650 + }, + { + "epoch": 3.367026838619728, + "grad_norm": 0.04260709509253502, + "learning_rate": 6.531892645521088e-06, + "loss": 0.0455, + "step": 9660 + }, + { + "epoch": 3.370512373649355, + "grad_norm": 0.02434307336807251, + "learning_rate": 6.517950505402579e-06, + "loss": 0.0985, + "step": 9670 + }, + { + "epoch": 3.3739979086789824, + "grad_norm": 80.342529296875, + "learning_rate": 6.504008365284072e-06, + "loss": 0.0361, + "step": 9680 + }, + { + "epoch": 3.377483443708609, + "grad_norm": 0.01803007163107395, + "learning_rate": 6.490066225165563e-06, + "loss": 0.07, + "step": 9690 + }, + { + "epoch": 3.3809689787382364, + "grad_norm": 0.06161894649267197, + "learning_rate": 6.476124085047055e-06, + "loss": 0.1747, + "step": 9700 + }, + { + "epoch": 3.384454513767863, + "grad_norm": 0.02253262884914875, + "learning_rate": 6.4621819449285475e-06, + "loss": 0.0444, + "step": 9710 + }, + { + "epoch": 3.3879400487974904, + "grad_norm": 0.027096986770629883, + "learning_rate": 6.448239804810039e-06, + "loss": 0.1927, + "step": 9720 + }, + { + "epoch": 3.3914255838271172, + "grad_norm": 0.42232781648635864, + "learning_rate": 6.434297664691531e-06, + "loss": 0.1841, + "step": 9730 + }, + { + "epoch": 3.3949111188567445, + "grad_norm": 0.01901787333190441, + "learning_rate": 6.4203555245730224e-06, + "loss": 0.135, + "step": 9740 + }, + { + "epoch": 3.3983966538863717, + "grad_norm": 4.167501926422119, + "learning_rate": 6.406413384454514e-06, + "loss": 0.0409, + "step": 9750 + }, + { + "epoch": 3.4018821889159985, + "grad_norm": 17.263132095336914, + "learning_rate": 6.392471244336006e-06, + "loss": 0.1366, + "step": 9760 + }, + { + "epoch": 3.4053677239456257, + "grad_norm": 4.265865325927734, + "learning_rate": 6.378529104217498e-06, + "loss": 0.1339, + "step": 9770 + }, + { + "epoch": 3.4088532589752525, + "grad_norm": 0.022587908431887627, + "learning_rate": 6.36458696409899e-06, + "loss": 0.1715, + "step": 9780 + }, + { + "epoch": 3.4123387940048797, + "grad_norm": 15.038971900939941, + "learning_rate": 6.350644823980482e-06, + "loss": 0.1432, + "step": 9790 + }, + { + "epoch": 3.415824329034507, + "grad_norm": 0.056241635233163834, + "learning_rate": 6.336702683861974e-06, + "loss": 0.093, + "step": 9800 + }, + { + "epoch": 3.4193098640641337, + "grad_norm": 0.08624168485403061, + "learning_rate": 6.322760543743465e-06, + "loss": 0.139, + "step": 9810 + }, + { + "epoch": 3.422795399093761, + "grad_norm": 0.025193965062499046, + "learning_rate": 6.308818403624957e-06, + "loss": 0.0069, + "step": 9820 + }, + { + "epoch": 3.4262809341233877, + "grad_norm": 12.65317440032959, + "learning_rate": 6.294876263506448e-06, + "loss": 0.1008, + "step": 9830 + }, + { + "epoch": 3.429766469153015, + "grad_norm": 3.4288766384124756, + "learning_rate": 6.280934123387941e-06, + "loss": 0.2026, + "step": 9840 + }, + { + "epoch": 3.433252004182642, + "grad_norm": 0.01608997769653797, + "learning_rate": 6.266991983269432e-06, + "loss": 0.0358, + "step": 9850 + }, + { + "epoch": 3.436737539212269, + "grad_norm": 9.135321617126465, + "learning_rate": 6.253049843150925e-06, + "loss": 0.1489, + "step": 9860 + }, + { + "epoch": 3.440223074241896, + "grad_norm": 0.02817300148308277, + "learning_rate": 6.239107703032417e-06, + "loss": 0.0225, + "step": 9870 + }, + { + "epoch": 3.443708609271523, + "grad_norm": 0.03426109254360199, + "learning_rate": 6.225165562913907e-06, + "loss": 0.0896, + "step": 9880 + }, + { + "epoch": 3.4471941443011502, + "grad_norm": 0.029791146516799927, + "learning_rate": 6.211223422795399e-06, + "loss": 0.0733, + "step": 9890 + }, + { + "epoch": 3.4506796793307775, + "grad_norm": 5.195836067199707, + "learning_rate": 6.1972812826768916e-06, + "loss": 0.0285, + "step": 9900 + }, + { + "epoch": 3.4541652143604042, + "grad_norm": 0.046240124851465225, + "learning_rate": 6.183339142558383e-06, + "loss": 0.2035, + "step": 9910 + }, + { + "epoch": 3.4576507493900315, + "grad_norm": 14.654541969299316, + "learning_rate": 6.169397002439875e-06, + "loss": 0.115, + "step": 9920 + }, + { + "epoch": 3.4611362844196583, + "grad_norm": 0.05055411905050278, + "learning_rate": 6.155454862321367e-06, + "loss": 0.0856, + "step": 9930 + }, + { + "epoch": 3.4646218194492855, + "grad_norm": 0.04881644248962402, + "learning_rate": 6.141512722202858e-06, + "loss": 0.1484, + "step": 9940 + }, + { + "epoch": 3.4681073544789127, + "grad_norm": 0.08495481312274933, + "learning_rate": 6.12757058208435e-06, + "loss": 0.0758, + "step": 9950 + }, + { + "epoch": 3.4715928895085395, + "grad_norm": 0.028945187106728554, + "learning_rate": 6.113628441965842e-06, + "loss": 0.0965, + "step": 9960 + }, + { + "epoch": 3.4750784245381667, + "grad_norm": 8.535677909851074, + "learning_rate": 6.099686301847334e-06, + "loss": 0.2292, + "step": 9970 + }, + { + "epoch": 3.4785639595677935, + "grad_norm": 7.189770221710205, + "learning_rate": 6.085744161728826e-06, + "loss": 0.155, + "step": 9980 + }, + { + "epoch": 3.4820494945974207, + "grad_norm": 0.022676438093185425, + "learning_rate": 6.071802021610318e-06, + "loss": 0.1451, + "step": 9990 + }, + { + "epoch": 3.485535029627048, + "grad_norm": 37.57761764526367, + "learning_rate": 6.05785988149181e-06, + "loss": 0.1904, + "step": 10000 + }, + { + "epoch": 3.4890205646566748, + "grad_norm": 0.05265439301729202, + "learning_rate": 6.043917741373301e-06, + "loss": 0.0894, + "step": 10010 + }, + { + "epoch": 3.492506099686302, + "grad_norm": 0.08444428443908691, + "learning_rate": 6.029975601254792e-06, + "loss": 0.1031, + "step": 10020 + }, + { + "epoch": 3.4959916347159288, + "grad_norm": 0.3577669858932495, + "learning_rate": 6.016033461136285e-06, + "loss": 0.0539, + "step": 10030 + }, + { + "epoch": 3.499477169745556, + "grad_norm": 0.026337046176195145, + "learning_rate": 6.0020913210177765e-06, + "loss": 0.1267, + "step": 10040 + }, + { + "epoch": 3.5029627047751832, + "grad_norm": 4.85567569732666, + "learning_rate": 5.988149180899269e-06, + "loss": 0.0094, + "step": 10050 + }, + { + "epoch": 3.50644823980481, + "grad_norm": 1.0068359375, + "learning_rate": 5.974207040780761e-06, + "loss": 0.1011, + "step": 10060 + }, + { + "epoch": 3.5099337748344372, + "grad_norm": 0.03198888525366783, + "learning_rate": 5.960264900662252e-06, + "loss": 0.1203, + "step": 10070 + }, + { + "epoch": 3.513419309864064, + "grad_norm": 0.08028540015220642, + "learning_rate": 5.946322760543743e-06, + "loss": 0.073, + "step": 10080 + }, + { + "epoch": 3.5169048448936913, + "grad_norm": 6.395030975341797, + "learning_rate": 5.932380620425236e-06, + "loss": 0.1022, + "step": 10090 + }, + { + "epoch": 3.5203903799233185, + "grad_norm": 0.9497456550598145, + "learning_rate": 5.918438480306727e-06, + "loss": 0.133, + "step": 10100 + }, + { + "epoch": 3.5238759149529453, + "grad_norm": 0.02326410636305809, + "learning_rate": 5.904496340188219e-06, + "loss": 0.218, + "step": 10110 + }, + { + "epoch": 3.527361449982572, + "grad_norm": 0.12932069599628448, + "learning_rate": 5.8905542000697115e-06, + "loss": 0.0913, + "step": 10120 + }, + { + "epoch": 3.5308469850121993, + "grad_norm": 0.023471644148230553, + "learning_rate": 5.876612059951203e-06, + "loss": 0.1593, + "step": 10130 + }, + { + "epoch": 3.5343325200418265, + "grad_norm": 11.825116157531738, + "learning_rate": 5.862669919832696e-06, + "loss": 0.0478, + "step": 10140 + }, + { + "epoch": 3.5378180550714533, + "grad_norm": 4.660243034362793, + "learning_rate": 5.8487277797141865e-06, + "loss": 0.0862, + "step": 10150 + }, + { + "epoch": 3.5413035901010805, + "grad_norm": 0.017631346359848976, + "learning_rate": 5.834785639595678e-06, + "loss": 0.072, + "step": 10160 + }, + { + "epoch": 3.5447891251307073, + "grad_norm": 0.02127777598798275, + "learning_rate": 5.82084349947717e-06, + "loss": 0.0143, + "step": 10170 + }, + { + "epoch": 3.5482746601603345, + "grad_norm": 12.403838157653809, + "learning_rate": 5.806901359358662e-06, + "loss": 0.194, + "step": 10180 + }, + { + "epoch": 3.5517601951899618, + "grad_norm": 0.01695849932730198, + "learning_rate": 5.792959219240154e-06, + "loss": 0.0156, + "step": 10190 + }, + { + "epoch": 3.5552457302195886, + "grad_norm": 0.06215062364935875, + "learning_rate": 5.779017079121646e-06, + "loss": 0.3112, + "step": 10200 + }, + { + "epoch": 3.558731265249216, + "grad_norm": 0.015378969721496105, + "learning_rate": 5.765074939003138e-06, + "loss": 0.0328, + "step": 10210 + }, + { + "epoch": 3.5622168002788426, + "grad_norm": 0.5997032523155212, + "learning_rate": 5.751132798884629e-06, + "loss": 0.0973, + "step": 10220 + }, + { + "epoch": 3.56570233530847, + "grad_norm": 0.17242105305194855, + "learning_rate": 5.737190658766121e-06, + "loss": 0.0697, + "step": 10230 + }, + { + "epoch": 3.569187870338097, + "grad_norm": 0.028457053005695343, + "learning_rate": 5.723248518647613e-06, + "loss": 0.2034, + "step": 10240 + }, + { + "epoch": 3.572673405367724, + "grad_norm": 51.637733459472656, + "learning_rate": 5.709306378529105e-06, + "loss": 0.3527, + "step": 10250 + }, + { + "epoch": 3.576158940397351, + "grad_norm": 0.02154484950006008, + "learning_rate": 5.6953642384105965e-06, + "loss": 0.0558, + "step": 10260 + }, + { + "epoch": 3.579644475426978, + "grad_norm": 0.02288723737001419, + "learning_rate": 5.681422098292089e-06, + "loss": 0.1087, + "step": 10270 + }, + { + "epoch": 3.583130010456605, + "grad_norm": 0.02297695353627205, + "learning_rate": 5.667479958173581e-06, + "loss": 0.0692, + "step": 10280 + }, + { + "epoch": 3.5866155454862323, + "grad_norm": 0.0711924359202385, + "learning_rate": 5.6535378180550715e-06, + "loss": 0.065, + "step": 10290 + }, + { + "epoch": 3.590101080515859, + "grad_norm": 17.65730857849121, + "learning_rate": 5.639595677936563e-06, + "loss": 0.2072, + "step": 10300 + }, + { + "epoch": 3.5935866155454863, + "grad_norm": 6.287166118621826, + "learning_rate": 5.625653537818056e-06, + "loss": 0.3501, + "step": 10310 + }, + { + "epoch": 3.597072150575113, + "grad_norm": 0.05497564375400543, + "learning_rate": 5.611711397699547e-06, + "loss": 0.1217, + "step": 10320 + }, + { + "epoch": 3.6005576856047403, + "grad_norm": 0.2877632677555084, + "learning_rate": 5.59776925758104e-06, + "loss": 0.1098, + "step": 10330 + }, + { + "epoch": 3.6040432206343676, + "grad_norm": 0.033131808042526245, + "learning_rate": 5.5838271174625315e-06, + "loss": 0.0095, + "step": 10340 + }, + { + "epoch": 3.6075287556639943, + "grad_norm": 0.057142358273267746, + "learning_rate": 5.569884977344022e-06, + "loss": 0.1609, + "step": 10350 + }, + { + "epoch": 3.6110142906936216, + "grad_norm": 3.4275271892547607, + "learning_rate": 5.555942837225514e-06, + "loss": 0.1044, + "step": 10360 + }, + { + "epoch": 3.6144998257232483, + "grad_norm": 0.03326639533042908, + "learning_rate": 5.5420006971070064e-06, + "loss": 0.0441, + "step": 10370 + }, + { + "epoch": 3.6179853607528756, + "grad_norm": 0.018506675958633423, + "learning_rate": 5.528058556988498e-06, + "loss": 0.1586, + "step": 10380 + }, + { + "epoch": 3.621470895782503, + "grad_norm": 0.05169185996055603, + "learning_rate": 5.51411641686999e-06, + "loss": 0.0747, + "step": 10390 + }, + { + "epoch": 3.6249564308121296, + "grad_norm": 0.017710169777274132, + "learning_rate": 5.500174276751482e-06, + "loss": 0.0171, + "step": 10400 + }, + { + "epoch": 3.628441965841757, + "grad_norm": 20.366230010986328, + "learning_rate": 5.486232136632974e-06, + "loss": 0.4428, + "step": 10410 + }, + { + "epoch": 3.6319275008713836, + "grad_norm": 0.12488202750682831, + "learning_rate": 5.472289996514465e-06, + "loss": 0.2049, + "step": 10420 + }, + { + "epoch": 3.635413035901011, + "grad_norm": 0.027057647705078125, + "learning_rate": 5.458347856395957e-06, + "loss": 0.0281, + "step": 10430 + }, + { + "epoch": 3.638898570930638, + "grad_norm": 0.2535144090652466, + "learning_rate": 5.444405716277449e-06, + "loss": 0.009, + "step": 10440 + }, + { + "epoch": 3.642384105960265, + "grad_norm": 5.463354110717773, + "learning_rate": 5.430463576158941e-06, + "loss": 0.131, + "step": 10450 + }, + { + "epoch": 3.645869640989892, + "grad_norm": 0.021357407793402672, + "learning_rate": 5.416521436040433e-06, + "loss": 0.407, + "step": 10460 + }, + { + "epoch": 3.649355176019519, + "grad_norm": 30.37545394897461, + "learning_rate": 5.402579295921925e-06, + "loss": 0.231, + "step": 10470 + }, + { + "epoch": 3.652840711049146, + "grad_norm": 0.027980081737041473, + "learning_rate": 5.388637155803416e-06, + "loss": 0.2154, + "step": 10480 + }, + { + "epoch": 3.6563262460787733, + "grad_norm": 0.03117654100060463, + "learning_rate": 5.374695015684907e-06, + "loss": 0.1123, + "step": 10490 + }, + { + "epoch": 3.6598117811084, + "grad_norm": 0.9009264707565308, + "learning_rate": 5.3607528755664e-06, + "loss": 0.1333, + "step": 10500 + }, + { + "epoch": 3.6632973161380273, + "grad_norm": 5.609897613525391, + "learning_rate": 5.346810735447891e-06, + "loss": 0.1772, + "step": 10510 + }, + { + "epoch": 3.666782851167654, + "grad_norm": 0.05884939059615135, + "learning_rate": 5.332868595329384e-06, + "loss": 0.0984, + "step": 10520 + }, + { + "epoch": 3.6702683861972814, + "grad_norm": 1.9585541486740112, + "learning_rate": 5.318926455210876e-06, + "loss": 0.0611, + "step": 10530 + }, + { + "epoch": 3.6737539212269086, + "grad_norm": 1.0173242092132568, + "learning_rate": 5.304984315092367e-06, + "loss": 0.2067, + "step": 10540 + }, + { + "epoch": 3.6772394562565354, + "grad_norm": 0.15780754387378693, + "learning_rate": 5.29104217497386e-06, + "loss": 0.1282, + "step": 10550 + }, + { + "epoch": 3.680724991286162, + "grad_norm": 27.38100814819336, + "learning_rate": 5.2771000348553506e-06, + "loss": 0.2614, + "step": 10560 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.04858740046620369, + "learning_rate": 5.263157894736842e-06, + "loss": 0.197, + "step": 10570 + }, + { + "epoch": 3.6876960613454166, + "grad_norm": 0.01936270296573639, + "learning_rate": 5.249215754618334e-06, + "loss": 0.1291, + "step": 10580 + }, + { + "epoch": 3.691181596375044, + "grad_norm": 0.02714901603758335, + "learning_rate": 5.235273614499826e-06, + "loss": 0.1723, + "step": 10590 + }, + { + "epoch": 3.6946671314046706, + "grad_norm": 0.02381339855492115, + "learning_rate": 5.221331474381318e-06, + "loss": 0.0174, + "step": 10600 + }, + { + "epoch": 3.6981526664342974, + "grad_norm": 0.06992737948894501, + "learning_rate": 5.20738933426281e-06, + "loss": 0.0499, + "step": 10610 + }, + { + "epoch": 3.7016382014639246, + "grad_norm": 0.4815942943096161, + "learning_rate": 5.193447194144302e-06, + "loss": 0.0657, + "step": 10620 + }, + { + "epoch": 3.705123736493552, + "grad_norm": 4.941831588745117, + "learning_rate": 5.179505054025793e-06, + "loss": 0.1945, + "step": 10630 + }, + { + "epoch": 3.7086092715231787, + "grad_norm": 4.8328166007995605, + "learning_rate": 5.165562913907285e-06, + "loss": 0.1758, + "step": 10640 + }, + { + "epoch": 3.712094806552806, + "grad_norm": 0.24197635054588318, + "learning_rate": 5.151620773788777e-06, + "loss": 0.1795, + "step": 10650 + }, + { + "epoch": 3.7155803415824327, + "grad_norm": 22.532445907592773, + "learning_rate": 5.137678633670269e-06, + "loss": 0.1157, + "step": 10660 + }, + { + "epoch": 3.71906587661206, + "grad_norm": 0.19902735948562622, + "learning_rate": 5.1237364935517605e-06, + "loss": 0.0064, + "step": 10670 + }, + { + "epoch": 3.722551411641687, + "grad_norm": 12.738110542297363, + "learning_rate": 5.109794353433253e-06, + "loss": 0.0219, + "step": 10680 + }, + { + "epoch": 3.726036946671314, + "grad_norm": 0.07453146576881409, + "learning_rate": 5.095852213314745e-06, + "loss": 0.1586, + "step": 10690 + }, + { + "epoch": 3.729522481700941, + "grad_norm": 0.022043762728571892, + "learning_rate": 5.0819100731962355e-06, + "loss": 0.0053, + "step": 10700 + }, + { + "epoch": 3.733008016730568, + "grad_norm": 15.734837532043457, + "learning_rate": 5.067967933077728e-06, + "loss": 0.2529, + "step": 10710 + }, + { + "epoch": 3.736493551760195, + "grad_norm": 0.5030373930931091, + "learning_rate": 5.05402579295922e-06, + "loss": 0.0513, + "step": 10720 + }, + { + "epoch": 3.7399790867898224, + "grad_norm": 0.10042136162519455, + "learning_rate": 5.040083652840711e-06, + "loss": 0.2097, + "step": 10730 + }, + { + "epoch": 3.743464621819449, + "grad_norm": 0.4963725507259369, + "learning_rate": 5.026141512722204e-06, + "loss": 0.0957, + "step": 10740 + }, + { + "epoch": 3.7469501568490764, + "grad_norm": 8.767095565795898, + "learning_rate": 5.0121993726036955e-06, + "loss": 0.2483, + "step": 10750 + }, + { + "epoch": 3.750435691878703, + "grad_norm": 12.400495529174805, + "learning_rate": 4.998257232485187e-06, + "loss": 0.1802, + "step": 10760 + }, + { + "epoch": 3.7539212269083304, + "grad_norm": 5.502656936645508, + "learning_rate": 4.984315092366679e-06, + "loss": 0.1525, + "step": 10770 + }, + { + "epoch": 3.7574067619379576, + "grad_norm": 0.01613186113536358, + "learning_rate": 4.9703729522481705e-06, + "loss": 0.1123, + "step": 10780 + }, + { + "epoch": 3.7608922969675844, + "grad_norm": 0.01936766505241394, + "learning_rate": 4.956430812129662e-06, + "loss": 0.0666, + "step": 10790 + }, + { + "epoch": 3.7643778319972117, + "grad_norm": 16.322816848754883, + "learning_rate": 4.942488672011154e-06, + "loss": 0.0653, + "step": 10800 + }, + { + "epoch": 3.7678633670268384, + "grad_norm": 17.28783416748047, + "learning_rate": 4.9285465318926455e-06, + "loss": 0.0617, + "step": 10810 + }, + { + "epoch": 3.7713489020564657, + "grad_norm": 0.042436350136995316, + "learning_rate": 4.914604391774138e-06, + "loss": 0.152, + "step": 10820 + }, + { + "epoch": 3.774834437086093, + "grad_norm": 0.20540641248226166, + "learning_rate": 4.90066225165563e-06, + "loss": 0.0412, + "step": 10830 + }, + { + "epoch": 3.7783199721157197, + "grad_norm": 0.211298406124115, + "learning_rate": 4.886720111537121e-06, + "loss": 0.0181, + "step": 10840 + }, + { + "epoch": 3.781805507145347, + "grad_norm": 0.09502696245908737, + "learning_rate": 4.872777971418613e-06, + "loss": 0.0132, + "step": 10850 + }, + { + "epoch": 3.7852910421749737, + "grad_norm": 0.03848971426486969, + "learning_rate": 4.858835831300105e-06, + "loss": 0.0608, + "step": 10860 + }, + { + "epoch": 3.788776577204601, + "grad_norm": 21.687044143676758, + "learning_rate": 4.844893691181597e-06, + "loss": 0.1938, + "step": 10870 + }, + { + "epoch": 3.792262112234228, + "grad_norm": 0.02101794257760048, + "learning_rate": 4.830951551063088e-06, + "loss": 0.0113, + "step": 10880 + }, + { + "epoch": 3.795747647263855, + "grad_norm": 0.036022286862134933, + "learning_rate": 4.8170094109445805e-06, + "loss": 0.164, + "step": 10890 + }, + { + "epoch": 3.799233182293482, + "grad_norm": 0.035032596439123154, + "learning_rate": 4.803067270826072e-06, + "loss": 0.0974, + "step": 10900 + }, + { + "epoch": 3.802718717323109, + "grad_norm": 8.165773391723633, + "learning_rate": 4.789125130707565e-06, + "loss": 0.0294, + "step": 10910 + }, + { + "epoch": 3.806204252352736, + "grad_norm": 0.01464312057942152, + "learning_rate": 4.7751829905890555e-06, + "loss": 0.1462, + "step": 10920 + }, + { + "epoch": 3.8096897873823634, + "grad_norm": 0.02338375523686409, + "learning_rate": 4.761240850470548e-06, + "loss": 0.1984, + "step": 10930 + }, + { + "epoch": 3.81317532241199, + "grad_norm": 0.021365277469158173, + "learning_rate": 4.74729871035204e-06, + "loss": 0.1061, + "step": 10940 + }, + { + "epoch": 3.8166608574416174, + "grad_norm": 0.01990444026887417, + "learning_rate": 4.733356570233531e-06, + "loss": 0.0306, + "step": 10950 + }, + { + "epoch": 3.820146392471244, + "grad_norm": 0.664445161819458, + "learning_rate": 4.719414430115023e-06, + "loss": 0.0078, + "step": 10960 + }, + { + "epoch": 3.8236319275008714, + "grad_norm": 2.076838254928589, + "learning_rate": 4.705472289996515e-06, + "loss": 0.0971, + "step": 10970 + }, + { + "epoch": 3.8271174625304987, + "grad_norm": 0.01907547004520893, + "learning_rate": 4.691530149878007e-06, + "loss": 0.0404, + "step": 10980 + }, + { + "epoch": 3.8306029975601255, + "grad_norm": 0.4929560720920563, + "learning_rate": 4.677588009759498e-06, + "loss": 0.0403, + "step": 10990 + }, + { + "epoch": 3.8340885325897527, + "grad_norm": 3.4360711574554443, + "learning_rate": 4.6636458696409905e-06, + "loss": 0.2111, + "step": 11000 + }, + { + "epoch": 3.8375740676193795, + "grad_norm": 3.6092143058776855, + "learning_rate": 4.649703729522482e-06, + "loss": 0.2806, + "step": 11010 + }, + { + "epoch": 3.8410596026490067, + "grad_norm": 0.01587533950805664, + "learning_rate": 4.635761589403974e-06, + "loss": 0.0803, + "step": 11020 + }, + { + "epoch": 3.844545137678634, + "grad_norm": 17.90401840209961, + "learning_rate": 4.6218194492854654e-06, + "loss": 0.1311, + "step": 11030 + }, + { + "epoch": 3.8480306727082607, + "grad_norm": 0.02171589806675911, + "learning_rate": 4.607877309166958e-06, + "loss": 0.0515, + "step": 11040 + }, + { + "epoch": 3.8515162077378875, + "grad_norm": 0.36789968609809875, + "learning_rate": 4.593935169048449e-06, + "loss": 0.1818, + "step": 11050 + }, + { + "epoch": 3.8550017427675147, + "grad_norm": 0.017076525837183, + "learning_rate": 4.579993028929941e-06, + "loss": 0.0441, + "step": 11060 + }, + { + "epoch": 3.858487277797142, + "grad_norm": 0.03205644711852074, + "learning_rate": 4.566050888811433e-06, + "loss": 0.0985, + "step": 11070 + }, + { + "epoch": 3.861972812826769, + "grad_norm": 0.031755849719047546, + "learning_rate": 4.552108748692925e-06, + "loss": 0.1107, + "step": 11080 + }, + { + "epoch": 3.865458347856396, + "grad_norm": 0.020482953637838364, + "learning_rate": 4.538166608574416e-06, + "loss": 0.1496, + "step": 11090 + }, + { + "epoch": 3.8689438828860228, + "grad_norm": 0.047831941395998, + "learning_rate": 4.524224468455909e-06, + "loss": 0.1769, + "step": 11100 + }, + { + "epoch": 3.87242941791565, + "grad_norm": 0.02326442301273346, + "learning_rate": 4.5102823283374004e-06, + "loss": 0.1952, + "step": 11110 + }, + { + "epoch": 3.875914952945277, + "grad_norm": 0.2660370469093323, + "learning_rate": 4.496340188218892e-06, + "loss": 0.0117, + "step": 11120 + }, + { + "epoch": 3.879400487974904, + "grad_norm": 0.023432690650224686, + "learning_rate": 4.482398048100384e-06, + "loss": 0.1016, + "step": 11130 + }, + { + "epoch": 3.8828860230045312, + "grad_norm": 11.599336624145508, + "learning_rate": 4.468455907981875e-06, + "loss": 0.0267, + "step": 11140 + }, + { + "epoch": 3.886371558034158, + "grad_norm": 0.030066780745983124, + "learning_rate": 4.454513767863368e-06, + "loss": 0.0146, + "step": 11150 + }, + { + "epoch": 3.8898570930637852, + "grad_norm": 0.30465713143348694, + "learning_rate": 4.440571627744859e-06, + "loss": 0.0186, + "step": 11160 + }, + { + "epoch": 3.8933426280934125, + "grad_norm": 0.04410243034362793, + "learning_rate": 4.426629487626351e-06, + "loss": 0.0072, + "step": 11170 + }, + { + "epoch": 3.8968281631230393, + "grad_norm": 39.044795989990234, + "learning_rate": 4.412687347507843e-06, + "loss": 0.1416, + "step": 11180 + }, + { + "epoch": 3.9003136981526665, + "grad_norm": 28.497705459594727, + "learning_rate": 4.3987452073893346e-06, + "loss": 0.0732, + "step": 11190 + }, + { + "epoch": 3.9037992331822933, + "grad_norm": 0.019765986129641533, + "learning_rate": 4.384803067270826e-06, + "loss": 0.0791, + "step": 11200 + }, + { + "epoch": 3.9072847682119205, + "grad_norm": 0.32468003034591675, + "learning_rate": 4.370860927152319e-06, + "loss": 0.1067, + "step": 11210 + }, + { + "epoch": 3.9107703032415477, + "grad_norm": 0.02749641239643097, + "learning_rate": 4.3569187870338096e-06, + "loss": 0.171, + "step": 11220 + }, + { + "epoch": 3.9142558382711745, + "grad_norm": 12.454182624816895, + "learning_rate": 4.342976646915302e-06, + "loss": 0.2891, + "step": 11230 + }, + { + "epoch": 3.9177413733008017, + "grad_norm": 0.03475377708673477, + "learning_rate": 4.329034506796794e-06, + "loss": 0.2393, + "step": 11240 + }, + { + "epoch": 3.9212269083304285, + "grad_norm": 0.01971781998872757, + "learning_rate": 4.315092366678285e-06, + "loss": 0.1189, + "step": 11250 + }, + { + "epoch": 3.9247124433600558, + "grad_norm": 0.7159892916679382, + "learning_rate": 4.301150226559777e-06, + "loss": 0.1387, + "step": 11260 + }, + { + "epoch": 3.928197978389683, + "grad_norm": 10.798375129699707, + "learning_rate": 4.287208086441269e-06, + "loss": 0.2423, + "step": 11270 + }, + { + "epoch": 3.9316835134193098, + "grad_norm": 1.2279161214828491, + "learning_rate": 4.273265946322761e-06, + "loss": 0.0822, + "step": 11280 + }, + { + "epoch": 3.935169048448937, + "grad_norm": 0.023021556437015533, + "learning_rate": 4.259323806204252e-06, + "loss": 0.1275, + "step": 11290 + }, + { + "epoch": 3.938654583478564, + "grad_norm": 0.041272278875112534, + "learning_rate": 4.2453816660857445e-06, + "loss": 0.0148, + "step": 11300 + }, + { + "epoch": 3.942140118508191, + "grad_norm": 3.4936373233795166, + "learning_rate": 4.231439525967236e-06, + "loss": 0.1178, + "step": 11310 + }, + { + "epoch": 3.9456256535378182, + "grad_norm": 0.033816587179899216, + "learning_rate": 4.217497385848729e-06, + "loss": 0.2568, + "step": 11320 + }, + { + "epoch": 3.949111188567445, + "grad_norm": 0.018212752416729927, + "learning_rate": 4.2035552457302195e-06, + "loss": 0.0131, + "step": 11330 + }, + { + "epoch": 3.9525967235970723, + "grad_norm": 10.551046371459961, + "learning_rate": 4.189613105611712e-06, + "loss": 0.1611, + "step": 11340 + }, + { + "epoch": 3.956082258626699, + "grad_norm": 0.018220404163002968, + "learning_rate": 4.175670965493204e-06, + "loss": 0.223, + "step": 11350 + }, + { + "epoch": 3.9595677936563263, + "grad_norm": 0.02707715705037117, + "learning_rate": 4.161728825374695e-06, + "loss": 0.0611, + "step": 11360 + }, + { + "epoch": 3.9630533286859535, + "grad_norm": 6.247076511383057, + "learning_rate": 4.147786685256187e-06, + "loss": 0.2263, + "step": 11370 + }, + { + "epoch": 3.9665388637155803, + "grad_norm": 0.013209417462348938, + "learning_rate": 4.133844545137679e-06, + "loss": 0.0905, + "step": 11380 + }, + { + "epoch": 3.9700243987452075, + "grad_norm": 0.19896091520786285, + "learning_rate": 4.119902405019171e-06, + "loss": 0.1132, + "step": 11390 + }, + { + "epoch": 3.9735099337748343, + "grad_norm": 0.01759384572505951, + "learning_rate": 4.105960264900663e-06, + "loss": 0.0424, + "step": 11400 + }, + { + "epoch": 3.9769954688044615, + "grad_norm": 0.018540671095252037, + "learning_rate": 4.0920181247821545e-06, + "loss": 0.095, + "step": 11410 + }, + { + "epoch": 3.9804810038340888, + "grad_norm": 0.018298756331205368, + "learning_rate": 4.078075984663646e-06, + "loss": 0.0239, + "step": 11420 + }, + { + "epoch": 3.9839665388637155, + "grad_norm": 0.10425955057144165, + "learning_rate": 4.064133844545138e-06, + "loss": 0.0831, + "step": 11430 + }, + { + "epoch": 3.9874520738933428, + "grad_norm": 0.29570281505584717, + "learning_rate": 4.0501917044266295e-06, + "loss": 0.0078, + "step": 11440 + }, + { + "epoch": 3.9909376089229696, + "grad_norm": 10.413164138793945, + "learning_rate": 4.036249564308122e-06, + "loss": 0.0978, + "step": 11450 + }, + { + "epoch": 3.994423143952597, + "grad_norm": 0.021612277254462242, + "learning_rate": 4.022307424189613e-06, + "loss": 0.0971, + "step": 11460 + }, + { + "epoch": 3.997908678982224, + "grad_norm": 2.315714120864868, + "learning_rate": 4.008365284071105e-06, + "loss": 0.0977, + "step": 11470 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9906172839506173, + "eval_loss": 0.04471336677670479, + "eval_runtime": 18.8269, + "eval_samples_per_second": 215.118, + "eval_steps_per_second": 26.93, + "step": 11476 + }, + { + "epoch": 4.001394214011851, + "grad_norm": 0.014861468225717545, + "learning_rate": 3.994423143952597e-06, + "loss": 0.0664, + "step": 11480 + }, + { + "epoch": 4.004879749041478, + "grad_norm": 0.18193350732326508, + "learning_rate": 3.9804810038340895e-06, + "loss": 0.1891, + "step": 11490 + }, + { + "epoch": 4.008365284071105, + "grad_norm": 0.04888654500246048, + "learning_rate": 3.96653886371558e-06, + "loss": 0.054, + "step": 11500 + }, + { + "epoch": 4.011850819100732, + "grad_norm": 0.022148391231894493, + "learning_rate": 3.952596723597073e-06, + "loss": 0.0184, + "step": 11510 + }, + { + "epoch": 4.015336354130359, + "grad_norm": 3.5250911712646484, + "learning_rate": 3.9386545834785645e-06, + "loss": 0.0466, + "step": 11520 + }, + { + "epoch": 4.018821889159986, + "grad_norm": 8.17469596862793, + "learning_rate": 3.924712443360056e-06, + "loss": 0.1563, + "step": 11530 + }, + { + "epoch": 4.022307424189613, + "grad_norm": 0.12059949338436127, + "learning_rate": 3.910770303241548e-06, + "loss": 0.1135, + "step": 11540 + }, + { + "epoch": 4.02579295921924, + "grad_norm": 0.23522257804870605, + "learning_rate": 3.8968281631230395e-06, + "loss": 0.1346, + "step": 11550 + }, + { + "epoch": 4.029278494248867, + "grad_norm": 0.014988483861088753, + "learning_rate": 3.882886023004532e-06, + "loss": 0.0105, + "step": 11560 + }, + { + "epoch": 4.0327640292784945, + "grad_norm": 0.3500690460205078, + "learning_rate": 3.868943882886023e-06, + "loss": 0.0901, + "step": 11570 + }, + { + "epoch": 4.036249564308121, + "grad_norm": 0.01964680850505829, + "learning_rate": 3.855001742767515e-06, + "loss": 0.0047, + "step": 11580 + }, + { + "epoch": 4.039735099337748, + "grad_norm": 0.01860162802040577, + "learning_rate": 3.841059602649007e-06, + "loss": 0.0216, + "step": 11590 + }, + { + "epoch": 4.043220634367375, + "grad_norm": 31.001237869262695, + "learning_rate": 3.827117462530499e-06, + "loss": 0.1348, + "step": 11600 + }, + { + "epoch": 4.046706169397003, + "grad_norm": 1.8247324228286743, + "learning_rate": 3.8131753224119907e-06, + "loss": 0.1087, + "step": 11610 + }, + { + "epoch": 4.05019170442663, + "grad_norm": 0.9444401860237122, + "learning_rate": 3.7992331822934824e-06, + "loss": 0.1226, + "step": 11620 + }, + { + "epoch": 4.053677239456256, + "grad_norm": 0.05249320715665817, + "learning_rate": 3.785291042174974e-06, + "loss": 0.1094, + "step": 11630 + }, + { + "epoch": 4.057162774485883, + "grad_norm": 13.864377975463867, + "learning_rate": 3.7713489020564657e-06, + "loss": 0.1238, + "step": 11640 + }, + { + "epoch": 4.060648309515511, + "grad_norm": 0.03523989021778107, + "learning_rate": 3.7574067619379578e-06, + "loss": 0.0571, + "step": 11650 + }, + { + "epoch": 4.064133844545138, + "grad_norm": 22.960308074951172, + "learning_rate": 3.74346462181945e-06, + "loss": 0.2689, + "step": 11660 + }, + { + "epoch": 4.067619379574765, + "grad_norm": 0.023842979222536087, + "learning_rate": 3.729522481700941e-06, + "loss": 0.1566, + "step": 11670 + }, + { + "epoch": 4.071104914604391, + "grad_norm": 0.2230115532875061, + "learning_rate": 3.715580341582433e-06, + "loss": 0.0784, + "step": 11680 + }, + { + "epoch": 4.074590449634019, + "grad_norm": 1.818585753440857, + "learning_rate": 3.7016382014639253e-06, + "loss": 0.1562, + "step": 11690 + }, + { + "epoch": 4.078075984663646, + "grad_norm": 0.024207105860114098, + "learning_rate": 3.6876960613454165e-06, + "loss": 0.0896, + "step": 11700 + }, + { + "epoch": 4.081561519693273, + "grad_norm": 8.846477508544922, + "learning_rate": 3.6737539212269086e-06, + "loss": 0.1073, + "step": 11710 + }, + { + "epoch": 4.0850470547229, + "grad_norm": 4.648227214813232, + "learning_rate": 3.6598117811084007e-06, + "loss": 0.0199, + "step": 11720 + }, + { + "epoch": 4.088532589752527, + "grad_norm": 0.014286634512245655, + "learning_rate": 3.6458696409898923e-06, + "loss": 0.0208, + "step": 11730 + }, + { + "epoch": 4.092018124782154, + "grad_norm": 8.536304473876953, + "learning_rate": 3.631927500871384e-06, + "loss": 0.0569, + "step": 11740 + }, + { + "epoch": 4.095503659811781, + "grad_norm": 0.018616320565342903, + "learning_rate": 3.6179853607528757e-06, + "loss": 0.0343, + "step": 11750 + }, + { + "epoch": 4.098989194841408, + "grad_norm": 0.03940120339393616, + "learning_rate": 3.6040432206343678e-06, + "loss": 0.0799, + "step": 11760 + }, + { + "epoch": 4.102474729871036, + "grad_norm": 0.017780063673853874, + "learning_rate": 3.5901010805158594e-06, + "loss": 0.0922, + "step": 11770 + }, + { + "epoch": 4.105960264900662, + "grad_norm": 0.03300795704126358, + "learning_rate": 3.576158940397351e-06, + "loss": 0.0589, + "step": 11780 + }, + { + "epoch": 4.109445799930289, + "grad_norm": 37.71131896972656, + "learning_rate": 3.562216800278843e-06, + "loss": 0.144, + "step": 11790 + }, + { + "epoch": 4.112931334959916, + "grad_norm": 0.2066090852022171, + "learning_rate": 3.5482746601603352e-06, + "loss": 0.0068, + "step": 11800 + }, + { + "epoch": 4.116416869989544, + "grad_norm": 0.02809598483145237, + "learning_rate": 3.5343325200418265e-06, + "loss": 0.2141, + "step": 11810 + }, + { + "epoch": 4.119902405019171, + "grad_norm": 11.681623458862305, + "learning_rate": 3.5203903799233186e-06, + "loss": 0.1601, + "step": 11820 + }, + { + "epoch": 4.123387940048797, + "grad_norm": 0.021636666730046272, + "learning_rate": 3.5064482398048107e-06, + "loss": 0.1021, + "step": 11830 + }, + { + "epoch": 4.126873475078424, + "grad_norm": 0.01840069517493248, + "learning_rate": 3.492506099686302e-06, + "loss": 0.0054, + "step": 11840 + }, + { + "epoch": 4.130359010108052, + "grad_norm": 14.279787063598633, + "learning_rate": 3.478563959567794e-06, + "loss": 0.4595, + "step": 11850 + }, + { + "epoch": 4.133844545137679, + "grad_norm": 0.06327791512012482, + "learning_rate": 3.464621819449286e-06, + "loss": 0.0815, + "step": 11860 + }, + { + "epoch": 4.137330080167306, + "grad_norm": 7.440787315368652, + "learning_rate": 3.4506796793307773e-06, + "loss": 0.0566, + "step": 11870 + }, + { + "epoch": 4.140815615196932, + "grad_norm": 0.01611819863319397, + "learning_rate": 3.4367375392122694e-06, + "loss": 0.1947, + "step": 11880 + }, + { + "epoch": 4.14430115022656, + "grad_norm": 0.07511651515960693, + "learning_rate": 3.422795399093761e-06, + "loss": 0.2238, + "step": 11890 + }, + { + "epoch": 4.147786685256187, + "grad_norm": 0.05026530474424362, + "learning_rate": 3.408853258975253e-06, + "loss": 0.1576, + "step": 11900 + }, + { + "epoch": 4.151272220285814, + "grad_norm": 12.329663276672363, + "learning_rate": 3.394911118856745e-06, + "loss": 0.0859, + "step": 11910 + }, + { + "epoch": 4.154757755315441, + "grad_norm": 6.723945140838623, + "learning_rate": 3.3809689787382365e-06, + "loss": 0.1676, + "step": 11920 + }, + { + "epoch": 4.158243290345068, + "grad_norm": 0.024456940591335297, + "learning_rate": 3.3670268386197285e-06, + "loss": 0.1249, + "step": 11930 + }, + { + "epoch": 4.161728825374695, + "grad_norm": 0.017061809077858925, + "learning_rate": 3.3530846985012198e-06, + "loss": 0.1512, + "step": 11940 + }, + { + "epoch": 4.165214360404322, + "grad_norm": 0.03508226200938225, + "learning_rate": 3.339142558382712e-06, + "loss": 0.1291, + "step": 11950 + }, + { + "epoch": 4.168699895433949, + "grad_norm": 0.017415596172213554, + "learning_rate": 3.325200418264204e-06, + "loss": 0.1062, + "step": 11960 + }, + { + "epoch": 4.172185430463577, + "grad_norm": 4.462165355682373, + "learning_rate": 3.311258278145696e-06, + "loss": 0.0761, + "step": 11970 + }, + { + "epoch": 4.175670965493203, + "grad_norm": 0.017748460173606873, + "learning_rate": 3.2973161380271873e-06, + "loss": 0.1781, + "step": 11980 + }, + { + "epoch": 4.17915650052283, + "grad_norm": 0.04285305365920067, + "learning_rate": 3.2833739979086794e-06, + "loss": 0.0114, + "step": 11990 + }, + { + "epoch": 4.182642035552457, + "grad_norm": 13.827252388000488, + "learning_rate": 3.2694318577901714e-06, + "loss": 0.0203, + "step": 12000 + }, + { + "epoch": 4.186127570582085, + "grad_norm": 0.01608210802078247, + "learning_rate": 3.2554897176716627e-06, + "loss": 0.065, + "step": 12010 + }, + { + "epoch": 4.189613105611711, + "grad_norm": 22.72849464416504, + "learning_rate": 3.2415475775531548e-06, + "loss": 0.146, + "step": 12020 + }, + { + "epoch": 4.193098640641338, + "grad_norm": 0.013584673404693604, + "learning_rate": 3.2276054374346464e-06, + "loss": 0.0754, + "step": 12030 + }, + { + "epoch": 4.196584175670965, + "grad_norm": 0.02281971462070942, + "learning_rate": 3.213663297316138e-06, + "loss": 0.0626, + "step": 12040 + }, + { + "epoch": 4.200069710700593, + "grad_norm": 1.6404436826705933, + "learning_rate": 3.19972115719763e-06, + "loss": 0.2863, + "step": 12050 + }, + { + "epoch": 4.20355524573022, + "grad_norm": 6.147258281707764, + "learning_rate": 3.185779017079122e-06, + "loss": 0.1192, + "step": 12060 + }, + { + "epoch": 4.207040780759846, + "grad_norm": 19.095373153686523, + "learning_rate": 3.171836876960614e-06, + "loss": 0.2516, + "step": 12070 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.01998833194375038, + "learning_rate": 3.157894736842105e-06, + "loss": 0.0836, + "step": 12080 + }, + { + "epoch": 4.214011850819101, + "grad_norm": 0.02509804256260395, + "learning_rate": 3.1439525967235973e-06, + "loss": 0.1586, + "step": 12090 + }, + { + "epoch": 4.217497385848728, + "grad_norm": 0.02285916358232498, + "learning_rate": 3.1300104566050893e-06, + "loss": 0.1148, + "step": 12100 + }, + { + "epoch": 4.220982920878355, + "grad_norm": 0.017659736797213554, + "learning_rate": 3.1160683164865806e-06, + "loss": 0.0536, + "step": 12110 + }, + { + "epoch": 4.2244684559079815, + "grad_norm": 0.14031673967838287, + "learning_rate": 3.1021261763680727e-06, + "loss": 0.1591, + "step": 12120 + }, + { + "epoch": 4.227953990937609, + "grad_norm": 0.03257942944765091, + "learning_rate": 3.0881840362495647e-06, + "loss": 0.0295, + "step": 12130 + }, + { + "epoch": 4.231439525967236, + "grad_norm": 5.860164165496826, + "learning_rate": 3.0742418961310564e-06, + "loss": 0.0525, + "step": 12140 + }, + { + "epoch": 4.234925060996863, + "grad_norm": 0.023333774879574776, + "learning_rate": 3.060299756012548e-06, + "loss": 0.1362, + "step": 12150 + }, + { + "epoch": 4.23841059602649, + "grad_norm": 0.021630102768540382, + "learning_rate": 3.04635761589404e-06, + "loss": 0.306, + "step": 12160 + }, + { + "epoch": 4.241896131056117, + "grad_norm": 0.9809282422065735, + "learning_rate": 3.032415475775532e-06, + "loss": 0.0831, + "step": 12170 + }, + { + "epoch": 4.245381666085744, + "grad_norm": 0.17336943745613098, + "learning_rate": 3.0184733356570235e-06, + "loss": 0.0056, + "step": 12180 + }, + { + "epoch": 4.248867201115371, + "grad_norm": 0.020092690363526344, + "learning_rate": 3.0045311955385156e-06, + "loss": 0.023, + "step": 12190 + }, + { + "epoch": 4.252352736144998, + "grad_norm": 6.104828834533691, + "learning_rate": 2.9905890554200072e-06, + "loss": 0.1396, + "step": 12200 + }, + { + "epoch": 4.255838271174626, + "grad_norm": 0.06953276693820953, + "learning_rate": 2.9766469153014993e-06, + "loss": 0.0521, + "step": 12210 + }, + { + "epoch": 4.259323806204252, + "grad_norm": 0.06008859723806381, + "learning_rate": 2.9627047751829905e-06, + "loss": 0.0795, + "step": 12220 + }, + { + "epoch": 4.262809341233879, + "grad_norm": 0.06809389591217041, + "learning_rate": 2.9487626350644826e-06, + "loss": 0.0766, + "step": 12230 + }, + { + "epoch": 4.2662948762635065, + "grad_norm": 0.011282606981694698, + "learning_rate": 2.9348204949459747e-06, + "loss": 0.1127, + "step": 12240 + }, + { + "epoch": 4.269780411293134, + "grad_norm": 0.01990368217229843, + "learning_rate": 2.920878354827466e-06, + "loss": 0.0512, + "step": 12250 + }, + { + "epoch": 4.273265946322761, + "grad_norm": 2.730123996734619, + "learning_rate": 2.906936214708958e-06, + "loss": 0.133, + "step": 12260 + }, + { + "epoch": 4.276751481352387, + "grad_norm": 0.018478725105524063, + "learning_rate": 2.89299407459045e-06, + "loss": 0.3299, + "step": 12270 + }, + { + "epoch": 4.2802370163820145, + "grad_norm": 0.027384992688894272, + "learning_rate": 2.8790519344719414e-06, + "loss": 0.015, + "step": 12280 + }, + { + "epoch": 4.283722551411642, + "grad_norm": 0.38204970955848694, + "learning_rate": 2.8651097943534334e-06, + "loss": 0.0074, + "step": 12290 + }, + { + "epoch": 4.287208086441269, + "grad_norm": 6.249034881591797, + "learning_rate": 2.8511676542349255e-06, + "loss": 0.0899, + "step": 12300 + }, + { + "epoch": 4.290693621470896, + "grad_norm": 2.7420356273651123, + "learning_rate": 2.837225514116417e-06, + "loss": 0.0179, + "step": 12310 + }, + { + "epoch": 4.2941791565005225, + "grad_norm": 3.861992835998535, + "learning_rate": 2.823283373997909e-06, + "loss": 0.1254, + "step": 12320 + }, + { + "epoch": 4.29766469153015, + "grad_norm": 1.4951390027999878, + "learning_rate": 2.8093412338794005e-06, + "loss": 0.0504, + "step": 12330 + }, + { + "epoch": 4.301150226559777, + "grad_norm": 0.025097187608480453, + "learning_rate": 2.7953990937608926e-06, + "loss": 0.0815, + "step": 12340 + }, + { + "epoch": 4.304635761589404, + "grad_norm": 0.020737938582897186, + "learning_rate": 2.7814569536423843e-06, + "loss": 0.0526, + "step": 12350 + }, + { + "epoch": 4.308121296619031, + "grad_norm": 0.025950776413083076, + "learning_rate": 2.767514813523876e-06, + "loss": 0.0167, + "step": 12360 + }, + { + "epoch": 4.311606831648658, + "grad_norm": 0.4815838634967804, + "learning_rate": 2.753572673405368e-06, + "loss": 0.0061, + "step": 12370 + }, + { + "epoch": 4.315092366678285, + "grad_norm": 0.020343678072094917, + "learning_rate": 2.73963053328686e-06, + "loss": 0.1312, + "step": 12380 + }, + { + "epoch": 4.318577901707912, + "grad_norm": 6.27520227432251, + "learning_rate": 2.7256883931683513e-06, + "loss": 0.1138, + "step": 12390 + }, + { + "epoch": 4.3220634367375395, + "grad_norm": 19.571958541870117, + "learning_rate": 2.7117462530498434e-06, + "loss": 0.1826, + "step": 12400 + }, + { + "epoch": 4.325548971767166, + "grad_norm": 0.015069302171468735, + "learning_rate": 2.6978041129313355e-06, + "loss": 0.0538, + "step": 12410 + }, + { + "epoch": 4.329034506796793, + "grad_norm": 0.01998594030737877, + "learning_rate": 2.6838619728128267e-06, + "loss": 0.0339, + "step": 12420 + }, + { + "epoch": 4.33252004182642, + "grad_norm": 0.017047762870788574, + "learning_rate": 2.669919832694319e-06, + "loss": 0.0475, + "step": 12430 + }, + { + "epoch": 4.3360055768560475, + "grad_norm": 6.753593444824219, + "learning_rate": 2.655977692575811e-06, + "loss": 0.2493, + "step": 12440 + }, + { + "epoch": 4.339491111885675, + "grad_norm": 0.022272732108831406, + "learning_rate": 2.642035552457302e-06, + "loss": 0.1162, + "step": 12450 + }, + { + "epoch": 4.342976646915302, + "grad_norm": 0.01944439485669136, + "learning_rate": 2.6280934123387942e-06, + "loss": 0.2014, + "step": 12460 + }, + { + "epoch": 4.346462181944928, + "grad_norm": 0.020634399726986885, + "learning_rate": 2.614151272220286e-06, + "loss": 0.0043, + "step": 12470 + }, + { + "epoch": 4.3499477169745555, + "grad_norm": 0.12367178499698639, + "learning_rate": 2.600209132101778e-06, + "loss": 0.0248, + "step": 12480 + }, + { + "epoch": 4.353433252004183, + "grad_norm": 6.508563995361328, + "learning_rate": 2.5862669919832696e-06, + "loss": 0.1527, + "step": 12490 + }, + { + "epoch": 4.35691878703381, + "grad_norm": 0.014639653265476227, + "learning_rate": 2.5723248518647613e-06, + "loss": 0.0247, + "step": 12500 + }, + { + "epoch": 4.360404322063436, + "grad_norm": 0.128875732421875, + "learning_rate": 2.5583827117462534e-06, + "loss": 0.1266, + "step": 12510 + }, + { + "epoch": 4.3638898570930635, + "grad_norm": 0.028906451538205147, + "learning_rate": 2.5444405716277446e-06, + "loss": 0.1789, + "step": 12520 + }, + { + "epoch": 4.367375392122691, + "grad_norm": 0.0448245145380497, + "learning_rate": 2.5304984315092367e-06, + "loss": 0.1235, + "step": 12530 + }, + { + "epoch": 4.370860927152318, + "grad_norm": 0.015661604702472687, + "learning_rate": 2.516556291390729e-06, + "loss": 0.2039, + "step": 12540 + }, + { + "epoch": 4.374346462181945, + "grad_norm": 0.021742140874266624, + "learning_rate": 2.502614151272221e-06, + "loss": 0.0054, + "step": 12550 + }, + { + "epoch": 4.377831997211572, + "grad_norm": 68.89356994628906, + "learning_rate": 2.4886720111537126e-06, + "loss": 0.1393, + "step": 12560 + }, + { + "epoch": 4.381317532241199, + "grad_norm": 0.11168365180492401, + "learning_rate": 2.4747298710352042e-06, + "loss": 0.0047, + "step": 12570 + }, + { + "epoch": 4.384803067270826, + "grad_norm": 0.05092855915427208, + "learning_rate": 2.460787730916696e-06, + "loss": 0.1151, + "step": 12580 + }, + { + "epoch": 4.388288602300453, + "grad_norm": 0.025330761447548866, + "learning_rate": 2.446845590798188e-06, + "loss": 0.0385, + "step": 12590 + }, + { + "epoch": 4.3917741373300805, + "grad_norm": 0.034395404160022736, + "learning_rate": 2.4329034506796796e-06, + "loss": 0.1206, + "step": 12600 + }, + { + "epoch": 4.395259672359707, + "grad_norm": 10.358739852905273, + "learning_rate": 2.4189613105611713e-06, + "loss": 0.1857, + "step": 12610 + }, + { + "epoch": 4.398745207389334, + "grad_norm": 1.8648558855056763, + "learning_rate": 2.405019170442663e-06, + "loss": 0.0807, + "step": 12620 + }, + { + "epoch": 4.402230742418961, + "grad_norm": 3.7046706676483154, + "learning_rate": 2.391077030324155e-06, + "loss": 0.0273, + "step": 12630 + }, + { + "epoch": 4.4057162774485885, + "grad_norm": 0.2638864517211914, + "learning_rate": 2.3771348902056467e-06, + "loss": 0.052, + "step": 12640 + }, + { + "epoch": 4.409201812478216, + "grad_norm": 10.242549896240234, + "learning_rate": 2.3631927500871384e-06, + "loss": 0.25, + "step": 12650 + }, + { + "epoch": 4.412687347507842, + "grad_norm": 0.021201113238930702, + "learning_rate": 2.3492506099686304e-06, + "loss": 0.1335, + "step": 12660 + }, + { + "epoch": 4.416172882537469, + "grad_norm": 0.015749704092741013, + "learning_rate": 2.335308469850122e-06, + "loss": 0.2059, + "step": 12670 + }, + { + "epoch": 4.4196584175670965, + "grad_norm": 0.01748356781899929, + "learning_rate": 2.3213663297316138e-06, + "loss": 0.1123, + "step": 12680 + }, + { + "epoch": 4.423143952596724, + "grad_norm": 16.068105697631836, + "learning_rate": 2.307424189613106e-06, + "loss": 0.0576, + "step": 12690 + }, + { + "epoch": 4.426629487626351, + "grad_norm": 0.042768694460392, + "learning_rate": 2.2934820494945975e-06, + "loss": 0.0666, + "step": 12700 + }, + { + "epoch": 4.430115022655977, + "grad_norm": 6.316594123840332, + "learning_rate": 2.2795399093760896e-06, + "loss": 0.1412, + "step": 12710 + }, + { + "epoch": 4.433600557685605, + "grad_norm": 0.023556068539619446, + "learning_rate": 2.2655977692575813e-06, + "loss": 0.1011, + "step": 12720 + }, + { + "epoch": 4.437086092715232, + "grad_norm": 0.03524250537157059, + "learning_rate": 2.2516556291390733e-06, + "loss": 0.1937, + "step": 12730 + }, + { + "epoch": 4.440571627744859, + "grad_norm": 4.944858551025391, + "learning_rate": 2.237713489020565e-06, + "loss": 0.1197, + "step": 12740 + }, + { + "epoch": 4.444057162774486, + "grad_norm": 18.027040481567383, + "learning_rate": 2.2237713489020567e-06, + "loss": 0.1547, + "step": 12750 + }, + { + "epoch": 4.447542697804113, + "grad_norm": 0.22646109759807587, + "learning_rate": 2.2098292087835483e-06, + "loss": 0.1819, + "step": 12760 + }, + { + "epoch": 4.45102823283374, + "grad_norm": 1.5214154720306396, + "learning_rate": 2.1958870686650404e-06, + "loss": 0.0784, + "step": 12770 + }, + { + "epoch": 4.454513767863367, + "grad_norm": 9.586079597473145, + "learning_rate": 2.181944928546532e-06, + "loss": 0.1344, + "step": 12780 + }, + { + "epoch": 4.457999302892994, + "grad_norm": 0.03214803338050842, + "learning_rate": 2.1680027884280237e-06, + "loss": 0.0525, + "step": 12790 + }, + { + "epoch": 4.4614848379226215, + "grad_norm": 0.06377983093261719, + "learning_rate": 2.1540606483095154e-06, + "loss": 0.1088, + "step": 12800 + }, + { + "epoch": 4.464970372952248, + "grad_norm": 2.2348597049713135, + "learning_rate": 2.1401185081910075e-06, + "loss": 0.0789, + "step": 12810 + }, + { + "epoch": 4.468455907981875, + "grad_norm": 0.040668126195669174, + "learning_rate": 2.126176368072499e-06, + "loss": 0.033, + "step": 12820 + }, + { + "epoch": 4.471941443011502, + "grad_norm": 0.012590868398547173, + "learning_rate": 2.1122342279539912e-06, + "loss": 0.027, + "step": 12830 + }, + { + "epoch": 4.4754269780411295, + "grad_norm": 0.01730727031826973, + "learning_rate": 2.098292087835483e-06, + "loss": 0.0822, + "step": 12840 + }, + { + "epoch": 4.478912513070757, + "grad_norm": 0.05510111153125763, + "learning_rate": 2.084349947716975e-06, + "loss": 0.0093, + "step": 12850 + }, + { + "epoch": 4.482398048100383, + "grad_norm": 0.07657311856746674, + "learning_rate": 2.0704078075984666e-06, + "loss": 0.0442, + "step": 12860 + }, + { + "epoch": 4.48588358313001, + "grad_norm": 9.510165214538574, + "learning_rate": 2.0564656674799583e-06, + "loss": 0.1325, + "step": 12870 + }, + { + "epoch": 4.489369118159638, + "grad_norm": 0.020914927124977112, + "learning_rate": 2.0425235273614504e-06, + "loss": 0.0475, + "step": 12880 + }, + { + "epoch": 4.492854653189265, + "grad_norm": 41.944488525390625, + "learning_rate": 2.028581387242942e-06, + "loss": 0.1917, + "step": 12890 + }, + { + "epoch": 4.496340188218891, + "grad_norm": 25.84819984436035, + "learning_rate": 2.0146392471244337e-06, + "loss": 0.2525, + "step": 12900 + }, + { + "epoch": 4.499825723248518, + "grad_norm": 0.020068377256393433, + "learning_rate": 2.0006971070059254e-06, + "loss": 0.1163, + "step": 12910 + }, + { + "epoch": 4.503311258278146, + "grad_norm": 0.1646548956632614, + "learning_rate": 1.9867549668874175e-06, + "loss": 0.124, + "step": 12920 + }, + { + "epoch": 4.506796793307773, + "grad_norm": 0.013263706117868423, + "learning_rate": 1.972812826768909e-06, + "loss": 0.1735, + "step": 12930 + }, + { + "epoch": 4.5102823283374, + "grad_norm": 0.03346388414502144, + "learning_rate": 1.9588706866504008e-06, + "loss": 0.006, + "step": 12940 + }, + { + "epoch": 4.513767863367027, + "grad_norm": 0.04874487593770027, + "learning_rate": 1.944928546531893e-06, + "loss": 0.1541, + "step": 12950 + }, + { + "epoch": 4.517253398396654, + "grad_norm": 0.03367030993103981, + "learning_rate": 1.9309864064133845e-06, + "loss": 0.095, + "step": 12960 + }, + { + "epoch": 4.520738933426281, + "grad_norm": 0.015354972332715988, + "learning_rate": 1.9170442662948766e-06, + "loss": 0.2585, + "step": 12970 + }, + { + "epoch": 4.524224468455908, + "grad_norm": 17.290773391723633, + "learning_rate": 1.9031021261763683e-06, + "loss": 0.1559, + "step": 12980 + }, + { + "epoch": 4.527710003485535, + "grad_norm": 0.03650696575641632, + "learning_rate": 1.88915998605786e-06, + "loss": 0.0843, + "step": 12990 + }, + { + "epoch": 4.531195538515162, + "grad_norm": 0.8950408697128296, + "learning_rate": 1.8752178459393518e-06, + "loss": 0.0086, + "step": 13000 + }, + { + "epoch": 4.534681073544789, + "grad_norm": 0.047185130417346954, + "learning_rate": 1.8612757058208437e-06, + "loss": 0.1137, + "step": 13010 + }, + { + "epoch": 4.538166608574416, + "grad_norm": 19.941810607910156, + "learning_rate": 1.8473335657023356e-06, + "loss": 0.0534, + "step": 13020 + }, + { + "epoch": 4.541652143604043, + "grad_norm": 0.03801906108856201, + "learning_rate": 1.8333914255838272e-06, + "loss": 0.1251, + "step": 13030 + }, + { + "epoch": 4.545137678633671, + "grad_norm": 0.018815016373991966, + "learning_rate": 1.8194492854653189e-06, + "loss": 0.1215, + "step": 13040 + }, + { + "epoch": 4.548623213663298, + "grad_norm": 0.015067674219608307, + "learning_rate": 1.805507145346811e-06, + "loss": 0.1624, + "step": 13050 + }, + { + "epoch": 4.552108748692924, + "grad_norm": 0.05160703510046005, + "learning_rate": 1.7915650052283026e-06, + "loss": 0.0379, + "step": 13060 + }, + { + "epoch": 4.555594283722551, + "grad_norm": 0.9364856481552124, + "learning_rate": 1.7776228651097945e-06, + "loss": 0.0417, + "step": 13070 + }, + { + "epoch": 4.559079818752179, + "grad_norm": 0.06938211619853973, + "learning_rate": 1.7636807249912864e-06, + "loss": 0.0135, + "step": 13080 + }, + { + "epoch": 4.562565353781806, + "grad_norm": 0.019827021285891533, + "learning_rate": 1.749738584872778e-06, + "loss": 0.0508, + "step": 13090 + }, + { + "epoch": 4.566050888811432, + "grad_norm": 0.11981372535228729, + "learning_rate": 1.73579644475427e-06, + "loss": 0.057, + "step": 13100 + }, + { + "epoch": 4.569536423841059, + "grad_norm": 0.3380275368690491, + "learning_rate": 1.7218543046357616e-06, + "loss": 0.0149, + "step": 13110 + }, + { + "epoch": 4.573021958870687, + "grad_norm": 0.017787497490644455, + "learning_rate": 1.7079121645172537e-06, + "loss": 0.0651, + "step": 13120 + }, + { + "epoch": 4.576507493900314, + "grad_norm": 0.033968303352594376, + "learning_rate": 1.6939700243987453e-06, + "loss": 0.0363, + "step": 13130 + }, + { + "epoch": 4.579993028929941, + "grad_norm": 0.08648999035358429, + "learning_rate": 1.6800278842802372e-06, + "loss": 0.1379, + "step": 13140 + }, + { + "epoch": 4.583478563959567, + "grad_norm": 11.68699836730957, + "learning_rate": 1.666085744161729e-06, + "loss": 0.0815, + "step": 13150 + }, + { + "epoch": 4.586964098989195, + "grad_norm": 0.021847940981388092, + "learning_rate": 1.6521436040432207e-06, + "loss": 0.116, + "step": 13160 + }, + { + "epoch": 4.590449634018822, + "grad_norm": 0.018791014328598976, + "learning_rate": 1.6382014639247126e-06, + "loss": 0.0267, + "step": 13170 + }, + { + "epoch": 4.593935169048449, + "grad_norm": 18.658403396606445, + "learning_rate": 1.6242593238062043e-06, + "loss": 0.1086, + "step": 13180 + }, + { + "epoch": 4.597420704078076, + "grad_norm": 0.09876930713653564, + "learning_rate": 1.6103171836876963e-06, + "loss": 0.0523, + "step": 13190 + }, + { + "epoch": 4.600906239107703, + "grad_norm": 1.7560397386550903, + "learning_rate": 1.596375043569188e-06, + "loss": 0.0086, + "step": 13200 + }, + { + "epoch": 4.60439177413733, + "grad_norm": 0.05320524796843529, + "learning_rate": 1.5824329034506797e-06, + "loss": 0.0986, + "step": 13210 + }, + { + "epoch": 4.607877309166957, + "grad_norm": 0.028736671432852745, + "learning_rate": 1.5684907633321718e-06, + "loss": 0.1403, + "step": 13220 + }, + { + "epoch": 4.611362844196584, + "grad_norm": 0.021549373865127563, + "learning_rate": 1.5545486232136634e-06, + "loss": 0.0132, + "step": 13230 + }, + { + "epoch": 4.614848379226212, + "grad_norm": 0.7282363176345825, + "learning_rate": 1.5406064830951553e-06, + "loss": 0.0712, + "step": 13240 + }, + { + "epoch": 4.618333914255838, + "grad_norm": 0.08495603501796722, + "learning_rate": 1.526664342976647e-06, + "loss": 0.0062, + "step": 13250 + }, + { + "epoch": 4.621819449285465, + "grad_norm": 0.015024428255856037, + "learning_rate": 1.512722202858139e-06, + "loss": 0.0218, + "step": 13260 + }, + { + "epoch": 4.625304984315092, + "grad_norm": 0.04030692204833031, + "learning_rate": 1.4987800627396307e-06, + "loss": 0.0916, + "step": 13270 + }, + { + "epoch": 4.62879051934472, + "grad_norm": 0.018205171450972557, + "learning_rate": 1.4848379226211224e-06, + "loss": 0.0038, + "step": 13280 + }, + { + "epoch": 4.632276054374346, + "grad_norm": 1.867049217224121, + "learning_rate": 1.4708957825026142e-06, + "loss": 0.1129, + "step": 13290 + }, + { + "epoch": 4.635761589403973, + "grad_norm": 9.02762508392334, + "learning_rate": 1.456953642384106e-06, + "loss": 0.0186, + "step": 13300 + }, + { + "epoch": 4.6392471244336, + "grad_norm": 9.14907169342041, + "learning_rate": 1.443011502265598e-06, + "loss": 0.1519, + "step": 13310 + }, + { + "epoch": 4.642732659463228, + "grad_norm": 15.300692558288574, + "learning_rate": 1.4290693621470896e-06, + "loss": 0.0941, + "step": 13320 + }, + { + "epoch": 4.646218194492855, + "grad_norm": 0.02297472208738327, + "learning_rate": 1.4151272220285813e-06, + "loss": 0.0892, + "step": 13330 + }, + { + "epoch": 4.649703729522482, + "grad_norm": 0.013637103140354156, + "learning_rate": 1.4011850819100734e-06, + "loss": 0.1728, + "step": 13340 + }, + { + "epoch": 4.6531892645521085, + "grad_norm": 3.2457101345062256, + "learning_rate": 1.387242941791565e-06, + "loss": 0.2769, + "step": 13350 + }, + { + "epoch": 4.656674799581736, + "grad_norm": 0.019199127331376076, + "learning_rate": 1.373300801673057e-06, + "loss": 0.0128, + "step": 13360 + }, + { + "epoch": 4.660160334611363, + "grad_norm": 4.897704601287842, + "learning_rate": 1.3593586615545488e-06, + "loss": 0.0735, + "step": 13370 + }, + { + "epoch": 4.66364586964099, + "grad_norm": 3.3588168621063232, + "learning_rate": 1.3454165214360407e-06, + "loss": 0.1323, + "step": 13380 + }, + { + "epoch": 4.6671314046706165, + "grad_norm": 4.936280727386475, + "learning_rate": 1.3314743813175323e-06, + "loss": 0.0841, + "step": 13390 + }, + { + "epoch": 4.670616939700244, + "grad_norm": 3.541562080383301, + "learning_rate": 1.317532241199024e-06, + "loss": 0.0276, + "step": 13400 + }, + { + "epoch": 4.674102474729871, + "grad_norm": 0.05973378196358681, + "learning_rate": 1.303590101080516e-06, + "loss": 0.0098, + "step": 13410 + }, + { + "epoch": 4.677588009759498, + "grad_norm": 0.05543622002005577, + "learning_rate": 1.2896479609620077e-06, + "loss": 0.0983, + "step": 13420 + }, + { + "epoch": 4.681073544789125, + "grad_norm": 0.028797583654522896, + "learning_rate": 1.2757058208434996e-06, + "loss": 0.0048, + "step": 13430 + }, + { + "epoch": 4.684559079818753, + "grad_norm": 0.03675423562526703, + "learning_rate": 1.2617636807249915e-06, + "loss": 0.041, + "step": 13440 + }, + { + "epoch": 4.688044614848379, + "grad_norm": 0.04537774622440338, + "learning_rate": 1.2478215406064831e-06, + "loss": 0.0517, + "step": 13450 + }, + { + "epoch": 4.691530149878006, + "grad_norm": 0.11532427370548248, + "learning_rate": 1.233879400487975e-06, + "loss": 0.056, + "step": 13460 + }, + { + "epoch": 4.695015684907633, + "grad_norm": 0.06099528446793556, + "learning_rate": 1.2199372603694667e-06, + "loss": 0.0924, + "step": 13470 + }, + { + "epoch": 4.698501219937261, + "grad_norm": 0.050252217799425125, + "learning_rate": 1.2059951202509586e-06, + "loss": 0.1061, + "step": 13480 + }, + { + "epoch": 4.701986754966887, + "grad_norm": 0.04012449085712433, + "learning_rate": 1.1920529801324504e-06, + "loss": 0.1954, + "step": 13490 + }, + { + "epoch": 4.705472289996514, + "grad_norm": 0.016682496294379234, + "learning_rate": 1.1781108400139423e-06, + "loss": 0.1782, + "step": 13500 + }, + { + "epoch": 4.7089578250261415, + "grad_norm": 0.021084588021039963, + "learning_rate": 1.1641686998954342e-06, + "loss": 0.2397, + "step": 13510 + }, + { + "epoch": 4.712443360055769, + "grad_norm": 0.016580209136009216, + "learning_rate": 1.1502265597769258e-06, + "loss": 0.0376, + "step": 13520 + }, + { + "epoch": 4.715928895085396, + "grad_norm": 11.953326225280762, + "learning_rate": 1.1362844196584175e-06, + "loss": 0.0902, + "step": 13530 + }, + { + "epoch": 4.719414430115023, + "grad_norm": 17.264039993286133, + "learning_rate": 1.1223422795399094e-06, + "loss": 0.189, + "step": 13540 + }, + { + "epoch": 4.7228999651446495, + "grad_norm": 0.02113143354654312, + "learning_rate": 1.1084001394214012e-06, + "loss": 0.1533, + "step": 13550 + }, + { + "epoch": 4.726385500174277, + "grad_norm": 0.029918327927589417, + "learning_rate": 1.0944579993028931e-06, + "loss": 0.1908, + "step": 13560 + }, + { + "epoch": 4.729871035203904, + "grad_norm": 0.1918916404247284, + "learning_rate": 1.080515859184385e-06, + "loss": 0.1448, + "step": 13570 + }, + { + "epoch": 4.733356570233531, + "grad_norm": 3.7918004989624023, + "learning_rate": 1.0665737190658767e-06, + "loss": 0.0895, + "step": 13580 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 15.452225685119629, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.154, + "step": 13590 + }, + { + "epoch": 4.740327640292785, + "grad_norm": 0.013763573952019215, + "learning_rate": 1.0386894388288602e-06, + "loss": 0.0615, + "step": 13600 + }, + { + "epoch": 4.743813175322412, + "grad_norm": 2.3458456993103027, + "learning_rate": 1.024747298710352e-06, + "loss": 0.1512, + "step": 13610 + }, + { + "epoch": 4.747298710352039, + "grad_norm": 25.5869197845459, + "learning_rate": 1.010805158591844e-06, + "loss": 0.1152, + "step": 13620 + }, + { + "epoch": 4.750784245381666, + "grad_norm": 0.01689908280968666, + "learning_rate": 9.968630184733358e-07, + "loss": 0.0267, + "step": 13630 + }, + { + "epoch": 4.754269780411293, + "grad_norm": 0.054154131561517715, + "learning_rate": 9.829208783548277e-07, + "loss": 0.0818, + "step": 13640 + }, + { + "epoch": 4.75775531544092, + "grad_norm": 0.030660415068268776, + "learning_rate": 9.689787382363193e-07, + "loss": 0.0049, + "step": 13650 + }, + { + "epoch": 4.761240850470547, + "grad_norm": 0.10165391117334366, + "learning_rate": 9.550365981178112e-07, + "loss": 0.179, + "step": 13660 + }, + { + "epoch": 4.7647263855001745, + "grad_norm": 0.038559578359127045, + "learning_rate": 9.41094457999303e-07, + "loss": 0.1079, + "step": 13670 + }, + { + "epoch": 4.768211920529802, + "grad_norm": 0.322663813829422, + "learning_rate": 9.271523178807948e-07, + "loss": 0.073, + "step": 13680 + }, + { + "epoch": 4.771697455559428, + "grad_norm": 0.013543305918574333, + "learning_rate": 9.132101777622866e-07, + "loss": 0.0798, + "step": 13690 + }, + { + "epoch": 4.775182990589055, + "grad_norm": 0.022806629538536072, + "learning_rate": 8.992680376437785e-07, + "loss": 0.056, + "step": 13700 + }, + { + "epoch": 4.7786685256186825, + "grad_norm": 0.024512002244591713, + "learning_rate": 8.853258975252702e-07, + "loss": 0.1357, + "step": 13710 + }, + { + "epoch": 4.78215406064831, + "grad_norm": 0.03469526022672653, + "learning_rate": 8.713837574067619e-07, + "loss": 0.0626, + "step": 13720 + }, + { + "epoch": 4.785639595677937, + "grad_norm": 19.904218673706055, + "learning_rate": 8.574416172882538e-07, + "loss": 0.0779, + "step": 13730 + }, + { + "epoch": 4.789125130707563, + "grad_norm": 0.01593809947371483, + "learning_rate": 8.434994771697457e-07, + "loss": 0.0538, + "step": 13740 + }, + { + "epoch": 4.7926106657371905, + "grad_norm": 0.012609903700649738, + "learning_rate": 8.295573370512374e-07, + "loss": 0.162, + "step": 13750 + }, + { + "epoch": 4.796096200766818, + "grad_norm": 0.14370125532150269, + "learning_rate": 8.156151969327293e-07, + "loss": 0.0212, + "step": 13760 + }, + { + "epoch": 4.799581735796445, + "grad_norm": 0.16198384761810303, + "learning_rate": 8.01673056814221e-07, + "loss": 0.0167, + "step": 13770 + }, + { + "epoch": 4.803067270826071, + "grad_norm": 37.06806945800781, + "learning_rate": 7.877309166957129e-07, + "loss": 0.0545, + "step": 13780 + }, + { + "epoch": 4.8065528058556986, + "grad_norm": 0.036267928779125214, + "learning_rate": 7.737887765772046e-07, + "loss": 0.1962, + "step": 13790 + }, + { + "epoch": 4.810038340885326, + "grad_norm": 0.038520101457834244, + "learning_rate": 7.598466364586965e-07, + "loss": 0.0049, + "step": 13800 + }, + { + "epoch": 4.813523875914953, + "grad_norm": 0.34361743927001953, + "learning_rate": 7.459044963401884e-07, + "loss": 0.063, + "step": 13810 + }, + { + "epoch": 4.81700941094458, + "grad_norm": 1.973721981048584, + "learning_rate": 7.3196235622168e-07, + "loss": 0.1321, + "step": 13820 + }, + { + "epoch": 4.8204949459742075, + "grad_norm": 0.058219779282808304, + "learning_rate": 7.180202161031719e-07, + "loss": 0.0716, + "step": 13830 + }, + { + "epoch": 4.823980481003834, + "grad_norm": 0.029252415522933006, + "learning_rate": 7.040780759846637e-07, + "loss": 0.11, + "step": 13840 + }, + { + "epoch": 4.827466016033461, + "grad_norm": 8.357165336608887, + "learning_rate": 6.901359358661555e-07, + "loss": 0.1614, + "step": 13850 + }, + { + "epoch": 4.830951551063088, + "grad_norm": 0.27816373109817505, + "learning_rate": 6.761937957476473e-07, + "loss": 0.1628, + "step": 13860 + }, + { + "epoch": 4.8344370860927155, + "grad_norm": 0.04683591425418854, + "learning_rate": 6.622516556291392e-07, + "loss": 0.1641, + "step": 13870 + }, + { + "epoch": 4.837922621122342, + "grad_norm": 13.681280136108398, + "learning_rate": 6.483095155106308e-07, + "loss": 0.0121, + "step": 13880 + }, + { + "epoch": 4.841408156151969, + "grad_norm": 0.020022893324494362, + "learning_rate": 6.343673753921227e-07, + "loss": 0.0361, + "step": 13890 + }, + { + "epoch": 4.844893691181596, + "grad_norm": 0.016766728833317757, + "learning_rate": 6.204252352736145e-07, + "loss": 0.0044, + "step": 13900 + }, + { + "epoch": 4.8483792262112235, + "grad_norm": 0.034691717475652695, + "learning_rate": 6.064830951551064e-07, + "loss": 0.1031, + "step": 13910 + }, + { + "epoch": 4.851864761240851, + "grad_norm": 0.28107964992523193, + "learning_rate": 5.925409550365982e-07, + "loss": 0.1587, + "step": 13920 + }, + { + "epoch": 4.855350296270478, + "grad_norm": 0.032920971512794495, + "learning_rate": 5.7859881491809e-07, + "loss": 0.1591, + "step": 13930 + }, + { + "epoch": 4.858835831300104, + "grad_norm": 0.01294040773063898, + "learning_rate": 5.646566747995818e-07, + "loss": 0.0468, + "step": 13940 + }, + { + "epoch": 4.8623213663297316, + "grad_norm": 0.06196725741028786, + "learning_rate": 5.507145346810736e-07, + "loss": 0.1613, + "step": 13950 + }, + { + "epoch": 4.865806901359359, + "grad_norm": 2.9842488765716553, + "learning_rate": 5.367723945625654e-07, + "loss": 0.0059, + "step": 13960 + }, + { + "epoch": 4.869292436388986, + "grad_norm": 0.224669948220253, + "learning_rate": 5.228302544440572e-07, + "loss": 0.0523, + "step": 13970 + }, + { + "epoch": 4.872777971418612, + "grad_norm": 2.8570218086242676, + "learning_rate": 5.08888114325549e-07, + "loss": 0.088, + "step": 13980 + }, + { + "epoch": 4.87626350644824, + "grad_norm": 0.2920582592487335, + "learning_rate": 4.949459742070408e-07, + "loss": 0.0094, + "step": 13990 + }, + { + "epoch": 4.879749041477867, + "grad_norm": 0.02076049894094467, + "learning_rate": 4.810038340885327e-07, + "loss": 0.1568, + "step": 14000 + }, + { + "epoch": 4.883234576507494, + "grad_norm": 0.015274460427463055, + "learning_rate": 4.670616939700244e-07, + "loss": 0.1205, + "step": 14010 + }, + { + "epoch": 4.886720111537121, + "grad_norm": 2.485515594482422, + "learning_rate": 4.5311955385151623e-07, + "loss": 0.1998, + "step": 14020 + }, + { + "epoch": 4.890205646566748, + "grad_norm": 0.03131948783993721, + "learning_rate": 4.3917741373300805e-07, + "loss": 0.0188, + "step": 14030 + }, + { + "epoch": 4.893691181596375, + "grad_norm": 0.014717698097229004, + "learning_rate": 4.252352736144998e-07, + "loss": 0.0876, + "step": 14040 + }, + { + "epoch": 4.897176716626002, + "grad_norm": 0.023664269596338272, + "learning_rate": 4.1129313349599164e-07, + "loss": 0.1329, + "step": 14050 + }, + { + "epoch": 4.900662251655629, + "grad_norm": 0.017263269051909447, + "learning_rate": 3.973509933774835e-07, + "loss": 0.2117, + "step": 14060 + }, + { + "epoch": 4.9041477866852565, + "grad_norm": 0.2318657636642456, + "learning_rate": 3.834088532589753e-07, + "loss": 0.0779, + "step": 14070 + }, + { + "epoch": 4.907633321714883, + "grad_norm": 0.06534525007009506, + "learning_rate": 3.694667131404671e-07, + "loss": 0.0394, + "step": 14080 + }, + { + "epoch": 4.91111885674451, + "grad_norm": 14.957361221313477, + "learning_rate": 3.555245730219589e-07, + "loss": 0.0659, + "step": 14090 + }, + { + "epoch": 4.914604391774137, + "grad_norm": 11.34598445892334, + "learning_rate": 3.415824329034507e-07, + "loss": 0.0993, + "step": 14100 + }, + { + "epoch": 4.9180899268037646, + "grad_norm": 3.656956672668457, + "learning_rate": 3.276402927849425e-07, + "loss": 0.0936, + "step": 14110 + }, + { + "epoch": 4.921575461833392, + "grad_norm": 0.048377808183431625, + "learning_rate": 3.1369815266643433e-07, + "loss": 0.1247, + "step": 14120 + }, + { + "epoch": 4.925060996863018, + "grad_norm": 21.954980850219727, + "learning_rate": 2.9975601254792615e-07, + "loss": 0.1416, + "step": 14130 + }, + { + "epoch": 4.928546531892645, + "grad_norm": 10.352675437927246, + "learning_rate": 2.858138724294179e-07, + "loss": 0.0832, + "step": 14140 + }, + { + "epoch": 4.932032066922273, + "grad_norm": 3.8607187271118164, + "learning_rate": 2.7187173231090974e-07, + "loss": 0.1472, + "step": 14150 + }, + { + "epoch": 4.9355176019519, + "grad_norm": 0.041085511445999146, + "learning_rate": 2.5792959219240156e-07, + "loss": 0.1278, + "step": 14160 + }, + { + "epoch": 4.939003136981527, + "grad_norm": 0.017858577892184258, + "learning_rate": 2.439874520738934e-07, + "loss": 0.0691, + "step": 14170 + }, + { + "epoch": 4.942488672011153, + "grad_norm": 10.322635650634766, + "learning_rate": 2.3004531195538517e-07, + "loss": 0.0753, + "step": 14180 + }, + { + "epoch": 4.945974207040781, + "grad_norm": 0.03775576502084732, + "learning_rate": 2.16103171836877e-07, + "loss": 0.022, + "step": 14190 + }, + { + "epoch": 4.949459742070408, + "grad_norm": 23.317575454711914, + "learning_rate": 2.021610317183688e-07, + "loss": 0.1228, + "step": 14200 + }, + { + "epoch": 4.952945277100035, + "grad_norm": 0.027847325429320335, + "learning_rate": 1.8821889159986058e-07, + "loss": 0.0041, + "step": 14210 + }, + { + "epoch": 4.956430812129662, + "grad_norm": 7.33128023147583, + "learning_rate": 1.7427675148135243e-07, + "loss": 0.0204, + "step": 14220 + }, + { + "epoch": 4.959916347159289, + "grad_norm": 0.49565252661705017, + "learning_rate": 1.6033461136284422e-07, + "loss": 0.0091, + "step": 14230 + }, + { + "epoch": 4.963401882188916, + "grad_norm": 0.3179240822792053, + "learning_rate": 1.4639247124433602e-07, + "loss": 0.1085, + "step": 14240 + }, + { + "epoch": 4.966887417218543, + "grad_norm": 0.015233837999403477, + "learning_rate": 1.3245033112582784e-07, + "loss": 0.0079, + "step": 14250 + }, + { + "epoch": 4.97037295224817, + "grad_norm": 4.127240180969238, + "learning_rate": 1.1850819100731964e-07, + "loss": 0.0403, + "step": 14260 + }, + { + "epoch": 4.973858487277797, + "grad_norm": 8.784873962402344, + "learning_rate": 1.0456605088881144e-07, + "loss": 0.0699, + "step": 14270 + }, + { + "epoch": 4.977344022307424, + "grad_norm": 9.374968528747559, + "learning_rate": 9.062391077030325e-08, + "loss": 0.039, + "step": 14280 + }, + { + "epoch": 4.980829557337051, + "grad_norm": 0.012251177802681923, + "learning_rate": 7.668177065179505e-08, + "loss": 0.1414, + "step": 14290 + }, + { + "epoch": 4.984315092366678, + "grad_norm": 2.2470829486846924, + "learning_rate": 6.273963053328686e-08, + "loss": 0.1129, + "step": 14300 + }, + { + "epoch": 4.987800627396306, + "grad_norm": 1.4505870342254639, + "learning_rate": 4.8797490414778674e-08, + "loss": 0.0438, + "step": 14310 + }, + { + "epoch": 4.991286162425933, + "grad_norm": 0.05611838400363922, + "learning_rate": 3.485535029627048e-08, + "loss": 0.0759, + "step": 14320 + }, + { + "epoch": 4.994771697455559, + "grad_norm": 0.01623629219830036, + "learning_rate": 2.0913210177762286e-08, + "loss": 0.0602, + "step": 14330 + }, + { + "epoch": 4.998257232485186, + "grad_norm": 0.022672150284051895, + "learning_rate": 6.971070059254096e-09, + "loss": 0.1617, + "step": 14340 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9916049382716049, + "eval_loss": 0.039273809641599655, + "eval_runtime": 19.3797, + "eval_samples_per_second": 208.981, + "eval_steps_per_second": 26.161, + "step": 14345 + }, + { + "epoch": 5.0, + "step": 14345, + "total_flos": 8.892843392498688e+18, + "train_loss": 0.1924674165219266, + "train_runtime": 1341.6959, + "train_samples_per_second": 85.526, + "train_steps_per_second": 10.692 + } + ], + "logging_steps": 10, + "max_steps": 14345, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.892843392498688e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}