{ "best_metric": 0.5319148936170213, "best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v7-tricot/checkpoint-3840", "epoch": 98.00759493670886, "eval_steps": 500, "global_step": 7900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012658227848101266, "grad_norm": 3.648193836212158, "learning_rate": 1.2658227848101266e-07, "loss": 1.0945, "step": 10 }, { "epoch": 0.002531645569620253, "grad_norm": 3.2326066493988037, "learning_rate": 2.5316455696202533e-07, "loss": 1.0964, "step": 20 }, { "epoch": 0.0037974683544303796, "grad_norm": 4.472383975982666, "learning_rate": 3.79746835443038e-07, "loss": 1.1085, "step": 30 }, { "epoch": 0.005063291139240506, "grad_norm": 2.293823003768921, "learning_rate": 5.063291139240507e-07, "loss": 1.0936, "step": 40 }, { "epoch": 0.006329113924050633, "grad_norm": 5.018352508544922, "learning_rate": 6.329113924050634e-07, "loss": 1.0804, "step": 50 }, { "epoch": 0.007594936708860759, "grad_norm": 3.437401533126831, "learning_rate": 7.59493670886076e-07, "loss": 1.1052, "step": 60 }, { "epoch": 0.008860759493670886, "grad_norm": 2.633840322494507, "learning_rate": 8.860759493670887e-07, "loss": 1.1041, "step": 70 }, { "epoch": 0.010126582278481013, "grad_norm": 8.326930046081543, "learning_rate": 1.0126582278481013e-06, "loss": 1.0989, "step": 80 }, { "epoch": 0.010126582278481013, "eval_accuracy": 0.3191489361702128, "eval_loss": 1.1030582189559937, "eval_runtime": 10.2981, "eval_samples_per_second": 4.564, "eval_steps_per_second": 1.165, "step": 80 }, { "epoch": 1.0012658227848101, "grad_norm": 5.40225076675415, "learning_rate": 1.139240506329114e-06, "loss": 1.0679, "step": 90 }, { "epoch": 1.0025316455696203, "grad_norm": 2.765038013458252, "learning_rate": 1.2658227848101267e-06, "loss": 1.1039, "step": 100 }, { "epoch": 1.0037974683544304, "grad_norm": 4.477443695068359, "learning_rate": 1.3924050632911392e-06, "loss": 1.0935, "step": 110 }, { "epoch": 1.0050632911392405, "grad_norm": 4.2104573249816895, "learning_rate": 1.518987341772152e-06, "loss": 1.0694, "step": 120 }, { "epoch": 1.0063291139240507, "grad_norm": 3.7104029655456543, "learning_rate": 1.6455696202531647e-06, "loss": 1.1194, "step": 130 }, { "epoch": 1.0075949367088608, "grad_norm": 6.592172622680664, "learning_rate": 1.7721518987341774e-06, "loss": 1.1171, "step": 140 }, { "epoch": 1.008860759493671, "grad_norm": 5.555737018585205, "learning_rate": 1.8987341772151901e-06, "loss": 1.0965, "step": 150 }, { "epoch": 1.010126582278481, "grad_norm": 6.766844749450684, "learning_rate": 2.0253164556962026e-06, "loss": 1.0889, "step": 160 }, { "epoch": 1.010126582278481, "eval_accuracy": 0.3404255319148936, "eval_loss": 1.1057974100112915, "eval_runtime": 9.1496, "eval_samples_per_second": 5.137, "eval_steps_per_second": 1.312, "step": 160 }, { "epoch": 2.00126582278481, "grad_norm": 5.130347728729248, "learning_rate": 2.1518987341772153e-06, "loss": 1.0725, "step": 170 }, { "epoch": 2.0025316455696203, "grad_norm": 5.7678070068359375, "learning_rate": 2.278481012658228e-06, "loss": 1.0791, "step": 180 }, { "epoch": 2.0037974683544304, "grad_norm": 6.685475826263428, "learning_rate": 2.4050632911392408e-06, "loss": 1.0906, "step": 190 }, { "epoch": 2.0050632911392405, "grad_norm": 4.179187774658203, "learning_rate": 2.5316455696202535e-06, "loss": 1.0754, "step": 200 }, { "epoch": 2.0063291139240507, "grad_norm": 7.93744421005249, "learning_rate": 2.6582278481012658e-06, "loss": 1.0612, "step": 210 }, { "epoch": 2.007594936708861, "grad_norm": 5.547979354858398, "learning_rate": 2.7848101265822785e-06, "loss": 1.0703, "step": 220 }, { "epoch": 2.008860759493671, "grad_norm": 6.917874813079834, "learning_rate": 2.9113924050632912e-06, "loss": 1.0648, "step": 230 }, { "epoch": 2.010126582278481, "grad_norm": 14.24355697631836, "learning_rate": 3.037974683544304e-06, "loss": 1.0739, "step": 240 }, { "epoch": 2.010126582278481, "eval_accuracy": 0.40425531914893614, "eval_loss": 1.1232898235321045, "eval_runtime": 8.996, "eval_samples_per_second": 5.225, "eval_steps_per_second": 1.334, "step": 240 }, { "epoch": 3.00126582278481, "grad_norm": 11.271843910217285, "learning_rate": 3.164556962025317e-06, "loss": 1.0305, "step": 250 }, { "epoch": 3.0025316455696203, "grad_norm": 12.815051078796387, "learning_rate": 3.2911392405063294e-06, "loss": 1.0576, "step": 260 }, { "epoch": 3.0037974683544304, "grad_norm": 19.665115356445312, "learning_rate": 3.417721518987342e-06, "loss": 0.9806, "step": 270 }, { "epoch": 3.0050632911392405, "grad_norm": 11.805643081665039, "learning_rate": 3.544303797468355e-06, "loss": 1.0493, "step": 280 }, { "epoch": 3.0063291139240507, "grad_norm": 17.121118545532227, "learning_rate": 3.6708860759493675e-06, "loss": 1.0728, "step": 290 }, { "epoch": 3.007594936708861, "grad_norm": 18.008495330810547, "learning_rate": 3.7974683544303802e-06, "loss": 1.0749, "step": 300 }, { "epoch": 3.008860759493671, "grad_norm": 7.478245735168457, "learning_rate": 3.924050632911393e-06, "loss": 1.0579, "step": 310 }, { "epoch": 3.010126582278481, "grad_norm": 29.086742401123047, "learning_rate": 4.050632911392405e-06, "loss": 1.0036, "step": 320 }, { "epoch": 3.010126582278481, "eval_accuracy": 0.2765957446808511, "eval_loss": 1.1595509052276611, "eval_runtime": 9.1259, "eval_samples_per_second": 5.15, "eval_steps_per_second": 1.315, "step": 320 }, { "epoch": 4.00126582278481, "grad_norm": 14.256952285766602, "learning_rate": 4.177215189873418e-06, "loss": 1.0293, "step": 330 }, { "epoch": 4.00253164556962, "grad_norm": 14.238683700561523, "learning_rate": 4.303797468354431e-06, "loss": 0.9938, "step": 340 }, { "epoch": 4.00379746835443, "grad_norm": 15.750340461730957, "learning_rate": 4.430379746835443e-06, "loss": 1.0362, "step": 350 }, { "epoch": 4.0050632911392405, "grad_norm": 24.7191219329834, "learning_rate": 4.556962025316456e-06, "loss": 0.9665, "step": 360 }, { "epoch": 4.006329113924051, "grad_norm": 17.434118270874023, "learning_rate": 4.683544303797468e-06, "loss": 0.9927, "step": 370 }, { "epoch": 4.007594936708861, "grad_norm": 5.921336650848389, "learning_rate": 4.8101265822784815e-06, "loss": 0.9862, "step": 380 }, { "epoch": 4.008860759493671, "grad_norm": 6.861782550811768, "learning_rate": 4.936708860759495e-06, "loss": 1.1099, "step": 390 }, { "epoch": 4.010126582278481, "grad_norm": 26.024229049682617, "learning_rate": 5.063291139240507e-06, "loss": 1.0706, "step": 400 }, { "epoch": 4.010126582278481, "eval_accuracy": 0.2553191489361702, "eval_loss": 1.1730738878250122, "eval_runtime": 9.0971, "eval_samples_per_second": 5.166, "eval_steps_per_second": 1.319, "step": 400 }, { "epoch": 5.00126582278481, "grad_norm": 13.2531099319458, "learning_rate": 5.189873417721519e-06, "loss": 0.8603, "step": 410 }, { "epoch": 5.00253164556962, "grad_norm": 11.527708053588867, "learning_rate": 5.3164556962025316e-06, "loss": 0.9762, "step": 420 }, { "epoch": 5.00379746835443, "grad_norm": 25.327789306640625, "learning_rate": 5.443037974683545e-06, "loss": 0.9512, "step": 430 }, { "epoch": 5.0050632911392405, "grad_norm": 24.11504554748535, "learning_rate": 5.569620253164557e-06, "loss": 0.9437, "step": 440 }, { "epoch": 5.006329113924051, "grad_norm": 26.003135681152344, "learning_rate": 5.69620253164557e-06, "loss": 0.9311, "step": 450 }, { "epoch": 5.007594936708861, "grad_norm": 22.07634735107422, "learning_rate": 5.8227848101265824e-06, "loss": 0.9741, "step": 460 }, { "epoch": 5.008860759493671, "grad_norm": 19.476099014282227, "learning_rate": 5.949367088607595e-06, "loss": 0.9916, "step": 470 }, { "epoch": 5.010126582278481, "grad_norm": 24.048255920410156, "learning_rate": 6.075949367088608e-06, "loss": 0.9669, "step": 480 }, { "epoch": 5.010126582278481, "eval_accuracy": 0.3191489361702128, "eval_loss": 1.1227548122406006, "eval_runtime": 9.13, "eval_samples_per_second": 5.148, "eval_steps_per_second": 1.314, "step": 480 }, { "epoch": 6.00126582278481, "grad_norm": 21.775312423706055, "learning_rate": 6.20253164556962e-06, "loss": 0.8955, "step": 490 }, { "epoch": 6.00253164556962, "grad_norm": 10.735696792602539, "learning_rate": 6.329113924050634e-06, "loss": 0.9152, "step": 500 }, { "epoch": 6.00379746835443, "grad_norm": 29.428773880004883, "learning_rate": 6.4556962025316464e-06, "loss": 0.9614, "step": 510 }, { "epoch": 6.0050632911392405, "grad_norm": 21.473602294921875, "learning_rate": 6.582278481012659e-06, "loss": 0.9911, "step": 520 }, { "epoch": 6.006329113924051, "grad_norm": 22.8590087890625, "learning_rate": 6.708860759493672e-06, "loss": 0.9406, "step": 530 }, { "epoch": 6.007594936708861, "grad_norm": 25.129230499267578, "learning_rate": 6.835443037974684e-06, "loss": 0.9051, "step": 540 }, { "epoch": 6.008860759493671, "grad_norm": 34.37338638305664, "learning_rate": 6.962025316455697e-06, "loss": 0.8215, "step": 550 }, { "epoch": 6.010126582278481, "grad_norm": 33.80929946899414, "learning_rate": 7.08860759493671e-06, "loss": 1.0233, "step": 560 }, { "epoch": 6.010126582278481, "eval_accuracy": 0.40425531914893614, "eval_loss": 1.1490142345428467, "eval_runtime": 9.1478, "eval_samples_per_second": 5.138, "eval_steps_per_second": 1.312, "step": 560 }, { "epoch": 7.00126582278481, "grad_norm": 10.804941177368164, "learning_rate": 7.215189873417722e-06, "loss": 0.8237, "step": 570 }, { "epoch": 7.00253164556962, "grad_norm": 14.405462265014648, "learning_rate": 7.341772151898735e-06, "loss": 0.8341, "step": 580 }, { "epoch": 7.00379746835443, "grad_norm": 33.400726318359375, "learning_rate": 7.468354430379747e-06, "loss": 0.8029, "step": 590 }, { "epoch": 7.0050632911392405, "grad_norm": 11.047707557678223, "learning_rate": 7.5949367088607605e-06, "loss": 0.935, "step": 600 }, { "epoch": 7.006329113924051, "grad_norm": 30.89590072631836, "learning_rate": 7.721518987341773e-06, "loss": 0.901, "step": 610 }, { "epoch": 7.007594936708861, "grad_norm": 14.323598861694336, "learning_rate": 7.848101265822786e-06, "loss": 0.8399, "step": 620 }, { "epoch": 7.008860759493671, "grad_norm": 25.75128173828125, "learning_rate": 7.974683544303799e-06, "loss": 0.8836, "step": 630 }, { "epoch": 7.010126582278481, "grad_norm": 28.034568786621094, "learning_rate": 8.10126582278481e-06, "loss": 0.8492, "step": 640 }, { "epoch": 7.010126582278481, "eval_accuracy": 0.3829787234042553, "eval_loss": 1.263619303703308, "eval_runtime": 8.888, "eval_samples_per_second": 5.288, "eval_steps_per_second": 1.35, "step": 640 }, { "epoch": 8.00126582278481, "grad_norm": 27.309749603271484, "learning_rate": 8.227848101265824e-06, "loss": 0.7993, "step": 650 }, { "epoch": 8.00253164556962, "grad_norm": 23.6923770904541, "learning_rate": 8.354430379746837e-06, "loss": 0.8733, "step": 660 }, { "epoch": 8.00379746835443, "grad_norm": 27.559850692749023, "learning_rate": 8.481012658227848e-06, "loss": 0.8727, "step": 670 }, { "epoch": 8.00506329113924, "grad_norm": 13.756896018981934, "learning_rate": 8.607594936708861e-06, "loss": 0.7896, "step": 680 }, { "epoch": 8.00632911392405, "grad_norm": 17.663959503173828, "learning_rate": 8.734177215189874e-06, "loss": 0.718, "step": 690 }, { "epoch": 8.00759493670886, "grad_norm": 11.68373966217041, "learning_rate": 8.860759493670886e-06, "loss": 0.6608, "step": 700 }, { "epoch": 8.00886075949367, "grad_norm": 15.120232582092285, "learning_rate": 8.987341772151899e-06, "loss": 0.7421, "step": 710 }, { "epoch": 8.010126582278481, "grad_norm": 12.948484420776367, "learning_rate": 9.113924050632912e-06, "loss": 0.8842, "step": 720 }, { "epoch": 8.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.4060986042022705, "eval_runtime": 8.8573, "eval_samples_per_second": 5.306, "eval_steps_per_second": 1.355, "step": 720 }, { "epoch": 9.00126582278481, "grad_norm": 17.29895782470703, "learning_rate": 9.240506329113925e-06, "loss": 0.7192, "step": 730 }, { "epoch": 9.00253164556962, "grad_norm": 16.932331085205078, "learning_rate": 9.367088607594937e-06, "loss": 0.8571, "step": 740 }, { "epoch": 9.00379746835443, "grad_norm": 27.8249454498291, "learning_rate": 9.49367088607595e-06, "loss": 0.6975, "step": 750 }, { "epoch": 9.00506329113924, "grad_norm": 19.709556579589844, "learning_rate": 9.620253164556963e-06, "loss": 0.7901, "step": 760 }, { "epoch": 9.00632911392405, "grad_norm": 27.908536911010742, "learning_rate": 9.746835443037975e-06, "loss": 0.7778, "step": 770 }, { "epoch": 9.00759493670886, "grad_norm": 11.295394897460938, "learning_rate": 9.87341772151899e-06, "loss": 0.6872, "step": 780 }, { "epoch": 9.00886075949367, "grad_norm": 19.349098205566406, "learning_rate": 1e-05, "loss": 0.7879, "step": 790 }, { "epoch": 9.010126582278481, "grad_norm": 17.75351333618164, "learning_rate": 9.985935302391e-06, "loss": 0.6599, "step": 800 }, { "epoch": 9.010126582278481, "eval_accuracy": 0.2978723404255319, "eval_loss": 1.3445005416870117, "eval_runtime": 8.6196, "eval_samples_per_second": 5.453, "eval_steps_per_second": 1.392, "step": 800 }, { "epoch": 10.00126582278481, "grad_norm": 21.275543212890625, "learning_rate": 9.971870604781998e-06, "loss": 0.6019, "step": 810 }, { "epoch": 10.00253164556962, "grad_norm": 29.977495193481445, "learning_rate": 9.957805907172996e-06, "loss": 0.724, "step": 820 }, { "epoch": 10.00379746835443, "grad_norm": 33.56300354003906, "learning_rate": 9.943741209563994e-06, "loss": 0.6457, "step": 830 }, { "epoch": 10.00506329113924, "grad_norm": 38.13019943237305, "learning_rate": 9.929676511954994e-06, "loss": 0.6331, "step": 840 }, { "epoch": 10.00632911392405, "grad_norm": 42.173423767089844, "learning_rate": 9.915611814345992e-06, "loss": 0.5996, "step": 850 }, { "epoch": 10.00759493670886, "grad_norm": 11.129090309143066, "learning_rate": 9.901547116736992e-06, "loss": 0.615, "step": 860 }, { "epoch": 10.00886075949367, "grad_norm": 37.42063903808594, "learning_rate": 9.88748241912799e-06, "loss": 0.6022, "step": 870 }, { "epoch": 10.010126582278481, "grad_norm": 55.16875457763672, "learning_rate": 9.87341772151899e-06, "loss": 0.6723, "step": 880 }, { "epoch": 10.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.4071933031082153, "eval_runtime": 8.6355, "eval_samples_per_second": 5.443, "eval_steps_per_second": 1.39, "step": 880 }, { "epoch": 11.00126582278481, "grad_norm": 14.047639846801758, "learning_rate": 9.859353023909987e-06, "loss": 0.5122, "step": 890 }, { "epoch": 11.00253164556962, "grad_norm": 14.567192077636719, "learning_rate": 9.845288326300985e-06, "loss": 0.5763, "step": 900 }, { "epoch": 11.00379746835443, "grad_norm": 31.18760871887207, "learning_rate": 9.831223628691983e-06, "loss": 0.6611, "step": 910 }, { "epoch": 11.00506329113924, "grad_norm": 49.245513916015625, "learning_rate": 9.817158931082983e-06, "loss": 0.7129, "step": 920 }, { "epoch": 11.00632911392405, "grad_norm": 25.506393432617188, "learning_rate": 9.803094233473981e-06, "loss": 0.4678, "step": 930 }, { "epoch": 11.00759493670886, "grad_norm": 16.567678451538086, "learning_rate": 9.78902953586498e-06, "loss": 0.6464, "step": 940 }, { "epoch": 11.00886075949367, "grad_norm": 45.41640090942383, "learning_rate": 9.774964838255979e-06, "loss": 0.6556, "step": 950 }, { "epoch": 11.010126582278481, "grad_norm": 53.20558166503906, "learning_rate": 9.760900140646977e-06, "loss": 0.604, "step": 960 }, { "epoch": 11.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.4198564291000366, "eval_runtime": 8.4854, "eval_samples_per_second": 5.539, "eval_steps_per_second": 1.414, "step": 960 }, { "epoch": 12.00126582278481, "grad_norm": 13.426566123962402, "learning_rate": 9.746835443037975e-06, "loss": 0.3598, "step": 970 }, { "epoch": 12.00253164556962, "grad_norm": 48.93745422363281, "learning_rate": 9.732770745428974e-06, "loss": 0.5107, "step": 980 }, { "epoch": 12.00379746835443, "grad_norm": 33.23870849609375, "learning_rate": 9.718706047819972e-06, "loss": 0.629, "step": 990 }, { "epoch": 12.00506329113924, "grad_norm": 75.58332061767578, "learning_rate": 9.704641350210972e-06, "loss": 0.4616, "step": 1000 }, { "epoch": 12.00632911392405, "grad_norm": 29.726964950561523, "learning_rate": 9.69057665260197e-06, "loss": 0.6597, "step": 1010 }, { "epoch": 12.00759493670886, "grad_norm": 41.4447135925293, "learning_rate": 9.67651195499297e-06, "loss": 0.67, "step": 1020 }, { "epoch": 12.00886075949367, "grad_norm": 59.76002502441406, "learning_rate": 9.662447257383967e-06, "loss": 0.6902, "step": 1030 }, { "epoch": 12.010126582278481, "grad_norm": 25.5214786529541, "learning_rate": 9.648382559774965e-06, "loss": 0.4959, "step": 1040 }, { "epoch": 12.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.5688742399215698, "eval_runtime": 8.4949, "eval_samples_per_second": 5.533, "eval_steps_per_second": 1.413, "step": 1040 }, { "epoch": 13.00126582278481, "grad_norm": 17.342782974243164, "learning_rate": 9.634317862165963e-06, "loss": 0.4443, "step": 1050 }, { "epoch": 13.00253164556962, "grad_norm": 6.651524066925049, "learning_rate": 9.620253164556963e-06, "loss": 0.4626, "step": 1060 }, { "epoch": 13.00379746835443, "grad_norm": 59.05470275878906, "learning_rate": 9.606188466947961e-06, "loss": 0.5051, "step": 1070 }, { "epoch": 13.00506329113924, "grad_norm": 12.133808135986328, "learning_rate": 9.59212376933896e-06, "loss": 0.4063, "step": 1080 }, { "epoch": 13.00632911392405, "grad_norm": 5.521517753601074, "learning_rate": 9.578059071729959e-06, "loss": 0.3626, "step": 1090 }, { "epoch": 13.00759493670886, "grad_norm": 39.51848220825195, "learning_rate": 9.563994374120957e-06, "loss": 0.4715, "step": 1100 }, { "epoch": 13.00886075949367, "grad_norm": 17.837867736816406, "learning_rate": 9.549929676511955e-06, "loss": 0.4161, "step": 1110 }, { "epoch": 13.010126582278481, "grad_norm": 10.324262619018555, "learning_rate": 9.535864978902954e-06, "loss": 0.3758, "step": 1120 }, { "epoch": 13.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.7867138385772705, "eval_runtime": 8.632, "eval_samples_per_second": 5.445, "eval_steps_per_second": 1.39, "step": 1120 }, { "epoch": 14.00126582278481, "grad_norm": 3.8076212406158447, "learning_rate": 9.521800281293952e-06, "loss": 0.2181, "step": 1130 }, { "epoch": 14.00253164556962, "grad_norm": 7.90512752532959, "learning_rate": 9.507735583684952e-06, "loss": 0.4037, "step": 1140 }, { "epoch": 14.00379746835443, "grad_norm": 6.371408462524414, "learning_rate": 9.49367088607595e-06, "loss": 0.7414, "step": 1150 }, { "epoch": 14.00506329113924, "grad_norm": 21.530675888061523, "learning_rate": 9.47960618846695e-06, "loss": 0.2786, "step": 1160 }, { "epoch": 14.00632911392405, "grad_norm": 1.7298585176467896, "learning_rate": 9.465541490857948e-06, "loss": 0.2941, "step": 1170 }, { "epoch": 14.00759493670886, "grad_norm": 14.179819107055664, "learning_rate": 9.451476793248946e-06, "loss": 0.6105, "step": 1180 }, { "epoch": 14.00886075949367, "grad_norm": 35.71600341796875, "learning_rate": 9.437412095639944e-06, "loss": 0.4703, "step": 1190 }, { "epoch": 14.010126582278481, "grad_norm": 7.779309272766113, "learning_rate": 9.423347398030943e-06, "loss": 0.6257, "step": 1200 }, { "epoch": 14.010126582278481, "eval_accuracy": 0.3617021276595745, "eval_loss": 1.921836018562317, "eval_runtime": 8.7081, "eval_samples_per_second": 5.397, "eval_steps_per_second": 1.378, "step": 1200 }, { "epoch": 15.00126582278481, "grad_norm": 90.20023345947266, "learning_rate": 9.409282700421943e-06, "loss": 0.3217, "step": 1210 }, { "epoch": 15.00253164556962, "grad_norm": 6.699902534484863, "learning_rate": 9.395218002812941e-06, "loss": 0.2383, "step": 1220 }, { "epoch": 15.00379746835443, "grad_norm": 15.322399139404297, "learning_rate": 9.381153305203939e-06, "loss": 0.2347, "step": 1230 }, { "epoch": 15.00506329113924, "grad_norm": 4.224050998687744, "learning_rate": 9.367088607594937e-06, "loss": 0.1293, "step": 1240 }, { "epoch": 15.00632911392405, "grad_norm": 3.2699191570281982, "learning_rate": 9.353023909985936e-06, "loss": 0.203, "step": 1250 }, { "epoch": 15.00759493670886, "grad_norm": 69.02498626708984, "learning_rate": 9.338959212376934e-06, "loss": 0.3505, "step": 1260 }, { "epoch": 15.00886075949367, "grad_norm": 148.28306579589844, "learning_rate": 9.324894514767934e-06, "loss": 0.3983, "step": 1270 }, { "epoch": 15.010126582278481, "grad_norm": 26.6025447845459, "learning_rate": 9.310829817158932e-06, "loss": 0.3693, "step": 1280 }, { "epoch": 15.010126582278481, "eval_accuracy": 0.3191489361702128, "eval_loss": 2.09875750541687, "eval_runtime": 9.0724, "eval_samples_per_second": 5.181, "eval_steps_per_second": 1.323, "step": 1280 }, { "epoch": 16.00126582278481, "grad_norm": 34.19914627075195, "learning_rate": 9.29676511954993e-06, "loss": 0.3708, "step": 1290 }, { "epoch": 16.00253164556962, "grad_norm": 57.25946807861328, "learning_rate": 9.28270042194093e-06, "loss": 0.5632, "step": 1300 }, { "epoch": 16.00379746835443, "grad_norm": 1.7772458791732788, "learning_rate": 9.268635724331928e-06, "loss": 0.2617, "step": 1310 }, { "epoch": 16.00506329113924, "grad_norm": 56.837650299072266, "learning_rate": 9.254571026722926e-06, "loss": 0.3024, "step": 1320 }, { "epoch": 16.00632911392405, "grad_norm": 0.5459592342376709, "learning_rate": 9.240506329113925e-06, "loss": 0.2552, "step": 1330 }, { "epoch": 16.00759493670886, "grad_norm": 153.30613708496094, "learning_rate": 9.226441631504923e-06, "loss": 0.659, "step": 1340 }, { "epoch": 16.008860759493672, "grad_norm": 47.839324951171875, "learning_rate": 9.212376933895923e-06, "loss": 0.3776, "step": 1350 }, { "epoch": 16.01012658227848, "grad_norm": 57.103763580322266, "learning_rate": 9.198312236286921e-06, "loss": 0.5933, "step": 1360 }, { "epoch": 16.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 1.8412983417510986, "eval_runtime": 8.4821, "eval_samples_per_second": 5.541, "eval_steps_per_second": 1.415, "step": 1360 }, { "epoch": 17.00126582278481, "grad_norm": 0.2360084503889084, "learning_rate": 9.184247538677919e-06, "loss": 0.0993, "step": 1370 }, { "epoch": 17.00253164556962, "grad_norm": 1.5083540678024292, "learning_rate": 9.170182841068917e-06, "loss": 0.2528, "step": 1380 }, { "epoch": 17.00379746835443, "grad_norm": 7.469198226928711, "learning_rate": 9.156118143459917e-06, "loss": 0.3329, "step": 1390 }, { "epoch": 17.00506329113924, "grad_norm": 100.13819885253906, "learning_rate": 9.142053445850915e-06, "loss": 0.2834, "step": 1400 }, { "epoch": 17.00632911392405, "grad_norm": 110.03264617919922, "learning_rate": 9.127988748241914e-06, "loss": 0.6402, "step": 1410 }, { "epoch": 17.00759493670886, "grad_norm": 116.64907836914062, "learning_rate": 9.113924050632912e-06, "loss": 0.4343, "step": 1420 }, { "epoch": 17.008860759493672, "grad_norm": 13.220937728881836, "learning_rate": 9.09985935302391e-06, "loss": 0.3556, "step": 1430 }, { "epoch": 17.01012658227848, "grad_norm": 63.16554260253906, "learning_rate": 9.085794655414908e-06, "loss": 0.202, "step": 1440 }, { "epoch": 17.01012658227848, "eval_accuracy": 0.3191489361702128, "eval_loss": 2.753727436065674, "eval_runtime": 8.441, "eval_samples_per_second": 5.568, "eval_steps_per_second": 1.422, "step": 1440 }, { "epoch": 18.00126582278481, "grad_norm": 6.848087310791016, "learning_rate": 9.071729957805908e-06, "loss": 0.2198, "step": 1450 }, { "epoch": 18.00253164556962, "grad_norm": 24.780672073364258, "learning_rate": 9.057665260196906e-06, "loss": 0.2974, "step": 1460 }, { "epoch": 18.00379746835443, "grad_norm": 28.783912658691406, "learning_rate": 9.043600562587905e-06, "loss": 0.2387, "step": 1470 }, { "epoch": 18.00506329113924, "grad_norm": 0.49766799807548523, "learning_rate": 9.029535864978903e-06, "loss": 0.2592, "step": 1480 }, { "epoch": 18.00632911392405, "grad_norm": 107.1086196899414, "learning_rate": 9.015471167369903e-06, "loss": 0.2736, "step": 1490 }, { "epoch": 18.00759493670886, "grad_norm": 1.34207284450531, "learning_rate": 9.001406469760901e-06, "loss": 0.3996, "step": 1500 }, { "epoch": 18.008860759493672, "grad_norm": 0.8816600441932678, "learning_rate": 8.987341772151899e-06, "loss": 0.3255, "step": 1510 }, { "epoch": 18.01012658227848, "grad_norm": 0.2845398187637329, "learning_rate": 8.973277074542897e-06, "loss": 0.1454, "step": 1520 }, { "epoch": 18.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 2.461174964904785, "eval_runtime": 8.4461, "eval_samples_per_second": 5.565, "eval_steps_per_second": 1.421, "step": 1520 }, { "epoch": 19.00126582278481, "grad_norm": 7.336277961730957, "learning_rate": 8.959212376933897e-06, "loss": 0.1073, "step": 1530 }, { "epoch": 19.00253164556962, "grad_norm": 1.7120882272720337, "learning_rate": 8.945147679324895e-06, "loss": 0.286, "step": 1540 }, { "epoch": 19.00379746835443, "grad_norm": 5.534066677093506, "learning_rate": 8.931082981715894e-06, "loss": 0.0711, "step": 1550 }, { "epoch": 19.00506329113924, "grad_norm": 0.4742295742034912, "learning_rate": 8.917018284106892e-06, "loss": 0.0344, "step": 1560 }, { "epoch": 19.00632911392405, "grad_norm": 7.864910125732422, "learning_rate": 8.90295358649789e-06, "loss": 0.4488, "step": 1570 }, { "epoch": 19.00759493670886, "grad_norm": 29.55208396911621, "learning_rate": 8.888888888888888e-06, "loss": 0.0769, "step": 1580 }, { "epoch": 19.008860759493672, "grad_norm": 6.9868597984313965, "learning_rate": 8.874824191279888e-06, "loss": 0.2275, "step": 1590 }, { "epoch": 19.01012658227848, "grad_norm": 0.9346122741699219, "learning_rate": 8.860759493670886e-06, "loss": 0.1332, "step": 1600 }, { "epoch": 19.01012658227848, "eval_accuracy": 0.3404255319148936, "eval_loss": 3.094426155090332, "eval_runtime": 8.4844, "eval_samples_per_second": 5.54, "eval_steps_per_second": 1.414, "step": 1600 }, { "epoch": 20.00126582278481, "grad_norm": 0.2495788335800171, "learning_rate": 8.846694796061886e-06, "loss": 0.0054, "step": 1610 }, { "epoch": 20.00253164556962, "grad_norm": 0.9110737442970276, "learning_rate": 8.832630098452884e-06, "loss": 0.1922, "step": 1620 }, { "epoch": 20.00379746835443, "grad_norm": 18.964305877685547, "learning_rate": 8.818565400843883e-06, "loss": 0.0081, "step": 1630 }, { "epoch": 20.00506329113924, "grad_norm": 32.764984130859375, "learning_rate": 8.804500703234881e-06, "loss": 0.1649, "step": 1640 }, { "epoch": 20.00632911392405, "grad_norm": 0.6211456060409546, "learning_rate": 8.79043600562588e-06, "loss": 0.267, "step": 1650 }, { "epoch": 20.00759493670886, "grad_norm": 11.705927848815918, "learning_rate": 8.776371308016879e-06, "loss": 0.0885, "step": 1660 }, { "epoch": 20.008860759493672, "grad_norm": 4.711695194244385, "learning_rate": 8.762306610407877e-06, "loss": 0.1366, "step": 1670 }, { "epoch": 20.01012658227848, "grad_norm": 1.115964412689209, "learning_rate": 8.748241912798877e-06, "loss": 0.9193, "step": 1680 }, { "epoch": 20.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 2.869112491607666, "eval_runtime": 8.4747, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.416, "step": 1680 }, { "epoch": 21.00126582278481, "grad_norm": 0.7912726998329163, "learning_rate": 8.734177215189874e-06, "loss": 0.0675, "step": 1690 }, { "epoch": 21.00253164556962, "grad_norm": 1.868703007698059, "learning_rate": 8.720112517580872e-06, "loss": 0.0215, "step": 1700 }, { "epoch": 21.00379746835443, "grad_norm": 0.07253948599100113, "learning_rate": 8.70604781997187e-06, "loss": 0.1879, "step": 1710 }, { "epoch": 21.00506329113924, "grad_norm": 0.07606098800897598, "learning_rate": 8.69198312236287e-06, "loss": 0.2937, "step": 1720 }, { "epoch": 21.00632911392405, "grad_norm": 4.814393520355225, "learning_rate": 8.677918424753868e-06, "loss": 0.1223, "step": 1730 }, { "epoch": 21.00759493670886, "grad_norm": 0.052608225494623184, "learning_rate": 8.663853727144868e-06, "loss": 0.1895, "step": 1740 }, { "epoch": 21.008860759493672, "grad_norm": 6.358555316925049, "learning_rate": 8.649789029535866e-06, "loss": 0.2224, "step": 1750 }, { "epoch": 21.01012658227848, "grad_norm": 0.10539772361516953, "learning_rate": 8.635724331926865e-06, "loss": 0.1201, "step": 1760 }, { "epoch": 21.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 3.0563912391662598, "eval_runtime": 8.5749, "eval_samples_per_second": 5.481, "eval_steps_per_second": 1.399, "step": 1760 }, { "epoch": 22.00126582278481, "grad_norm": 3.320700168609619, "learning_rate": 8.621659634317863e-06, "loss": 0.0066, "step": 1770 }, { "epoch": 22.00253164556962, "grad_norm": 77.03856658935547, "learning_rate": 8.607594936708861e-06, "loss": 0.0985, "step": 1780 }, { "epoch": 22.00379746835443, "grad_norm": 0.06309456378221512, "learning_rate": 8.59353023909986e-06, "loss": 0.0229, "step": 1790 }, { "epoch": 22.00506329113924, "grad_norm": 81.78655242919922, "learning_rate": 8.579465541490859e-06, "loss": 0.1983, "step": 1800 }, { "epoch": 22.00632911392405, "grad_norm": 0.15561726689338684, "learning_rate": 8.565400843881857e-06, "loss": 0.1817, "step": 1810 }, { "epoch": 22.00759493670886, "grad_norm": 10.21172046661377, "learning_rate": 8.551336146272857e-06, "loss": 0.0148, "step": 1820 }, { "epoch": 22.008860759493672, "grad_norm": 0.5883349180221558, "learning_rate": 8.537271448663855e-06, "loss": 0.0018, "step": 1830 }, { "epoch": 22.01012658227848, "grad_norm": 0.019595500081777573, "learning_rate": 8.523206751054853e-06, "loss": 0.1716, "step": 1840 }, { "epoch": 22.01012658227848, "eval_accuracy": 0.3404255319148936, "eval_loss": 3.390719175338745, "eval_runtime": 8.6187, "eval_samples_per_second": 5.453, "eval_steps_per_second": 1.392, "step": 1840 }, { "epoch": 23.00126582278481, "grad_norm": 0.025295179337263107, "learning_rate": 8.50914205344585e-06, "loss": 0.0037, "step": 1850 }, { "epoch": 23.00253164556962, "grad_norm": 0.1332973688840866, "learning_rate": 8.49507735583685e-06, "loss": 0.0026, "step": 1860 }, { "epoch": 23.00379746835443, "grad_norm": 0.08286605030298233, "learning_rate": 8.481012658227848e-06, "loss": 0.1337, "step": 1870 }, { "epoch": 23.00506329113924, "grad_norm": 0.11277411133050919, "learning_rate": 8.466947960618848e-06, "loss": 0.1286, "step": 1880 }, { "epoch": 23.00632911392405, "grad_norm": 0.047154348343610764, "learning_rate": 8.452883263009846e-06, "loss": 0.0068, "step": 1890 }, { "epoch": 23.00759493670886, "grad_norm": 0.02648848481476307, "learning_rate": 8.438818565400846e-06, "loss": 0.0168, "step": 1900 }, { "epoch": 23.008860759493672, "grad_norm": 0.0498431995511055, "learning_rate": 8.424753867791844e-06, "loss": 0.0048, "step": 1910 }, { "epoch": 23.01012658227848, "grad_norm": 0.11999885737895966, "learning_rate": 8.410689170182841e-06, "loss": 0.0402, "step": 1920 }, { "epoch": 23.01012658227848, "eval_accuracy": 0.3191489361702128, "eval_loss": 3.7917425632476807, "eval_runtime": 8.7081, "eval_samples_per_second": 5.397, "eval_steps_per_second": 1.378, "step": 1920 }, { "epoch": 24.00126582278481, "grad_norm": 0.034499507397413254, "learning_rate": 8.39662447257384e-06, "loss": 0.012, "step": 1930 }, { "epoch": 24.00253164556962, "grad_norm": 42.179473876953125, "learning_rate": 8.382559774964839e-06, "loss": 0.2346, "step": 1940 }, { "epoch": 24.00379746835443, "grad_norm": 0.6478450298309326, "learning_rate": 8.368495077355837e-06, "loss": 0.008, "step": 1950 }, { "epoch": 24.00506329113924, "grad_norm": 0.04269712418317795, "learning_rate": 8.354430379746837e-06, "loss": 0.004, "step": 1960 }, { "epoch": 24.00632911392405, "grad_norm": 29.495561599731445, "learning_rate": 8.340365682137835e-06, "loss": 0.1656, "step": 1970 }, { "epoch": 24.00759493670886, "grad_norm": 0.09528925269842148, "learning_rate": 8.326300984528833e-06, "loss": 0.0268, "step": 1980 }, { "epoch": 24.008860759493672, "grad_norm": 0.023056741803884506, "learning_rate": 8.31223628691983e-06, "loss": 0.0095, "step": 1990 }, { "epoch": 24.01012658227848, "grad_norm": 2.2930028438568115, "learning_rate": 8.29817158931083e-06, "loss": 0.0709, "step": 2000 }, { "epoch": 24.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 3.5486884117126465, "eval_runtime": 8.9153, "eval_samples_per_second": 5.272, "eval_steps_per_second": 1.346, "step": 2000 }, { "epoch": 25.00126582278481, "grad_norm": 0.614183783531189, "learning_rate": 8.284106891701828e-06, "loss": 0.0297, "step": 2010 }, { "epoch": 25.00253164556962, "grad_norm": 243.75750732421875, "learning_rate": 8.270042194092828e-06, "loss": 0.1049, "step": 2020 }, { "epoch": 25.00379746835443, "grad_norm": 210.3068389892578, "learning_rate": 8.255977496483826e-06, "loss": 0.0886, "step": 2030 }, { "epoch": 25.00506329113924, "grad_norm": 2.261234760284424, "learning_rate": 8.241912798874826e-06, "loss": 0.0027, "step": 2040 }, { "epoch": 25.00632911392405, "grad_norm": 6.479150772094727, "learning_rate": 8.227848101265824e-06, "loss": 0.1408, "step": 2050 }, { "epoch": 25.00759493670886, "grad_norm": 0.04374171420931816, "learning_rate": 8.213783403656822e-06, "loss": 0.0659, "step": 2060 }, { "epoch": 25.008860759493672, "grad_norm": 0.19435258209705353, "learning_rate": 8.199718706047821e-06, "loss": 0.0016, "step": 2070 }, { "epoch": 25.01012658227848, "grad_norm": 0.020269129425287247, "learning_rate": 8.18565400843882e-06, "loss": 0.1021, "step": 2080 }, { "epoch": 25.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 3.9004390239715576, "eval_runtime": 8.7333, "eval_samples_per_second": 5.382, "eval_steps_per_second": 1.374, "step": 2080 }, { "epoch": 26.00126582278481, "grad_norm": 0.07372234761714935, "learning_rate": 8.171589310829819e-06, "loss": 0.0096, "step": 2090 }, { "epoch": 26.00253164556962, "grad_norm": 0.9319536089897156, "learning_rate": 8.157524613220817e-06, "loss": 0.0877, "step": 2100 }, { "epoch": 26.00379746835443, "grad_norm": 1.9737757444381714, "learning_rate": 8.143459915611815e-06, "loss": 0.0105, "step": 2110 }, { "epoch": 26.00506329113924, "grad_norm": 0.010359777137637138, "learning_rate": 8.129395218002813e-06, "loss": 0.0019, "step": 2120 }, { "epoch": 26.00632911392405, "grad_norm": 0.16365653276443481, "learning_rate": 8.115330520393813e-06, "loss": 0.0006, "step": 2130 }, { "epoch": 26.00759493670886, "grad_norm": 184.18040466308594, "learning_rate": 8.10126582278481e-06, "loss": 0.1279, "step": 2140 }, { "epoch": 26.008860759493672, "grad_norm": 0.01543757226318121, "learning_rate": 8.08720112517581e-06, "loss": 0.0289, "step": 2150 }, { "epoch": 26.01012658227848, "grad_norm": 0.02343440055847168, "learning_rate": 8.073136427566808e-06, "loss": 0.0029, "step": 2160 }, { "epoch": 26.01012658227848, "eval_accuracy": 0.3617021276595745, "eval_loss": 4.194858551025391, "eval_runtime": 9.0554, "eval_samples_per_second": 5.19, "eval_steps_per_second": 1.325, "step": 2160 }, { "epoch": 27.00126582278481, "grad_norm": 0.163554847240448, "learning_rate": 8.059071729957806e-06, "loss": 0.0027, "step": 2170 }, { "epoch": 27.00253164556962, "grad_norm": 64.04247283935547, "learning_rate": 8.045007032348806e-06, "loss": 0.0081, "step": 2180 }, { "epoch": 27.00379746835443, "grad_norm": 0.2571711242198944, "learning_rate": 8.030942334739804e-06, "loss": 0.0059, "step": 2190 }, { "epoch": 27.00506329113924, "grad_norm": 0.015557597391307354, "learning_rate": 8.016877637130802e-06, "loss": 0.0709, "step": 2200 }, { "epoch": 27.00632911392405, "grad_norm": 0.05058155208826065, "learning_rate": 8.002812939521801e-06, "loss": 0.0016, "step": 2210 }, { "epoch": 27.00759493670886, "grad_norm": 0.06934946775436401, "learning_rate": 7.9887482419128e-06, "loss": 0.0048, "step": 2220 }, { "epoch": 27.008860759493672, "grad_norm": 0.06157020479440689, "learning_rate": 7.974683544303799e-06, "loss": 0.0006, "step": 2230 }, { "epoch": 27.01012658227848, "grad_norm": 0.016570130363106728, "learning_rate": 7.960618846694797e-06, "loss": 0.1352, "step": 2240 }, { "epoch": 27.01012658227848, "eval_accuracy": 0.3617021276595745, "eval_loss": 4.503756999969482, "eval_runtime": 8.4591, "eval_samples_per_second": 5.556, "eval_steps_per_second": 1.419, "step": 2240 }, { "epoch": 28.00126582278481, "grad_norm": 0.05582532659173012, "learning_rate": 7.946554149085795e-06, "loss": 0.0875, "step": 2250 }, { "epoch": 28.00253164556962, "grad_norm": 0.04096909984946251, "learning_rate": 7.932489451476793e-06, "loss": 0.0003, "step": 2260 }, { "epoch": 28.00379746835443, "grad_norm": 0.9817273616790771, "learning_rate": 7.918424753867793e-06, "loss": 0.0012, "step": 2270 }, { "epoch": 28.00506329113924, "grad_norm": 0.07687732577323914, "learning_rate": 7.90436005625879e-06, "loss": 0.0023, "step": 2280 }, { "epoch": 28.00632911392405, "grad_norm": 18.15758514404297, "learning_rate": 7.89029535864979e-06, "loss": 0.1754, "step": 2290 }, { "epoch": 28.00759493670886, "grad_norm": 0.007940283045172691, "learning_rate": 7.876230661040788e-06, "loss": 0.3378, "step": 2300 }, { "epoch": 28.008860759493672, "grad_norm": 165.2981414794922, "learning_rate": 7.862165963431786e-06, "loss": 0.2031, "step": 2310 }, { "epoch": 28.01012658227848, "grad_norm": 0.009227721951901913, "learning_rate": 7.848101265822786e-06, "loss": 0.0173, "step": 2320 }, { "epoch": 28.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 3.935215473175049, "eval_runtime": 8.4766, "eval_samples_per_second": 5.545, "eval_steps_per_second": 1.416, "step": 2320 }, { "epoch": 29.00126582278481, "grad_norm": 0.01626473106443882, "learning_rate": 7.834036568213784e-06, "loss": 0.0544, "step": 2330 }, { "epoch": 29.00253164556962, "grad_norm": 0.018083002418279648, "learning_rate": 7.819971870604782e-06, "loss": 0.0064, "step": 2340 }, { "epoch": 29.00379746835443, "grad_norm": 0.2154766470193863, "learning_rate": 7.805907172995782e-06, "loss": 0.0006, "step": 2350 }, { "epoch": 29.00506329113924, "grad_norm": 220.96780395507812, "learning_rate": 7.79184247538678e-06, "loss": 0.1229, "step": 2360 }, { "epoch": 29.00632911392405, "grad_norm": 0.17289696633815765, "learning_rate": 7.77777777777778e-06, "loss": 0.0005, "step": 2370 }, { "epoch": 29.00759493670886, "grad_norm": 1.7889928817749023, "learning_rate": 7.763713080168777e-06, "loss": 0.1407, "step": 2380 }, { "epoch": 29.008860759493672, "grad_norm": 0.008173462934792042, "learning_rate": 7.749648382559775e-06, "loss": 0.1463, "step": 2390 }, { "epoch": 29.01012658227848, "grad_norm": 0.011393209919333458, "learning_rate": 7.735583684950773e-06, "loss": 0.0012, "step": 2400 }, { "epoch": 29.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.323361873626709, "eval_runtime": 8.4682, "eval_samples_per_second": 5.55, "eval_steps_per_second": 1.417, "step": 2400 }, { "epoch": 30.00126582278481, "grad_norm": 0.011178904213011265, "learning_rate": 7.721518987341773e-06, "loss": 0.0004, "step": 2410 }, { "epoch": 30.00253164556962, "grad_norm": 0.0153023237362504, "learning_rate": 7.70745428973277e-06, "loss": 0.0008, "step": 2420 }, { "epoch": 30.00379746835443, "grad_norm": 0.010914456099271774, "learning_rate": 7.69338959212377e-06, "loss": 0.0004, "step": 2430 }, { "epoch": 30.00506329113924, "grad_norm": 0.007891859859228134, "learning_rate": 7.679324894514768e-06, "loss": 0.0005, "step": 2440 }, { "epoch": 30.00632911392405, "grad_norm": 87.5243911743164, "learning_rate": 7.665260196905766e-06, "loss": 0.0081, "step": 2450 }, { "epoch": 30.00759493670886, "grad_norm": 1.0978916883468628, "learning_rate": 7.651195499296766e-06, "loss": 0.0004, "step": 2460 }, { "epoch": 30.008860759493672, "grad_norm": 12.773395538330078, "learning_rate": 7.637130801687764e-06, "loss": 0.0024, "step": 2470 }, { "epoch": 30.01012658227848, "grad_norm": 0.03179134428501129, "learning_rate": 7.623066104078764e-06, "loss": 0.0007, "step": 2480 }, { "epoch": 30.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.287741661071777, "eval_runtime": 8.4651, "eval_samples_per_second": 5.552, "eval_steps_per_second": 1.418, "step": 2480 }, { "epoch": 31.00126582278481, "grad_norm": 0.005546510685235262, "learning_rate": 7.609001406469762e-06, "loss": 0.0003, "step": 2490 }, { "epoch": 31.00253164556962, "grad_norm": 0.013632872141897678, "learning_rate": 7.5949367088607605e-06, "loss": 0.1395, "step": 2500 }, { "epoch": 31.00379746835443, "grad_norm": 0.006918332539498806, "learning_rate": 7.5808720112517584e-06, "loss": 0.0027, "step": 2510 }, { "epoch": 31.00506329113924, "grad_norm": 0.012666971422731876, "learning_rate": 7.566807313642758e-06, "loss": 0.0004, "step": 2520 }, { "epoch": 31.00632911392405, "grad_norm": 0.005221995059400797, "learning_rate": 7.552742616033756e-06, "loss": 0.0009, "step": 2530 }, { "epoch": 31.00759493670886, "grad_norm": 0.013362145982682705, "learning_rate": 7.538677918424755e-06, "loss": 0.0012, "step": 2540 }, { "epoch": 31.008860759493672, "grad_norm": 0.035756830126047134, "learning_rate": 7.524613220815753e-06, "loss": 0.0004, "step": 2550 }, { "epoch": 31.01012658227848, "grad_norm": 0.08822837471961975, "learning_rate": 7.510548523206752e-06, "loss": 0.2292, "step": 2560 }, { "epoch": 31.01012658227848, "eval_accuracy": 0.3191489361702128, "eval_loss": 4.729736804962158, "eval_runtime": 8.4818, "eval_samples_per_second": 5.541, "eval_steps_per_second": 1.415, "step": 2560 }, { "epoch": 32.00126582278481, "grad_norm": 6.84944486618042, "learning_rate": 7.4964838255977505e-06, "loss": 0.001, "step": 2570 }, { "epoch": 32.00253164556962, "grad_norm": 0.025634169578552246, "learning_rate": 7.482419127988749e-06, "loss": 0.0002, "step": 2580 }, { "epoch": 32.00379746835443, "grad_norm": 0.026997152715921402, "learning_rate": 7.468354430379747e-06, "loss": 0.0089, "step": 2590 }, { "epoch": 32.00506329113924, "grad_norm": 0.008302225731313229, "learning_rate": 7.454289732770746e-06, "loss": 0.0005, "step": 2600 }, { "epoch": 32.00632911392405, "grad_norm": 0.033620625734329224, "learning_rate": 7.440225035161744e-06, "loss": 0.0081, "step": 2610 }, { "epoch": 32.00759493670886, "grad_norm": 0.022618619725108147, "learning_rate": 7.426160337552744e-06, "loss": 0.0548, "step": 2620 }, { "epoch": 32.00886075949367, "grad_norm": 0.0314350426197052, "learning_rate": 7.412095639943742e-06, "loss": 0.0003, "step": 2630 }, { "epoch": 32.01012658227848, "grad_norm": 0.007120281923562288, "learning_rate": 7.398030942334741e-06, "loss": 0.0004, "step": 2640 }, { "epoch": 32.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.471046447753906, "eval_runtime": 8.9073, "eval_samples_per_second": 5.277, "eval_steps_per_second": 1.347, "step": 2640 }, { "epoch": 33.00126582278481, "grad_norm": 0.3721332550048828, "learning_rate": 7.3839662447257386e-06, "loss": 0.1564, "step": 2650 }, { "epoch": 33.00253164556962, "grad_norm": 0.052768442779779434, "learning_rate": 7.369901547116738e-06, "loss": 0.0022, "step": 2660 }, { "epoch": 33.00379746835443, "grad_norm": 93.05609130859375, "learning_rate": 7.355836849507736e-06, "loss": 0.2399, "step": 2670 }, { "epoch": 33.00506329113924, "grad_norm": 0.0038992296904325485, "learning_rate": 7.341772151898735e-06, "loss": 0.0088, "step": 2680 }, { "epoch": 33.00632911392405, "grad_norm": 0.020863041281700134, "learning_rate": 7.327707454289733e-06, "loss": 0.0002, "step": 2690 }, { "epoch": 33.00759493670886, "grad_norm": 0.006648873444646597, "learning_rate": 7.313642756680732e-06, "loss": 0.0388, "step": 2700 }, { "epoch": 33.00886075949367, "grad_norm": 156.20700073242188, "learning_rate": 7.29957805907173e-06, "loss": 0.2333, "step": 2710 }, { "epoch": 33.01012658227848, "grad_norm": 0.008939997293055058, "learning_rate": 7.2855133614627295e-06, "loss": 0.0361, "step": 2720 }, { "epoch": 33.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.239119529724121, "eval_runtime": 8.9548, "eval_samples_per_second": 5.249, "eval_steps_per_second": 1.34, "step": 2720 }, { "epoch": 34.00126582278481, "grad_norm": 15.270977020263672, "learning_rate": 7.2714486638537275e-06, "loss": 0.1088, "step": 2730 }, { "epoch": 34.00253164556962, "grad_norm": 14.447574615478516, "learning_rate": 7.257383966244726e-06, "loss": 0.002, "step": 2740 }, { "epoch": 34.00379746835443, "grad_norm": 0.009641851298511028, "learning_rate": 7.243319268635724e-06, "loss": 0.0003, "step": 2750 }, { "epoch": 34.00506329113924, "grad_norm": 3.5248186588287354, "learning_rate": 7.229254571026724e-06, "loss": 0.0007, "step": 2760 }, { "epoch": 34.00632911392405, "grad_norm": 0.06941874325275421, "learning_rate": 7.215189873417722e-06, "loss": 0.106, "step": 2770 }, { "epoch": 34.00759493670886, "grad_norm": 0.0060513801872730255, "learning_rate": 7.201125175808721e-06, "loss": 0.0003, "step": 2780 }, { "epoch": 34.00886075949367, "grad_norm": 0.03698160871863365, "learning_rate": 7.187060478199719e-06, "loss": 0.0003, "step": 2790 }, { "epoch": 34.01012658227848, "grad_norm": 0.021343868225812912, "learning_rate": 7.172995780590718e-06, "loss": 0.0002, "step": 2800 }, { "epoch": 34.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.2255730628967285, "eval_runtime": 9.1143, "eval_samples_per_second": 5.157, "eval_steps_per_second": 1.317, "step": 2800 }, { "epoch": 35.00126582278481, "grad_norm": 0.02194453403353691, "learning_rate": 7.158931082981716e-06, "loss": 0.0002, "step": 2810 }, { "epoch": 35.00253164556962, "grad_norm": 0.008681000210344791, "learning_rate": 7.144866385372715e-06, "loss": 0.0017, "step": 2820 }, { "epoch": 35.00379746835443, "grad_norm": 0.003180101979523897, "learning_rate": 7.130801687763713e-06, "loss": 0.0002, "step": 2830 }, { "epoch": 35.00506329113924, "grad_norm": 0.00399158988147974, "learning_rate": 7.116736990154712e-06, "loss": 0.0851, "step": 2840 }, { "epoch": 35.00632911392405, "grad_norm": 317.16937255859375, "learning_rate": 7.10267229254571e-06, "loss": 0.1581, "step": 2850 }, { "epoch": 35.00759493670886, "grad_norm": 0.006524229887872934, "learning_rate": 7.08860759493671e-06, "loss": 0.0002, "step": 2860 }, { "epoch": 35.00886075949367, "grad_norm": 230.1353759765625, "learning_rate": 7.074542897327708e-06, "loss": 0.1746, "step": 2870 }, { "epoch": 35.01012658227848, "grad_norm": 89.08749389648438, "learning_rate": 7.060478199718706e-06, "loss": 0.0082, "step": 2880 }, { "epoch": 35.01012658227848, "eval_accuracy": 0.3404255319148936, "eval_loss": 5.073359489440918, "eval_runtime": 8.501, "eval_samples_per_second": 5.529, "eval_steps_per_second": 1.412, "step": 2880 }, { "epoch": 36.00126582278481, "grad_norm": 0.014753330498933792, "learning_rate": 7.046413502109706e-06, "loss": 0.0002, "step": 2890 }, { "epoch": 36.00253164556962, "grad_norm": 3.0008251667022705, "learning_rate": 7.032348804500704e-06, "loss": 0.0373, "step": 2900 }, { "epoch": 36.00379746835443, "grad_norm": 0.010498768649995327, "learning_rate": 7.018284106891703e-06, "loss": 0.009, "step": 2910 }, { "epoch": 36.00506329113924, "grad_norm": 0.06089121848344803, "learning_rate": 7.004219409282701e-06, "loss": 0.0003, "step": 2920 }, { "epoch": 36.00632911392405, "grad_norm": 0.009548901580274105, "learning_rate": 6.9901547116737e-06, "loss": 0.1971, "step": 2930 }, { "epoch": 36.00759493670886, "grad_norm": 0.11378785222768784, "learning_rate": 6.9760900140646985e-06, "loss": 0.0021, "step": 2940 }, { "epoch": 36.00886075949367, "grad_norm": 0.004684086889028549, "learning_rate": 6.962025316455697e-06, "loss": 0.0879, "step": 2950 }, { "epoch": 36.01012658227848, "grad_norm": 0.005387377459555864, "learning_rate": 6.947960618846695e-06, "loss": 0.0318, "step": 2960 }, { "epoch": 36.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.073455333709717, "eval_runtime": 8.491, "eval_samples_per_second": 5.535, "eval_steps_per_second": 1.413, "step": 2960 }, { "epoch": 37.00126582278481, "grad_norm": 0.0073117660358548164, "learning_rate": 6.933895921237694e-06, "loss": 0.0002, "step": 2970 }, { "epoch": 37.00253164556962, "grad_norm": 0.01575954630970955, "learning_rate": 6.919831223628692e-06, "loss": 0.0051, "step": 2980 }, { "epoch": 37.00379746835443, "grad_norm": 0.005418274085968733, "learning_rate": 6.905766526019692e-06, "loss": 0.0003, "step": 2990 }, { "epoch": 37.00506329113924, "grad_norm": 0.004269735421985388, "learning_rate": 6.89170182841069e-06, "loss": 0.0658, "step": 3000 }, { "epoch": 37.00632911392405, "grad_norm": 0.004627808462828398, "learning_rate": 6.8776371308016885e-06, "loss": 0.0001, "step": 3010 }, { "epoch": 37.00759493670886, "grad_norm": 0.008293317630887032, "learning_rate": 6.8635724331926865e-06, "loss": 0.0904, "step": 3020 }, { "epoch": 37.00886075949367, "grad_norm": 0.013359429314732552, "learning_rate": 6.849507735583686e-06, "loss": 0.1007, "step": 3030 }, { "epoch": 37.01012658227848, "grad_norm": 0.006999185774475336, "learning_rate": 6.835443037974684e-06, "loss": 0.0002, "step": 3040 }, { "epoch": 37.01012658227848, "eval_accuracy": 0.2553191489361702, "eval_loss": 5.146430492401123, "eval_runtime": 8.4797, "eval_samples_per_second": 5.543, "eval_steps_per_second": 1.415, "step": 3040 }, { "epoch": 38.00126582278481, "grad_norm": 0.005403840448707342, "learning_rate": 6.821378340365683e-06, "loss": 0.0003, "step": 3050 }, { "epoch": 38.00253164556962, "grad_norm": 0.01304860319942236, "learning_rate": 6.807313642756681e-06, "loss": 0.0003, "step": 3060 }, { "epoch": 38.00379746835443, "grad_norm": 0.29351142048835754, "learning_rate": 6.79324894514768e-06, "loss": 0.0004, "step": 3070 }, { "epoch": 38.00506329113924, "grad_norm": 0.0071726636961102486, "learning_rate": 6.779184247538679e-06, "loss": 0.0002, "step": 3080 }, { "epoch": 38.00632911392405, "grad_norm": 0.8798180222511292, "learning_rate": 6.7651195499296774e-06, "loss": 0.0578, "step": 3090 }, { "epoch": 38.00759493670886, "grad_norm": 0.01378143671900034, "learning_rate": 6.751054852320675e-06, "loss": 0.0004, "step": 3100 }, { "epoch": 38.00886075949367, "grad_norm": 0.005133031401783228, "learning_rate": 6.736990154711674e-06, "loss": 0.0062, "step": 3110 }, { "epoch": 38.01012658227848, "grad_norm": 0.01705407164990902, "learning_rate": 6.722925457102672e-06, "loss": 0.0003, "step": 3120 }, { "epoch": 38.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.634023189544678, "eval_runtime": 8.7121, "eval_samples_per_second": 5.395, "eval_steps_per_second": 1.377, "step": 3120 }, { "epoch": 39.00126582278481, "grad_norm": 0.005898992531001568, "learning_rate": 6.708860759493672e-06, "loss": 0.0008, "step": 3130 }, { "epoch": 39.00253164556962, "grad_norm": 0.40792742371559143, "learning_rate": 6.69479606188467e-06, "loss": 0.0003, "step": 3140 }, { "epoch": 39.00379746835443, "grad_norm": 0.019352609291672707, "learning_rate": 6.680731364275669e-06, "loss": 0.0002, "step": 3150 }, { "epoch": 39.00506329113924, "grad_norm": 0.0045697493478655815, "learning_rate": 6.666666666666667e-06, "loss": 0.0001, "step": 3160 }, { "epoch": 39.00632911392405, "grad_norm": 0.005903047509491444, "learning_rate": 6.652601969057666e-06, "loss": 0.0001, "step": 3170 }, { "epoch": 39.00759493670886, "grad_norm": 0.01714833825826645, "learning_rate": 6.638537271448664e-06, "loss": 0.1579, "step": 3180 }, { "epoch": 39.00886075949367, "grad_norm": 0.07012953609228134, "learning_rate": 6.624472573839663e-06, "loss": 0.0002, "step": 3190 }, { "epoch": 39.01012658227848, "grad_norm": 0.022253967821598053, "learning_rate": 6.610407876230661e-06, "loss": 0.48, "step": 3200 }, { "epoch": 39.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.337032794952393, "eval_runtime": 8.4914, "eval_samples_per_second": 5.535, "eval_steps_per_second": 1.413, "step": 3200 }, { "epoch": 40.00126582278481, "grad_norm": 233.1455535888672, "learning_rate": 6.59634317862166e-06, "loss": 0.0365, "step": 3210 }, { "epoch": 40.00253164556962, "grad_norm": 0.008999134413897991, "learning_rate": 6.582278481012659e-06, "loss": 0.1475, "step": 3220 }, { "epoch": 40.00379746835443, "grad_norm": 0.0032340127509087324, "learning_rate": 6.5682137834036576e-06, "loss": 0.1164, "step": 3230 }, { "epoch": 40.00506329113924, "grad_norm": 0.014319919049739838, "learning_rate": 6.5541490857946555e-06, "loss": 0.0008, "step": 3240 }, { "epoch": 40.00632911392405, "grad_norm": 0.019842559471726418, "learning_rate": 6.540084388185654e-06, "loss": 0.0006, "step": 3250 }, { "epoch": 40.00759493670886, "grad_norm": 0.018094390630722046, "learning_rate": 6.526019690576652e-06, "loss": 0.0005, "step": 3260 }, { "epoch": 40.00886075949367, "grad_norm": 0.00912653561681509, "learning_rate": 6.511954992967652e-06, "loss": 0.049, "step": 3270 }, { "epoch": 40.01012658227848, "grad_norm": 0.03302593529224396, "learning_rate": 6.49789029535865e-06, "loss": 0.0002, "step": 3280 }, { "epoch": 40.01012658227848, "eval_accuracy": 0.3617021276595745, "eval_loss": 4.582009315490723, "eval_runtime": 8.4753, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.416, "step": 3280 }, { "epoch": 41.00126582278481, "grad_norm": 10.000889778137207, "learning_rate": 6.483825597749649e-06, "loss": 0.0011, "step": 3290 }, { "epoch": 41.00253164556962, "grad_norm": 0.009547159075737, "learning_rate": 6.4697609001406485e-06, "loss": 0.0002, "step": 3300 }, { "epoch": 41.00379746835443, "grad_norm": 0.005821730941534042, "learning_rate": 6.4556962025316464e-06, "loss": 0.0001, "step": 3310 }, { "epoch": 41.00506329113924, "grad_norm": 0.00588320242241025, "learning_rate": 6.441631504922645e-06, "loss": 0.0025, "step": 3320 }, { "epoch": 41.00632911392405, "grad_norm": 298.84820556640625, "learning_rate": 6.427566807313643e-06, "loss": 0.2948, "step": 3330 }, { "epoch": 41.00759493670886, "grad_norm": 0.00702635245397687, "learning_rate": 6.413502109704642e-06, "loss": 0.0002, "step": 3340 }, { "epoch": 41.00886075949367, "grad_norm": 0.003056429559364915, "learning_rate": 6.39943741209564e-06, "loss": 0.0259, "step": 3350 }, { "epoch": 41.01012658227848, "grad_norm": 0.011072452180087566, "learning_rate": 6.38537271448664e-06, "loss": 0.0002, "step": 3360 }, { "epoch": 41.01012658227848, "eval_accuracy": 0.3191489361702128, "eval_loss": 5.0156683921813965, "eval_runtime": 8.468, "eval_samples_per_second": 5.55, "eval_steps_per_second": 1.417, "step": 3360 }, { "epoch": 42.00126582278481, "grad_norm": 0.022217601537704468, "learning_rate": 6.371308016877638e-06, "loss": 0.0002, "step": 3370 }, { "epoch": 42.00253164556962, "grad_norm": 65.69084167480469, "learning_rate": 6.3572433192686365e-06, "loss": 0.0037, "step": 3380 }, { "epoch": 42.00379746835443, "grad_norm": 0.07589118182659149, "learning_rate": 6.3431786216596345e-06, "loss": 0.0002, "step": 3390 }, { "epoch": 42.00506329113924, "grad_norm": 0.003494243137538433, "learning_rate": 6.329113924050634e-06, "loss": 0.0013, "step": 3400 }, { "epoch": 42.00632911392405, "grad_norm": 0.0027374387718737125, "learning_rate": 6.315049226441632e-06, "loss": 0.2015, "step": 3410 }, { "epoch": 42.00759493670886, "grad_norm": 0.0055436789989471436, "learning_rate": 6.300984528832631e-06, "loss": 0.0001, "step": 3420 }, { "epoch": 42.00886075949367, "grad_norm": 0.006678999401628971, "learning_rate": 6.286919831223629e-06, "loss": 0.0016, "step": 3430 }, { "epoch": 42.01012658227848, "grad_norm": 0.006669959519058466, "learning_rate": 6.272855133614629e-06, "loss": 0.1209, "step": 3440 }, { "epoch": 42.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.310915946960449, "eval_runtime": 8.4948, "eval_samples_per_second": 5.533, "eval_steps_per_second": 1.413, "step": 3440 }, { "epoch": 43.00126582278481, "grad_norm": 36.991024017333984, "learning_rate": 6.2587904360056266e-06, "loss": 0.0031, "step": 3450 }, { "epoch": 43.00253164556962, "grad_norm": 0.03218389302492142, "learning_rate": 6.244725738396625e-06, "loss": 0.0003, "step": 3460 }, { "epoch": 43.00379746835443, "grad_norm": 0.0067522223107516766, "learning_rate": 6.230661040787623e-06, "loss": 0.0002, "step": 3470 }, { "epoch": 43.00506329113924, "grad_norm": 0.09478826075792313, "learning_rate": 6.216596343178622e-06, "loss": 0.0001, "step": 3480 }, { "epoch": 43.00632911392405, "grad_norm": 0.006108371540904045, "learning_rate": 6.20253164556962e-06, "loss": 0.0001, "step": 3490 }, { "epoch": 43.00759493670886, "grad_norm": 0.004173735156655312, "learning_rate": 6.18846694796062e-06, "loss": 0.0002, "step": 3500 }, { "epoch": 43.00886075949367, "grad_norm": 0.004864380694925785, "learning_rate": 6.174402250351618e-06, "loss": 0.0001, "step": 3510 }, { "epoch": 43.01012658227848, "grad_norm": 0.006738508120179176, "learning_rate": 6.160337552742617e-06, "loss": 0.0001, "step": 3520 }, { "epoch": 43.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.459574222564697, "eval_runtime": 8.4737, "eval_samples_per_second": 5.547, "eval_steps_per_second": 1.416, "step": 3520 }, { "epoch": 44.00126582278481, "grad_norm": 0.00519252335652709, "learning_rate": 6.146272855133615e-06, "loss": 0.0001, "step": 3530 }, { "epoch": 44.00253164556962, "grad_norm": 0.0036063846200704575, "learning_rate": 6.132208157524614e-06, "loss": 0.0001, "step": 3540 }, { "epoch": 44.00379746835443, "grad_norm": 0.004207131918519735, "learning_rate": 6.118143459915612e-06, "loss": 0.0001, "step": 3550 }, { "epoch": 44.00506329113924, "grad_norm": 0.0024055996909737587, "learning_rate": 6.104078762306611e-06, "loss": 0.0001, "step": 3560 }, { "epoch": 44.00632911392405, "grad_norm": 0.004374026786535978, "learning_rate": 6.090014064697609e-06, "loss": 0.0001, "step": 3570 }, { "epoch": 44.00759493670886, "grad_norm": 0.0027957686688750982, "learning_rate": 6.075949367088608e-06, "loss": 0.0002, "step": 3580 }, { "epoch": 44.00886075949367, "grad_norm": 0.008639072068035603, "learning_rate": 6.061884669479607e-06, "loss": 0.004, "step": 3590 }, { "epoch": 44.01012658227848, "grad_norm": 0.011701129376888275, "learning_rate": 6.0478199718706055e-06, "loss": 0.0109, "step": 3600 }, { "epoch": 44.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.425137996673584, "eval_runtime": 8.4694, "eval_samples_per_second": 5.549, "eval_steps_per_second": 1.417, "step": 3600 }, { "epoch": 45.00126582278481, "grad_norm": 0.017412984743714333, "learning_rate": 6.0337552742616035e-06, "loss": 0.0001, "step": 3610 }, { "epoch": 45.00253164556962, "grad_norm": 0.007230939343571663, "learning_rate": 6.019690576652602e-06, "loss": 0.0698, "step": 3620 }, { "epoch": 45.00379746835443, "grad_norm": 0.014825068414211273, "learning_rate": 6.0056258790436e-06, "loss": 0.0001, "step": 3630 }, { "epoch": 45.00506329113924, "grad_norm": 0.013121239840984344, "learning_rate": 5.9915611814346e-06, "loss": 0.0001, "step": 3640 }, { "epoch": 45.00632911392405, "grad_norm": 0.013468984514474869, "learning_rate": 5.977496483825598e-06, "loss": 0.0001, "step": 3650 }, { "epoch": 45.00759493670886, "grad_norm": 0.06317138671875, "learning_rate": 5.963431786216597e-06, "loss": 0.0001, "step": 3660 }, { "epoch": 45.00886075949367, "grad_norm": 0.003630951512604952, "learning_rate": 5.949367088607595e-06, "loss": 0.1698, "step": 3670 }, { "epoch": 45.01012658227848, "grad_norm": 0.005787010304629803, "learning_rate": 5.935302390998594e-06, "loss": 0.0001, "step": 3680 }, { "epoch": 45.01012658227848, "eval_accuracy": 0.2978723404255319, "eval_loss": 5.296198844909668, "eval_runtime": 8.4784, "eval_samples_per_second": 5.543, "eval_steps_per_second": 1.415, "step": 3680 }, { "epoch": 46.00126582278481, "grad_norm": 0.00311860884539783, "learning_rate": 5.921237693389592e-06, "loss": 0.0553, "step": 3690 }, { "epoch": 46.00253164556962, "grad_norm": 0.005304061342030764, "learning_rate": 5.907172995780591e-06, "loss": 0.0002, "step": 3700 }, { "epoch": 46.00379746835443, "grad_norm": 0.015418877825140953, "learning_rate": 5.893108298171589e-06, "loss": 0.0001, "step": 3710 }, { "epoch": 46.00506329113924, "grad_norm": 0.018117068335413933, "learning_rate": 5.879043600562588e-06, "loss": 0.0024, "step": 3720 }, { "epoch": 46.00632911392405, "grad_norm": 0.004327620379626751, "learning_rate": 5.864978902953588e-06, "loss": 0.0003, "step": 3730 }, { "epoch": 46.00759493670886, "grad_norm": 0.024266647174954414, "learning_rate": 5.850914205344586e-06, "loss": 0.0001, "step": 3740 }, { "epoch": 46.00886075949367, "grad_norm": 414.96563720703125, "learning_rate": 5.8368495077355845e-06, "loss": 0.0487, "step": 3750 }, { "epoch": 46.01012658227848, "grad_norm": 0.008569066412746906, "learning_rate": 5.8227848101265824e-06, "loss": 0.1516, "step": 3760 }, { "epoch": 46.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.23142147064209, "eval_runtime": 8.4548, "eval_samples_per_second": 5.559, "eval_steps_per_second": 1.419, "step": 3760 }, { "epoch": 47.00126582278481, "grad_norm": 5.414425849914551, "learning_rate": 5.808720112517582e-06, "loss": 0.0007, "step": 3770 }, { "epoch": 47.00253164556962, "grad_norm": 0.0027215760201215744, "learning_rate": 5.79465541490858e-06, "loss": 0.0002, "step": 3780 }, { "epoch": 47.00379746835443, "grad_norm": 0.15611502528190613, "learning_rate": 5.780590717299579e-06, "loss": 0.0073, "step": 3790 }, { "epoch": 47.00506329113924, "grad_norm": 0.0019717360846698284, "learning_rate": 5.766526019690577e-06, "loss": 0.0003, "step": 3800 }, { "epoch": 47.00632911392405, "grad_norm": 0.005944707430899143, "learning_rate": 5.7524613220815765e-06, "loss": 0.0035, "step": 3810 }, { "epoch": 47.00759493670886, "grad_norm": 0.00346226803958416, "learning_rate": 5.7383966244725745e-06, "loss": 0.0001, "step": 3820 }, { "epoch": 47.00886075949367, "grad_norm": 0.024175411090254784, "learning_rate": 5.724331926863573e-06, "loss": 0.0193, "step": 3830 }, { "epoch": 47.01012658227848, "grad_norm": 0.03984224796295166, "learning_rate": 5.710267229254571e-06, "loss": 0.0001, "step": 3840 }, { "epoch": 47.01012658227848, "eval_accuracy": 0.5319148936170213, "eval_loss": 4.070488929748535, "eval_runtime": 8.5074, "eval_samples_per_second": 5.525, "eval_steps_per_second": 1.411, "step": 3840 }, { "epoch": 48.00126582278481, "grad_norm": 0.012625842355191708, "learning_rate": 5.69620253164557e-06, "loss": 0.0218, "step": 3850 }, { "epoch": 48.00253164556962, "grad_norm": 0.008255310356616974, "learning_rate": 5.682137834036568e-06, "loss": 0.012, "step": 3860 }, { "epoch": 48.00379746835443, "grad_norm": 0.019036876037716866, "learning_rate": 5.668073136427568e-06, "loss": 0.001, "step": 3870 }, { "epoch": 48.00506329113924, "grad_norm": 0.013268685899674892, "learning_rate": 5.654008438818566e-06, "loss": 0.0002, "step": 3880 }, { "epoch": 48.00632911392405, "grad_norm": 0.01589319296181202, "learning_rate": 5.639943741209565e-06, "loss": 0.0756, "step": 3890 }, { "epoch": 48.00759493670886, "grad_norm": 0.0036217246670275927, "learning_rate": 5.6258790436005626e-06, "loss": 0.1435, "step": 3900 }, { "epoch": 48.00886075949367, "grad_norm": 0.002351459814235568, "learning_rate": 5.611814345991562e-06, "loss": 0.0001, "step": 3910 }, { "epoch": 48.01012658227848, "grad_norm": 0.0023628135677427053, "learning_rate": 5.59774964838256e-06, "loss": 0.0001, "step": 3920 }, { "epoch": 48.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.55861234664917, "eval_runtime": 8.4995, "eval_samples_per_second": 5.53, "eval_steps_per_second": 1.412, "step": 3920 }, { "epoch": 49.00126582278481, "grad_norm": 0.003683815710246563, "learning_rate": 5.583684950773559e-06, "loss": 0.0001, "step": 3930 }, { "epoch": 49.00253164556962, "grad_norm": 0.004656449891626835, "learning_rate": 5.569620253164557e-06, "loss": 0.0001, "step": 3940 }, { "epoch": 49.00379746835443, "grad_norm": 0.012214281596243382, "learning_rate": 5.555555555555557e-06, "loss": 0.0052, "step": 3950 }, { "epoch": 49.00506329113924, "grad_norm": 0.009479358792304993, "learning_rate": 5.541490857946555e-06, "loss": 0.012, "step": 3960 }, { "epoch": 49.00632911392405, "grad_norm": 0.008819793350994587, "learning_rate": 5.5274261603375535e-06, "loss": 0.0001, "step": 3970 }, { "epoch": 49.00759493670886, "grad_norm": 0.04174829646945, "learning_rate": 5.5133614627285514e-06, "loss": 0.0001, "step": 3980 }, { "epoch": 49.00886075949367, "grad_norm": 0.0032157686073333025, "learning_rate": 5.49929676511955e-06, "loss": 0.1845, "step": 3990 }, { "epoch": 49.01012658227848, "grad_norm": 0.010618672706186771, "learning_rate": 5.485232067510548e-06, "loss": 0.0266, "step": 4000 }, { "epoch": 49.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.947876453399658, "eval_runtime": 8.5288, "eval_samples_per_second": 5.511, "eval_steps_per_second": 1.407, "step": 4000 }, { "epoch": 50.00126582278481, "grad_norm": 0.056022197008132935, "learning_rate": 5.471167369901548e-06, "loss": 0.2505, "step": 4010 }, { "epoch": 50.00253164556962, "grad_norm": 0.004495659377425909, "learning_rate": 5.457102672292546e-06, "loss": 0.053, "step": 4020 }, { "epoch": 50.00379746835443, "grad_norm": 0.003035302273929119, "learning_rate": 5.443037974683545e-06, "loss": 0.0001, "step": 4030 }, { "epoch": 50.00506329113924, "grad_norm": 0.006570629775524139, "learning_rate": 5.428973277074543e-06, "loss": 0.0002, "step": 4040 }, { "epoch": 50.00632911392405, "grad_norm": 323.4715881347656, "learning_rate": 5.414908579465542e-06, "loss": 0.055, "step": 4050 }, { "epoch": 50.00759493670886, "grad_norm": 0.002824920229613781, "learning_rate": 5.40084388185654e-06, "loss": 0.0001, "step": 4060 }, { "epoch": 50.00886075949367, "grad_norm": 0.06357023864984512, "learning_rate": 5.386779184247539e-06, "loss": 0.0001, "step": 4070 }, { "epoch": 50.01012658227848, "grad_norm": 0.004729899112135172, "learning_rate": 5.372714486638537e-06, "loss": 0.0001, "step": 4080 }, { "epoch": 50.01012658227848, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.32703161239624, "eval_runtime": 8.4715, "eval_samples_per_second": 5.548, "eval_steps_per_second": 1.417, "step": 4080 }, { "epoch": 51.00126582278481, "grad_norm": 0.008333638310432434, "learning_rate": 5.358649789029536e-06, "loss": 0.0002, "step": 4090 }, { "epoch": 51.00253164556962, "grad_norm": 0.009458293206989765, "learning_rate": 5.344585091420535e-06, "loss": 0.0001, "step": 4100 }, { "epoch": 51.00379746835443, "grad_norm": 0.0024418376851826906, "learning_rate": 5.330520393811534e-06, "loss": 0.0002, "step": 4110 }, { "epoch": 51.00506329113924, "grad_norm": 0.004669019021093845, "learning_rate": 5.3164556962025316e-06, "loss": 0.0001, "step": 4120 }, { "epoch": 51.00632911392405, "grad_norm": 0.003113614860922098, "learning_rate": 5.30239099859353e-06, "loss": 0.0012, "step": 4130 }, { "epoch": 51.00759493670886, "grad_norm": 0.003157148603349924, "learning_rate": 5.28832630098453e-06, "loss": 0.0001, "step": 4140 }, { "epoch": 51.00886075949367, "grad_norm": 0.004666858818382025, "learning_rate": 5.274261603375528e-06, "loss": 0.1445, "step": 4150 }, { "epoch": 51.01012658227848, "grad_norm": 0.002661221195012331, "learning_rate": 5.260196905766527e-06, "loss": 0.1307, "step": 4160 }, { "epoch": 51.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.794792175292969, "eval_runtime": 8.4848, "eval_samples_per_second": 5.539, "eval_steps_per_second": 1.414, "step": 4160 }, { "epoch": 52.00126582278481, "grad_norm": 0.001836895477026701, "learning_rate": 5.246132208157525e-06, "loss": 0.0002, "step": 4170 }, { "epoch": 52.00253164556962, "grad_norm": 2.3909878730773926, "learning_rate": 5.2320675105485245e-06, "loss": 0.0084, "step": 4180 }, { "epoch": 52.00379746835443, "grad_norm": 0.0022460331674665213, "learning_rate": 5.2180028129395225e-06, "loss": 0.0005, "step": 4190 }, { "epoch": 52.00506329113924, "grad_norm": 0.7268118858337402, "learning_rate": 5.203938115330521e-06, "loss": 0.0008, "step": 4200 }, { "epoch": 52.00632911392405, "grad_norm": 0.0033825428690761328, "learning_rate": 5.189873417721519e-06, "loss": 0.0001, "step": 4210 }, { "epoch": 52.00759493670886, "grad_norm": 0.006189883220940828, "learning_rate": 5.175808720112518e-06, "loss": 0.0001, "step": 4220 }, { "epoch": 52.00886075949367, "grad_norm": 0.006958500016480684, "learning_rate": 5.161744022503516e-06, "loss": 0.0002, "step": 4230 }, { "epoch": 52.01012658227848, "grad_norm": 0.003031873842701316, "learning_rate": 5.147679324894516e-06, "loss": 0.0019, "step": 4240 }, { "epoch": 52.01012658227848, "eval_accuracy": 0.3617021276595745, "eval_loss": 4.363827705383301, "eval_runtime": 8.6439, "eval_samples_per_second": 5.437, "eval_steps_per_second": 1.388, "step": 4240 }, { "epoch": 53.00126582278481, "grad_norm": 0.09558191895484924, "learning_rate": 5.133614627285514e-06, "loss": 0.0001, "step": 4250 }, { "epoch": 53.00253164556962, "grad_norm": 0.002434414578601718, "learning_rate": 5.1195499296765125e-06, "loss": 0.0007, "step": 4260 }, { "epoch": 53.00379746835443, "grad_norm": 0.018281536176800728, "learning_rate": 5.1054852320675105e-06, "loss": 0.0004, "step": 4270 }, { "epoch": 53.00506329113924, "grad_norm": 0.003481107298284769, "learning_rate": 5.09142053445851e-06, "loss": 0.0001, "step": 4280 }, { "epoch": 53.00632911392405, "grad_norm": 0.0011617491254583001, "learning_rate": 5.077355836849508e-06, "loss": 0.0001, "step": 4290 }, { "epoch": 53.00759493670886, "grad_norm": 0.007551996968686581, "learning_rate": 5.063291139240507e-06, "loss": 0.0001, "step": 4300 }, { "epoch": 53.00886075949367, "grad_norm": 0.003541940590366721, "learning_rate": 5.049226441631505e-06, "loss": 0.0001, "step": 4310 }, { "epoch": 53.01012658227848, "grad_norm": 0.0031788817141205072, "learning_rate": 5.035161744022505e-06, "loss": 0.0001, "step": 4320 }, { "epoch": 53.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.586310386657715, "eval_runtime": 8.4934, "eval_samples_per_second": 5.534, "eval_steps_per_second": 1.413, "step": 4320 }, { "epoch": 54.00126582278481, "grad_norm": 0.0024340234231203794, "learning_rate": 5.021097046413503e-06, "loss": 0.0001, "step": 4330 }, { "epoch": 54.00253164556962, "grad_norm": 0.0034480541944503784, "learning_rate": 5.007032348804501e-06, "loss": 0.0001, "step": 4340 }, { "epoch": 54.00379746835443, "grad_norm": 0.0023180190473794937, "learning_rate": 4.9929676511955e-06, "loss": 0.0001, "step": 4350 }, { "epoch": 54.00506329113924, "grad_norm": 0.0015061123995110393, "learning_rate": 4.978902953586498e-06, "loss": 0.0001, "step": 4360 }, { "epoch": 54.00632911392405, "grad_norm": 0.013990904204547405, "learning_rate": 4.964838255977497e-06, "loss": 0.0001, "step": 4370 }, { "epoch": 54.00759493670886, "grad_norm": 0.0013285009190440178, "learning_rate": 4.950773558368496e-06, "loss": 0.0001, "step": 4380 }, { "epoch": 54.00886075949367, "grad_norm": 0.00343449623323977, "learning_rate": 4.936708860759495e-06, "loss": 0.0001, "step": 4390 }, { "epoch": 54.01012658227848, "grad_norm": 0.003100321162492037, "learning_rate": 4.922644163150493e-06, "loss": 0.0001, "step": 4400 }, { "epoch": 54.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.737309455871582, "eval_runtime": 8.4983, "eval_samples_per_second": 5.53, "eval_steps_per_second": 1.412, "step": 4400 }, { "epoch": 55.00126582278481, "grad_norm": 0.0022656081710010767, "learning_rate": 4.9085794655414915e-06, "loss": 0.0001, "step": 4410 }, { "epoch": 55.00253164556962, "grad_norm": 0.002674259478226304, "learning_rate": 4.89451476793249e-06, "loss": 0.0001, "step": 4420 }, { "epoch": 55.00379746835443, "grad_norm": 0.0027046040631830692, "learning_rate": 4.880450070323488e-06, "loss": 0.0024, "step": 4430 }, { "epoch": 55.00506329113924, "grad_norm": 0.003956619184464216, "learning_rate": 4.866385372714487e-06, "loss": 0.0001, "step": 4440 }, { "epoch": 55.00632911392405, "grad_norm": 0.03139903396368027, "learning_rate": 4.852320675105486e-06, "loss": 0.0742, "step": 4450 }, { "epoch": 55.00759493670886, "grad_norm": 0.00574122928082943, "learning_rate": 4.838255977496485e-06, "loss": 0.2666, "step": 4460 }, { "epoch": 55.00886075949367, "grad_norm": 0.012300165370106697, "learning_rate": 4.824191279887483e-06, "loss": 0.0001, "step": 4470 }, { "epoch": 55.01012658227848, "grad_norm": 5.114750385284424, "learning_rate": 4.8101265822784815e-06, "loss": 0.0006, "step": 4480 }, { "epoch": 55.01012658227848, "eval_accuracy": 0.44680851063829785, "eval_loss": 3.9066474437713623, "eval_runtime": 8.5347, "eval_samples_per_second": 5.507, "eval_steps_per_second": 1.406, "step": 4480 }, { "epoch": 56.00126582278481, "grad_norm": 11.49494457244873, "learning_rate": 4.79606188466948e-06, "loss": 0.0011, "step": 4490 }, { "epoch": 56.00253164556962, "grad_norm": 0.005774380173534155, "learning_rate": 4.781997187060478e-06, "loss": 0.0001, "step": 4500 }, { "epoch": 56.00379746835443, "grad_norm": 0.006357602309435606, "learning_rate": 4.767932489451477e-06, "loss": 0.0001, "step": 4510 }, { "epoch": 56.00506329113924, "grad_norm": 0.002659859601408243, "learning_rate": 4.753867791842476e-06, "loss": 0.0017, "step": 4520 }, { "epoch": 56.00632911392405, "grad_norm": 0.013889423571527004, "learning_rate": 4.739803094233475e-06, "loss": 0.0001, "step": 4530 }, { "epoch": 56.00759493670886, "grad_norm": 0.002410717075690627, "learning_rate": 4.725738396624473e-06, "loss": 0.0001, "step": 4540 }, { "epoch": 56.00886075949367, "grad_norm": 0.0023062098771333694, "learning_rate": 4.711673699015472e-06, "loss": 0.0001, "step": 4550 }, { "epoch": 56.01012658227848, "grad_norm": 0.0023318820167332888, "learning_rate": 4.6976090014064704e-06, "loss": 0.0001, "step": 4560 }, { "epoch": 56.01012658227848, "eval_accuracy": 0.46808510638297873, "eval_loss": 4.031365394592285, "eval_runtime": 8.457, "eval_samples_per_second": 5.558, "eval_steps_per_second": 1.419, "step": 4560 }, { "epoch": 57.00126582278481, "grad_norm": 0.004980940837413073, "learning_rate": 4.683544303797468e-06, "loss": 0.0001, "step": 4570 }, { "epoch": 57.00253164556962, "grad_norm": 0.0019251375924795866, "learning_rate": 4.669479606188467e-06, "loss": 0.0001, "step": 4580 }, { "epoch": 57.00379746835443, "grad_norm": 0.0028012413531541824, "learning_rate": 4.655414908579466e-06, "loss": 0.0174, "step": 4590 }, { "epoch": 57.00506329113924, "grad_norm": 143.03387451171875, "learning_rate": 4.641350210970465e-06, "loss": 0.0119, "step": 4600 }, { "epoch": 57.00632911392405, "grad_norm": 0.003186359303072095, "learning_rate": 4.627285513361463e-06, "loss": 0.0001, "step": 4610 }, { "epoch": 57.00759493670886, "grad_norm": 0.11152984201908112, "learning_rate": 4.613220815752462e-06, "loss": 0.0037, "step": 4620 }, { "epoch": 57.00886075949367, "grad_norm": 0.001349453697912395, "learning_rate": 4.5991561181434605e-06, "loss": 0.1545, "step": 4630 }, { "epoch": 57.01012658227848, "grad_norm": 0.05059582367539406, "learning_rate": 4.5850914205344585e-06, "loss": 0.0001, "step": 4640 }, { "epoch": 57.01012658227848, "eval_accuracy": 0.5106382978723404, "eval_loss": 4.058121204376221, "eval_runtime": 8.491, "eval_samples_per_second": 5.535, "eval_steps_per_second": 1.413, "step": 4640 }, { "epoch": 58.00126582278481, "grad_norm": 0.003746249247342348, "learning_rate": 4.571026722925457e-06, "loss": 0.0, "step": 4650 }, { "epoch": 58.00253164556962, "grad_norm": 0.0019692752975970507, "learning_rate": 4.556962025316456e-06, "loss": 0.0001, "step": 4660 }, { "epoch": 58.00379746835443, "grad_norm": 0.002934554358944297, "learning_rate": 4.542897327707454e-06, "loss": 0.0001, "step": 4670 }, { "epoch": 58.00506329113924, "grad_norm": 0.005108493380248547, "learning_rate": 4.528832630098453e-06, "loss": 0.0402, "step": 4680 }, { "epoch": 58.00632911392405, "grad_norm": 0.004260794725269079, "learning_rate": 4.514767932489452e-06, "loss": 0.0006, "step": 4690 }, { "epoch": 58.00759493670886, "grad_norm": 0.06016235053539276, "learning_rate": 4.5007032348804506e-06, "loss": 0.0004, "step": 4700 }, { "epoch": 58.00886075949367, "grad_norm": 0.0011827549897134304, "learning_rate": 4.4866385372714485e-06, "loss": 0.114, "step": 4710 }, { "epoch": 58.01012658227848, "grad_norm": 0.002215220592916012, "learning_rate": 4.472573839662447e-06, "loss": 0.0001, "step": 4720 }, { "epoch": 58.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 5.004458904266357, "eval_runtime": 8.4472, "eval_samples_per_second": 5.564, "eval_steps_per_second": 1.421, "step": 4720 }, { "epoch": 59.00126582278481, "grad_norm": 0.004406394902616739, "learning_rate": 4.458509142053446e-06, "loss": 0.0005, "step": 4730 }, { "epoch": 59.00253164556962, "grad_norm": 0.002640231978148222, "learning_rate": 4.444444444444444e-06, "loss": 0.0001, "step": 4740 }, { "epoch": 59.00379746835443, "grad_norm": 0.0029783693607896566, "learning_rate": 4.430379746835443e-06, "loss": 0.0001, "step": 4750 }, { "epoch": 59.00506329113924, "grad_norm": 0.013541797176003456, "learning_rate": 4.416315049226442e-06, "loss": 0.0004, "step": 4760 }, { "epoch": 59.00632911392405, "grad_norm": 0.0070534199476242065, "learning_rate": 4.402250351617441e-06, "loss": 0.0002, "step": 4770 }, { "epoch": 59.00759493670886, "grad_norm": 0.0034858768340200186, "learning_rate": 4.3881856540084394e-06, "loss": 0.0001, "step": 4780 }, { "epoch": 59.00886075949367, "grad_norm": 0.00406244769692421, "learning_rate": 4.374120956399438e-06, "loss": 0.0, "step": 4790 }, { "epoch": 59.01012658227848, "grad_norm": 0.0017109077889472246, "learning_rate": 4.360056258790436e-06, "loss": 0.0001, "step": 4800 }, { "epoch": 59.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.089483737945557, "eval_runtime": 8.467, "eval_samples_per_second": 5.551, "eval_steps_per_second": 1.417, "step": 4800 }, { "epoch": 60.00126582278481, "grad_norm": 0.0024315589107573032, "learning_rate": 4.345991561181435e-06, "loss": 0.0, "step": 4810 }, { "epoch": 60.00253164556962, "grad_norm": 0.0012313745683059096, "learning_rate": 4.331926863572434e-06, "loss": 0.0001, "step": 4820 }, { "epoch": 60.00379746835443, "grad_norm": 0.0019479021430015564, "learning_rate": 4.317862165963433e-06, "loss": 0.0004, "step": 4830 }, { "epoch": 60.00506329113924, "grad_norm": 0.00494040735065937, "learning_rate": 4.303797468354431e-06, "loss": 0.0872, "step": 4840 }, { "epoch": 60.00632911392405, "grad_norm": 0.012567605823278427, "learning_rate": 4.2897327707454295e-06, "loss": 0.0011, "step": 4850 }, { "epoch": 60.00759493670886, "grad_norm": 0.002357608638703823, "learning_rate": 4.275668073136428e-06, "loss": 0.0296, "step": 4860 }, { "epoch": 60.00886075949367, "grad_norm": 0.0030609623063355684, "learning_rate": 4.261603375527426e-06, "loss": 0.0243, "step": 4870 }, { "epoch": 60.01012658227848, "grad_norm": 0.0016012099804356694, "learning_rate": 4.247538677918425e-06, "loss": 0.0713, "step": 4880 }, { "epoch": 60.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 5.042915344238281, "eval_runtime": 8.491, "eval_samples_per_second": 5.535, "eval_steps_per_second": 1.413, "step": 4880 }, { "epoch": 61.00126582278481, "grad_norm": 0.004251533187925816, "learning_rate": 4.233473980309424e-06, "loss": 0.0766, "step": 4890 }, { "epoch": 61.00253164556962, "grad_norm": 0.0019293460063636303, "learning_rate": 4.219409282700423e-06, "loss": 0.0006, "step": 4900 }, { "epoch": 61.00379746835443, "grad_norm": 0.004420694895088673, "learning_rate": 4.205344585091421e-06, "loss": 0.0567, "step": 4910 }, { "epoch": 61.00506329113924, "grad_norm": 0.2990714907646179, "learning_rate": 4.1912798874824196e-06, "loss": 0.0274, "step": 4920 }, { "epoch": 61.00632911392405, "grad_norm": 0.004468689672648907, "learning_rate": 4.177215189873418e-06, "loss": 0.0001, "step": 4930 }, { "epoch": 61.00759493670886, "grad_norm": 0.004564017057418823, "learning_rate": 4.163150492264416e-06, "loss": 0.2423, "step": 4940 }, { "epoch": 61.00886075949367, "grad_norm": 93.79319763183594, "learning_rate": 4.149085794655415e-06, "loss": 0.0517, "step": 4950 }, { "epoch": 61.01012658227848, "grad_norm": 0.004615637473762035, "learning_rate": 4.135021097046414e-06, "loss": 0.0017, "step": 4960 }, { "epoch": 61.01012658227848, "eval_accuracy": 0.425531914893617, "eval_loss": 4.786965370178223, "eval_runtime": 8.4752, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.416, "step": 4960 }, { "epoch": 62.00126582278481, "grad_norm": 0.061868444085121155, "learning_rate": 4.120956399437413e-06, "loss": 0.0001, "step": 4970 }, { "epoch": 62.00253164556962, "grad_norm": 0.006057640537619591, "learning_rate": 4.106891701828411e-06, "loss": 0.0013, "step": 4980 }, { "epoch": 62.00379746835443, "grad_norm": 4.027284145355225, "learning_rate": 4.09282700421941e-06, "loss": 0.0004, "step": 4990 }, { "epoch": 62.00506329113924, "grad_norm": 0.00944253709167242, "learning_rate": 4.0787623066104084e-06, "loss": 0.0001, "step": 5000 }, { "epoch": 62.00632911392405, "grad_norm": 0.0035694832913577557, "learning_rate": 4.064697609001406e-06, "loss": 0.0001, "step": 5010 }, { "epoch": 62.00759493670886, "grad_norm": 0.0015398276736959815, "learning_rate": 4.050632911392405e-06, "loss": 0.0671, "step": 5020 }, { "epoch": 62.00886075949367, "grad_norm": 0.002066017361357808, "learning_rate": 4.036568213783404e-06, "loss": 0.0009, "step": 5030 }, { "epoch": 62.01012658227848, "grad_norm": 0.003685934003442526, "learning_rate": 4.022503516174403e-06, "loss": 0.0676, "step": 5040 }, { "epoch": 62.01012658227848, "eval_accuracy": 0.3829787234042553, "eval_loss": 5.095708847045898, "eval_runtime": 8.4925, "eval_samples_per_second": 5.534, "eval_steps_per_second": 1.413, "step": 5040 }, { "epoch": 63.00126582278481, "grad_norm": 0.0065714651718735695, "learning_rate": 4.008438818565401e-06, "loss": 0.0001, "step": 5050 }, { "epoch": 63.00253164556962, "grad_norm": 0.003956567496061325, "learning_rate": 3.9943741209564e-06, "loss": 0.0001, "step": 5060 }, { "epoch": 63.00379746835443, "grad_norm": 0.008157577365636826, "learning_rate": 3.9803094233473985e-06, "loss": 0.0, "step": 5070 }, { "epoch": 63.00506329113924, "grad_norm": 0.0031191923189908266, "learning_rate": 3.9662447257383965e-06, "loss": 0.0, "step": 5080 }, { "epoch": 63.00632911392405, "grad_norm": 0.0020041377283632755, "learning_rate": 3.952180028129395e-06, "loss": 0.0001, "step": 5090 }, { "epoch": 63.00759493670886, "grad_norm": 0.004067094065248966, "learning_rate": 3.938115330520394e-06, "loss": 0.0001, "step": 5100 }, { "epoch": 63.00886075949367, "grad_norm": 0.0008815817418508232, "learning_rate": 3.924050632911393e-06, "loss": 0.0, "step": 5110 }, { "epoch": 63.01012658227848, "grad_norm": 0.008889904245734215, "learning_rate": 3.909985935302391e-06, "loss": 0.0, "step": 5120 }, { "epoch": 63.01012658227848, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.606178283691406, "eval_runtime": 8.459, "eval_samples_per_second": 5.556, "eval_steps_per_second": 1.419, "step": 5120 }, { "epoch": 64.00126582278482, "grad_norm": 0.0015669207787141204, "learning_rate": 3.89592123769339e-06, "loss": 0.0, "step": 5130 }, { "epoch": 64.00253164556962, "grad_norm": 0.0011807240080088377, "learning_rate": 3.8818565400843886e-06, "loss": 0.0, "step": 5140 }, { "epoch": 64.00379746835443, "grad_norm": 0.0015742299146950245, "learning_rate": 3.8677918424753865e-06, "loss": 0.0, "step": 5150 }, { "epoch": 64.00506329113924, "grad_norm": 0.004820580128580332, "learning_rate": 3.853727144866385e-06, "loss": 0.0, "step": 5160 }, { "epoch": 64.00632911392405, "grad_norm": 0.0032741015311330557, "learning_rate": 3.839662447257384e-06, "loss": 0.0008, "step": 5170 }, { "epoch": 64.00759493670886, "grad_norm": 0.0036417359951883554, "learning_rate": 3.825597749648383e-06, "loss": 0.0, "step": 5180 }, { "epoch": 64.00886075949367, "grad_norm": 0.008372402749955654, "learning_rate": 3.811533052039382e-06, "loss": 0.0001, "step": 5190 }, { "epoch": 64.01012658227847, "grad_norm": 0.003397272201254964, "learning_rate": 3.7974683544303802e-06, "loss": 0.0045, "step": 5200 }, { "epoch": 64.01012658227847, "eval_accuracy": 0.3829787234042553, "eval_loss": 5.245887756347656, "eval_runtime": 8.4656, "eval_samples_per_second": 5.552, "eval_steps_per_second": 1.418, "step": 5200 }, { "epoch": 65.00126582278482, "grad_norm": 0.0019356166012585163, "learning_rate": 3.783403656821379e-06, "loss": 0.0014, "step": 5210 }, { "epoch": 65.00253164556962, "grad_norm": 0.0013496861793100834, "learning_rate": 3.7693389592123775e-06, "loss": 0.0267, "step": 5220 }, { "epoch": 65.00379746835443, "grad_norm": 0.04248461872339249, "learning_rate": 3.755274261603376e-06, "loss": 0.0002, "step": 5230 }, { "epoch": 65.00506329113924, "grad_norm": 0.004868203774094582, "learning_rate": 3.7412095639943747e-06, "loss": 0.002, "step": 5240 }, { "epoch": 65.00632911392405, "grad_norm": 0.0008925410802476108, "learning_rate": 3.727144866385373e-06, "loss": 0.0, "step": 5250 }, { "epoch": 65.00759493670886, "grad_norm": 0.0019144342513754964, "learning_rate": 3.713080168776372e-06, "loss": 0.0, "step": 5260 }, { "epoch": 65.00886075949367, "grad_norm": 0.001448463648557663, "learning_rate": 3.6990154711673703e-06, "loss": 0.0001, "step": 5270 }, { "epoch": 65.01012658227847, "grad_norm": 0.0024687196128070354, "learning_rate": 3.684950773558369e-06, "loss": 0.0943, "step": 5280 }, { "epoch": 65.01012658227847, "eval_accuracy": 0.3617021276595745, "eval_loss": 5.0856242179870605, "eval_runtime": 8.4623, "eval_samples_per_second": 5.554, "eval_steps_per_second": 1.418, "step": 5280 }, { "epoch": 66.00126582278482, "grad_norm": 0.0016155457124114037, "learning_rate": 3.6708860759493675e-06, "loss": 0.0001, "step": 5290 }, { "epoch": 66.00253164556962, "grad_norm": 0.17640484869480133, "learning_rate": 3.656821378340366e-06, "loss": 0.0001, "step": 5300 }, { "epoch": 66.00379746835443, "grad_norm": 0.0015737387584522367, "learning_rate": 3.6427566807313647e-06, "loss": 0.0, "step": 5310 }, { "epoch": 66.00506329113924, "grad_norm": 0.015487028285861015, "learning_rate": 3.628691983122363e-06, "loss": 0.0001, "step": 5320 }, { "epoch": 66.00632911392405, "grad_norm": 0.0009026491898111999, "learning_rate": 3.614627285513362e-06, "loss": 0.0143, "step": 5330 }, { "epoch": 66.00759493670886, "grad_norm": 0.0015520367305725813, "learning_rate": 3.6005625879043604e-06, "loss": 0.0048, "step": 5340 }, { "epoch": 66.00886075949367, "grad_norm": 0.007421619724482298, "learning_rate": 3.586497890295359e-06, "loss": 0.0, "step": 5350 }, { "epoch": 66.01012658227847, "grad_norm": 0.0009921834571287036, "learning_rate": 3.5724331926863576e-06, "loss": 0.0002, "step": 5360 }, { "epoch": 66.01012658227847, "eval_accuracy": 0.48936170212765956, "eval_loss": 4.449216365814209, "eval_runtime": 8.455, "eval_samples_per_second": 5.559, "eval_steps_per_second": 1.419, "step": 5360 }, { "epoch": 67.00126582278482, "grad_norm": 0.009655151516199112, "learning_rate": 3.558368495077356e-06, "loss": 0.0001, "step": 5370 }, { "epoch": 67.00253164556962, "grad_norm": 0.004027712158858776, "learning_rate": 3.544303797468355e-06, "loss": 0.0001, "step": 5380 }, { "epoch": 67.00379746835443, "grad_norm": 0.002234363229945302, "learning_rate": 3.530239099859353e-06, "loss": 0.0001, "step": 5390 }, { "epoch": 67.00506329113924, "grad_norm": 0.001890279003418982, "learning_rate": 3.516174402250352e-06, "loss": 0.0001, "step": 5400 }, { "epoch": 67.00632911392405, "grad_norm": 0.0018905351171270013, "learning_rate": 3.5021097046413504e-06, "loss": 0.0, "step": 5410 }, { "epoch": 67.00759493670886, "grad_norm": 0.001428403309546411, "learning_rate": 3.4880450070323492e-06, "loss": 0.0001, "step": 5420 }, { "epoch": 67.00886075949367, "grad_norm": 0.001419195905327797, "learning_rate": 3.4739803094233476e-06, "loss": 0.0924, "step": 5430 }, { "epoch": 67.01012658227847, "grad_norm": 0.0011965942103415728, "learning_rate": 3.459915611814346e-06, "loss": 0.0002, "step": 5440 }, { "epoch": 67.01012658227847, "eval_accuracy": 0.40425531914893614, "eval_loss": 5.17952299118042, "eval_runtime": 8.4394, "eval_samples_per_second": 5.569, "eval_steps_per_second": 1.422, "step": 5440 }, { "epoch": 68.00126582278482, "grad_norm": 0.004659404046833515, "learning_rate": 3.445850914205345e-06, "loss": 0.0001, "step": 5450 }, { "epoch": 68.00253164556962, "grad_norm": 282.0872497558594, "learning_rate": 3.4317862165963433e-06, "loss": 0.0246, "step": 5460 }, { "epoch": 68.00379746835443, "grad_norm": 0.19597108662128448, "learning_rate": 3.417721518987342e-06, "loss": 0.0001, "step": 5470 }, { "epoch": 68.00506329113924, "grad_norm": 0.0020114402286708355, "learning_rate": 3.4036568213783405e-06, "loss": 0.0449, "step": 5480 }, { "epoch": 68.00632911392405, "grad_norm": 0.0017703929916024208, "learning_rate": 3.3895921237693393e-06, "loss": 0.0001, "step": 5490 }, { "epoch": 68.00759493670886, "grad_norm": 0.005612295586615801, "learning_rate": 3.3755274261603377e-06, "loss": 0.0, "step": 5500 }, { "epoch": 68.00886075949367, "grad_norm": 0.002703710226342082, "learning_rate": 3.361462728551336e-06, "loss": 0.0001, "step": 5510 }, { "epoch": 68.01012658227847, "grad_norm": 0.0033236260060220957, "learning_rate": 3.347398030942335e-06, "loss": 0.0007, "step": 5520 }, { "epoch": 68.01012658227847, "eval_accuracy": 0.46808510638297873, "eval_loss": 4.3201751708984375, "eval_runtime": 8.4512, "eval_samples_per_second": 5.561, "eval_steps_per_second": 1.42, "step": 5520 }, { "epoch": 69.00126582278482, "grad_norm": 0.13362517952919006, "learning_rate": 3.3333333333333333e-06, "loss": 0.0001, "step": 5530 }, { "epoch": 69.00253164556962, "grad_norm": 0.0022546211257576942, "learning_rate": 3.319268635724332e-06, "loss": 0.1001, "step": 5540 }, { "epoch": 69.00379746835443, "grad_norm": 0.002193002263084054, "learning_rate": 3.3052039381153305e-06, "loss": 0.0, "step": 5550 }, { "epoch": 69.00506329113924, "grad_norm": 0.0027829715982079506, "learning_rate": 3.2911392405063294e-06, "loss": 0.0061, "step": 5560 }, { "epoch": 69.00632911392405, "grad_norm": 0.0031120802741497755, "learning_rate": 3.2770745428973278e-06, "loss": 0.0001, "step": 5570 }, { "epoch": 69.00759493670886, "grad_norm": 0.001309010898694396, "learning_rate": 3.263009845288326e-06, "loss": 0.0001, "step": 5580 }, { "epoch": 69.00886075949367, "grad_norm": 0.028802473098039627, "learning_rate": 3.248945147679325e-06, "loss": 0.004, "step": 5590 }, { "epoch": 69.01012658227847, "grad_norm": 0.0005848377477377653, "learning_rate": 3.2348804500703242e-06, "loss": 0.1678, "step": 5600 }, { "epoch": 69.01012658227847, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.868789196014404, "eval_runtime": 8.4887, "eval_samples_per_second": 5.537, "eval_steps_per_second": 1.414, "step": 5600 }, { "epoch": 70.00126582278482, "grad_norm": 0.0012933706166222692, "learning_rate": 3.2208157524613226e-06, "loss": 0.0002, "step": 5610 }, { "epoch": 70.00253164556962, "grad_norm": 0.02926361933350563, "learning_rate": 3.206751054852321e-06, "loss": 0.0041, "step": 5620 }, { "epoch": 70.00379746835443, "grad_norm": 0.001349663594737649, "learning_rate": 3.19268635724332e-06, "loss": 0.0003, "step": 5630 }, { "epoch": 70.00506329113924, "grad_norm": 0.002003163332119584, "learning_rate": 3.1786216596343183e-06, "loss": 0.0, "step": 5640 }, { "epoch": 70.00632911392405, "grad_norm": 0.011114409193396568, "learning_rate": 3.164556962025317e-06, "loss": 0.0001, "step": 5650 }, { "epoch": 70.00759493670886, "grad_norm": 0.004937044810503721, "learning_rate": 3.1504922644163155e-06, "loss": 0.0, "step": 5660 }, { "epoch": 70.00886075949367, "grad_norm": 0.01026509702205658, "learning_rate": 3.1364275668073143e-06, "loss": 0.0001, "step": 5670 }, { "epoch": 70.01012658227847, "grad_norm": 0.00430481368675828, "learning_rate": 3.1223628691983127e-06, "loss": 0.0001, "step": 5680 }, { "epoch": 70.01012658227847, "eval_accuracy": 0.40425531914893614, "eval_loss": 5.288034439086914, "eval_runtime": 8.477, "eval_samples_per_second": 5.544, "eval_steps_per_second": 1.416, "step": 5680 }, { "epoch": 71.00126582278482, "grad_norm": 0.0036116482224315405, "learning_rate": 3.108298171589311e-06, "loss": 0.0, "step": 5690 }, { "epoch": 71.00253164556962, "grad_norm": 0.015287871472537518, "learning_rate": 3.09423347398031e-06, "loss": 0.0001, "step": 5700 }, { "epoch": 71.00379746835443, "grad_norm": 0.008669455535709858, "learning_rate": 3.0801687763713083e-06, "loss": 0.0, "step": 5710 }, { "epoch": 71.00506329113924, "grad_norm": 0.019757656380534172, "learning_rate": 3.066104078762307e-06, "loss": 0.0001, "step": 5720 }, { "epoch": 71.00632911392405, "grad_norm": 0.012890863232314587, "learning_rate": 3.0520393811533055e-06, "loss": 0.0, "step": 5730 }, { "epoch": 71.00759493670886, "grad_norm": 0.0019587704446166754, "learning_rate": 3.037974683544304e-06, "loss": 0.0, "step": 5740 }, { "epoch": 71.00886075949367, "grad_norm": 0.0018708609277382493, "learning_rate": 3.0239099859353028e-06, "loss": 0.0, "step": 5750 }, { "epoch": 71.01012658227847, "grad_norm": 0.003253462491557002, "learning_rate": 3.009845288326301e-06, "loss": 0.0, "step": 5760 }, { "epoch": 71.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 5.115118503570557, "eval_runtime": 8.4566, "eval_samples_per_second": 5.558, "eval_steps_per_second": 1.419, "step": 5760 }, { "epoch": 72.00126582278482, "grad_norm": 0.001443624496459961, "learning_rate": 2.9957805907173e-06, "loss": 0.0, "step": 5770 }, { "epoch": 72.00253164556962, "grad_norm": 0.0025624725967645645, "learning_rate": 2.9817158931082984e-06, "loss": 0.0026, "step": 5780 }, { "epoch": 72.00379746835443, "grad_norm": 0.00680403271690011, "learning_rate": 2.967651195499297e-06, "loss": 0.0, "step": 5790 }, { "epoch": 72.00506329113924, "grad_norm": 0.0030975525733083487, "learning_rate": 2.9535864978902956e-06, "loss": 0.0, "step": 5800 }, { "epoch": 72.00632911392405, "grad_norm": 0.019320061430335045, "learning_rate": 2.939521800281294e-06, "loss": 0.2163, "step": 5810 }, { "epoch": 72.00759493670886, "grad_norm": 0.07163013517856598, "learning_rate": 2.925457102672293e-06, "loss": 0.0001, "step": 5820 }, { "epoch": 72.00886075949367, "grad_norm": 0.0038794104475528, "learning_rate": 2.9113924050632912e-06, "loss": 0.1065, "step": 5830 }, { "epoch": 72.01012658227847, "grad_norm": 0.0027189133688807487, "learning_rate": 2.89732770745429e-06, "loss": 0.0005, "step": 5840 }, { "epoch": 72.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.566655158996582, "eval_runtime": 8.4516, "eval_samples_per_second": 5.561, "eval_steps_per_second": 1.42, "step": 5840 }, { "epoch": 73.00126582278482, "grad_norm": 0.0017171819927170873, "learning_rate": 2.8832630098452884e-06, "loss": 0.0001, "step": 5850 }, { "epoch": 73.00253164556962, "grad_norm": 0.0033329855650663376, "learning_rate": 2.8691983122362873e-06, "loss": 0.0, "step": 5860 }, { "epoch": 73.00379746835443, "grad_norm": 0.008366705849766731, "learning_rate": 2.8551336146272857e-06, "loss": 0.0001, "step": 5870 }, { "epoch": 73.00506329113924, "grad_norm": 0.0013916816096752882, "learning_rate": 2.841068917018284e-06, "loss": 0.0001, "step": 5880 }, { "epoch": 73.00632911392405, "grad_norm": 0.001828556414693594, "learning_rate": 2.827004219409283e-06, "loss": 0.0, "step": 5890 }, { "epoch": 73.00759493670886, "grad_norm": 0.002120325807482004, "learning_rate": 2.8129395218002813e-06, "loss": 0.0, "step": 5900 }, { "epoch": 73.00886075949367, "grad_norm": 0.0018465804168954492, "learning_rate": 2.79887482419128e-06, "loss": 0.0, "step": 5910 }, { "epoch": 73.01012658227847, "grad_norm": 0.0017947384621948004, "learning_rate": 2.7848101265822785e-06, "loss": 0.0, "step": 5920 }, { "epoch": 73.01012658227847, "eval_accuracy": 0.46808510638297873, "eval_loss": 4.288333415985107, "eval_runtime": 8.4999, "eval_samples_per_second": 5.529, "eval_steps_per_second": 1.412, "step": 5920 }, { "epoch": 74.00126582278482, "grad_norm": 0.0011507548624649644, "learning_rate": 2.7707454289732773e-06, "loss": 0.0001, "step": 5930 }, { "epoch": 74.00253164556962, "grad_norm": 0.001057163462974131, "learning_rate": 2.7566807313642757e-06, "loss": 0.0002, "step": 5940 }, { "epoch": 74.00379746835443, "grad_norm": 0.001940654474310577, "learning_rate": 2.742616033755274e-06, "loss": 0.0001, "step": 5950 }, { "epoch": 74.00506329113924, "grad_norm": 0.013309543952345848, "learning_rate": 2.728551336146273e-06, "loss": 0.1043, "step": 5960 }, { "epoch": 74.00632911392405, "grad_norm": 0.003933705855160952, "learning_rate": 2.7144866385372713e-06, "loss": 0.0, "step": 5970 }, { "epoch": 74.00759493670886, "grad_norm": 0.0016960457433015108, "learning_rate": 2.70042194092827e-06, "loss": 0.0011, "step": 5980 }, { "epoch": 74.00886075949367, "grad_norm": 0.0025782466400414705, "learning_rate": 2.6863572433192686e-06, "loss": 0.0001, "step": 5990 }, { "epoch": 74.01012658227847, "grad_norm": 0.0025307261385023594, "learning_rate": 2.6722925457102674e-06, "loss": 0.0, "step": 6000 }, { "epoch": 74.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.684779167175293, "eval_runtime": 8.5153, "eval_samples_per_second": 5.519, "eval_steps_per_second": 1.409, "step": 6000 }, { "epoch": 75.00126582278482, "grad_norm": 0.0016025023069232702, "learning_rate": 2.6582278481012658e-06, "loss": 0.1562, "step": 6010 }, { "epoch": 75.00253164556962, "grad_norm": 0.0006585910450667143, "learning_rate": 2.644163150492265e-06, "loss": 0.0001, "step": 6020 }, { "epoch": 75.00379746835443, "grad_norm": 0.0031663491390645504, "learning_rate": 2.6300984528832634e-06, "loss": 0.0, "step": 6030 }, { "epoch": 75.00506329113924, "grad_norm": 0.015673287212848663, "learning_rate": 2.6160337552742622e-06, "loss": 0.0, "step": 6040 }, { "epoch": 75.00632911392405, "grad_norm": 0.003231970127671957, "learning_rate": 2.6019690576652606e-06, "loss": 0.0, "step": 6050 }, { "epoch": 75.00759493670886, "grad_norm": 0.0015489223878830671, "learning_rate": 2.587904360056259e-06, "loss": 0.0, "step": 6060 }, { "epoch": 75.00886075949367, "grad_norm": 18.648025512695312, "learning_rate": 2.573839662447258e-06, "loss": 0.0019, "step": 6070 }, { "epoch": 75.01012658227847, "grad_norm": 0.00114185712300241, "learning_rate": 2.5597749648382563e-06, "loss": 0.0, "step": 6080 }, { "epoch": 75.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.815650939941406, "eval_runtime": 8.5452, "eval_samples_per_second": 5.5, "eval_steps_per_second": 1.404, "step": 6080 }, { "epoch": 76.00126582278482, "grad_norm": 0.20001061260700226, "learning_rate": 2.545710267229255e-06, "loss": 0.0001, "step": 6090 }, { "epoch": 76.00253164556962, "grad_norm": 0.002338194055482745, "learning_rate": 2.5316455696202535e-06, "loss": 0.0, "step": 6100 }, { "epoch": 76.00379746835443, "grad_norm": 0.004149795509874821, "learning_rate": 2.5175808720112523e-06, "loss": 0.0, "step": 6110 }, { "epoch": 76.00506329113924, "grad_norm": 0.0017416217597201467, "learning_rate": 2.5035161744022507e-06, "loss": 0.0003, "step": 6120 }, { "epoch": 76.00632911392405, "grad_norm": 0.0011654688278213143, "learning_rate": 2.489451476793249e-06, "loss": 0.0, "step": 6130 }, { "epoch": 76.00759493670886, "grad_norm": 0.0034060347825288773, "learning_rate": 2.475386779184248e-06, "loss": 0.0, "step": 6140 }, { "epoch": 76.00886075949367, "grad_norm": 0.0008188936626538634, "learning_rate": 2.4613220815752463e-06, "loss": 0.0, "step": 6150 }, { "epoch": 76.01012658227847, "grad_norm": 0.0015720352530479431, "learning_rate": 2.447257383966245e-06, "loss": 0.0, "step": 6160 }, { "epoch": 76.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.8248186111450195, "eval_runtime": 8.5505, "eval_samples_per_second": 5.497, "eval_steps_per_second": 1.403, "step": 6160 }, { "epoch": 77.00126582278482, "grad_norm": 0.007548962719738483, "learning_rate": 2.4331926863572436e-06, "loss": 0.0, "step": 6170 }, { "epoch": 77.00253164556962, "grad_norm": 0.00578302051872015, "learning_rate": 2.4191279887482424e-06, "loss": 0.0, "step": 6180 }, { "epoch": 77.00379746835443, "grad_norm": 0.0033245827071368694, "learning_rate": 2.4050632911392408e-06, "loss": 0.0, "step": 6190 }, { "epoch": 77.00506329113924, "grad_norm": 0.0008535035303793848, "learning_rate": 2.390998593530239e-06, "loss": 0.0015, "step": 6200 }, { "epoch": 77.00632911392405, "grad_norm": 0.0016984603134915233, "learning_rate": 2.376933895921238e-06, "loss": 0.0001, "step": 6210 }, { "epoch": 77.00759493670886, "grad_norm": 0.0014544576406478882, "learning_rate": 2.3628691983122364e-06, "loss": 0.0045, "step": 6220 }, { "epoch": 77.00886075949367, "grad_norm": 0.0007344476762227714, "learning_rate": 2.3488045007032352e-06, "loss": 0.0, "step": 6230 }, { "epoch": 77.01012658227847, "grad_norm": 0.014698284678161144, "learning_rate": 2.3347398030942336e-06, "loss": 0.0, "step": 6240 }, { "epoch": 77.01012658227847, "eval_accuracy": 0.48936170212765956, "eval_loss": 4.5635786056518555, "eval_runtime": 8.4512, "eval_samples_per_second": 5.561, "eval_steps_per_second": 1.42, "step": 6240 }, { "epoch": 78.00126582278482, "grad_norm": 0.0009909283835440874, "learning_rate": 2.3206751054852324e-06, "loss": 0.0019, "step": 6250 }, { "epoch": 78.00253164556962, "grad_norm": 0.020323097705841064, "learning_rate": 2.306610407876231e-06, "loss": 0.0, "step": 6260 }, { "epoch": 78.00379746835443, "grad_norm": 0.0027961665764451027, "learning_rate": 2.2925457102672292e-06, "loss": 0.0001, "step": 6270 }, { "epoch": 78.00506329113924, "grad_norm": 0.0009632346336729825, "learning_rate": 2.278481012658228e-06, "loss": 0.0, "step": 6280 }, { "epoch": 78.00632911392405, "grad_norm": 0.005322067067027092, "learning_rate": 2.2644163150492265e-06, "loss": 0.1958, "step": 6290 }, { "epoch": 78.00759493670886, "grad_norm": 0.002847396768629551, "learning_rate": 2.2503516174402253e-06, "loss": 0.0, "step": 6300 }, { "epoch": 78.00886075949367, "grad_norm": 0.002737447852268815, "learning_rate": 2.2362869198312237e-06, "loss": 0.0, "step": 6310 }, { "epoch": 78.01012658227847, "grad_norm": 0.036222778260707855, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "step": 6320 }, { "epoch": 78.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.581666469573975, "eval_runtime": 8.4752, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.416, "step": 6320 }, { "epoch": 79.00126582278482, "grad_norm": 0.001555649214424193, "learning_rate": 2.208157524613221e-06, "loss": 0.0, "step": 6330 }, { "epoch": 79.00253164556962, "grad_norm": 0.0009841909632086754, "learning_rate": 2.1940928270042197e-06, "loss": 0.0, "step": 6340 }, { "epoch": 79.00379746835443, "grad_norm": 0.0069242678582668304, "learning_rate": 2.180028129395218e-06, "loss": 0.0, "step": 6350 }, { "epoch": 79.00506329113924, "grad_norm": 0.0031804998870939016, "learning_rate": 2.165963431786217e-06, "loss": 0.0, "step": 6360 }, { "epoch": 79.00632911392405, "grad_norm": 0.0010907890973612666, "learning_rate": 2.1518987341772153e-06, "loss": 0.0, "step": 6370 }, { "epoch": 79.00759493670886, "grad_norm": 0.0008229253580793738, "learning_rate": 2.137834036568214e-06, "loss": 0.0001, "step": 6380 }, { "epoch": 79.00886075949367, "grad_norm": 0.004569868091493845, "learning_rate": 2.1237693389592126e-06, "loss": 0.0, "step": 6390 }, { "epoch": 79.01012658227847, "grad_norm": 0.0017135925590991974, "learning_rate": 2.1097046413502114e-06, "loss": 0.0001, "step": 6400 }, { "epoch": 79.01012658227847, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.774336338043213, "eval_runtime": 8.469, "eval_samples_per_second": 5.55, "eval_steps_per_second": 1.417, "step": 6400 }, { "epoch": 80.00126582278482, "grad_norm": 0.0009415835957042873, "learning_rate": 2.0956399437412098e-06, "loss": 0.0, "step": 6410 }, { "epoch": 80.00253164556962, "grad_norm": 0.0011497698724269867, "learning_rate": 2.081575246132208e-06, "loss": 0.0, "step": 6420 }, { "epoch": 80.00379746835443, "grad_norm": 0.0016221057157963514, "learning_rate": 2.067510548523207e-06, "loss": 0.0, "step": 6430 }, { "epoch": 80.00506329113924, "grad_norm": 0.002268084790557623, "learning_rate": 2.0534458509142054e-06, "loss": 0.0, "step": 6440 }, { "epoch": 80.00632911392405, "grad_norm": 0.0011354024754837155, "learning_rate": 2.0393811533052042e-06, "loss": 0.0, "step": 6450 }, { "epoch": 80.00759493670886, "grad_norm": 0.002358433324843645, "learning_rate": 2.0253164556962026e-06, "loss": 0.0004, "step": 6460 }, { "epoch": 80.00886075949367, "grad_norm": 0.0013280572602525353, "learning_rate": 2.0112517580872014e-06, "loss": 0.0, "step": 6470 }, { "epoch": 80.01012658227847, "grad_norm": 0.006725861690938473, "learning_rate": 1.9971870604782e-06, "loss": 0.0001, "step": 6480 }, { "epoch": 80.01012658227847, "eval_accuracy": 0.40425531914893614, "eval_loss": 4.900009632110596, "eval_runtime": 8.4183, "eval_samples_per_second": 5.583, "eval_steps_per_second": 1.425, "step": 6480 }, { "epoch": 81.00126582278482, "grad_norm": 0.0010699324775487185, "learning_rate": 1.9831223628691982e-06, "loss": 0.1826, "step": 6490 }, { "epoch": 81.00253164556962, "grad_norm": 0.006973781157284975, "learning_rate": 1.969057665260197e-06, "loss": 0.0, "step": 6500 }, { "epoch": 81.00379746835443, "grad_norm": 0.003398946486413479, "learning_rate": 1.9549929676511955e-06, "loss": 0.0, "step": 6510 }, { "epoch": 81.00506329113924, "grad_norm": 0.009173160418868065, "learning_rate": 1.9409282700421943e-06, "loss": 0.0005, "step": 6520 }, { "epoch": 81.00632911392405, "grad_norm": 0.07392571866512299, "learning_rate": 1.9268635724331927e-06, "loss": 0.0361, "step": 6530 }, { "epoch": 81.00759493670886, "grad_norm": 0.001574037130922079, "learning_rate": 1.9127988748241915e-06, "loss": 0.0, "step": 6540 }, { "epoch": 81.00886075949367, "grad_norm": 0.0032889668364077806, "learning_rate": 1.8987341772151901e-06, "loss": 0.0, "step": 6550 }, { "epoch": 81.01012658227847, "grad_norm": 0.002083021914586425, "learning_rate": 1.8846694796061887e-06, "loss": 0.0002, "step": 6560 }, { "epoch": 81.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.766859531402588, "eval_runtime": 8.4247, "eval_samples_per_second": 5.579, "eval_steps_per_second": 1.424, "step": 6560 }, { "epoch": 82.00126582278482, "grad_norm": 0.001750531722791493, "learning_rate": 1.8706047819971873e-06, "loss": 0.0, "step": 6570 }, { "epoch": 82.00253164556962, "grad_norm": 0.0014307881938293576, "learning_rate": 1.856540084388186e-06, "loss": 0.0, "step": 6580 }, { "epoch": 82.00379746835443, "grad_norm": 0.0012166056549176574, "learning_rate": 1.8424753867791846e-06, "loss": 0.0, "step": 6590 }, { "epoch": 82.00506329113924, "grad_norm": 0.0018584979698061943, "learning_rate": 1.828410689170183e-06, "loss": 0.0, "step": 6600 }, { "epoch": 82.00632911392405, "grad_norm": 0.0007850803667679429, "learning_rate": 1.8143459915611816e-06, "loss": 0.0, "step": 6610 }, { "epoch": 82.00759493670886, "grad_norm": 0.00409714225679636, "learning_rate": 1.8002812939521802e-06, "loss": 0.0, "step": 6620 }, { "epoch": 82.00886075949367, "grad_norm": 0.0014620538568124175, "learning_rate": 1.7862165963431788e-06, "loss": 0.0, "step": 6630 }, { "epoch": 82.01012658227847, "grad_norm": 0.0011849668808281422, "learning_rate": 1.7721518987341774e-06, "loss": 0.0, "step": 6640 }, { "epoch": 82.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.8224687576293945, "eval_runtime": 8.4754, "eval_samples_per_second": 5.545, "eval_steps_per_second": 1.416, "step": 6640 }, { "epoch": 83.00126582278482, "grad_norm": 0.0014004989061504602, "learning_rate": 1.758087201125176e-06, "loss": 0.0, "step": 6650 }, { "epoch": 83.00253164556962, "grad_norm": 0.0015136294532567263, "learning_rate": 1.7440225035161746e-06, "loss": 0.0, "step": 6660 }, { "epoch": 83.00379746835443, "grad_norm": 0.00371897267177701, "learning_rate": 1.729957805907173e-06, "loss": 0.0, "step": 6670 }, { "epoch": 83.00506329113924, "grad_norm": 0.0008714126888662577, "learning_rate": 1.7158931082981716e-06, "loss": 0.0, "step": 6680 }, { "epoch": 83.00632911392405, "grad_norm": 0.003846103325486183, "learning_rate": 1.7018284106891702e-06, "loss": 0.0, "step": 6690 }, { "epoch": 83.00759493670886, "grad_norm": 0.0013878681929782033, "learning_rate": 1.6877637130801689e-06, "loss": 0.0, "step": 6700 }, { "epoch": 83.00886075949367, "grad_norm": 0.0011597294360399246, "learning_rate": 1.6736990154711675e-06, "loss": 0.0, "step": 6710 }, { "epoch": 83.01012658227847, "grad_norm": 0.0006404958548955619, "learning_rate": 1.659634317862166e-06, "loss": 0.0, "step": 6720 }, { "epoch": 83.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.833099842071533, "eval_runtime": 8.4872, "eval_samples_per_second": 5.538, "eval_steps_per_second": 1.414, "step": 6720 }, { "epoch": 84.00126582278482, "grad_norm": 0.0019264252623543143, "learning_rate": 1.6455696202531647e-06, "loss": 0.0, "step": 6730 }, { "epoch": 84.00253164556962, "grad_norm": 0.001029732171446085, "learning_rate": 1.631504922644163e-06, "loss": 0.0005, "step": 6740 }, { "epoch": 84.00379746835443, "grad_norm": 0.0017922447295859456, "learning_rate": 1.6174402250351621e-06, "loss": 0.0, "step": 6750 }, { "epoch": 84.00506329113924, "grad_norm": 0.0012487670173868537, "learning_rate": 1.6033755274261605e-06, "loss": 0.0, "step": 6760 }, { "epoch": 84.00632911392405, "grad_norm": 0.0014119717525318265, "learning_rate": 1.5893108298171591e-06, "loss": 0.0001, "step": 6770 }, { "epoch": 84.00759493670886, "grad_norm": 0.0006965138600207865, "learning_rate": 1.5752461322081577e-06, "loss": 0.0, "step": 6780 }, { "epoch": 84.00886075949367, "grad_norm": 0.0011428669095039368, "learning_rate": 1.5611814345991563e-06, "loss": 0.0, "step": 6790 }, { "epoch": 84.01012658227847, "grad_norm": 0.002268004696816206, "learning_rate": 1.547116736990155e-06, "loss": 0.0, "step": 6800 }, { "epoch": 84.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.715381145477295, "eval_runtime": 8.3979, "eval_samples_per_second": 5.597, "eval_steps_per_second": 1.429, "step": 6800 }, { "epoch": 85.00126582278482, "grad_norm": 0.002592705423012376, "learning_rate": 1.5330520393811536e-06, "loss": 0.0, "step": 6810 }, { "epoch": 85.00253164556962, "grad_norm": 0.04101519286632538, "learning_rate": 1.518987341772152e-06, "loss": 0.0, "step": 6820 }, { "epoch": 85.00379746835443, "grad_norm": 0.0005958875990472734, "learning_rate": 1.5049226441631506e-06, "loss": 0.0, "step": 6830 }, { "epoch": 85.00506329113924, "grad_norm": 0.0025226089637726545, "learning_rate": 1.4908579465541492e-06, "loss": 0.0, "step": 6840 }, { "epoch": 85.00632911392405, "grad_norm": 0.0008681220351718366, "learning_rate": 1.4767932489451478e-06, "loss": 0.0, "step": 6850 }, { "epoch": 85.00759493670886, "grad_norm": 0.0013401606120169163, "learning_rate": 1.4627285513361464e-06, "loss": 0.0, "step": 6860 }, { "epoch": 85.00886075949367, "grad_norm": 0.0010737047996371984, "learning_rate": 1.448663853727145e-06, "loss": 0.0, "step": 6870 }, { "epoch": 85.01012658227847, "grad_norm": 0.001385514042340219, "learning_rate": 1.4345991561181436e-06, "loss": 0.0, "step": 6880 }, { "epoch": 85.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.716861248016357, "eval_runtime": 8.4473, "eval_samples_per_second": 5.564, "eval_steps_per_second": 1.421, "step": 6880 }, { "epoch": 86.00126582278482, "grad_norm": 0.0020737305749207735, "learning_rate": 1.420534458509142e-06, "loss": 0.0, "step": 6890 }, { "epoch": 86.00253164556962, "grad_norm": 0.0013663348508998752, "learning_rate": 1.4064697609001406e-06, "loss": 0.0001, "step": 6900 }, { "epoch": 86.00379746835443, "grad_norm": 0.01127583533525467, "learning_rate": 1.3924050632911392e-06, "loss": 0.0, "step": 6910 }, { "epoch": 86.00506329113924, "grad_norm": 0.002084192121401429, "learning_rate": 1.3783403656821379e-06, "loss": 0.0, "step": 6920 }, { "epoch": 86.00632911392405, "grad_norm": 0.0009935207199305296, "learning_rate": 1.3642756680731365e-06, "loss": 0.0, "step": 6930 }, { "epoch": 86.00759493670886, "grad_norm": 0.0008619350846856833, "learning_rate": 1.350210970464135e-06, "loss": 0.0, "step": 6940 }, { "epoch": 86.00886075949367, "grad_norm": 0.000807570235338062, "learning_rate": 1.3361462728551337e-06, "loss": 0.0, "step": 6950 }, { "epoch": 86.01012658227847, "grad_norm": 0.001797909731976688, "learning_rate": 1.3220815752461325e-06, "loss": 0.0, "step": 6960 }, { "epoch": 86.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.900410175323486, "eval_runtime": 8.4167, "eval_samples_per_second": 5.584, "eval_steps_per_second": 1.426, "step": 6960 }, { "epoch": 87.00126582278482, "grad_norm": 0.0008401426021009684, "learning_rate": 1.3080168776371311e-06, "loss": 0.0, "step": 6970 }, { "epoch": 87.00253164556962, "grad_norm": 0.0014646663330495358, "learning_rate": 1.2939521800281295e-06, "loss": 0.0, "step": 6980 }, { "epoch": 87.00379746835443, "grad_norm": 0.0010633817873895168, "learning_rate": 1.2798874824191281e-06, "loss": 0.0, "step": 6990 }, { "epoch": 87.00506329113924, "grad_norm": 0.0017103515565395355, "learning_rate": 1.2658227848101267e-06, "loss": 0.0, "step": 7000 }, { "epoch": 87.00632911392405, "grad_norm": 0.001976841827854514, "learning_rate": 1.2517580872011254e-06, "loss": 0.0, "step": 7010 }, { "epoch": 87.00759493670886, "grad_norm": 0.000657711352687329, "learning_rate": 1.237693389592124e-06, "loss": 0.0, "step": 7020 }, { "epoch": 87.00886075949367, "grad_norm": 0.0006206512916833162, "learning_rate": 1.2236286919831226e-06, "loss": 0.0, "step": 7030 }, { "epoch": 87.01012658227847, "grad_norm": 0.0030793757177889347, "learning_rate": 1.2095639943741212e-06, "loss": 0.0, "step": 7040 }, { "epoch": 87.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.909188270568848, "eval_runtime": 8.4164, "eval_samples_per_second": 5.584, "eval_steps_per_second": 1.426, "step": 7040 }, { "epoch": 88.00126582278482, "grad_norm": 0.0011876953067258, "learning_rate": 1.1954992967651196e-06, "loss": 0.0, "step": 7050 }, { "epoch": 88.00253164556962, "grad_norm": 0.0019371965900063515, "learning_rate": 1.1814345991561182e-06, "loss": 0.0, "step": 7060 }, { "epoch": 88.00379746835443, "grad_norm": 0.001290988875553012, "learning_rate": 1.1673699015471168e-06, "loss": 0.0, "step": 7070 }, { "epoch": 88.00506329113924, "grad_norm": 0.003862161422148347, "learning_rate": 1.1533052039381154e-06, "loss": 0.0, "step": 7080 }, { "epoch": 88.00632911392405, "grad_norm": 0.0007267307373695076, "learning_rate": 1.139240506329114e-06, "loss": 0.0, "step": 7090 }, { "epoch": 88.00759493670886, "grad_norm": 0.0007938898052088916, "learning_rate": 1.1251758087201126e-06, "loss": 0.0, "step": 7100 }, { "epoch": 88.00886075949367, "grad_norm": 0.0006018871208652854, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "step": 7110 }, { "epoch": 88.01012658227847, "grad_norm": 0.0017778057372197509, "learning_rate": 1.0970464135021099e-06, "loss": 0.0, "step": 7120 }, { "epoch": 88.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.89414119720459, "eval_runtime": 8.438, "eval_samples_per_second": 5.57, "eval_steps_per_second": 1.422, "step": 7120 }, { "epoch": 89.00126582278482, "grad_norm": 0.0007234832737594843, "learning_rate": 1.0829817158931085e-06, "loss": 0.0, "step": 7130 }, { "epoch": 89.00253164556962, "grad_norm": 0.0015409559709951282, "learning_rate": 1.068917018284107e-06, "loss": 0.0, "step": 7140 }, { "epoch": 89.00379746835443, "grad_norm": 0.0008910736651159823, "learning_rate": 1.0548523206751057e-06, "loss": 0.0, "step": 7150 }, { "epoch": 89.00506329113924, "grad_norm": 0.0020937921945005655, "learning_rate": 1.040787623066104e-06, "loss": 0.0, "step": 7160 }, { "epoch": 89.00632911392405, "grad_norm": 0.0014372824225574732, "learning_rate": 1.0267229254571027e-06, "loss": 0.0, "step": 7170 }, { "epoch": 89.00759493670886, "grad_norm": 0.003179526887834072, "learning_rate": 1.0126582278481013e-06, "loss": 0.0, "step": 7180 }, { "epoch": 89.00886075949367, "grad_norm": 0.0012057056883350015, "learning_rate": 9.985935302391e-07, "loss": 0.0001, "step": 7190 }, { "epoch": 89.01012658227847, "grad_norm": 0.005369286518543959, "learning_rate": 9.845288326300985e-07, "loss": 0.0, "step": 7200 }, { "epoch": 89.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.789796352386475, "eval_runtime": 8.4427, "eval_samples_per_second": 5.567, "eval_steps_per_second": 1.421, "step": 7200 }, { "epoch": 90.00126582278482, "grad_norm": 0.0017162526492029428, "learning_rate": 9.704641350210971e-07, "loss": 0.0, "step": 7210 }, { "epoch": 90.00253164556962, "grad_norm": 0.0009838847909122705, "learning_rate": 9.563994374120958e-07, "loss": 0.0, "step": 7220 }, { "epoch": 90.00379746835443, "grad_norm": 0.015449059195816517, "learning_rate": 9.423347398030944e-07, "loss": 0.0, "step": 7230 }, { "epoch": 90.00506329113924, "grad_norm": 0.0017991637578234076, "learning_rate": 9.28270042194093e-07, "loss": 0.0, "step": 7240 }, { "epoch": 90.00632911392405, "grad_norm": 0.0010769497603178024, "learning_rate": 9.142053445850915e-07, "loss": 0.0, "step": 7250 }, { "epoch": 90.00759493670886, "grad_norm": 0.0007212001946754754, "learning_rate": 9.001406469760901e-07, "loss": 0.0, "step": 7260 }, { "epoch": 90.00886075949367, "grad_norm": 0.000739375944249332, "learning_rate": 8.860759493670887e-07, "loss": 0.0, "step": 7270 }, { "epoch": 90.01012658227847, "grad_norm": 0.002124297898262739, "learning_rate": 8.720112517580873e-07, "loss": 0.0, "step": 7280 }, { "epoch": 90.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.827134132385254, "eval_runtime": 8.4657, "eval_samples_per_second": 5.552, "eval_steps_per_second": 1.417, "step": 7280 }, { "epoch": 91.00126582278482, "grad_norm": 0.00743636442348361, "learning_rate": 8.579465541490858e-07, "loss": 0.0, "step": 7290 }, { "epoch": 91.00253164556962, "grad_norm": 0.001242807717062533, "learning_rate": 8.438818565400844e-07, "loss": 0.0, "step": 7300 }, { "epoch": 91.00379746835443, "grad_norm": 0.017530538141727448, "learning_rate": 8.29817158931083e-07, "loss": 0.0, "step": 7310 }, { "epoch": 91.00506329113924, "grad_norm": 0.0027876682579517365, "learning_rate": 8.157524613220815e-07, "loss": 0.0, "step": 7320 }, { "epoch": 91.00632911392405, "grad_norm": 0.001038099406287074, "learning_rate": 8.016877637130803e-07, "loss": 0.0, "step": 7330 }, { "epoch": 91.00759493670886, "grad_norm": 0.0012997939484193921, "learning_rate": 7.876230661040789e-07, "loss": 0.0, "step": 7340 }, { "epoch": 91.00886075949367, "grad_norm": 0.00228080153465271, "learning_rate": 7.735583684950775e-07, "loss": 0.0, "step": 7350 }, { "epoch": 91.01012658227847, "grad_norm": 0.0014501850819215178, "learning_rate": 7.59493670886076e-07, "loss": 0.0, "step": 7360 }, { "epoch": 91.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.832017421722412, "eval_runtime": 8.6255, "eval_samples_per_second": 5.449, "eval_steps_per_second": 1.391, "step": 7360 }, { "epoch": 92.00126582278482, "grad_norm": 0.0007885429658927023, "learning_rate": 7.454289732770746e-07, "loss": 0.0, "step": 7370 }, { "epoch": 92.00253164556962, "grad_norm": 0.0009592593996785581, "learning_rate": 7.313642756680732e-07, "loss": 0.0, "step": 7380 }, { "epoch": 92.00379746835443, "grad_norm": 0.004812302067875862, "learning_rate": 7.172995780590718e-07, "loss": 0.0, "step": 7390 }, { "epoch": 92.00506329113924, "grad_norm": 0.0012065304908901453, "learning_rate": 7.032348804500703e-07, "loss": 0.0, "step": 7400 }, { "epoch": 92.00632911392405, "grad_norm": 0.0025038751773536205, "learning_rate": 6.891701828410689e-07, "loss": 0.0, "step": 7410 }, { "epoch": 92.00759493670886, "grad_norm": 0.0007439328473992646, "learning_rate": 6.751054852320675e-07, "loss": 0.0, "step": 7420 }, { "epoch": 92.00886075949367, "grad_norm": 0.040091563016176224, "learning_rate": 6.610407876230663e-07, "loss": 0.0, "step": 7430 }, { "epoch": 92.01012658227847, "grad_norm": 0.00362952146679163, "learning_rate": 6.469760900140648e-07, "loss": 0.0, "step": 7440 }, { "epoch": 92.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.827417850494385, "eval_runtime": 8.4772, "eval_samples_per_second": 5.544, "eval_steps_per_second": 1.416, "step": 7440 }, { "epoch": 93.00126582278482, "grad_norm": 0.0022241012193262577, "learning_rate": 6.329113924050634e-07, "loss": 0.0, "step": 7450 }, { "epoch": 93.00253164556962, "grad_norm": 0.025551510974764824, "learning_rate": 6.18846694796062e-07, "loss": 0.0, "step": 7460 }, { "epoch": 93.00379746835443, "grad_norm": 0.00078696379205212, "learning_rate": 6.047819971870606e-07, "loss": 0.0, "step": 7470 }, { "epoch": 93.00506329113924, "grad_norm": 0.0017261310713365674, "learning_rate": 5.907172995780591e-07, "loss": 0.0, "step": 7480 }, { "epoch": 93.00632911392405, "grad_norm": 0.003345273435115814, "learning_rate": 5.766526019690577e-07, "loss": 0.0, "step": 7490 }, { "epoch": 93.00759493670886, "grad_norm": 0.0011764048831537366, "learning_rate": 5.625879043600563e-07, "loss": 0.0, "step": 7500 }, { "epoch": 93.00886075949367, "grad_norm": 0.0013756465632468462, "learning_rate": 5.485232067510549e-07, "loss": 0.0, "step": 7510 }, { "epoch": 93.01012658227847, "grad_norm": 0.0011709831887856126, "learning_rate": 5.344585091420535e-07, "loss": 0.0, "step": 7520 }, { "epoch": 93.01012658227847, "eval_accuracy": 0.44680851063829785, "eval_loss": 4.826868057250977, "eval_runtime": 170.4429, "eval_samples_per_second": 0.276, "eval_steps_per_second": 0.07, "step": 7520 }, { "epoch": 94.00126582278482, "grad_norm": 0.002271972130984068, "learning_rate": 5.20393811533052e-07, "loss": 0.0001, "step": 7530 }, { "epoch": 94.00253164556962, "grad_norm": 12.584663391113281, "learning_rate": 5.063291139240507e-07, "loss": 0.0007, "step": 7540 }, { "epoch": 94.00379746835443, "grad_norm": 0.0010153332259505987, "learning_rate": 4.922644163150493e-07, "loss": 0.0, "step": 7550 }, { "epoch": 94.00506329113924, "grad_norm": 0.0019480427727103233, "learning_rate": 4.781997187060479e-07, "loss": 0.0, "step": 7560 }, { "epoch": 94.00632911392405, "grad_norm": 0.005996455904096365, "learning_rate": 4.641350210970465e-07, "loss": 0.0, "step": 7570 }, { "epoch": 94.00759493670886, "grad_norm": 0.0005868688458576798, "learning_rate": 4.5007032348804504e-07, "loss": 0.0, "step": 7580 }, { "epoch": 94.00886075949367, "grad_norm": 0.0008807959966361523, "learning_rate": 4.3600562587904366e-07, "loss": 0.0, "step": 7590 }, { "epoch": 94.01012658227847, "grad_norm": 0.0008403842803090811, "learning_rate": 4.219409282700422e-07, "loss": 0.0, "step": 7600 }, { "epoch": 94.01012658227847, "eval_accuracy": 0.3829787234042553, "eval_loss": 4.878473281860352, "eval_runtime": 8.411, "eval_samples_per_second": 5.588, "eval_steps_per_second": 1.427, "step": 7600 }, { "epoch": 95.00126582278482, "grad_norm": 0.005562290083616972, "learning_rate": 4.0787623066104077e-07, "loss": 0.0001, "step": 7610 }, { "epoch": 95.00253164556962, "grad_norm": 0.004410718102008104, "learning_rate": 3.9381153305203943e-07, "loss": 0.0, "step": 7620 }, { "epoch": 95.00379746835443, "grad_norm": 0.0042534684762358665, "learning_rate": 3.79746835443038e-07, "loss": 0.0, "step": 7630 }, { "epoch": 95.00506329113924, "grad_norm": 0.0012142916675657034, "learning_rate": 3.656821378340366e-07, "loss": 0.0, "step": 7640 }, { "epoch": 95.00632911392405, "grad_norm": 0.0007107236888259649, "learning_rate": 3.5161744022503516e-07, "loss": 0.0, "step": 7650 }, { "epoch": 95.00759493670886, "grad_norm": 0.0018182717030867934, "learning_rate": 3.3755274261603377e-07, "loss": 0.0, "step": 7660 }, { "epoch": 95.00886075949367, "grad_norm": 0.003002484329044819, "learning_rate": 3.234880450070324e-07, "loss": 0.0001, "step": 7670 }, { "epoch": 95.01012658227847, "grad_norm": 0.0012020288268104196, "learning_rate": 3.09423347398031e-07, "loss": 0.0, "step": 7680 }, { "epoch": 95.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.963972568511963, "eval_runtime": 8.4253, "eval_samples_per_second": 5.578, "eval_steps_per_second": 1.424, "step": 7680 }, { "epoch": 96.00126582278482, "grad_norm": 0.0024408556055277586, "learning_rate": 2.9535864978902955e-07, "loss": 0.0, "step": 7690 }, { "epoch": 96.00253164556962, "grad_norm": 329.71331787109375, "learning_rate": 2.8129395218002816e-07, "loss": 0.132, "step": 7700 }, { "epoch": 96.00379746835443, "grad_norm": 0.0019197100773453712, "learning_rate": 2.6722925457102677e-07, "loss": 0.0005, "step": 7710 }, { "epoch": 96.00506329113924, "grad_norm": 0.0013428219826892018, "learning_rate": 2.5316455696202533e-07, "loss": 0.0, "step": 7720 }, { "epoch": 96.00632911392405, "grad_norm": 0.0015475323889404535, "learning_rate": 2.3909985935302394e-07, "loss": 0.0, "step": 7730 }, { "epoch": 96.00759493670886, "grad_norm": 0.0013204860733821988, "learning_rate": 2.2503516174402252e-07, "loss": 0.0, "step": 7740 }, { "epoch": 96.00886075949367, "grad_norm": 0.0007807817310094833, "learning_rate": 2.109704641350211e-07, "loss": 0.0, "step": 7750 }, { "epoch": 96.01012658227847, "grad_norm": 0.0009276365744881332, "learning_rate": 1.9690576652601972e-07, "loss": 0.0, "step": 7760 }, { "epoch": 96.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.947977066040039, "eval_runtime": 8.4147, "eval_samples_per_second": 5.585, "eval_steps_per_second": 1.426, "step": 7760 }, { "epoch": 97.00126582278482, "grad_norm": 0.0013080050703138113, "learning_rate": 1.828410689170183e-07, "loss": 0.0, "step": 7770 }, { "epoch": 97.00253164556962, "grad_norm": 0.0018693221500143409, "learning_rate": 1.6877637130801689e-07, "loss": 0.0, "step": 7780 }, { "epoch": 97.00379746835443, "grad_norm": 0.0018208841793239117, "learning_rate": 1.547116736990155e-07, "loss": 0.0, "step": 7790 }, { "epoch": 97.00506329113924, "grad_norm": 0.0018955061677843332, "learning_rate": 1.4064697609001408e-07, "loss": 0.0, "step": 7800 }, { "epoch": 97.00632911392405, "grad_norm": 0.0007756951963528991, "learning_rate": 1.2658227848101266e-07, "loss": 0.0, "step": 7810 }, { "epoch": 97.00759493670886, "grad_norm": 0.0009716827771626413, "learning_rate": 1.1251758087201126e-07, "loss": 0.0, "step": 7820 }, { "epoch": 97.00886075949367, "grad_norm": 0.003705208422616124, "learning_rate": 9.845288326300986e-08, "loss": 0.0, "step": 7830 }, { "epoch": 97.01012658227847, "grad_norm": 0.013106240890920162, "learning_rate": 8.438818565400844e-08, "loss": 0.0, "step": 7840 }, { "epoch": 97.01012658227847, "eval_accuracy": 0.425531914893617, "eval_loss": 4.940354824066162, "eval_runtime": 8.4561, "eval_samples_per_second": 5.558, "eval_steps_per_second": 1.419, "step": 7840 }, { "epoch": 98.00126582278482, "grad_norm": 0.0024029065389186144, "learning_rate": 7.032348804500704e-08, "loss": 0.0, "step": 7850 }, { "epoch": 98.00253164556962, "grad_norm": 0.0011833859607577324, "learning_rate": 5.625879043600563e-08, "loss": 0.0, "step": 7860 }, { "epoch": 98.00379746835443, "grad_norm": 0.0013356610434129834, "learning_rate": 4.219409282700422e-08, "loss": 0.0, "step": 7870 }, { "epoch": 98.00506329113924, "grad_norm": 0.0007605087594129145, "learning_rate": 2.8129395218002815e-08, "loss": 0.0, "step": 7880 }, { "epoch": 98.00632911392405, "grad_norm": 0.0008561754948459566, "learning_rate": 1.4064697609001408e-08, "loss": 0.0011, "step": 7890 }, { "epoch": 98.00759493670886, "grad_norm": 0.0006674563628621399, "learning_rate": 0.0, "loss": 0.0, "step": 7900 }, { "epoch": 98.00759493670886, "eval_accuracy": 0.425531914893617, "eval_loss": 4.9351420402526855, "eval_runtime": 9.2926, "eval_samples_per_second": 5.058, "eval_steps_per_second": 1.291, "step": 7900 }, { "epoch": 98.00759493670886, "step": 7900, "total_flos": 1.378962555602208e+20, "train_loss": 0.1596812904629944, "train_runtime": 13780.8931, "train_samples_per_second": 2.293, "train_steps_per_second": 0.573 }, { "epoch": 98.00759493670886, "eval_accuracy": 0.3076923076923077, "eval_loss": 5.42232084274292, "eval_runtime": 8.1011, "eval_samples_per_second": 4.814, "eval_steps_per_second": 1.234, "step": 7900 }, { "epoch": 98.00759493670886, "eval_accuracy": 0.3076923076923077, "eval_loss": 5.422321319580078, "eval_runtime": 7.0644, "eval_samples_per_second": 5.521, "eval_steps_per_second": 1.416, "step": 7900 } ], "logging_steps": 10, "max_steps": 7900, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.378962555602208e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }