MAE-CT-CPC-Dicotomized-v7-tricot / trainer_state.json
beingbatman's picture
End of training
df6f230 verified
{
"best_metric": 0.5319148936170213,
"best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v7-tricot/checkpoint-3840",
"epoch": 98.00759493670886,
"eval_steps": 500,
"global_step": 7900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012658227848101266,
"grad_norm": 3.648193836212158,
"learning_rate": 1.2658227848101266e-07,
"loss": 1.0945,
"step": 10
},
{
"epoch": 0.002531645569620253,
"grad_norm": 3.2326066493988037,
"learning_rate": 2.5316455696202533e-07,
"loss": 1.0964,
"step": 20
},
{
"epoch": 0.0037974683544303796,
"grad_norm": 4.472383975982666,
"learning_rate": 3.79746835443038e-07,
"loss": 1.1085,
"step": 30
},
{
"epoch": 0.005063291139240506,
"grad_norm": 2.293823003768921,
"learning_rate": 5.063291139240507e-07,
"loss": 1.0936,
"step": 40
},
{
"epoch": 0.006329113924050633,
"grad_norm": 5.018352508544922,
"learning_rate": 6.329113924050634e-07,
"loss": 1.0804,
"step": 50
},
{
"epoch": 0.007594936708860759,
"grad_norm": 3.437401533126831,
"learning_rate": 7.59493670886076e-07,
"loss": 1.1052,
"step": 60
},
{
"epoch": 0.008860759493670886,
"grad_norm": 2.633840322494507,
"learning_rate": 8.860759493670887e-07,
"loss": 1.1041,
"step": 70
},
{
"epoch": 0.010126582278481013,
"grad_norm": 8.326930046081543,
"learning_rate": 1.0126582278481013e-06,
"loss": 1.0989,
"step": 80
},
{
"epoch": 0.010126582278481013,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 1.1030582189559937,
"eval_runtime": 10.2981,
"eval_samples_per_second": 4.564,
"eval_steps_per_second": 1.165,
"step": 80
},
{
"epoch": 1.0012658227848101,
"grad_norm": 5.40225076675415,
"learning_rate": 1.139240506329114e-06,
"loss": 1.0679,
"step": 90
},
{
"epoch": 1.0025316455696203,
"grad_norm": 2.765038013458252,
"learning_rate": 1.2658227848101267e-06,
"loss": 1.1039,
"step": 100
},
{
"epoch": 1.0037974683544304,
"grad_norm": 4.477443695068359,
"learning_rate": 1.3924050632911392e-06,
"loss": 1.0935,
"step": 110
},
{
"epoch": 1.0050632911392405,
"grad_norm": 4.2104573249816895,
"learning_rate": 1.518987341772152e-06,
"loss": 1.0694,
"step": 120
},
{
"epoch": 1.0063291139240507,
"grad_norm": 3.7104029655456543,
"learning_rate": 1.6455696202531647e-06,
"loss": 1.1194,
"step": 130
},
{
"epoch": 1.0075949367088608,
"grad_norm": 6.592172622680664,
"learning_rate": 1.7721518987341774e-06,
"loss": 1.1171,
"step": 140
},
{
"epoch": 1.008860759493671,
"grad_norm": 5.555737018585205,
"learning_rate": 1.8987341772151901e-06,
"loss": 1.0965,
"step": 150
},
{
"epoch": 1.010126582278481,
"grad_norm": 6.766844749450684,
"learning_rate": 2.0253164556962026e-06,
"loss": 1.0889,
"step": 160
},
{
"epoch": 1.010126582278481,
"eval_accuracy": 0.3404255319148936,
"eval_loss": 1.1057974100112915,
"eval_runtime": 9.1496,
"eval_samples_per_second": 5.137,
"eval_steps_per_second": 1.312,
"step": 160
},
{
"epoch": 2.00126582278481,
"grad_norm": 5.130347728729248,
"learning_rate": 2.1518987341772153e-06,
"loss": 1.0725,
"step": 170
},
{
"epoch": 2.0025316455696203,
"grad_norm": 5.7678070068359375,
"learning_rate": 2.278481012658228e-06,
"loss": 1.0791,
"step": 180
},
{
"epoch": 2.0037974683544304,
"grad_norm": 6.685475826263428,
"learning_rate": 2.4050632911392408e-06,
"loss": 1.0906,
"step": 190
},
{
"epoch": 2.0050632911392405,
"grad_norm": 4.179187774658203,
"learning_rate": 2.5316455696202535e-06,
"loss": 1.0754,
"step": 200
},
{
"epoch": 2.0063291139240507,
"grad_norm": 7.93744421005249,
"learning_rate": 2.6582278481012658e-06,
"loss": 1.0612,
"step": 210
},
{
"epoch": 2.007594936708861,
"grad_norm": 5.547979354858398,
"learning_rate": 2.7848101265822785e-06,
"loss": 1.0703,
"step": 220
},
{
"epoch": 2.008860759493671,
"grad_norm": 6.917874813079834,
"learning_rate": 2.9113924050632912e-06,
"loss": 1.0648,
"step": 230
},
{
"epoch": 2.010126582278481,
"grad_norm": 14.24355697631836,
"learning_rate": 3.037974683544304e-06,
"loss": 1.0739,
"step": 240
},
{
"epoch": 2.010126582278481,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 1.1232898235321045,
"eval_runtime": 8.996,
"eval_samples_per_second": 5.225,
"eval_steps_per_second": 1.334,
"step": 240
},
{
"epoch": 3.00126582278481,
"grad_norm": 11.271843910217285,
"learning_rate": 3.164556962025317e-06,
"loss": 1.0305,
"step": 250
},
{
"epoch": 3.0025316455696203,
"grad_norm": 12.815051078796387,
"learning_rate": 3.2911392405063294e-06,
"loss": 1.0576,
"step": 260
},
{
"epoch": 3.0037974683544304,
"grad_norm": 19.665115356445312,
"learning_rate": 3.417721518987342e-06,
"loss": 0.9806,
"step": 270
},
{
"epoch": 3.0050632911392405,
"grad_norm": 11.805643081665039,
"learning_rate": 3.544303797468355e-06,
"loss": 1.0493,
"step": 280
},
{
"epoch": 3.0063291139240507,
"grad_norm": 17.121118545532227,
"learning_rate": 3.6708860759493675e-06,
"loss": 1.0728,
"step": 290
},
{
"epoch": 3.007594936708861,
"grad_norm": 18.008495330810547,
"learning_rate": 3.7974683544303802e-06,
"loss": 1.0749,
"step": 300
},
{
"epoch": 3.008860759493671,
"grad_norm": 7.478245735168457,
"learning_rate": 3.924050632911393e-06,
"loss": 1.0579,
"step": 310
},
{
"epoch": 3.010126582278481,
"grad_norm": 29.086742401123047,
"learning_rate": 4.050632911392405e-06,
"loss": 1.0036,
"step": 320
},
{
"epoch": 3.010126582278481,
"eval_accuracy": 0.2765957446808511,
"eval_loss": 1.1595509052276611,
"eval_runtime": 9.1259,
"eval_samples_per_second": 5.15,
"eval_steps_per_second": 1.315,
"step": 320
},
{
"epoch": 4.00126582278481,
"grad_norm": 14.256952285766602,
"learning_rate": 4.177215189873418e-06,
"loss": 1.0293,
"step": 330
},
{
"epoch": 4.00253164556962,
"grad_norm": 14.238683700561523,
"learning_rate": 4.303797468354431e-06,
"loss": 0.9938,
"step": 340
},
{
"epoch": 4.00379746835443,
"grad_norm": 15.750340461730957,
"learning_rate": 4.430379746835443e-06,
"loss": 1.0362,
"step": 350
},
{
"epoch": 4.0050632911392405,
"grad_norm": 24.7191219329834,
"learning_rate": 4.556962025316456e-06,
"loss": 0.9665,
"step": 360
},
{
"epoch": 4.006329113924051,
"grad_norm": 17.434118270874023,
"learning_rate": 4.683544303797468e-06,
"loss": 0.9927,
"step": 370
},
{
"epoch": 4.007594936708861,
"grad_norm": 5.921336650848389,
"learning_rate": 4.8101265822784815e-06,
"loss": 0.9862,
"step": 380
},
{
"epoch": 4.008860759493671,
"grad_norm": 6.861782550811768,
"learning_rate": 4.936708860759495e-06,
"loss": 1.1099,
"step": 390
},
{
"epoch": 4.010126582278481,
"grad_norm": 26.024229049682617,
"learning_rate": 5.063291139240507e-06,
"loss": 1.0706,
"step": 400
},
{
"epoch": 4.010126582278481,
"eval_accuracy": 0.2553191489361702,
"eval_loss": 1.1730738878250122,
"eval_runtime": 9.0971,
"eval_samples_per_second": 5.166,
"eval_steps_per_second": 1.319,
"step": 400
},
{
"epoch": 5.00126582278481,
"grad_norm": 13.2531099319458,
"learning_rate": 5.189873417721519e-06,
"loss": 0.8603,
"step": 410
},
{
"epoch": 5.00253164556962,
"grad_norm": 11.527708053588867,
"learning_rate": 5.3164556962025316e-06,
"loss": 0.9762,
"step": 420
},
{
"epoch": 5.00379746835443,
"grad_norm": 25.327789306640625,
"learning_rate": 5.443037974683545e-06,
"loss": 0.9512,
"step": 430
},
{
"epoch": 5.0050632911392405,
"grad_norm": 24.11504554748535,
"learning_rate": 5.569620253164557e-06,
"loss": 0.9437,
"step": 440
},
{
"epoch": 5.006329113924051,
"grad_norm": 26.003135681152344,
"learning_rate": 5.69620253164557e-06,
"loss": 0.9311,
"step": 450
},
{
"epoch": 5.007594936708861,
"grad_norm": 22.07634735107422,
"learning_rate": 5.8227848101265824e-06,
"loss": 0.9741,
"step": 460
},
{
"epoch": 5.008860759493671,
"grad_norm": 19.476099014282227,
"learning_rate": 5.949367088607595e-06,
"loss": 0.9916,
"step": 470
},
{
"epoch": 5.010126582278481,
"grad_norm": 24.048255920410156,
"learning_rate": 6.075949367088608e-06,
"loss": 0.9669,
"step": 480
},
{
"epoch": 5.010126582278481,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 1.1227548122406006,
"eval_runtime": 9.13,
"eval_samples_per_second": 5.148,
"eval_steps_per_second": 1.314,
"step": 480
},
{
"epoch": 6.00126582278481,
"grad_norm": 21.775312423706055,
"learning_rate": 6.20253164556962e-06,
"loss": 0.8955,
"step": 490
},
{
"epoch": 6.00253164556962,
"grad_norm": 10.735696792602539,
"learning_rate": 6.329113924050634e-06,
"loss": 0.9152,
"step": 500
},
{
"epoch": 6.00379746835443,
"grad_norm": 29.428773880004883,
"learning_rate": 6.4556962025316464e-06,
"loss": 0.9614,
"step": 510
},
{
"epoch": 6.0050632911392405,
"grad_norm": 21.473602294921875,
"learning_rate": 6.582278481012659e-06,
"loss": 0.9911,
"step": 520
},
{
"epoch": 6.006329113924051,
"grad_norm": 22.8590087890625,
"learning_rate": 6.708860759493672e-06,
"loss": 0.9406,
"step": 530
},
{
"epoch": 6.007594936708861,
"grad_norm": 25.129230499267578,
"learning_rate": 6.835443037974684e-06,
"loss": 0.9051,
"step": 540
},
{
"epoch": 6.008860759493671,
"grad_norm": 34.37338638305664,
"learning_rate": 6.962025316455697e-06,
"loss": 0.8215,
"step": 550
},
{
"epoch": 6.010126582278481,
"grad_norm": 33.80929946899414,
"learning_rate": 7.08860759493671e-06,
"loss": 1.0233,
"step": 560
},
{
"epoch": 6.010126582278481,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 1.1490142345428467,
"eval_runtime": 9.1478,
"eval_samples_per_second": 5.138,
"eval_steps_per_second": 1.312,
"step": 560
},
{
"epoch": 7.00126582278481,
"grad_norm": 10.804941177368164,
"learning_rate": 7.215189873417722e-06,
"loss": 0.8237,
"step": 570
},
{
"epoch": 7.00253164556962,
"grad_norm": 14.405462265014648,
"learning_rate": 7.341772151898735e-06,
"loss": 0.8341,
"step": 580
},
{
"epoch": 7.00379746835443,
"grad_norm": 33.400726318359375,
"learning_rate": 7.468354430379747e-06,
"loss": 0.8029,
"step": 590
},
{
"epoch": 7.0050632911392405,
"grad_norm": 11.047707557678223,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.935,
"step": 600
},
{
"epoch": 7.006329113924051,
"grad_norm": 30.89590072631836,
"learning_rate": 7.721518987341773e-06,
"loss": 0.901,
"step": 610
},
{
"epoch": 7.007594936708861,
"grad_norm": 14.323598861694336,
"learning_rate": 7.848101265822786e-06,
"loss": 0.8399,
"step": 620
},
{
"epoch": 7.008860759493671,
"grad_norm": 25.75128173828125,
"learning_rate": 7.974683544303799e-06,
"loss": 0.8836,
"step": 630
},
{
"epoch": 7.010126582278481,
"grad_norm": 28.034568786621094,
"learning_rate": 8.10126582278481e-06,
"loss": 0.8492,
"step": 640
},
{
"epoch": 7.010126582278481,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 1.263619303703308,
"eval_runtime": 8.888,
"eval_samples_per_second": 5.288,
"eval_steps_per_second": 1.35,
"step": 640
},
{
"epoch": 8.00126582278481,
"grad_norm": 27.309749603271484,
"learning_rate": 8.227848101265824e-06,
"loss": 0.7993,
"step": 650
},
{
"epoch": 8.00253164556962,
"grad_norm": 23.6923770904541,
"learning_rate": 8.354430379746837e-06,
"loss": 0.8733,
"step": 660
},
{
"epoch": 8.00379746835443,
"grad_norm": 27.559850692749023,
"learning_rate": 8.481012658227848e-06,
"loss": 0.8727,
"step": 670
},
{
"epoch": 8.00506329113924,
"grad_norm": 13.756896018981934,
"learning_rate": 8.607594936708861e-06,
"loss": 0.7896,
"step": 680
},
{
"epoch": 8.00632911392405,
"grad_norm": 17.663959503173828,
"learning_rate": 8.734177215189874e-06,
"loss": 0.718,
"step": 690
},
{
"epoch": 8.00759493670886,
"grad_norm": 11.68373966217041,
"learning_rate": 8.860759493670886e-06,
"loss": 0.6608,
"step": 700
},
{
"epoch": 8.00886075949367,
"grad_norm": 15.120232582092285,
"learning_rate": 8.987341772151899e-06,
"loss": 0.7421,
"step": 710
},
{
"epoch": 8.010126582278481,
"grad_norm": 12.948484420776367,
"learning_rate": 9.113924050632912e-06,
"loss": 0.8842,
"step": 720
},
{
"epoch": 8.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.4060986042022705,
"eval_runtime": 8.8573,
"eval_samples_per_second": 5.306,
"eval_steps_per_second": 1.355,
"step": 720
},
{
"epoch": 9.00126582278481,
"grad_norm": 17.29895782470703,
"learning_rate": 9.240506329113925e-06,
"loss": 0.7192,
"step": 730
},
{
"epoch": 9.00253164556962,
"grad_norm": 16.932331085205078,
"learning_rate": 9.367088607594937e-06,
"loss": 0.8571,
"step": 740
},
{
"epoch": 9.00379746835443,
"grad_norm": 27.8249454498291,
"learning_rate": 9.49367088607595e-06,
"loss": 0.6975,
"step": 750
},
{
"epoch": 9.00506329113924,
"grad_norm": 19.709556579589844,
"learning_rate": 9.620253164556963e-06,
"loss": 0.7901,
"step": 760
},
{
"epoch": 9.00632911392405,
"grad_norm": 27.908536911010742,
"learning_rate": 9.746835443037975e-06,
"loss": 0.7778,
"step": 770
},
{
"epoch": 9.00759493670886,
"grad_norm": 11.295394897460938,
"learning_rate": 9.87341772151899e-06,
"loss": 0.6872,
"step": 780
},
{
"epoch": 9.00886075949367,
"grad_norm": 19.349098205566406,
"learning_rate": 1e-05,
"loss": 0.7879,
"step": 790
},
{
"epoch": 9.010126582278481,
"grad_norm": 17.75351333618164,
"learning_rate": 9.985935302391e-06,
"loss": 0.6599,
"step": 800
},
{
"epoch": 9.010126582278481,
"eval_accuracy": 0.2978723404255319,
"eval_loss": 1.3445005416870117,
"eval_runtime": 8.6196,
"eval_samples_per_second": 5.453,
"eval_steps_per_second": 1.392,
"step": 800
},
{
"epoch": 10.00126582278481,
"grad_norm": 21.275543212890625,
"learning_rate": 9.971870604781998e-06,
"loss": 0.6019,
"step": 810
},
{
"epoch": 10.00253164556962,
"grad_norm": 29.977495193481445,
"learning_rate": 9.957805907172996e-06,
"loss": 0.724,
"step": 820
},
{
"epoch": 10.00379746835443,
"grad_norm": 33.56300354003906,
"learning_rate": 9.943741209563994e-06,
"loss": 0.6457,
"step": 830
},
{
"epoch": 10.00506329113924,
"grad_norm": 38.13019943237305,
"learning_rate": 9.929676511954994e-06,
"loss": 0.6331,
"step": 840
},
{
"epoch": 10.00632911392405,
"grad_norm": 42.173423767089844,
"learning_rate": 9.915611814345992e-06,
"loss": 0.5996,
"step": 850
},
{
"epoch": 10.00759493670886,
"grad_norm": 11.129090309143066,
"learning_rate": 9.901547116736992e-06,
"loss": 0.615,
"step": 860
},
{
"epoch": 10.00886075949367,
"grad_norm": 37.42063903808594,
"learning_rate": 9.88748241912799e-06,
"loss": 0.6022,
"step": 870
},
{
"epoch": 10.010126582278481,
"grad_norm": 55.16875457763672,
"learning_rate": 9.87341772151899e-06,
"loss": 0.6723,
"step": 880
},
{
"epoch": 10.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.4071933031082153,
"eval_runtime": 8.6355,
"eval_samples_per_second": 5.443,
"eval_steps_per_second": 1.39,
"step": 880
},
{
"epoch": 11.00126582278481,
"grad_norm": 14.047639846801758,
"learning_rate": 9.859353023909987e-06,
"loss": 0.5122,
"step": 890
},
{
"epoch": 11.00253164556962,
"grad_norm": 14.567192077636719,
"learning_rate": 9.845288326300985e-06,
"loss": 0.5763,
"step": 900
},
{
"epoch": 11.00379746835443,
"grad_norm": 31.18760871887207,
"learning_rate": 9.831223628691983e-06,
"loss": 0.6611,
"step": 910
},
{
"epoch": 11.00506329113924,
"grad_norm": 49.245513916015625,
"learning_rate": 9.817158931082983e-06,
"loss": 0.7129,
"step": 920
},
{
"epoch": 11.00632911392405,
"grad_norm": 25.506393432617188,
"learning_rate": 9.803094233473981e-06,
"loss": 0.4678,
"step": 930
},
{
"epoch": 11.00759493670886,
"grad_norm": 16.567678451538086,
"learning_rate": 9.78902953586498e-06,
"loss": 0.6464,
"step": 940
},
{
"epoch": 11.00886075949367,
"grad_norm": 45.41640090942383,
"learning_rate": 9.774964838255979e-06,
"loss": 0.6556,
"step": 950
},
{
"epoch": 11.010126582278481,
"grad_norm": 53.20558166503906,
"learning_rate": 9.760900140646977e-06,
"loss": 0.604,
"step": 960
},
{
"epoch": 11.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.4198564291000366,
"eval_runtime": 8.4854,
"eval_samples_per_second": 5.539,
"eval_steps_per_second": 1.414,
"step": 960
},
{
"epoch": 12.00126582278481,
"grad_norm": 13.426566123962402,
"learning_rate": 9.746835443037975e-06,
"loss": 0.3598,
"step": 970
},
{
"epoch": 12.00253164556962,
"grad_norm": 48.93745422363281,
"learning_rate": 9.732770745428974e-06,
"loss": 0.5107,
"step": 980
},
{
"epoch": 12.00379746835443,
"grad_norm": 33.23870849609375,
"learning_rate": 9.718706047819972e-06,
"loss": 0.629,
"step": 990
},
{
"epoch": 12.00506329113924,
"grad_norm": 75.58332061767578,
"learning_rate": 9.704641350210972e-06,
"loss": 0.4616,
"step": 1000
},
{
"epoch": 12.00632911392405,
"grad_norm": 29.726964950561523,
"learning_rate": 9.69057665260197e-06,
"loss": 0.6597,
"step": 1010
},
{
"epoch": 12.00759493670886,
"grad_norm": 41.4447135925293,
"learning_rate": 9.67651195499297e-06,
"loss": 0.67,
"step": 1020
},
{
"epoch": 12.00886075949367,
"grad_norm": 59.76002502441406,
"learning_rate": 9.662447257383967e-06,
"loss": 0.6902,
"step": 1030
},
{
"epoch": 12.010126582278481,
"grad_norm": 25.5214786529541,
"learning_rate": 9.648382559774965e-06,
"loss": 0.4959,
"step": 1040
},
{
"epoch": 12.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.5688742399215698,
"eval_runtime": 8.4949,
"eval_samples_per_second": 5.533,
"eval_steps_per_second": 1.413,
"step": 1040
},
{
"epoch": 13.00126582278481,
"grad_norm": 17.342782974243164,
"learning_rate": 9.634317862165963e-06,
"loss": 0.4443,
"step": 1050
},
{
"epoch": 13.00253164556962,
"grad_norm": 6.651524066925049,
"learning_rate": 9.620253164556963e-06,
"loss": 0.4626,
"step": 1060
},
{
"epoch": 13.00379746835443,
"grad_norm": 59.05470275878906,
"learning_rate": 9.606188466947961e-06,
"loss": 0.5051,
"step": 1070
},
{
"epoch": 13.00506329113924,
"grad_norm": 12.133808135986328,
"learning_rate": 9.59212376933896e-06,
"loss": 0.4063,
"step": 1080
},
{
"epoch": 13.00632911392405,
"grad_norm": 5.521517753601074,
"learning_rate": 9.578059071729959e-06,
"loss": 0.3626,
"step": 1090
},
{
"epoch": 13.00759493670886,
"grad_norm": 39.51848220825195,
"learning_rate": 9.563994374120957e-06,
"loss": 0.4715,
"step": 1100
},
{
"epoch": 13.00886075949367,
"grad_norm": 17.837867736816406,
"learning_rate": 9.549929676511955e-06,
"loss": 0.4161,
"step": 1110
},
{
"epoch": 13.010126582278481,
"grad_norm": 10.324262619018555,
"learning_rate": 9.535864978902954e-06,
"loss": 0.3758,
"step": 1120
},
{
"epoch": 13.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.7867138385772705,
"eval_runtime": 8.632,
"eval_samples_per_second": 5.445,
"eval_steps_per_second": 1.39,
"step": 1120
},
{
"epoch": 14.00126582278481,
"grad_norm": 3.8076212406158447,
"learning_rate": 9.521800281293952e-06,
"loss": 0.2181,
"step": 1130
},
{
"epoch": 14.00253164556962,
"grad_norm": 7.90512752532959,
"learning_rate": 9.507735583684952e-06,
"loss": 0.4037,
"step": 1140
},
{
"epoch": 14.00379746835443,
"grad_norm": 6.371408462524414,
"learning_rate": 9.49367088607595e-06,
"loss": 0.7414,
"step": 1150
},
{
"epoch": 14.00506329113924,
"grad_norm": 21.530675888061523,
"learning_rate": 9.47960618846695e-06,
"loss": 0.2786,
"step": 1160
},
{
"epoch": 14.00632911392405,
"grad_norm": 1.7298585176467896,
"learning_rate": 9.465541490857948e-06,
"loss": 0.2941,
"step": 1170
},
{
"epoch": 14.00759493670886,
"grad_norm": 14.179819107055664,
"learning_rate": 9.451476793248946e-06,
"loss": 0.6105,
"step": 1180
},
{
"epoch": 14.00886075949367,
"grad_norm": 35.71600341796875,
"learning_rate": 9.437412095639944e-06,
"loss": 0.4703,
"step": 1190
},
{
"epoch": 14.010126582278481,
"grad_norm": 7.779309272766113,
"learning_rate": 9.423347398030943e-06,
"loss": 0.6257,
"step": 1200
},
{
"epoch": 14.010126582278481,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 1.921836018562317,
"eval_runtime": 8.7081,
"eval_samples_per_second": 5.397,
"eval_steps_per_second": 1.378,
"step": 1200
},
{
"epoch": 15.00126582278481,
"grad_norm": 90.20023345947266,
"learning_rate": 9.409282700421943e-06,
"loss": 0.3217,
"step": 1210
},
{
"epoch": 15.00253164556962,
"grad_norm": 6.699902534484863,
"learning_rate": 9.395218002812941e-06,
"loss": 0.2383,
"step": 1220
},
{
"epoch": 15.00379746835443,
"grad_norm": 15.322399139404297,
"learning_rate": 9.381153305203939e-06,
"loss": 0.2347,
"step": 1230
},
{
"epoch": 15.00506329113924,
"grad_norm": 4.224050998687744,
"learning_rate": 9.367088607594937e-06,
"loss": 0.1293,
"step": 1240
},
{
"epoch": 15.00632911392405,
"grad_norm": 3.2699191570281982,
"learning_rate": 9.353023909985936e-06,
"loss": 0.203,
"step": 1250
},
{
"epoch": 15.00759493670886,
"grad_norm": 69.02498626708984,
"learning_rate": 9.338959212376934e-06,
"loss": 0.3505,
"step": 1260
},
{
"epoch": 15.00886075949367,
"grad_norm": 148.28306579589844,
"learning_rate": 9.324894514767934e-06,
"loss": 0.3983,
"step": 1270
},
{
"epoch": 15.010126582278481,
"grad_norm": 26.6025447845459,
"learning_rate": 9.310829817158932e-06,
"loss": 0.3693,
"step": 1280
},
{
"epoch": 15.010126582278481,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 2.09875750541687,
"eval_runtime": 9.0724,
"eval_samples_per_second": 5.181,
"eval_steps_per_second": 1.323,
"step": 1280
},
{
"epoch": 16.00126582278481,
"grad_norm": 34.19914627075195,
"learning_rate": 9.29676511954993e-06,
"loss": 0.3708,
"step": 1290
},
{
"epoch": 16.00253164556962,
"grad_norm": 57.25946807861328,
"learning_rate": 9.28270042194093e-06,
"loss": 0.5632,
"step": 1300
},
{
"epoch": 16.00379746835443,
"grad_norm": 1.7772458791732788,
"learning_rate": 9.268635724331928e-06,
"loss": 0.2617,
"step": 1310
},
{
"epoch": 16.00506329113924,
"grad_norm": 56.837650299072266,
"learning_rate": 9.254571026722926e-06,
"loss": 0.3024,
"step": 1320
},
{
"epoch": 16.00632911392405,
"grad_norm": 0.5459592342376709,
"learning_rate": 9.240506329113925e-06,
"loss": 0.2552,
"step": 1330
},
{
"epoch": 16.00759493670886,
"grad_norm": 153.30613708496094,
"learning_rate": 9.226441631504923e-06,
"loss": 0.659,
"step": 1340
},
{
"epoch": 16.008860759493672,
"grad_norm": 47.839324951171875,
"learning_rate": 9.212376933895923e-06,
"loss": 0.3776,
"step": 1350
},
{
"epoch": 16.01012658227848,
"grad_norm": 57.103763580322266,
"learning_rate": 9.198312236286921e-06,
"loss": 0.5933,
"step": 1360
},
{
"epoch": 16.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 1.8412983417510986,
"eval_runtime": 8.4821,
"eval_samples_per_second": 5.541,
"eval_steps_per_second": 1.415,
"step": 1360
},
{
"epoch": 17.00126582278481,
"grad_norm": 0.2360084503889084,
"learning_rate": 9.184247538677919e-06,
"loss": 0.0993,
"step": 1370
},
{
"epoch": 17.00253164556962,
"grad_norm": 1.5083540678024292,
"learning_rate": 9.170182841068917e-06,
"loss": 0.2528,
"step": 1380
},
{
"epoch": 17.00379746835443,
"grad_norm": 7.469198226928711,
"learning_rate": 9.156118143459917e-06,
"loss": 0.3329,
"step": 1390
},
{
"epoch": 17.00506329113924,
"grad_norm": 100.13819885253906,
"learning_rate": 9.142053445850915e-06,
"loss": 0.2834,
"step": 1400
},
{
"epoch": 17.00632911392405,
"grad_norm": 110.03264617919922,
"learning_rate": 9.127988748241914e-06,
"loss": 0.6402,
"step": 1410
},
{
"epoch": 17.00759493670886,
"grad_norm": 116.64907836914062,
"learning_rate": 9.113924050632912e-06,
"loss": 0.4343,
"step": 1420
},
{
"epoch": 17.008860759493672,
"grad_norm": 13.220937728881836,
"learning_rate": 9.09985935302391e-06,
"loss": 0.3556,
"step": 1430
},
{
"epoch": 17.01012658227848,
"grad_norm": 63.16554260253906,
"learning_rate": 9.085794655414908e-06,
"loss": 0.202,
"step": 1440
},
{
"epoch": 17.01012658227848,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 2.753727436065674,
"eval_runtime": 8.441,
"eval_samples_per_second": 5.568,
"eval_steps_per_second": 1.422,
"step": 1440
},
{
"epoch": 18.00126582278481,
"grad_norm": 6.848087310791016,
"learning_rate": 9.071729957805908e-06,
"loss": 0.2198,
"step": 1450
},
{
"epoch": 18.00253164556962,
"grad_norm": 24.780672073364258,
"learning_rate": 9.057665260196906e-06,
"loss": 0.2974,
"step": 1460
},
{
"epoch": 18.00379746835443,
"grad_norm": 28.783912658691406,
"learning_rate": 9.043600562587905e-06,
"loss": 0.2387,
"step": 1470
},
{
"epoch": 18.00506329113924,
"grad_norm": 0.49766799807548523,
"learning_rate": 9.029535864978903e-06,
"loss": 0.2592,
"step": 1480
},
{
"epoch": 18.00632911392405,
"grad_norm": 107.1086196899414,
"learning_rate": 9.015471167369903e-06,
"loss": 0.2736,
"step": 1490
},
{
"epoch": 18.00759493670886,
"grad_norm": 1.34207284450531,
"learning_rate": 9.001406469760901e-06,
"loss": 0.3996,
"step": 1500
},
{
"epoch": 18.008860759493672,
"grad_norm": 0.8816600441932678,
"learning_rate": 8.987341772151899e-06,
"loss": 0.3255,
"step": 1510
},
{
"epoch": 18.01012658227848,
"grad_norm": 0.2845398187637329,
"learning_rate": 8.973277074542897e-06,
"loss": 0.1454,
"step": 1520
},
{
"epoch": 18.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 2.461174964904785,
"eval_runtime": 8.4461,
"eval_samples_per_second": 5.565,
"eval_steps_per_second": 1.421,
"step": 1520
},
{
"epoch": 19.00126582278481,
"grad_norm": 7.336277961730957,
"learning_rate": 8.959212376933897e-06,
"loss": 0.1073,
"step": 1530
},
{
"epoch": 19.00253164556962,
"grad_norm": 1.7120882272720337,
"learning_rate": 8.945147679324895e-06,
"loss": 0.286,
"step": 1540
},
{
"epoch": 19.00379746835443,
"grad_norm": 5.534066677093506,
"learning_rate": 8.931082981715894e-06,
"loss": 0.0711,
"step": 1550
},
{
"epoch": 19.00506329113924,
"grad_norm": 0.4742295742034912,
"learning_rate": 8.917018284106892e-06,
"loss": 0.0344,
"step": 1560
},
{
"epoch": 19.00632911392405,
"grad_norm": 7.864910125732422,
"learning_rate": 8.90295358649789e-06,
"loss": 0.4488,
"step": 1570
},
{
"epoch": 19.00759493670886,
"grad_norm": 29.55208396911621,
"learning_rate": 8.888888888888888e-06,
"loss": 0.0769,
"step": 1580
},
{
"epoch": 19.008860759493672,
"grad_norm": 6.9868597984313965,
"learning_rate": 8.874824191279888e-06,
"loss": 0.2275,
"step": 1590
},
{
"epoch": 19.01012658227848,
"grad_norm": 0.9346122741699219,
"learning_rate": 8.860759493670886e-06,
"loss": 0.1332,
"step": 1600
},
{
"epoch": 19.01012658227848,
"eval_accuracy": 0.3404255319148936,
"eval_loss": 3.094426155090332,
"eval_runtime": 8.4844,
"eval_samples_per_second": 5.54,
"eval_steps_per_second": 1.414,
"step": 1600
},
{
"epoch": 20.00126582278481,
"grad_norm": 0.2495788335800171,
"learning_rate": 8.846694796061886e-06,
"loss": 0.0054,
"step": 1610
},
{
"epoch": 20.00253164556962,
"grad_norm": 0.9110737442970276,
"learning_rate": 8.832630098452884e-06,
"loss": 0.1922,
"step": 1620
},
{
"epoch": 20.00379746835443,
"grad_norm": 18.964305877685547,
"learning_rate": 8.818565400843883e-06,
"loss": 0.0081,
"step": 1630
},
{
"epoch": 20.00506329113924,
"grad_norm": 32.764984130859375,
"learning_rate": 8.804500703234881e-06,
"loss": 0.1649,
"step": 1640
},
{
"epoch": 20.00632911392405,
"grad_norm": 0.6211456060409546,
"learning_rate": 8.79043600562588e-06,
"loss": 0.267,
"step": 1650
},
{
"epoch": 20.00759493670886,
"grad_norm": 11.705927848815918,
"learning_rate": 8.776371308016879e-06,
"loss": 0.0885,
"step": 1660
},
{
"epoch": 20.008860759493672,
"grad_norm": 4.711695194244385,
"learning_rate": 8.762306610407877e-06,
"loss": 0.1366,
"step": 1670
},
{
"epoch": 20.01012658227848,
"grad_norm": 1.115964412689209,
"learning_rate": 8.748241912798877e-06,
"loss": 0.9193,
"step": 1680
},
{
"epoch": 20.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 2.869112491607666,
"eval_runtime": 8.4747,
"eval_samples_per_second": 5.546,
"eval_steps_per_second": 1.416,
"step": 1680
},
{
"epoch": 21.00126582278481,
"grad_norm": 0.7912726998329163,
"learning_rate": 8.734177215189874e-06,
"loss": 0.0675,
"step": 1690
},
{
"epoch": 21.00253164556962,
"grad_norm": 1.868703007698059,
"learning_rate": 8.720112517580872e-06,
"loss": 0.0215,
"step": 1700
},
{
"epoch": 21.00379746835443,
"grad_norm": 0.07253948599100113,
"learning_rate": 8.70604781997187e-06,
"loss": 0.1879,
"step": 1710
},
{
"epoch": 21.00506329113924,
"grad_norm": 0.07606098800897598,
"learning_rate": 8.69198312236287e-06,
"loss": 0.2937,
"step": 1720
},
{
"epoch": 21.00632911392405,
"grad_norm": 4.814393520355225,
"learning_rate": 8.677918424753868e-06,
"loss": 0.1223,
"step": 1730
},
{
"epoch": 21.00759493670886,
"grad_norm": 0.052608225494623184,
"learning_rate": 8.663853727144868e-06,
"loss": 0.1895,
"step": 1740
},
{
"epoch": 21.008860759493672,
"grad_norm": 6.358555316925049,
"learning_rate": 8.649789029535866e-06,
"loss": 0.2224,
"step": 1750
},
{
"epoch": 21.01012658227848,
"grad_norm": 0.10539772361516953,
"learning_rate": 8.635724331926865e-06,
"loss": 0.1201,
"step": 1760
},
{
"epoch": 21.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 3.0563912391662598,
"eval_runtime": 8.5749,
"eval_samples_per_second": 5.481,
"eval_steps_per_second": 1.399,
"step": 1760
},
{
"epoch": 22.00126582278481,
"grad_norm": 3.320700168609619,
"learning_rate": 8.621659634317863e-06,
"loss": 0.0066,
"step": 1770
},
{
"epoch": 22.00253164556962,
"grad_norm": 77.03856658935547,
"learning_rate": 8.607594936708861e-06,
"loss": 0.0985,
"step": 1780
},
{
"epoch": 22.00379746835443,
"grad_norm": 0.06309456378221512,
"learning_rate": 8.59353023909986e-06,
"loss": 0.0229,
"step": 1790
},
{
"epoch": 22.00506329113924,
"grad_norm": 81.78655242919922,
"learning_rate": 8.579465541490859e-06,
"loss": 0.1983,
"step": 1800
},
{
"epoch": 22.00632911392405,
"grad_norm": 0.15561726689338684,
"learning_rate": 8.565400843881857e-06,
"loss": 0.1817,
"step": 1810
},
{
"epoch": 22.00759493670886,
"grad_norm": 10.21172046661377,
"learning_rate": 8.551336146272857e-06,
"loss": 0.0148,
"step": 1820
},
{
"epoch": 22.008860759493672,
"grad_norm": 0.5883349180221558,
"learning_rate": 8.537271448663855e-06,
"loss": 0.0018,
"step": 1830
},
{
"epoch": 22.01012658227848,
"grad_norm": 0.019595500081777573,
"learning_rate": 8.523206751054853e-06,
"loss": 0.1716,
"step": 1840
},
{
"epoch": 22.01012658227848,
"eval_accuracy": 0.3404255319148936,
"eval_loss": 3.390719175338745,
"eval_runtime": 8.6187,
"eval_samples_per_second": 5.453,
"eval_steps_per_second": 1.392,
"step": 1840
},
{
"epoch": 23.00126582278481,
"grad_norm": 0.025295179337263107,
"learning_rate": 8.50914205344585e-06,
"loss": 0.0037,
"step": 1850
},
{
"epoch": 23.00253164556962,
"grad_norm": 0.1332973688840866,
"learning_rate": 8.49507735583685e-06,
"loss": 0.0026,
"step": 1860
},
{
"epoch": 23.00379746835443,
"grad_norm": 0.08286605030298233,
"learning_rate": 8.481012658227848e-06,
"loss": 0.1337,
"step": 1870
},
{
"epoch": 23.00506329113924,
"grad_norm": 0.11277411133050919,
"learning_rate": 8.466947960618848e-06,
"loss": 0.1286,
"step": 1880
},
{
"epoch": 23.00632911392405,
"grad_norm": 0.047154348343610764,
"learning_rate": 8.452883263009846e-06,
"loss": 0.0068,
"step": 1890
},
{
"epoch": 23.00759493670886,
"grad_norm": 0.02648848481476307,
"learning_rate": 8.438818565400846e-06,
"loss": 0.0168,
"step": 1900
},
{
"epoch": 23.008860759493672,
"grad_norm": 0.0498431995511055,
"learning_rate": 8.424753867791844e-06,
"loss": 0.0048,
"step": 1910
},
{
"epoch": 23.01012658227848,
"grad_norm": 0.11999885737895966,
"learning_rate": 8.410689170182841e-06,
"loss": 0.0402,
"step": 1920
},
{
"epoch": 23.01012658227848,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 3.7917425632476807,
"eval_runtime": 8.7081,
"eval_samples_per_second": 5.397,
"eval_steps_per_second": 1.378,
"step": 1920
},
{
"epoch": 24.00126582278481,
"grad_norm": 0.034499507397413254,
"learning_rate": 8.39662447257384e-06,
"loss": 0.012,
"step": 1930
},
{
"epoch": 24.00253164556962,
"grad_norm": 42.179473876953125,
"learning_rate": 8.382559774964839e-06,
"loss": 0.2346,
"step": 1940
},
{
"epoch": 24.00379746835443,
"grad_norm": 0.6478450298309326,
"learning_rate": 8.368495077355837e-06,
"loss": 0.008,
"step": 1950
},
{
"epoch": 24.00506329113924,
"grad_norm": 0.04269712418317795,
"learning_rate": 8.354430379746837e-06,
"loss": 0.004,
"step": 1960
},
{
"epoch": 24.00632911392405,
"grad_norm": 29.495561599731445,
"learning_rate": 8.340365682137835e-06,
"loss": 0.1656,
"step": 1970
},
{
"epoch": 24.00759493670886,
"grad_norm": 0.09528925269842148,
"learning_rate": 8.326300984528833e-06,
"loss": 0.0268,
"step": 1980
},
{
"epoch": 24.008860759493672,
"grad_norm": 0.023056741803884506,
"learning_rate": 8.31223628691983e-06,
"loss": 0.0095,
"step": 1990
},
{
"epoch": 24.01012658227848,
"grad_norm": 2.2930028438568115,
"learning_rate": 8.29817158931083e-06,
"loss": 0.0709,
"step": 2000
},
{
"epoch": 24.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 3.5486884117126465,
"eval_runtime": 8.9153,
"eval_samples_per_second": 5.272,
"eval_steps_per_second": 1.346,
"step": 2000
},
{
"epoch": 25.00126582278481,
"grad_norm": 0.614183783531189,
"learning_rate": 8.284106891701828e-06,
"loss": 0.0297,
"step": 2010
},
{
"epoch": 25.00253164556962,
"grad_norm": 243.75750732421875,
"learning_rate": 8.270042194092828e-06,
"loss": 0.1049,
"step": 2020
},
{
"epoch": 25.00379746835443,
"grad_norm": 210.3068389892578,
"learning_rate": 8.255977496483826e-06,
"loss": 0.0886,
"step": 2030
},
{
"epoch": 25.00506329113924,
"grad_norm": 2.261234760284424,
"learning_rate": 8.241912798874826e-06,
"loss": 0.0027,
"step": 2040
},
{
"epoch": 25.00632911392405,
"grad_norm": 6.479150772094727,
"learning_rate": 8.227848101265824e-06,
"loss": 0.1408,
"step": 2050
},
{
"epoch": 25.00759493670886,
"grad_norm": 0.04374171420931816,
"learning_rate": 8.213783403656822e-06,
"loss": 0.0659,
"step": 2060
},
{
"epoch": 25.008860759493672,
"grad_norm": 0.19435258209705353,
"learning_rate": 8.199718706047821e-06,
"loss": 0.0016,
"step": 2070
},
{
"epoch": 25.01012658227848,
"grad_norm": 0.020269129425287247,
"learning_rate": 8.18565400843882e-06,
"loss": 0.1021,
"step": 2080
},
{
"epoch": 25.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 3.9004390239715576,
"eval_runtime": 8.7333,
"eval_samples_per_second": 5.382,
"eval_steps_per_second": 1.374,
"step": 2080
},
{
"epoch": 26.00126582278481,
"grad_norm": 0.07372234761714935,
"learning_rate": 8.171589310829819e-06,
"loss": 0.0096,
"step": 2090
},
{
"epoch": 26.00253164556962,
"grad_norm": 0.9319536089897156,
"learning_rate": 8.157524613220817e-06,
"loss": 0.0877,
"step": 2100
},
{
"epoch": 26.00379746835443,
"grad_norm": 1.9737757444381714,
"learning_rate": 8.143459915611815e-06,
"loss": 0.0105,
"step": 2110
},
{
"epoch": 26.00506329113924,
"grad_norm": 0.010359777137637138,
"learning_rate": 8.129395218002813e-06,
"loss": 0.0019,
"step": 2120
},
{
"epoch": 26.00632911392405,
"grad_norm": 0.16365653276443481,
"learning_rate": 8.115330520393813e-06,
"loss": 0.0006,
"step": 2130
},
{
"epoch": 26.00759493670886,
"grad_norm": 184.18040466308594,
"learning_rate": 8.10126582278481e-06,
"loss": 0.1279,
"step": 2140
},
{
"epoch": 26.008860759493672,
"grad_norm": 0.01543757226318121,
"learning_rate": 8.08720112517581e-06,
"loss": 0.0289,
"step": 2150
},
{
"epoch": 26.01012658227848,
"grad_norm": 0.02343440055847168,
"learning_rate": 8.073136427566808e-06,
"loss": 0.0029,
"step": 2160
},
{
"epoch": 26.01012658227848,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 4.194858551025391,
"eval_runtime": 9.0554,
"eval_samples_per_second": 5.19,
"eval_steps_per_second": 1.325,
"step": 2160
},
{
"epoch": 27.00126582278481,
"grad_norm": 0.163554847240448,
"learning_rate": 8.059071729957806e-06,
"loss": 0.0027,
"step": 2170
},
{
"epoch": 27.00253164556962,
"grad_norm": 64.04247283935547,
"learning_rate": 8.045007032348806e-06,
"loss": 0.0081,
"step": 2180
},
{
"epoch": 27.00379746835443,
"grad_norm": 0.2571711242198944,
"learning_rate": 8.030942334739804e-06,
"loss": 0.0059,
"step": 2190
},
{
"epoch": 27.00506329113924,
"grad_norm": 0.015557597391307354,
"learning_rate": 8.016877637130802e-06,
"loss": 0.0709,
"step": 2200
},
{
"epoch": 27.00632911392405,
"grad_norm": 0.05058155208826065,
"learning_rate": 8.002812939521801e-06,
"loss": 0.0016,
"step": 2210
},
{
"epoch": 27.00759493670886,
"grad_norm": 0.06934946775436401,
"learning_rate": 7.9887482419128e-06,
"loss": 0.0048,
"step": 2220
},
{
"epoch": 27.008860759493672,
"grad_norm": 0.06157020479440689,
"learning_rate": 7.974683544303799e-06,
"loss": 0.0006,
"step": 2230
},
{
"epoch": 27.01012658227848,
"grad_norm": 0.016570130363106728,
"learning_rate": 7.960618846694797e-06,
"loss": 0.1352,
"step": 2240
},
{
"epoch": 27.01012658227848,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 4.503756999969482,
"eval_runtime": 8.4591,
"eval_samples_per_second": 5.556,
"eval_steps_per_second": 1.419,
"step": 2240
},
{
"epoch": 28.00126582278481,
"grad_norm": 0.05582532659173012,
"learning_rate": 7.946554149085795e-06,
"loss": 0.0875,
"step": 2250
},
{
"epoch": 28.00253164556962,
"grad_norm": 0.04096909984946251,
"learning_rate": 7.932489451476793e-06,
"loss": 0.0003,
"step": 2260
},
{
"epoch": 28.00379746835443,
"grad_norm": 0.9817273616790771,
"learning_rate": 7.918424753867793e-06,
"loss": 0.0012,
"step": 2270
},
{
"epoch": 28.00506329113924,
"grad_norm": 0.07687732577323914,
"learning_rate": 7.90436005625879e-06,
"loss": 0.0023,
"step": 2280
},
{
"epoch": 28.00632911392405,
"grad_norm": 18.15758514404297,
"learning_rate": 7.89029535864979e-06,
"loss": 0.1754,
"step": 2290
},
{
"epoch": 28.00759493670886,
"grad_norm": 0.007940283045172691,
"learning_rate": 7.876230661040788e-06,
"loss": 0.3378,
"step": 2300
},
{
"epoch": 28.008860759493672,
"grad_norm": 165.2981414794922,
"learning_rate": 7.862165963431786e-06,
"loss": 0.2031,
"step": 2310
},
{
"epoch": 28.01012658227848,
"grad_norm": 0.009227721951901913,
"learning_rate": 7.848101265822786e-06,
"loss": 0.0173,
"step": 2320
},
{
"epoch": 28.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 3.935215473175049,
"eval_runtime": 8.4766,
"eval_samples_per_second": 5.545,
"eval_steps_per_second": 1.416,
"step": 2320
},
{
"epoch": 29.00126582278481,
"grad_norm": 0.01626473106443882,
"learning_rate": 7.834036568213784e-06,
"loss": 0.0544,
"step": 2330
},
{
"epoch": 29.00253164556962,
"grad_norm": 0.018083002418279648,
"learning_rate": 7.819971870604782e-06,
"loss": 0.0064,
"step": 2340
},
{
"epoch": 29.00379746835443,
"grad_norm": 0.2154766470193863,
"learning_rate": 7.805907172995782e-06,
"loss": 0.0006,
"step": 2350
},
{
"epoch": 29.00506329113924,
"grad_norm": 220.96780395507812,
"learning_rate": 7.79184247538678e-06,
"loss": 0.1229,
"step": 2360
},
{
"epoch": 29.00632911392405,
"grad_norm": 0.17289696633815765,
"learning_rate": 7.77777777777778e-06,
"loss": 0.0005,
"step": 2370
},
{
"epoch": 29.00759493670886,
"grad_norm": 1.7889928817749023,
"learning_rate": 7.763713080168777e-06,
"loss": 0.1407,
"step": 2380
},
{
"epoch": 29.008860759493672,
"grad_norm": 0.008173462934792042,
"learning_rate": 7.749648382559775e-06,
"loss": 0.1463,
"step": 2390
},
{
"epoch": 29.01012658227848,
"grad_norm": 0.011393209919333458,
"learning_rate": 7.735583684950773e-06,
"loss": 0.0012,
"step": 2400
},
{
"epoch": 29.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.323361873626709,
"eval_runtime": 8.4682,
"eval_samples_per_second": 5.55,
"eval_steps_per_second": 1.417,
"step": 2400
},
{
"epoch": 30.00126582278481,
"grad_norm": 0.011178904213011265,
"learning_rate": 7.721518987341773e-06,
"loss": 0.0004,
"step": 2410
},
{
"epoch": 30.00253164556962,
"grad_norm": 0.0153023237362504,
"learning_rate": 7.70745428973277e-06,
"loss": 0.0008,
"step": 2420
},
{
"epoch": 30.00379746835443,
"grad_norm": 0.010914456099271774,
"learning_rate": 7.69338959212377e-06,
"loss": 0.0004,
"step": 2430
},
{
"epoch": 30.00506329113924,
"grad_norm": 0.007891859859228134,
"learning_rate": 7.679324894514768e-06,
"loss": 0.0005,
"step": 2440
},
{
"epoch": 30.00632911392405,
"grad_norm": 87.5243911743164,
"learning_rate": 7.665260196905766e-06,
"loss": 0.0081,
"step": 2450
},
{
"epoch": 30.00759493670886,
"grad_norm": 1.0978916883468628,
"learning_rate": 7.651195499296766e-06,
"loss": 0.0004,
"step": 2460
},
{
"epoch": 30.008860759493672,
"grad_norm": 12.773395538330078,
"learning_rate": 7.637130801687764e-06,
"loss": 0.0024,
"step": 2470
},
{
"epoch": 30.01012658227848,
"grad_norm": 0.03179134428501129,
"learning_rate": 7.623066104078764e-06,
"loss": 0.0007,
"step": 2480
},
{
"epoch": 30.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.287741661071777,
"eval_runtime": 8.4651,
"eval_samples_per_second": 5.552,
"eval_steps_per_second": 1.418,
"step": 2480
},
{
"epoch": 31.00126582278481,
"grad_norm": 0.005546510685235262,
"learning_rate": 7.609001406469762e-06,
"loss": 0.0003,
"step": 2490
},
{
"epoch": 31.00253164556962,
"grad_norm": 0.013632872141897678,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.1395,
"step": 2500
},
{
"epoch": 31.00379746835443,
"grad_norm": 0.006918332539498806,
"learning_rate": 7.5808720112517584e-06,
"loss": 0.0027,
"step": 2510
},
{
"epoch": 31.00506329113924,
"grad_norm": 0.012666971422731876,
"learning_rate": 7.566807313642758e-06,
"loss": 0.0004,
"step": 2520
},
{
"epoch": 31.00632911392405,
"grad_norm": 0.005221995059400797,
"learning_rate": 7.552742616033756e-06,
"loss": 0.0009,
"step": 2530
},
{
"epoch": 31.00759493670886,
"grad_norm": 0.013362145982682705,
"learning_rate": 7.538677918424755e-06,
"loss": 0.0012,
"step": 2540
},
{
"epoch": 31.008860759493672,
"grad_norm": 0.035756830126047134,
"learning_rate": 7.524613220815753e-06,
"loss": 0.0004,
"step": 2550
},
{
"epoch": 31.01012658227848,
"grad_norm": 0.08822837471961975,
"learning_rate": 7.510548523206752e-06,
"loss": 0.2292,
"step": 2560
},
{
"epoch": 31.01012658227848,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 4.729736804962158,
"eval_runtime": 8.4818,
"eval_samples_per_second": 5.541,
"eval_steps_per_second": 1.415,
"step": 2560
},
{
"epoch": 32.00126582278481,
"grad_norm": 6.84944486618042,
"learning_rate": 7.4964838255977505e-06,
"loss": 0.001,
"step": 2570
},
{
"epoch": 32.00253164556962,
"grad_norm": 0.025634169578552246,
"learning_rate": 7.482419127988749e-06,
"loss": 0.0002,
"step": 2580
},
{
"epoch": 32.00379746835443,
"grad_norm": 0.026997152715921402,
"learning_rate": 7.468354430379747e-06,
"loss": 0.0089,
"step": 2590
},
{
"epoch": 32.00506329113924,
"grad_norm": 0.008302225731313229,
"learning_rate": 7.454289732770746e-06,
"loss": 0.0005,
"step": 2600
},
{
"epoch": 32.00632911392405,
"grad_norm": 0.033620625734329224,
"learning_rate": 7.440225035161744e-06,
"loss": 0.0081,
"step": 2610
},
{
"epoch": 32.00759493670886,
"grad_norm": 0.022618619725108147,
"learning_rate": 7.426160337552744e-06,
"loss": 0.0548,
"step": 2620
},
{
"epoch": 32.00886075949367,
"grad_norm": 0.0314350426197052,
"learning_rate": 7.412095639943742e-06,
"loss": 0.0003,
"step": 2630
},
{
"epoch": 32.01012658227848,
"grad_norm": 0.007120281923562288,
"learning_rate": 7.398030942334741e-06,
"loss": 0.0004,
"step": 2640
},
{
"epoch": 32.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.471046447753906,
"eval_runtime": 8.9073,
"eval_samples_per_second": 5.277,
"eval_steps_per_second": 1.347,
"step": 2640
},
{
"epoch": 33.00126582278481,
"grad_norm": 0.3721332550048828,
"learning_rate": 7.3839662447257386e-06,
"loss": 0.1564,
"step": 2650
},
{
"epoch": 33.00253164556962,
"grad_norm": 0.052768442779779434,
"learning_rate": 7.369901547116738e-06,
"loss": 0.0022,
"step": 2660
},
{
"epoch": 33.00379746835443,
"grad_norm": 93.05609130859375,
"learning_rate": 7.355836849507736e-06,
"loss": 0.2399,
"step": 2670
},
{
"epoch": 33.00506329113924,
"grad_norm": 0.0038992296904325485,
"learning_rate": 7.341772151898735e-06,
"loss": 0.0088,
"step": 2680
},
{
"epoch": 33.00632911392405,
"grad_norm": 0.020863041281700134,
"learning_rate": 7.327707454289733e-06,
"loss": 0.0002,
"step": 2690
},
{
"epoch": 33.00759493670886,
"grad_norm": 0.006648873444646597,
"learning_rate": 7.313642756680732e-06,
"loss": 0.0388,
"step": 2700
},
{
"epoch": 33.00886075949367,
"grad_norm": 156.20700073242188,
"learning_rate": 7.29957805907173e-06,
"loss": 0.2333,
"step": 2710
},
{
"epoch": 33.01012658227848,
"grad_norm": 0.008939997293055058,
"learning_rate": 7.2855133614627295e-06,
"loss": 0.0361,
"step": 2720
},
{
"epoch": 33.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.239119529724121,
"eval_runtime": 8.9548,
"eval_samples_per_second": 5.249,
"eval_steps_per_second": 1.34,
"step": 2720
},
{
"epoch": 34.00126582278481,
"grad_norm": 15.270977020263672,
"learning_rate": 7.2714486638537275e-06,
"loss": 0.1088,
"step": 2730
},
{
"epoch": 34.00253164556962,
"grad_norm": 14.447574615478516,
"learning_rate": 7.257383966244726e-06,
"loss": 0.002,
"step": 2740
},
{
"epoch": 34.00379746835443,
"grad_norm": 0.009641851298511028,
"learning_rate": 7.243319268635724e-06,
"loss": 0.0003,
"step": 2750
},
{
"epoch": 34.00506329113924,
"grad_norm": 3.5248186588287354,
"learning_rate": 7.229254571026724e-06,
"loss": 0.0007,
"step": 2760
},
{
"epoch": 34.00632911392405,
"grad_norm": 0.06941874325275421,
"learning_rate": 7.215189873417722e-06,
"loss": 0.106,
"step": 2770
},
{
"epoch": 34.00759493670886,
"grad_norm": 0.0060513801872730255,
"learning_rate": 7.201125175808721e-06,
"loss": 0.0003,
"step": 2780
},
{
"epoch": 34.00886075949367,
"grad_norm": 0.03698160871863365,
"learning_rate": 7.187060478199719e-06,
"loss": 0.0003,
"step": 2790
},
{
"epoch": 34.01012658227848,
"grad_norm": 0.021343868225812912,
"learning_rate": 7.172995780590718e-06,
"loss": 0.0002,
"step": 2800
},
{
"epoch": 34.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.2255730628967285,
"eval_runtime": 9.1143,
"eval_samples_per_second": 5.157,
"eval_steps_per_second": 1.317,
"step": 2800
},
{
"epoch": 35.00126582278481,
"grad_norm": 0.02194453403353691,
"learning_rate": 7.158931082981716e-06,
"loss": 0.0002,
"step": 2810
},
{
"epoch": 35.00253164556962,
"grad_norm": 0.008681000210344791,
"learning_rate": 7.144866385372715e-06,
"loss": 0.0017,
"step": 2820
},
{
"epoch": 35.00379746835443,
"grad_norm": 0.003180101979523897,
"learning_rate": 7.130801687763713e-06,
"loss": 0.0002,
"step": 2830
},
{
"epoch": 35.00506329113924,
"grad_norm": 0.00399158988147974,
"learning_rate": 7.116736990154712e-06,
"loss": 0.0851,
"step": 2840
},
{
"epoch": 35.00632911392405,
"grad_norm": 317.16937255859375,
"learning_rate": 7.10267229254571e-06,
"loss": 0.1581,
"step": 2850
},
{
"epoch": 35.00759493670886,
"grad_norm": 0.006524229887872934,
"learning_rate": 7.08860759493671e-06,
"loss": 0.0002,
"step": 2860
},
{
"epoch": 35.00886075949367,
"grad_norm": 230.1353759765625,
"learning_rate": 7.074542897327708e-06,
"loss": 0.1746,
"step": 2870
},
{
"epoch": 35.01012658227848,
"grad_norm": 89.08749389648438,
"learning_rate": 7.060478199718706e-06,
"loss": 0.0082,
"step": 2880
},
{
"epoch": 35.01012658227848,
"eval_accuracy": 0.3404255319148936,
"eval_loss": 5.073359489440918,
"eval_runtime": 8.501,
"eval_samples_per_second": 5.529,
"eval_steps_per_second": 1.412,
"step": 2880
},
{
"epoch": 36.00126582278481,
"grad_norm": 0.014753330498933792,
"learning_rate": 7.046413502109706e-06,
"loss": 0.0002,
"step": 2890
},
{
"epoch": 36.00253164556962,
"grad_norm": 3.0008251667022705,
"learning_rate": 7.032348804500704e-06,
"loss": 0.0373,
"step": 2900
},
{
"epoch": 36.00379746835443,
"grad_norm": 0.010498768649995327,
"learning_rate": 7.018284106891703e-06,
"loss": 0.009,
"step": 2910
},
{
"epoch": 36.00506329113924,
"grad_norm": 0.06089121848344803,
"learning_rate": 7.004219409282701e-06,
"loss": 0.0003,
"step": 2920
},
{
"epoch": 36.00632911392405,
"grad_norm": 0.009548901580274105,
"learning_rate": 6.9901547116737e-06,
"loss": 0.1971,
"step": 2930
},
{
"epoch": 36.00759493670886,
"grad_norm": 0.11378785222768784,
"learning_rate": 6.9760900140646985e-06,
"loss": 0.0021,
"step": 2940
},
{
"epoch": 36.00886075949367,
"grad_norm": 0.004684086889028549,
"learning_rate": 6.962025316455697e-06,
"loss": 0.0879,
"step": 2950
},
{
"epoch": 36.01012658227848,
"grad_norm": 0.005387377459555864,
"learning_rate": 6.947960618846695e-06,
"loss": 0.0318,
"step": 2960
},
{
"epoch": 36.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.073455333709717,
"eval_runtime": 8.491,
"eval_samples_per_second": 5.535,
"eval_steps_per_second": 1.413,
"step": 2960
},
{
"epoch": 37.00126582278481,
"grad_norm": 0.0073117660358548164,
"learning_rate": 6.933895921237694e-06,
"loss": 0.0002,
"step": 2970
},
{
"epoch": 37.00253164556962,
"grad_norm": 0.01575954630970955,
"learning_rate": 6.919831223628692e-06,
"loss": 0.0051,
"step": 2980
},
{
"epoch": 37.00379746835443,
"grad_norm": 0.005418274085968733,
"learning_rate": 6.905766526019692e-06,
"loss": 0.0003,
"step": 2990
},
{
"epoch": 37.00506329113924,
"grad_norm": 0.004269735421985388,
"learning_rate": 6.89170182841069e-06,
"loss": 0.0658,
"step": 3000
},
{
"epoch": 37.00632911392405,
"grad_norm": 0.004627808462828398,
"learning_rate": 6.8776371308016885e-06,
"loss": 0.0001,
"step": 3010
},
{
"epoch": 37.00759493670886,
"grad_norm": 0.008293317630887032,
"learning_rate": 6.8635724331926865e-06,
"loss": 0.0904,
"step": 3020
},
{
"epoch": 37.00886075949367,
"grad_norm": 0.013359429314732552,
"learning_rate": 6.849507735583686e-06,
"loss": 0.1007,
"step": 3030
},
{
"epoch": 37.01012658227848,
"grad_norm": 0.006999185774475336,
"learning_rate": 6.835443037974684e-06,
"loss": 0.0002,
"step": 3040
},
{
"epoch": 37.01012658227848,
"eval_accuracy": 0.2553191489361702,
"eval_loss": 5.146430492401123,
"eval_runtime": 8.4797,
"eval_samples_per_second": 5.543,
"eval_steps_per_second": 1.415,
"step": 3040
},
{
"epoch": 38.00126582278481,
"grad_norm": 0.005403840448707342,
"learning_rate": 6.821378340365683e-06,
"loss": 0.0003,
"step": 3050
},
{
"epoch": 38.00253164556962,
"grad_norm": 0.01304860319942236,
"learning_rate": 6.807313642756681e-06,
"loss": 0.0003,
"step": 3060
},
{
"epoch": 38.00379746835443,
"grad_norm": 0.29351142048835754,
"learning_rate": 6.79324894514768e-06,
"loss": 0.0004,
"step": 3070
},
{
"epoch": 38.00506329113924,
"grad_norm": 0.0071726636961102486,
"learning_rate": 6.779184247538679e-06,
"loss": 0.0002,
"step": 3080
},
{
"epoch": 38.00632911392405,
"grad_norm": 0.8798180222511292,
"learning_rate": 6.7651195499296774e-06,
"loss": 0.0578,
"step": 3090
},
{
"epoch": 38.00759493670886,
"grad_norm": 0.01378143671900034,
"learning_rate": 6.751054852320675e-06,
"loss": 0.0004,
"step": 3100
},
{
"epoch": 38.00886075949367,
"grad_norm": 0.005133031401783228,
"learning_rate": 6.736990154711674e-06,
"loss": 0.0062,
"step": 3110
},
{
"epoch": 38.01012658227848,
"grad_norm": 0.01705407164990902,
"learning_rate": 6.722925457102672e-06,
"loss": 0.0003,
"step": 3120
},
{
"epoch": 38.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.634023189544678,
"eval_runtime": 8.7121,
"eval_samples_per_second": 5.395,
"eval_steps_per_second": 1.377,
"step": 3120
},
{
"epoch": 39.00126582278481,
"grad_norm": 0.005898992531001568,
"learning_rate": 6.708860759493672e-06,
"loss": 0.0008,
"step": 3130
},
{
"epoch": 39.00253164556962,
"grad_norm": 0.40792742371559143,
"learning_rate": 6.69479606188467e-06,
"loss": 0.0003,
"step": 3140
},
{
"epoch": 39.00379746835443,
"grad_norm": 0.019352609291672707,
"learning_rate": 6.680731364275669e-06,
"loss": 0.0002,
"step": 3150
},
{
"epoch": 39.00506329113924,
"grad_norm": 0.0045697493478655815,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0001,
"step": 3160
},
{
"epoch": 39.00632911392405,
"grad_norm": 0.005903047509491444,
"learning_rate": 6.652601969057666e-06,
"loss": 0.0001,
"step": 3170
},
{
"epoch": 39.00759493670886,
"grad_norm": 0.01714833825826645,
"learning_rate": 6.638537271448664e-06,
"loss": 0.1579,
"step": 3180
},
{
"epoch": 39.00886075949367,
"grad_norm": 0.07012953609228134,
"learning_rate": 6.624472573839663e-06,
"loss": 0.0002,
"step": 3190
},
{
"epoch": 39.01012658227848,
"grad_norm": 0.022253967821598053,
"learning_rate": 6.610407876230661e-06,
"loss": 0.48,
"step": 3200
},
{
"epoch": 39.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.337032794952393,
"eval_runtime": 8.4914,
"eval_samples_per_second": 5.535,
"eval_steps_per_second": 1.413,
"step": 3200
},
{
"epoch": 40.00126582278481,
"grad_norm": 233.1455535888672,
"learning_rate": 6.59634317862166e-06,
"loss": 0.0365,
"step": 3210
},
{
"epoch": 40.00253164556962,
"grad_norm": 0.008999134413897991,
"learning_rate": 6.582278481012659e-06,
"loss": 0.1475,
"step": 3220
},
{
"epoch": 40.00379746835443,
"grad_norm": 0.0032340127509087324,
"learning_rate": 6.5682137834036576e-06,
"loss": 0.1164,
"step": 3230
},
{
"epoch": 40.00506329113924,
"grad_norm": 0.014319919049739838,
"learning_rate": 6.5541490857946555e-06,
"loss": 0.0008,
"step": 3240
},
{
"epoch": 40.00632911392405,
"grad_norm": 0.019842559471726418,
"learning_rate": 6.540084388185654e-06,
"loss": 0.0006,
"step": 3250
},
{
"epoch": 40.00759493670886,
"grad_norm": 0.018094390630722046,
"learning_rate": 6.526019690576652e-06,
"loss": 0.0005,
"step": 3260
},
{
"epoch": 40.00886075949367,
"grad_norm": 0.00912653561681509,
"learning_rate": 6.511954992967652e-06,
"loss": 0.049,
"step": 3270
},
{
"epoch": 40.01012658227848,
"grad_norm": 0.03302593529224396,
"learning_rate": 6.49789029535865e-06,
"loss": 0.0002,
"step": 3280
},
{
"epoch": 40.01012658227848,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 4.582009315490723,
"eval_runtime": 8.4753,
"eval_samples_per_second": 5.546,
"eval_steps_per_second": 1.416,
"step": 3280
},
{
"epoch": 41.00126582278481,
"grad_norm": 10.000889778137207,
"learning_rate": 6.483825597749649e-06,
"loss": 0.0011,
"step": 3290
},
{
"epoch": 41.00253164556962,
"grad_norm": 0.009547159075737,
"learning_rate": 6.4697609001406485e-06,
"loss": 0.0002,
"step": 3300
},
{
"epoch": 41.00379746835443,
"grad_norm": 0.005821730941534042,
"learning_rate": 6.4556962025316464e-06,
"loss": 0.0001,
"step": 3310
},
{
"epoch": 41.00506329113924,
"grad_norm": 0.00588320242241025,
"learning_rate": 6.441631504922645e-06,
"loss": 0.0025,
"step": 3320
},
{
"epoch": 41.00632911392405,
"grad_norm": 298.84820556640625,
"learning_rate": 6.427566807313643e-06,
"loss": 0.2948,
"step": 3330
},
{
"epoch": 41.00759493670886,
"grad_norm": 0.00702635245397687,
"learning_rate": 6.413502109704642e-06,
"loss": 0.0002,
"step": 3340
},
{
"epoch": 41.00886075949367,
"grad_norm": 0.003056429559364915,
"learning_rate": 6.39943741209564e-06,
"loss": 0.0259,
"step": 3350
},
{
"epoch": 41.01012658227848,
"grad_norm": 0.011072452180087566,
"learning_rate": 6.38537271448664e-06,
"loss": 0.0002,
"step": 3360
},
{
"epoch": 41.01012658227848,
"eval_accuracy": 0.3191489361702128,
"eval_loss": 5.0156683921813965,
"eval_runtime": 8.468,
"eval_samples_per_second": 5.55,
"eval_steps_per_second": 1.417,
"step": 3360
},
{
"epoch": 42.00126582278481,
"grad_norm": 0.022217601537704468,
"learning_rate": 6.371308016877638e-06,
"loss": 0.0002,
"step": 3370
},
{
"epoch": 42.00253164556962,
"grad_norm": 65.69084167480469,
"learning_rate": 6.3572433192686365e-06,
"loss": 0.0037,
"step": 3380
},
{
"epoch": 42.00379746835443,
"grad_norm": 0.07589118182659149,
"learning_rate": 6.3431786216596345e-06,
"loss": 0.0002,
"step": 3390
},
{
"epoch": 42.00506329113924,
"grad_norm": 0.003494243137538433,
"learning_rate": 6.329113924050634e-06,
"loss": 0.0013,
"step": 3400
},
{
"epoch": 42.00632911392405,
"grad_norm": 0.0027374387718737125,
"learning_rate": 6.315049226441632e-06,
"loss": 0.2015,
"step": 3410
},
{
"epoch": 42.00759493670886,
"grad_norm": 0.0055436789989471436,
"learning_rate": 6.300984528832631e-06,
"loss": 0.0001,
"step": 3420
},
{
"epoch": 42.00886075949367,
"grad_norm": 0.006678999401628971,
"learning_rate": 6.286919831223629e-06,
"loss": 0.0016,
"step": 3430
},
{
"epoch": 42.01012658227848,
"grad_norm": 0.006669959519058466,
"learning_rate": 6.272855133614629e-06,
"loss": 0.1209,
"step": 3440
},
{
"epoch": 42.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.310915946960449,
"eval_runtime": 8.4948,
"eval_samples_per_second": 5.533,
"eval_steps_per_second": 1.413,
"step": 3440
},
{
"epoch": 43.00126582278481,
"grad_norm": 36.991024017333984,
"learning_rate": 6.2587904360056266e-06,
"loss": 0.0031,
"step": 3450
},
{
"epoch": 43.00253164556962,
"grad_norm": 0.03218389302492142,
"learning_rate": 6.244725738396625e-06,
"loss": 0.0003,
"step": 3460
},
{
"epoch": 43.00379746835443,
"grad_norm": 0.0067522223107516766,
"learning_rate": 6.230661040787623e-06,
"loss": 0.0002,
"step": 3470
},
{
"epoch": 43.00506329113924,
"grad_norm": 0.09478826075792313,
"learning_rate": 6.216596343178622e-06,
"loss": 0.0001,
"step": 3480
},
{
"epoch": 43.00632911392405,
"grad_norm": 0.006108371540904045,
"learning_rate": 6.20253164556962e-06,
"loss": 0.0001,
"step": 3490
},
{
"epoch": 43.00759493670886,
"grad_norm": 0.004173735156655312,
"learning_rate": 6.18846694796062e-06,
"loss": 0.0002,
"step": 3500
},
{
"epoch": 43.00886075949367,
"grad_norm": 0.004864380694925785,
"learning_rate": 6.174402250351618e-06,
"loss": 0.0001,
"step": 3510
},
{
"epoch": 43.01012658227848,
"grad_norm": 0.006738508120179176,
"learning_rate": 6.160337552742617e-06,
"loss": 0.0001,
"step": 3520
},
{
"epoch": 43.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.459574222564697,
"eval_runtime": 8.4737,
"eval_samples_per_second": 5.547,
"eval_steps_per_second": 1.416,
"step": 3520
},
{
"epoch": 44.00126582278481,
"grad_norm": 0.00519252335652709,
"learning_rate": 6.146272855133615e-06,
"loss": 0.0001,
"step": 3530
},
{
"epoch": 44.00253164556962,
"grad_norm": 0.0036063846200704575,
"learning_rate": 6.132208157524614e-06,
"loss": 0.0001,
"step": 3540
},
{
"epoch": 44.00379746835443,
"grad_norm": 0.004207131918519735,
"learning_rate": 6.118143459915612e-06,
"loss": 0.0001,
"step": 3550
},
{
"epoch": 44.00506329113924,
"grad_norm": 0.0024055996909737587,
"learning_rate": 6.104078762306611e-06,
"loss": 0.0001,
"step": 3560
},
{
"epoch": 44.00632911392405,
"grad_norm": 0.004374026786535978,
"learning_rate": 6.090014064697609e-06,
"loss": 0.0001,
"step": 3570
},
{
"epoch": 44.00759493670886,
"grad_norm": 0.0027957686688750982,
"learning_rate": 6.075949367088608e-06,
"loss": 0.0002,
"step": 3580
},
{
"epoch": 44.00886075949367,
"grad_norm": 0.008639072068035603,
"learning_rate": 6.061884669479607e-06,
"loss": 0.004,
"step": 3590
},
{
"epoch": 44.01012658227848,
"grad_norm": 0.011701129376888275,
"learning_rate": 6.0478199718706055e-06,
"loss": 0.0109,
"step": 3600
},
{
"epoch": 44.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.425137996673584,
"eval_runtime": 8.4694,
"eval_samples_per_second": 5.549,
"eval_steps_per_second": 1.417,
"step": 3600
},
{
"epoch": 45.00126582278481,
"grad_norm": 0.017412984743714333,
"learning_rate": 6.0337552742616035e-06,
"loss": 0.0001,
"step": 3610
},
{
"epoch": 45.00253164556962,
"grad_norm": 0.007230939343571663,
"learning_rate": 6.019690576652602e-06,
"loss": 0.0698,
"step": 3620
},
{
"epoch": 45.00379746835443,
"grad_norm": 0.014825068414211273,
"learning_rate": 6.0056258790436e-06,
"loss": 0.0001,
"step": 3630
},
{
"epoch": 45.00506329113924,
"grad_norm": 0.013121239840984344,
"learning_rate": 5.9915611814346e-06,
"loss": 0.0001,
"step": 3640
},
{
"epoch": 45.00632911392405,
"grad_norm": 0.013468984514474869,
"learning_rate": 5.977496483825598e-06,
"loss": 0.0001,
"step": 3650
},
{
"epoch": 45.00759493670886,
"grad_norm": 0.06317138671875,
"learning_rate": 5.963431786216597e-06,
"loss": 0.0001,
"step": 3660
},
{
"epoch": 45.00886075949367,
"grad_norm": 0.003630951512604952,
"learning_rate": 5.949367088607595e-06,
"loss": 0.1698,
"step": 3670
},
{
"epoch": 45.01012658227848,
"grad_norm": 0.005787010304629803,
"learning_rate": 5.935302390998594e-06,
"loss": 0.0001,
"step": 3680
},
{
"epoch": 45.01012658227848,
"eval_accuracy": 0.2978723404255319,
"eval_loss": 5.296198844909668,
"eval_runtime": 8.4784,
"eval_samples_per_second": 5.543,
"eval_steps_per_second": 1.415,
"step": 3680
},
{
"epoch": 46.00126582278481,
"grad_norm": 0.00311860884539783,
"learning_rate": 5.921237693389592e-06,
"loss": 0.0553,
"step": 3690
},
{
"epoch": 46.00253164556962,
"grad_norm": 0.005304061342030764,
"learning_rate": 5.907172995780591e-06,
"loss": 0.0002,
"step": 3700
},
{
"epoch": 46.00379746835443,
"grad_norm": 0.015418877825140953,
"learning_rate": 5.893108298171589e-06,
"loss": 0.0001,
"step": 3710
},
{
"epoch": 46.00506329113924,
"grad_norm": 0.018117068335413933,
"learning_rate": 5.879043600562588e-06,
"loss": 0.0024,
"step": 3720
},
{
"epoch": 46.00632911392405,
"grad_norm": 0.004327620379626751,
"learning_rate": 5.864978902953588e-06,
"loss": 0.0003,
"step": 3730
},
{
"epoch": 46.00759493670886,
"grad_norm": 0.024266647174954414,
"learning_rate": 5.850914205344586e-06,
"loss": 0.0001,
"step": 3740
},
{
"epoch": 46.00886075949367,
"grad_norm": 414.96563720703125,
"learning_rate": 5.8368495077355845e-06,
"loss": 0.0487,
"step": 3750
},
{
"epoch": 46.01012658227848,
"grad_norm": 0.008569066412746906,
"learning_rate": 5.8227848101265824e-06,
"loss": 0.1516,
"step": 3760
},
{
"epoch": 46.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.23142147064209,
"eval_runtime": 8.4548,
"eval_samples_per_second": 5.559,
"eval_steps_per_second": 1.419,
"step": 3760
},
{
"epoch": 47.00126582278481,
"grad_norm": 5.414425849914551,
"learning_rate": 5.808720112517582e-06,
"loss": 0.0007,
"step": 3770
},
{
"epoch": 47.00253164556962,
"grad_norm": 0.0027215760201215744,
"learning_rate": 5.79465541490858e-06,
"loss": 0.0002,
"step": 3780
},
{
"epoch": 47.00379746835443,
"grad_norm": 0.15611502528190613,
"learning_rate": 5.780590717299579e-06,
"loss": 0.0073,
"step": 3790
},
{
"epoch": 47.00506329113924,
"grad_norm": 0.0019717360846698284,
"learning_rate": 5.766526019690577e-06,
"loss": 0.0003,
"step": 3800
},
{
"epoch": 47.00632911392405,
"grad_norm": 0.005944707430899143,
"learning_rate": 5.7524613220815765e-06,
"loss": 0.0035,
"step": 3810
},
{
"epoch": 47.00759493670886,
"grad_norm": 0.00346226803958416,
"learning_rate": 5.7383966244725745e-06,
"loss": 0.0001,
"step": 3820
},
{
"epoch": 47.00886075949367,
"grad_norm": 0.024175411090254784,
"learning_rate": 5.724331926863573e-06,
"loss": 0.0193,
"step": 3830
},
{
"epoch": 47.01012658227848,
"grad_norm": 0.03984224796295166,
"learning_rate": 5.710267229254571e-06,
"loss": 0.0001,
"step": 3840
},
{
"epoch": 47.01012658227848,
"eval_accuracy": 0.5319148936170213,
"eval_loss": 4.070488929748535,
"eval_runtime": 8.5074,
"eval_samples_per_second": 5.525,
"eval_steps_per_second": 1.411,
"step": 3840
},
{
"epoch": 48.00126582278481,
"grad_norm": 0.012625842355191708,
"learning_rate": 5.69620253164557e-06,
"loss": 0.0218,
"step": 3850
},
{
"epoch": 48.00253164556962,
"grad_norm": 0.008255310356616974,
"learning_rate": 5.682137834036568e-06,
"loss": 0.012,
"step": 3860
},
{
"epoch": 48.00379746835443,
"grad_norm": 0.019036876037716866,
"learning_rate": 5.668073136427568e-06,
"loss": 0.001,
"step": 3870
},
{
"epoch": 48.00506329113924,
"grad_norm": 0.013268685899674892,
"learning_rate": 5.654008438818566e-06,
"loss": 0.0002,
"step": 3880
},
{
"epoch": 48.00632911392405,
"grad_norm": 0.01589319296181202,
"learning_rate": 5.639943741209565e-06,
"loss": 0.0756,
"step": 3890
},
{
"epoch": 48.00759493670886,
"grad_norm": 0.0036217246670275927,
"learning_rate": 5.6258790436005626e-06,
"loss": 0.1435,
"step": 3900
},
{
"epoch": 48.00886075949367,
"grad_norm": 0.002351459814235568,
"learning_rate": 5.611814345991562e-06,
"loss": 0.0001,
"step": 3910
},
{
"epoch": 48.01012658227848,
"grad_norm": 0.0023628135677427053,
"learning_rate": 5.59774964838256e-06,
"loss": 0.0001,
"step": 3920
},
{
"epoch": 48.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.55861234664917,
"eval_runtime": 8.4995,
"eval_samples_per_second": 5.53,
"eval_steps_per_second": 1.412,
"step": 3920
},
{
"epoch": 49.00126582278481,
"grad_norm": 0.003683815710246563,
"learning_rate": 5.583684950773559e-06,
"loss": 0.0001,
"step": 3930
},
{
"epoch": 49.00253164556962,
"grad_norm": 0.004656449891626835,
"learning_rate": 5.569620253164557e-06,
"loss": 0.0001,
"step": 3940
},
{
"epoch": 49.00379746835443,
"grad_norm": 0.012214281596243382,
"learning_rate": 5.555555555555557e-06,
"loss": 0.0052,
"step": 3950
},
{
"epoch": 49.00506329113924,
"grad_norm": 0.009479358792304993,
"learning_rate": 5.541490857946555e-06,
"loss": 0.012,
"step": 3960
},
{
"epoch": 49.00632911392405,
"grad_norm": 0.008819793350994587,
"learning_rate": 5.5274261603375535e-06,
"loss": 0.0001,
"step": 3970
},
{
"epoch": 49.00759493670886,
"grad_norm": 0.04174829646945,
"learning_rate": 5.5133614627285514e-06,
"loss": 0.0001,
"step": 3980
},
{
"epoch": 49.00886075949367,
"grad_norm": 0.0032157686073333025,
"learning_rate": 5.49929676511955e-06,
"loss": 0.1845,
"step": 3990
},
{
"epoch": 49.01012658227848,
"grad_norm": 0.010618672706186771,
"learning_rate": 5.485232067510548e-06,
"loss": 0.0266,
"step": 4000
},
{
"epoch": 49.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.947876453399658,
"eval_runtime": 8.5288,
"eval_samples_per_second": 5.511,
"eval_steps_per_second": 1.407,
"step": 4000
},
{
"epoch": 50.00126582278481,
"grad_norm": 0.056022197008132935,
"learning_rate": 5.471167369901548e-06,
"loss": 0.2505,
"step": 4010
},
{
"epoch": 50.00253164556962,
"grad_norm": 0.004495659377425909,
"learning_rate": 5.457102672292546e-06,
"loss": 0.053,
"step": 4020
},
{
"epoch": 50.00379746835443,
"grad_norm": 0.003035302273929119,
"learning_rate": 5.443037974683545e-06,
"loss": 0.0001,
"step": 4030
},
{
"epoch": 50.00506329113924,
"grad_norm": 0.006570629775524139,
"learning_rate": 5.428973277074543e-06,
"loss": 0.0002,
"step": 4040
},
{
"epoch": 50.00632911392405,
"grad_norm": 323.4715881347656,
"learning_rate": 5.414908579465542e-06,
"loss": 0.055,
"step": 4050
},
{
"epoch": 50.00759493670886,
"grad_norm": 0.002824920229613781,
"learning_rate": 5.40084388185654e-06,
"loss": 0.0001,
"step": 4060
},
{
"epoch": 50.00886075949367,
"grad_norm": 0.06357023864984512,
"learning_rate": 5.386779184247539e-06,
"loss": 0.0001,
"step": 4070
},
{
"epoch": 50.01012658227848,
"grad_norm": 0.004729899112135172,
"learning_rate": 5.372714486638537e-06,
"loss": 0.0001,
"step": 4080
},
{
"epoch": 50.01012658227848,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.32703161239624,
"eval_runtime": 8.4715,
"eval_samples_per_second": 5.548,
"eval_steps_per_second": 1.417,
"step": 4080
},
{
"epoch": 51.00126582278481,
"grad_norm": 0.008333638310432434,
"learning_rate": 5.358649789029536e-06,
"loss": 0.0002,
"step": 4090
},
{
"epoch": 51.00253164556962,
"grad_norm": 0.009458293206989765,
"learning_rate": 5.344585091420535e-06,
"loss": 0.0001,
"step": 4100
},
{
"epoch": 51.00379746835443,
"grad_norm": 0.0024418376851826906,
"learning_rate": 5.330520393811534e-06,
"loss": 0.0002,
"step": 4110
},
{
"epoch": 51.00506329113924,
"grad_norm": 0.004669019021093845,
"learning_rate": 5.3164556962025316e-06,
"loss": 0.0001,
"step": 4120
},
{
"epoch": 51.00632911392405,
"grad_norm": 0.003113614860922098,
"learning_rate": 5.30239099859353e-06,
"loss": 0.0012,
"step": 4130
},
{
"epoch": 51.00759493670886,
"grad_norm": 0.003157148603349924,
"learning_rate": 5.28832630098453e-06,
"loss": 0.0001,
"step": 4140
},
{
"epoch": 51.00886075949367,
"grad_norm": 0.004666858818382025,
"learning_rate": 5.274261603375528e-06,
"loss": 0.1445,
"step": 4150
},
{
"epoch": 51.01012658227848,
"grad_norm": 0.002661221195012331,
"learning_rate": 5.260196905766527e-06,
"loss": 0.1307,
"step": 4160
},
{
"epoch": 51.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.794792175292969,
"eval_runtime": 8.4848,
"eval_samples_per_second": 5.539,
"eval_steps_per_second": 1.414,
"step": 4160
},
{
"epoch": 52.00126582278481,
"grad_norm": 0.001836895477026701,
"learning_rate": 5.246132208157525e-06,
"loss": 0.0002,
"step": 4170
},
{
"epoch": 52.00253164556962,
"grad_norm": 2.3909878730773926,
"learning_rate": 5.2320675105485245e-06,
"loss": 0.0084,
"step": 4180
},
{
"epoch": 52.00379746835443,
"grad_norm": 0.0022460331674665213,
"learning_rate": 5.2180028129395225e-06,
"loss": 0.0005,
"step": 4190
},
{
"epoch": 52.00506329113924,
"grad_norm": 0.7268118858337402,
"learning_rate": 5.203938115330521e-06,
"loss": 0.0008,
"step": 4200
},
{
"epoch": 52.00632911392405,
"grad_norm": 0.0033825428690761328,
"learning_rate": 5.189873417721519e-06,
"loss": 0.0001,
"step": 4210
},
{
"epoch": 52.00759493670886,
"grad_norm": 0.006189883220940828,
"learning_rate": 5.175808720112518e-06,
"loss": 0.0001,
"step": 4220
},
{
"epoch": 52.00886075949367,
"grad_norm": 0.006958500016480684,
"learning_rate": 5.161744022503516e-06,
"loss": 0.0002,
"step": 4230
},
{
"epoch": 52.01012658227848,
"grad_norm": 0.003031873842701316,
"learning_rate": 5.147679324894516e-06,
"loss": 0.0019,
"step": 4240
},
{
"epoch": 52.01012658227848,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 4.363827705383301,
"eval_runtime": 8.6439,
"eval_samples_per_second": 5.437,
"eval_steps_per_second": 1.388,
"step": 4240
},
{
"epoch": 53.00126582278481,
"grad_norm": 0.09558191895484924,
"learning_rate": 5.133614627285514e-06,
"loss": 0.0001,
"step": 4250
},
{
"epoch": 53.00253164556962,
"grad_norm": 0.002434414578601718,
"learning_rate": 5.1195499296765125e-06,
"loss": 0.0007,
"step": 4260
},
{
"epoch": 53.00379746835443,
"grad_norm": 0.018281536176800728,
"learning_rate": 5.1054852320675105e-06,
"loss": 0.0004,
"step": 4270
},
{
"epoch": 53.00506329113924,
"grad_norm": 0.003481107298284769,
"learning_rate": 5.09142053445851e-06,
"loss": 0.0001,
"step": 4280
},
{
"epoch": 53.00632911392405,
"grad_norm": 0.0011617491254583001,
"learning_rate": 5.077355836849508e-06,
"loss": 0.0001,
"step": 4290
},
{
"epoch": 53.00759493670886,
"grad_norm": 0.007551996968686581,
"learning_rate": 5.063291139240507e-06,
"loss": 0.0001,
"step": 4300
},
{
"epoch": 53.00886075949367,
"grad_norm": 0.003541940590366721,
"learning_rate": 5.049226441631505e-06,
"loss": 0.0001,
"step": 4310
},
{
"epoch": 53.01012658227848,
"grad_norm": 0.0031788817141205072,
"learning_rate": 5.035161744022505e-06,
"loss": 0.0001,
"step": 4320
},
{
"epoch": 53.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.586310386657715,
"eval_runtime": 8.4934,
"eval_samples_per_second": 5.534,
"eval_steps_per_second": 1.413,
"step": 4320
},
{
"epoch": 54.00126582278481,
"grad_norm": 0.0024340234231203794,
"learning_rate": 5.021097046413503e-06,
"loss": 0.0001,
"step": 4330
},
{
"epoch": 54.00253164556962,
"grad_norm": 0.0034480541944503784,
"learning_rate": 5.007032348804501e-06,
"loss": 0.0001,
"step": 4340
},
{
"epoch": 54.00379746835443,
"grad_norm": 0.0023180190473794937,
"learning_rate": 4.9929676511955e-06,
"loss": 0.0001,
"step": 4350
},
{
"epoch": 54.00506329113924,
"grad_norm": 0.0015061123995110393,
"learning_rate": 4.978902953586498e-06,
"loss": 0.0001,
"step": 4360
},
{
"epoch": 54.00632911392405,
"grad_norm": 0.013990904204547405,
"learning_rate": 4.964838255977497e-06,
"loss": 0.0001,
"step": 4370
},
{
"epoch": 54.00759493670886,
"grad_norm": 0.0013285009190440178,
"learning_rate": 4.950773558368496e-06,
"loss": 0.0001,
"step": 4380
},
{
"epoch": 54.00886075949367,
"grad_norm": 0.00343449623323977,
"learning_rate": 4.936708860759495e-06,
"loss": 0.0001,
"step": 4390
},
{
"epoch": 54.01012658227848,
"grad_norm": 0.003100321162492037,
"learning_rate": 4.922644163150493e-06,
"loss": 0.0001,
"step": 4400
},
{
"epoch": 54.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.737309455871582,
"eval_runtime": 8.4983,
"eval_samples_per_second": 5.53,
"eval_steps_per_second": 1.412,
"step": 4400
},
{
"epoch": 55.00126582278481,
"grad_norm": 0.0022656081710010767,
"learning_rate": 4.9085794655414915e-06,
"loss": 0.0001,
"step": 4410
},
{
"epoch": 55.00253164556962,
"grad_norm": 0.002674259478226304,
"learning_rate": 4.89451476793249e-06,
"loss": 0.0001,
"step": 4420
},
{
"epoch": 55.00379746835443,
"grad_norm": 0.0027046040631830692,
"learning_rate": 4.880450070323488e-06,
"loss": 0.0024,
"step": 4430
},
{
"epoch": 55.00506329113924,
"grad_norm": 0.003956619184464216,
"learning_rate": 4.866385372714487e-06,
"loss": 0.0001,
"step": 4440
},
{
"epoch": 55.00632911392405,
"grad_norm": 0.03139903396368027,
"learning_rate": 4.852320675105486e-06,
"loss": 0.0742,
"step": 4450
},
{
"epoch": 55.00759493670886,
"grad_norm": 0.00574122928082943,
"learning_rate": 4.838255977496485e-06,
"loss": 0.2666,
"step": 4460
},
{
"epoch": 55.00886075949367,
"grad_norm": 0.012300165370106697,
"learning_rate": 4.824191279887483e-06,
"loss": 0.0001,
"step": 4470
},
{
"epoch": 55.01012658227848,
"grad_norm": 5.114750385284424,
"learning_rate": 4.8101265822784815e-06,
"loss": 0.0006,
"step": 4480
},
{
"epoch": 55.01012658227848,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 3.9066474437713623,
"eval_runtime": 8.5347,
"eval_samples_per_second": 5.507,
"eval_steps_per_second": 1.406,
"step": 4480
},
{
"epoch": 56.00126582278481,
"grad_norm": 11.49494457244873,
"learning_rate": 4.79606188466948e-06,
"loss": 0.0011,
"step": 4490
},
{
"epoch": 56.00253164556962,
"grad_norm": 0.005774380173534155,
"learning_rate": 4.781997187060478e-06,
"loss": 0.0001,
"step": 4500
},
{
"epoch": 56.00379746835443,
"grad_norm": 0.006357602309435606,
"learning_rate": 4.767932489451477e-06,
"loss": 0.0001,
"step": 4510
},
{
"epoch": 56.00506329113924,
"grad_norm": 0.002659859601408243,
"learning_rate": 4.753867791842476e-06,
"loss": 0.0017,
"step": 4520
},
{
"epoch": 56.00632911392405,
"grad_norm": 0.013889423571527004,
"learning_rate": 4.739803094233475e-06,
"loss": 0.0001,
"step": 4530
},
{
"epoch": 56.00759493670886,
"grad_norm": 0.002410717075690627,
"learning_rate": 4.725738396624473e-06,
"loss": 0.0001,
"step": 4540
},
{
"epoch": 56.00886075949367,
"grad_norm": 0.0023062098771333694,
"learning_rate": 4.711673699015472e-06,
"loss": 0.0001,
"step": 4550
},
{
"epoch": 56.01012658227848,
"grad_norm": 0.0023318820167332888,
"learning_rate": 4.6976090014064704e-06,
"loss": 0.0001,
"step": 4560
},
{
"epoch": 56.01012658227848,
"eval_accuracy": 0.46808510638297873,
"eval_loss": 4.031365394592285,
"eval_runtime": 8.457,
"eval_samples_per_second": 5.558,
"eval_steps_per_second": 1.419,
"step": 4560
},
{
"epoch": 57.00126582278481,
"grad_norm": 0.004980940837413073,
"learning_rate": 4.683544303797468e-06,
"loss": 0.0001,
"step": 4570
},
{
"epoch": 57.00253164556962,
"grad_norm": 0.0019251375924795866,
"learning_rate": 4.669479606188467e-06,
"loss": 0.0001,
"step": 4580
},
{
"epoch": 57.00379746835443,
"grad_norm": 0.0028012413531541824,
"learning_rate": 4.655414908579466e-06,
"loss": 0.0174,
"step": 4590
},
{
"epoch": 57.00506329113924,
"grad_norm": 143.03387451171875,
"learning_rate": 4.641350210970465e-06,
"loss": 0.0119,
"step": 4600
},
{
"epoch": 57.00632911392405,
"grad_norm": 0.003186359303072095,
"learning_rate": 4.627285513361463e-06,
"loss": 0.0001,
"step": 4610
},
{
"epoch": 57.00759493670886,
"grad_norm": 0.11152984201908112,
"learning_rate": 4.613220815752462e-06,
"loss": 0.0037,
"step": 4620
},
{
"epoch": 57.00886075949367,
"grad_norm": 0.001349453697912395,
"learning_rate": 4.5991561181434605e-06,
"loss": 0.1545,
"step": 4630
},
{
"epoch": 57.01012658227848,
"grad_norm": 0.05059582367539406,
"learning_rate": 4.5850914205344585e-06,
"loss": 0.0001,
"step": 4640
},
{
"epoch": 57.01012658227848,
"eval_accuracy": 0.5106382978723404,
"eval_loss": 4.058121204376221,
"eval_runtime": 8.491,
"eval_samples_per_second": 5.535,
"eval_steps_per_second": 1.413,
"step": 4640
},
{
"epoch": 58.00126582278481,
"grad_norm": 0.003746249247342348,
"learning_rate": 4.571026722925457e-06,
"loss": 0.0,
"step": 4650
},
{
"epoch": 58.00253164556962,
"grad_norm": 0.0019692752975970507,
"learning_rate": 4.556962025316456e-06,
"loss": 0.0001,
"step": 4660
},
{
"epoch": 58.00379746835443,
"grad_norm": 0.002934554358944297,
"learning_rate": 4.542897327707454e-06,
"loss": 0.0001,
"step": 4670
},
{
"epoch": 58.00506329113924,
"grad_norm": 0.005108493380248547,
"learning_rate": 4.528832630098453e-06,
"loss": 0.0402,
"step": 4680
},
{
"epoch": 58.00632911392405,
"grad_norm": 0.004260794725269079,
"learning_rate": 4.514767932489452e-06,
"loss": 0.0006,
"step": 4690
},
{
"epoch": 58.00759493670886,
"grad_norm": 0.06016235053539276,
"learning_rate": 4.5007032348804506e-06,
"loss": 0.0004,
"step": 4700
},
{
"epoch": 58.00886075949367,
"grad_norm": 0.0011827549897134304,
"learning_rate": 4.4866385372714485e-06,
"loss": 0.114,
"step": 4710
},
{
"epoch": 58.01012658227848,
"grad_norm": 0.002215220592916012,
"learning_rate": 4.472573839662447e-06,
"loss": 0.0001,
"step": 4720
},
{
"epoch": 58.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 5.004458904266357,
"eval_runtime": 8.4472,
"eval_samples_per_second": 5.564,
"eval_steps_per_second": 1.421,
"step": 4720
},
{
"epoch": 59.00126582278481,
"grad_norm": 0.004406394902616739,
"learning_rate": 4.458509142053446e-06,
"loss": 0.0005,
"step": 4730
},
{
"epoch": 59.00253164556962,
"grad_norm": 0.002640231978148222,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0001,
"step": 4740
},
{
"epoch": 59.00379746835443,
"grad_norm": 0.0029783693607896566,
"learning_rate": 4.430379746835443e-06,
"loss": 0.0001,
"step": 4750
},
{
"epoch": 59.00506329113924,
"grad_norm": 0.013541797176003456,
"learning_rate": 4.416315049226442e-06,
"loss": 0.0004,
"step": 4760
},
{
"epoch": 59.00632911392405,
"grad_norm": 0.0070534199476242065,
"learning_rate": 4.402250351617441e-06,
"loss": 0.0002,
"step": 4770
},
{
"epoch": 59.00759493670886,
"grad_norm": 0.0034858768340200186,
"learning_rate": 4.3881856540084394e-06,
"loss": 0.0001,
"step": 4780
},
{
"epoch": 59.00886075949367,
"grad_norm": 0.00406244769692421,
"learning_rate": 4.374120956399438e-06,
"loss": 0.0,
"step": 4790
},
{
"epoch": 59.01012658227848,
"grad_norm": 0.0017109077889472246,
"learning_rate": 4.360056258790436e-06,
"loss": 0.0001,
"step": 4800
},
{
"epoch": 59.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.089483737945557,
"eval_runtime": 8.467,
"eval_samples_per_second": 5.551,
"eval_steps_per_second": 1.417,
"step": 4800
},
{
"epoch": 60.00126582278481,
"grad_norm": 0.0024315589107573032,
"learning_rate": 4.345991561181435e-06,
"loss": 0.0,
"step": 4810
},
{
"epoch": 60.00253164556962,
"grad_norm": 0.0012313745683059096,
"learning_rate": 4.331926863572434e-06,
"loss": 0.0001,
"step": 4820
},
{
"epoch": 60.00379746835443,
"grad_norm": 0.0019479021430015564,
"learning_rate": 4.317862165963433e-06,
"loss": 0.0004,
"step": 4830
},
{
"epoch": 60.00506329113924,
"grad_norm": 0.00494040735065937,
"learning_rate": 4.303797468354431e-06,
"loss": 0.0872,
"step": 4840
},
{
"epoch": 60.00632911392405,
"grad_norm": 0.012567605823278427,
"learning_rate": 4.2897327707454295e-06,
"loss": 0.0011,
"step": 4850
},
{
"epoch": 60.00759493670886,
"grad_norm": 0.002357608638703823,
"learning_rate": 4.275668073136428e-06,
"loss": 0.0296,
"step": 4860
},
{
"epoch": 60.00886075949367,
"grad_norm": 0.0030609623063355684,
"learning_rate": 4.261603375527426e-06,
"loss": 0.0243,
"step": 4870
},
{
"epoch": 60.01012658227848,
"grad_norm": 0.0016012099804356694,
"learning_rate": 4.247538677918425e-06,
"loss": 0.0713,
"step": 4880
},
{
"epoch": 60.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 5.042915344238281,
"eval_runtime": 8.491,
"eval_samples_per_second": 5.535,
"eval_steps_per_second": 1.413,
"step": 4880
},
{
"epoch": 61.00126582278481,
"grad_norm": 0.004251533187925816,
"learning_rate": 4.233473980309424e-06,
"loss": 0.0766,
"step": 4890
},
{
"epoch": 61.00253164556962,
"grad_norm": 0.0019293460063636303,
"learning_rate": 4.219409282700423e-06,
"loss": 0.0006,
"step": 4900
},
{
"epoch": 61.00379746835443,
"grad_norm": 0.004420694895088673,
"learning_rate": 4.205344585091421e-06,
"loss": 0.0567,
"step": 4910
},
{
"epoch": 61.00506329113924,
"grad_norm": 0.2990714907646179,
"learning_rate": 4.1912798874824196e-06,
"loss": 0.0274,
"step": 4920
},
{
"epoch": 61.00632911392405,
"grad_norm": 0.004468689672648907,
"learning_rate": 4.177215189873418e-06,
"loss": 0.0001,
"step": 4930
},
{
"epoch": 61.00759493670886,
"grad_norm": 0.004564017057418823,
"learning_rate": 4.163150492264416e-06,
"loss": 0.2423,
"step": 4940
},
{
"epoch": 61.00886075949367,
"grad_norm": 93.79319763183594,
"learning_rate": 4.149085794655415e-06,
"loss": 0.0517,
"step": 4950
},
{
"epoch": 61.01012658227848,
"grad_norm": 0.004615637473762035,
"learning_rate": 4.135021097046414e-06,
"loss": 0.0017,
"step": 4960
},
{
"epoch": 61.01012658227848,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.786965370178223,
"eval_runtime": 8.4752,
"eval_samples_per_second": 5.546,
"eval_steps_per_second": 1.416,
"step": 4960
},
{
"epoch": 62.00126582278481,
"grad_norm": 0.061868444085121155,
"learning_rate": 4.120956399437413e-06,
"loss": 0.0001,
"step": 4970
},
{
"epoch": 62.00253164556962,
"grad_norm": 0.006057640537619591,
"learning_rate": 4.106891701828411e-06,
"loss": 0.0013,
"step": 4980
},
{
"epoch": 62.00379746835443,
"grad_norm": 4.027284145355225,
"learning_rate": 4.09282700421941e-06,
"loss": 0.0004,
"step": 4990
},
{
"epoch": 62.00506329113924,
"grad_norm": 0.00944253709167242,
"learning_rate": 4.0787623066104084e-06,
"loss": 0.0001,
"step": 5000
},
{
"epoch": 62.00632911392405,
"grad_norm": 0.0035694832913577557,
"learning_rate": 4.064697609001406e-06,
"loss": 0.0001,
"step": 5010
},
{
"epoch": 62.00759493670886,
"grad_norm": 0.0015398276736959815,
"learning_rate": 4.050632911392405e-06,
"loss": 0.0671,
"step": 5020
},
{
"epoch": 62.00886075949367,
"grad_norm": 0.002066017361357808,
"learning_rate": 4.036568213783404e-06,
"loss": 0.0009,
"step": 5030
},
{
"epoch": 62.01012658227848,
"grad_norm": 0.003685934003442526,
"learning_rate": 4.022503516174403e-06,
"loss": 0.0676,
"step": 5040
},
{
"epoch": 62.01012658227848,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 5.095708847045898,
"eval_runtime": 8.4925,
"eval_samples_per_second": 5.534,
"eval_steps_per_second": 1.413,
"step": 5040
},
{
"epoch": 63.00126582278481,
"grad_norm": 0.0065714651718735695,
"learning_rate": 4.008438818565401e-06,
"loss": 0.0001,
"step": 5050
},
{
"epoch": 63.00253164556962,
"grad_norm": 0.003956567496061325,
"learning_rate": 3.9943741209564e-06,
"loss": 0.0001,
"step": 5060
},
{
"epoch": 63.00379746835443,
"grad_norm": 0.008157577365636826,
"learning_rate": 3.9803094233473985e-06,
"loss": 0.0,
"step": 5070
},
{
"epoch": 63.00506329113924,
"grad_norm": 0.0031191923189908266,
"learning_rate": 3.9662447257383965e-06,
"loss": 0.0,
"step": 5080
},
{
"epoch": 63.00632911392405,
"grad_norm": 0.0020041377283632755,
"learning_rate": 3.952180028129395e-06,
"loss": 0.0001,
"step": 5090
},
{
"epoch": 63.00759493670886,
"grad_norm": 0.004067094065248966,
"learning_rate": 3.938115330520394e-06,
"loss": 0.0001,
"step": 5100
},
{
"epoch": 63.00886075949367,
"grad_norm": 0.0008815817418508232,
"learning_rate": 3.924050632911393e-06,
"loss": 0.0,
"step": 5110
},
{
"epoch": 63.01012658227848,
"grad_norm": 0.008889904245734215,
"learning_rate": 3.909985935302391e-06,
"loss": 0.0,
"step": 5120
},
{
"epoch": 63.01012658227848,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.606178283691406,
"eval_runtime": 8.459,
"eval_samples_per_second": 5.556,
"eval_steps_per_second": 1.419,
"step": 5120
},
{
"epoch": 64.00126582278482,
"grad_norm": 0.0015669207787141204,
"learning_rate": 3.89592123769339e-06,
"loss": 0.0,
"step": 5130
},
{
"epoch": 64.00253164556962,
"grad_norm": 0.0011807240080088377,
"learning_rate": 3.8818565400843886e-06,
"loss": 0.0,
"step": 5140
},
{
"epoch": 64.00379746835443,
"grad_norm": 0.0015742299146950245,
"learning_rate": 3.8677918424753865e-06,
"loss": 0.0,
"step": 5150
},
{
"epoch": 64.00506329113924,
"grad_norm": 0.004820580128580332,
"learning_rate": 3.853727144866385e-06,
"loss": 0.0,
"step": 5160
},
{
"epoch": 64.00632911392405,
"grad_norm": 0.0032741015311330557,
"learning_rate": 3.839662447257384e-06,
"loss": 0.0008,
"step": 5170
},
{
"epoch": 64.00759493670886,
"grad_norm": 0.0036417359951883554,
"learning_rate": 3.825597749648383e-06,
"loss": 0.0,
"step": 5180
},
{
"epoch": 64.00886075949367,
"grad_norm": 0.008372402749955654,
"learning_rate": 3.811533052039382e-06,
"loss": 0.0001,
"step": 5190
},
{
"epoch": 64.01012658227847,
"grad_norm": 0.003397272201254964,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.0045,
"step": 5200
},
{
"epoch": 64.01012658227847,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 5.245887756347656,
"eval_runtime": 8.4656,
"eval_samples_per_second": 5.552,
"eval_steps_per_second": 1.418,
"step": 5200
},
{
"epoch": 65.00126582278482,
"grad_norm": 0.0019356166012585163,
"learning_rate": 3.783403656821379e-06,
"loss": 0.0014,
"step": 5210
},
{
"epoch": 65.00253164556962,
"grad_norm": 0.0013496861793100834,
"learning_rate": 3.7693389592123775e-06,
"loss": 0.0267,
"step": 5220
},
{
"epoch": 65.00379746835443,
"grad_norm": 0.04248461872339249,
"learning_rate": 3.755274261603376e-06,
"loss": 0.0002,
"step": 5230
},
{
"epoch": 65.00506329113924,
"grad_norm": 0.004868203774094582,
"learning_rate": 3.7412095639943747e-06,
"loss": 0.002,
"step": 5240
},
{
"epoch": 65.00632911392405,
"grad_norm": 0.0008925410802476108,
"learning_rate": 3.727144866385373e-06,
"loss": 0.0,
"step": 5250
},
{
"epoch": 65.00759493670886,
"grad_norm": 0.0019144342513754964,
"learning_rate": 3.713080168776372e-06,
"loss": 0.0,
"step": 5260
},
{
"epoch": 65.00886075949367,
"grad_norm": 0.001448463648557663,
"learning_rate": 3.6990154711673703e-06,
"loss": 0.0001,
"step": 5270
},
{
"epoch": 65.01012658227847,
"grad_norm": 0.0024687196128070354,
"learning_rate": 3.684950773558369e-06,
"loss": 0.0943,
"step": 5280
},
{
"epoch": 65.01012658227847,
"eval_accuracy": 0.3617021276595745,
"eval_loss": 5.0856242179870605,
"eval_runtime": 8.4623,
"eval_samples_per_second": 5.554,
"eval_steps_per_second": 1.418,
"step": 5280
},
{
"epoch": 66.00126582278482,
"grad_norm": 0.0016155457124114037,
"learning_rate": 3.6708860759493675e-06,
"loss": 0.0001,
"step": 5290
},
{
"epoch": 66.00253164556962,
"grad_norm": 0.17640484869480133,
"learning_rate": 3.656821378340366e-06,
"loss": 0.0001,
"step": 5300
},
{
"epoch": 66.00379746835443,
"grad_norm": 0.0015737387584522367,
"learning_rate": 3.6427566807313647e-06,
"loss": 0.0,
"step": 5310
},
{
"epoch": 66.00506329113924,
"grad_norm": 0.015487028285861015,
"learning_rate": 3.628691983122363e-06,
"loss": 0.0001,
"step": 5320
},
{
"epoch": 66.00632911392405,
"grad_norm": 0.0009026491898111999,
"learning_rate": 3.614627285513362e-06,
"loss": 0.0143,
"step": 5330
},
{
"epoch": 66.00759493670886,
"grad_norm": 0.0015520367305725813,
"learning_rate": 3.6005625879043604e-06,
"loss": 0.0048,
"step": 5340
},
{
"epoch": 66.00886075949367,
"grad_norm": 0.007421619724482298,
"learning_rate": 3.586497890295359e-06,
"loss": 0.0,
"step": 5350
},
{
"epoch": 66.01012658227847,
"grad_norm": 0.0009921834571287036,
"learning_rate": 3.5724331926863576e-06,
"loss": 0.0002,
"step": 5360
},
{
"epoch": 66.01012658227847,
"eval_accuracy": 0.48936170212765956,
"eval_loss": 4.449216365814209,
"eval_runtime": 8.455,
"eval_samples_per_second": 5.559,
"eval_steps_per_second": 1.419,
"step": 5360
},
{
"epoch": 67.00126582278482,
"grad_norm": 0.009655151516199112,
"learning_rate": 3.558368495077356e-06,
"loss": 0.0001,
"step": 5370
},
{
"epoch": 67.00253164556962,
"grad_norm": 0.004027712158858776,
"learning_rate": 3.544303797468355e-06,
"loss": 0.0001,
"step": 5380
},
{
"epoch": 67.00379746835443,
"grad_norm": 0.002234363229945302,
"learning_rate": 3.530239099859353e-06,
"loss": 0.0001,
"step": 5390
},
{
"epoch": 67.00506329113924,
"grad_norm": 0.001890279003418982,
"learning_rate": 3.516174402250352e-06,
"loss": 0.0001,
"step": 5400
},
{
"epoch": 67.00632911392405,
"grad_norm": 0.0018905351171270013,
"learning_rate": 3.5021097046413504e-06,
"loss": 0.0,
"step": 5410
},
{
"epoch": 67.00759493670886,
"grad_norm": 0.001428403309546411,
"learning_rate": 3.4880450070323492e-06,
"loss": 0.0001,
"step": 5420
},
{
"epoch": 67.00886075949367,
"grad_norm": 0.001419195905327797,
"learning_rate": 3.4739803094233476e-06,
"loss": 0.0924,
"step": 5430
},
{
"epoch": 67.01012658227847,
"grad_norm": 0.0011965942103415728,
"learning_rate": 3.459915611814346e-06,
"loss": 0.0002,
"step": 5440
},
{
"epoch": 67.01012658227847,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 5.17952299118042,
"eval_runtime": 8.4394,
"eval_samples_per_second": 5.569,
"eval_steps_per_second": 1.422,
"step": 5440
},
{
"epoch": 68.00126582278482,
"grad_norm": 0.004659404046833515,
"learning_rate": 3.445850914205345e-06,
"loss": 0.0001,
"step": 5450
},
{
"epoch": 68.00253164556962,
"grad_norm": 282.0872497558594,
"learning_rate": 3.4317862165963433e-06,
"loss": 0.0246,
"step": 5460
},
{
"epoch": 68.00379746835443,
"grad_norm": 0.19597108662128448,
"learning_rate": 3.417721518987342e-06,
"loss": 0.0001,
"step": 5470
},
{
"epoch": 68.00506329113924,
"grad_norm": 0.0020114402286708355,
"learning_rate": 3.4036568213783405e-06,
"loss": 0.0449,
"step": 5480
},
{
"epoch": 68.00632911392405,
"grad_norm": 0.0017703929916024208,
"learning_rate": 3.3895921237693393e-06,
"loss": 0.0001,
"step": 5490
},
{
"epoch": 68.00759493670886,
"grad_norm": 0.005612295586615801,
"learning_rate": 3.3755274261603377e-06,
"loss": 0.0,
"step": 5500
},
{
"epoch": 68.00886075949367,
"grad_norm": 0.002703710226342082,
"learning_rate": 3.361462728551336e-06,
"loss": 0.0001,
"step": 5510
},
{
"epoch": 68.01012658227847,
"grad_norm": 0.0033236260060220957,
"learning_rate": 3.347398030942335e-06,
"loss": 0.0007,
"step": 5520
},
{
"epoch": 68.01012658227847,
"eval_accuracy": 0.46808510638297873,
"eval_loss": 4.3201751708984375,
"eval_runtime": 8.4512,
"eval_samples_per_second": 5.561,
"eval_steps_per_second": 1.42,
"step": 5520
},
{
"epoch": 69.00126582278482,
"grad_norm": 0.13362517952919006,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0001,
"step": 5530
},
{
"epoch": 69.00253164556962,
"grad_norm": 0.0022546211257576942,
"learning_rate": 3.319268635724332e-06,
"loss": 0.1001,
"step": 5540
},
{
"epoch": 69.00379746835443,
"grad_norm": 0.002193002263084054,
"learning_rate": 3.3052039381153305e-06,
"loss": 0.0,
"step": 5550
},
{
"epoch": 69.00506329113924,
"grad_norm": 0.0027829715982079506,
"learning_rate": 3.2911392405063294e-06,
"loss": 0.0061,
"step": 5560
},
{
"epoch": 69.00632911392405,
"grad_norm": 0.0031120802741497755,
"learning_rate": 3.2770745428973278e-06,
"loss": 0.0001,
"step": 5570
},
{
"epoch": 69.00759493670886,
"grad_norm": 0.001309010898694396,
"learning_rate": 3.263009845288326e-06,
"loss": 0.0001,
"step": 5580
},
{
"epoch": 69.00886075949367,
"grad_norm": 0.028802473098039627,
"learning_rate": 3.248945147679325e-06,
"loss": 0.004,
"step": 5590
},
{
"epoch": 69.01012658227847,
"grad_norm": 0.0005848377477377653,
"learning_rate": 3.2348804500703242e-06,
"loss": 0.1678,
"step": 5600
},
{
"epoch": 69.01012658227847,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.868789196014404,
"eval_runtime": 8.4887,
"eval_samples_per_second": 5.537,
"eval_steps_per_second": 1.414,
"step": 5600
},
{
"epoch": 70.00126582278482,
"grad_norm": 0.0012933706166222692,
"learning_rate": 3.2208157524613226e-06,
"loss": 0.0002,
"step": 5610
},
{
"epoch": 70.00253164556962,
"grad_norm": 0.02926361933350563,
"learning_rate": 3.206751054852321e-06,
"loss": 0.0041,
"step": 5620
},
{
"epoch": 70.00379746835443,
"grad_norm": 0.001349663594737649,
"learning_rate": 3.19268635724332e-06,
"loss": 0.0003,
"step": 5630
},
{
"epoch": 70.00506329113924,
"grad_norm": 0.002003163332119584,
"learning_rate": 3.1786216596343183e-06,
"loss": 0.0,
"step": 5640
},
{
"epoch": 70.00632911392405,
"grad_norm": 0.011114409193396568,
"learning_rate": 3.164556962025317e-06,
"loss": 0.0001,
"step": 5650
},
{
"epoch": 70.00759493670886,
"grad_norm": 0.004937044810503721,
"learning_rate": 3.1504922644163155e-06,
"loss": 0.0,
"step": 5660
},
{
"epoch": 70.00886075949367,
"grad_norm": 0.01026509702205658,
"learning_rate": 3.1364275668073143e-06,
"loss": 0.0001,
"step": 5670
},
{
"epoch": 70.01012658227847,
"grad_norm": 0.00430481368675828,
"learning_rate": 3.1223628691983127e-06,
"loss": 0.0001,
"step": 5680
},
{
"epoch": 70.01012658227847,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 5.288034439086914,
"eval_runtime": 8.477,
"eval_samples_per_second": 5.544,
"eval_steps_per_second": 1.416,
"step": 5680
},
{
"epoch": 71.00126582278482,
"grad_norm": 0.0036116482224315405,
"learning_rate": 3.108298171589311e-06,
"loss": 0.0,
"step": 5690
},
{
"epoch": 71.00253164556962,
"grad_norm": 0.015287871472537518,
"learning_rate": 3.09423347398031e-06,
"loss": 0.0001,
"step": 5700
},
{
"epoch": 71.00379746835443,
"grad_norm": 0.008669455535709858,
"learning_rate": 3.0801687763713083e-06,
"loss": 0.0,
"step": 5710
},
{
"epoch": 71.00506329113924,
"grad_norm": 0.019757656380534172,
"learning_rate": 3.066104078762307e-06,
"loss": 0.0001,
"step": 5720
},
{
"epoch": 71.00632911392405,
"grad_norm": 0.012890863232314587,
"learning_rate": 3.0520393811533055e-06,
"loss": 0.0,
"step": 5730
},
{
"epoch": 71.00759493670886,
"grad_norm": 0.0019587704446166754,
"learning_rate": 3.037974683544304e-06,
"loss": 0.0,
"step": 5740
},
{
"epoch": 71.00886075949367,
"grad_norm": 0.0018708609277382493,
"learning_rate": 3.0239099859353028e-06,
"loss": 0.0,
"step": 5750
},
{
"epoch": 71.01012658227847,
"grad_norm": 0.003253462491557002,
"learning_rate": 3.009845288326301e-06,
"loss": 0.0,
"step": 5760
},
{
"epoch": 71.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 5.115118503570557,
"eval_runtime": 8.4566,
"eval_samples_per_second": 5.558,
"eval_steps_per_second": 1.419,
"step": 5760
},
{
"epoch": 72.00126582278482,
"grad_norm": 0.001443624496459961,
"learning_rate": 2.9957805907173e-06,
"loss": 0.0,
"step": 5770
},
{
"epoch": 72.00253164556962,
"grad_norm": 0.0025624725967645645,
"learning_rate": 2.9817158931082984e-06,
"loss": 0.0026,
"step": 5780
},
{
"epoch": 72.00379746835443,
"grad_norm": 0.00680403271690011,
"learning_rate": 2.967651195499297e-06,
"loss": 0.0,
"step": 5790
},
{
"epoch": 72.00506329113924,
"grad_norm": 0.0030975525733083487,
"learning_rate": 2.9535864978902956e-06,
"loss": 0.0,
"step": 5800
},
{
"epoch": 72.00632911392405,
"grad_norm": 0.019320061430335045,
"learning_rate": 2.939521800281294e-06,
"loss": 0.2163,
"step": 5810
},
{
"epoch": 72.00759493670886,
"grad_norm": 0.07163013517856598,
"learning_rate": 2.925457102672293e-06,
"loss": 0.0001,
"step": 5820
},
{
"epoch": 72.00886075949367,
"grad_norm": 0.0038794104475528,
"learning_rate": 2.9113924050632912e-06,
"loss": 0.1065,
"step": 5830
},
{
"epoch": 72.01012658227847,
"grad_norm": 0.0027189133688807487,
"learning_rate": 2.89732770745429e-06,
"loss": 0.0005,
"step": 5840
},
{
"epoch": 72.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.566655158996582,
"eval_runtime": 8.4516,
"eval_samples_per_second": 5.561,
"eval_steps_per_second": 1.42,
"step": 5840
},
{
"epoch": 73.00126582278482,
"grad_norm": 0.0017171819927170873,
"learning_rate": 2.8832630098452884e-06,
"loss": 0.0001,
"step": 5850
},
{
"epoch": 73.00253164556962,
"grad_norm": 0.0033329855650663376,
"learning_rate": 2.8691983122362873e-06,
"loss": 0.0,
"step": 5860
},
{
"epoch": 73.00379746835443,
"grad_norm": 0.008366705849766731,
"learning_rate": 2.8551336146272857e-06,
"loss": 0.0001,
"step": 5870
},
{
"epoch": 73.00506329113924,
"grad_norm": 0.0013916816096752882,
"learning_rate": 2.841068917018284e-06,
"loss": 0.0001,
"step": 5880
},
{
"epoch": 73.00632911392405,
"grad_norm": 0.001828556414693594,
"learning_rate": 2.827004219409283e-06,
"loss": 0.0,
"step": 5890
},
{
"epoch": 73.00759493670886,
"grad_norm": 0.002120325807482004,
"learning_rate": 2.8129395218002813e-06,
"loss": 0.0,
"step": 5900
},
{
"epoch": 73.00886075949367,
"grad_norm": 0.0018465804168954492,
"learning_rate": 2.79887482419128e-06,
"loss": 0.0,
"step": 5910
},
{
"epoch": 73.01012658227847,
"grad_norm": 0.0017947384621948004,
"learning_rate": 2.7848101265822785e-06,
"loss": 0.0,
"step": 5920
},
{
"epoch": 73.01012658227847,
"eval_accuracy": 0.46808510638297873,
"eval_loss": 4.288333415985107,
"eval_runtime": 8.4999,
"eval_samples_per_second": 5.529,
"eval_steps_per_second": 1.412,
"step": 5920
},
{
"epoch": 74.00126582278482,
"grad_norm": 0.0011507548624649644,
"learning_rate": 2.7707454289732773e-06,
"loss": 0.0001,
"step": 5930
},
{
"epoch": 74.00253164556962,
"grad_norm": 0.001057163462974131,
"learning_rate": 2.7566807313642757e-06,
"loss": 0.0002,
"step": 5940
},
{
"epoch": 74.00379746835443,
"grad_norm": 0.001940654474310577,
"learning_rate": 2.742616033755274e-06,
"loss": 0.0001,
"step": 5950
},
{
"epoch": 74.00506329113924,
"grad_norm": 0.013309543952345848,
"learning_rate": 2.728551336146273e-06,
"loss": 0.1043,
"step": 5960
},
{
"epoch": 74.00632911392405,
"grad_norm": 0.003933705855160952,
"learning_rate": 2.7144866385372713e-06,
"loss": 0.0,
"step": 5970
},
{
"epoch": 74.00759493670886,
"grad_norm": 0.0016960457433015108,
"learning_rate": 2.70042194092827e-06,
"loss": 0.0011,
"step": 5980
},
{
"epoch": 74.00886075949367,
"grad_norm": 0.0025782466400414705,
"learning_rate": 2.6863572433192686e-06,
"loss": 0.0001,
"step": 5990
},
{
"epoch": 74.01012658227847,
"grad_norm": 0.0025307261385023594,
"learning_rate": 2.6722925457102674e-06,
"loss": 0.0,
"step": 6000
},
{
"epoch": 74.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.684779167175293,
"eval_runtime": 8.5153,
"eval_samples_per_second": 5.519,
"eval_steps_per_second": 1.409,
"step": 6000
},
{
"epoch": 75.00126582278482,
"grad_norm": 0.0016025023069232702,
"learning_rate": 2.6582278481012658e-06,
"loss": 0.1562,
"step": 6010
},
{
"epoch": 75.00253164556962,
"grad_norm": 0.0006585910450667143,
"learning_rate": 2.644163150492265e-06,
"loss": 0.0001,
"step": 6020
},
{
"epoch": 75.00379746835443,
"grad_norm": 0.0031663491390645504,
"learning_rate": 2.6300984528832634e-06,
"loss": 0.0,
"step": 6030
},
{
"epoch": 75.00506329113924,
"grad_norm": 0.015673287212848663,
"learning_rate": 2.6160337552742622e-06,
"loss": 0.0,
"step": 6040
},
{
"epoch": 75.00632911392405,
"grad_norm": 0.003231970127671957,
"learning_rate": 2.6019690576652606e-06,
"loss": 0.0,
"step": 6050
},
{
"epoch": 75.00759493670886,
"grad_norm": 0.0015489223878830671,
"learning_rate": 2.587904360056259e-06,
"loss": 0.0,
"step": 6060
},
{
"epoch": 75.00886075949367,
"grad_norm": 18.648025512695312,
"learning_rate": 2.573839662447258e-06,
"loss": 0.0019,
"step": 6070
},
{
"epoch": 75.01012658227847,
"grad_norm": 0.00114185712300241,
"learning_rate": 2.5597749648382563e-06,
"loss": 0.0,
"step": 6080
},
{
"epoch": 75.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.815650939941406,
"eval_runtime": 8.5452,
"eval_samples_per_second": 5.5,
"eval_steps_per_second": 1.404,
"step": 6080
},
{
"epoch": 76.00126582278482,
"grad_norm": 0.20001061260700226,
"learning_rate": 2.545710267229255e-06,
"loss": 0.0001,
"step": 6090
},
{
"epoch": 76.00253164556962,
"grad_norm": 0.002338194055482745,
"learning_rate": 2.5316455696202535e-06,
"loss": 0.0,
"step": 6100
},
{
"epoch": 76.00379746835443,
"grad_norm": 0.004149795509874821,
"learning_rate": 2.5175808720112523e-06,
"loss": 0.0,
"step": 6110
},
{
"epoch": 76.00506329113924,
"grad_norm": 0.0017416217597201467,
"learning_rate": 2.5035161744022507e-06,
"loss": 0.0003,
"step": 6120
},
{
"epoch": 76.00632911392405,
"grad_norm": 0.0011654688278213143,
"learning_rate": 2.489451476793249e-06,
"loss": 0.0,
"step": 6130
},
{
"epoch": 76.00759493670886,
"grad_norm": 0.0034060347825288773,
"learning_rate": 2.475386779184248e-06,
"loss": 0.0,
"step": 6140
},
{
"epoch": 76.00886075949367,
"grad_norm": 0.0008188936626538634,
"learning_rate": 2.4613220815752463e-06,
"loss": 0.0,
"step": 6150
},
{
"epoch": 76.01012658227847,
"grad_norm": 0.0015720352530479431,
"learning_rate": 2.447257383966245e-06,
"loss": 0.0,
"step": 6160
},
{
"epoch": 76.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.8248186111450195,
"eval_runtime": 8.5505,
"eval_samples_per_second": 5.497,
"eval_steps_per_second": 1.403,
"step": 6160
},
{
"epoch": 77.00126582278482,
"grad_norm": 0.007548962719738483,
"learning_rate": 2.4331926863572436e-06,
"loss": 0.0,
"step": 6170
},
{
"epoch": 77.00253164556962,
"grad_norm": 0.00578302051872015,
"learning_rate": 2.4191279887482424e-06,
"loss": 0.0,
"step": 6180
},
{
"epoch": 77.00379746835443,
"grad_norm": 0.0033245827071368694,
"learning_rate": 2.4050632911392408e-06,
"loss": 0.0,
"step": 6190
},
{
"epoch": 77.00506329113924,
"grad_norm": 0.0008535035303793848,
"learning_rate": 2.390998593530239e-06,
"loss": 0.0015,
"step": 6200
},
{
"epoch": 77.00632911392405,
"grad_norm": 0.0016984603134915233,
"learning_rate": 2.376933895921238e-06,
"loss": 0.0001,
"step": 6210
},
{
"epoch": 77.00759493670886,
"grad_norm": 0.0014544576406478882,
"learning_rate": 2.3628691983122364e-06,
"loss": 0.0045,
"step": 6220
},
{
"epoch": 77.00886075949367,
"grad_norm": 0.0007344476762227714,
"learning_rate": 2.3488045007032352e-06,
"loss": 0.0,
"step": 6230
},
{
"epoch": 77.01012658227847,
"grad_norm": 0.014698284678161144,
"learning_rate": 2.3347398030942336e-06,
"loss": 0.0,
"step": 6240
},
{
"epoch": 77.01012658227847,
"eval_accuracy": 0.48936170212765956,
"eval_loss": 4.5635786056518555,
"eval_runtime": 8.4512,
"eval_samples_per_second": 5.561,
"eval_steps_per_second": 1.42,
"step": 6240
},
{
"epoch": 78.00126582278482,
"grad_norm": 0.0009909283835440874,
"learning_rate": 2.3206751054852324e-06,
"loss": 0.0019,
"step": 6250
},
{
"epoch": 78.00253164556962,
"grad_norm": 0.020323097705841064,
"learning_rate": 2.306610407876231e-06,
"loss": 0.0,
"step": 6260
},
{
"epoch": 78.00379746835443,
"grad_norm": 0.0027961665764451027,
"learning_rate": 2.2925457102672292e-06,
"loss": 0.0001,
"step": 6270
},
{
"epoch": 78.00506329113924,
"grad_norm": 0.0009632346336729825,
"learning_rate": 2.278481012658228e-06,
"loss": 0.0,
"step": 6280
},
{
"epoch": 78.00632911392405,
"grad_norm": 0.005322067067027092,
"learning_rate": 2.2644163150492265e-06,
"loss": 0.1958,
"step": 6290
},
{
"epoch": 78.00759493670886,
"grad_norm": 0.002847396768629551,
"learning_rate": 2.2503516174402253e-06,
"loss": 0.0,
"step": 6300
},
{
"epoch": 78.00886075949367,
"grad_norm": 0.002737447852268815,
"learning_rate": 2.2362869198312237e-06,
"loss": 0.0,
"step": 6310
},
{
"epoch": 78.01012658227847,
"grad_norm": 0.036222778260707855,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0,
"step": 6320
},
{
"epoch": 78.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.581666469573975,
"eval_runtime": 8.4752,
"eval_samples_per_second": 5.546,
"eval_steps_per_second": 1.416,
"step": 6320
},
{
"epoch": 79.00126582278482,
"grad_norm": 0.001555649214424193,
"learning_rate": 2.208157524613221e-06,
"loss": 0.0,
"step": 6330
},
{
"epoch": 79.00253164556962,
"grad_norm": 0.0009841909632086754,
"learning_rate": 2.1940928270042197e-06,
"loss": 0.0,
"step": 6340
},
{
"epoch": 79.00379746835443,
"grad_norm": 0.0069242678582668304,
"learning_rate": 2.180028129395218e-06,
"loss": 0.0,
"step": 6350
},
{
"epoch": 79.00506329113924,
"grad_norm": 0.0031804998870939016,
"learning_rate": 2.165963431786217e-06,
"loss": 0.0,
"step": 6360
},
{
"epoch": 79.00632911392405,
"grad_norm": 0.0010907890973612666,
"learning_rate": 2.1518987341772153e-06,
"loss": 0.0,
"step": 6370
},
{
"epoch": 79.00759493670886,
"grad_norm": 0.0008229253580793738,
"learning_rate": 2.137834036568214e-06,
"loss": 0.0001,
"step": 6380
},
{
"epoch": 79.00886075949367,
"grad_norm": 0.004569868091493845,
"learning_rate": 2.1237693389592126e-06,
"loss": 0.0,
"step": 6390
},
{
"epoch": 79.01012658227847,
"grad_norm": 0.0017135925590991974,
"learning_rate": 2.1097046413502114e-06,
"loss": 0.0001,
"step": 6400
},
{
"epoch": 79.01012658227847,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.774336338043213,
"eval_runtime": 8.469,
"eval_samples_per_second": 5.55,
"eval_steps_per_second": 1.417,
"step": 6400
},
{
"epoch": 80.00126582278482,
"grad_norm": 0.0009415835957042873,
"learning_rate": 2.0956399437412098e-06,
"loss": 0.0,
"step": 6410
},
{
"epoch": 80.00253164556962,
"grad_norm": 0.0011497698724269867,
"learning_rate": 2.081575246132208e-06,
"loss": 0.0,
"step": 6420
},
{
"epoch": 80.00379746835443,
"grad_norm": 0.0016221057157963514,
"learning_rate": 2.067510548523207e-06,
"loss": 0.0,
"step": 6430
},
{
"epoch": 80.00506329113924,
"grad_norm": 0.002268084790557623,
"learning_rate": 2.0534458509142054e-06,
"loss": 0.0,
"step": 6440
},
{
"epoch": 80.00632911392405,
"grad_norm": 0.0011354024754837155,
"learning_rate": 2.0393811533052042e-06,
"loss": 0.0,
"step": 6450
},
{
"epoch": 80.00759493670886,
"grad_norm": 0.002358433324843645,
"learning_rate": 2.0253164556962026e-06,
"loss": 0.0004,
"step": 6460
},
{
"epoch": 80.00886075949367,
"grad_norm": 0.0013280572602525353,
"learning_rate": 2.0112517580872014e-06,
"loss": 0.0,
"step": 6470
},
{
"epoch": 80.01012658227847,
"grad_norm": 0.006725861690938473,
"learning_rate": 1.9971870604782e-06,
"loss": 0.0001,
"step": 6480
},
{
"epoch": 80.01012658227847,
"eval_accuracy": 0.40425531914893614,
"eval_loss": 4.900009632110596,
"eval_runtime": 8.4183,
"eval_samples_per_second": 5.583,
"eval_steps_per_second": 1.425,
"step": 6480
},
{
"epoch": 81.00126582278482,
"grad_norm": 0.0010699324775487185,
"learning_rate": 1.9831223628691982e-06,
"loss": 0.1826,
"step": 6490
},
{
"epoch": 81.00253164556962,
"grad_norm": 0.006973781157284975,
"learning_rate": 1.969057665260197e-06,
"loss": 0.0,
"step": 6500
},
{
"epoch": 81.00379746835443,
"grad_norm": 0.003398946486413479,
"learning_rate": 1.9549929676511955e-06,
"loss": 0.0,
"step": 6510
},
{
"epoch": 81.00506329113924,
"grad_norm": 0.009173160418868065,
"learning_rate": 1.9409282700421943e-06,
"loss": 0.0005,
"step": 6520
},
{
"epoch": 81.00632911392405,
"grad_norm": 0.07392571866512299,
"learning_rate": 1.9268635724331927e-06,
"loss": 0.0361,
"step": 6530
},
{
"epoch": 81.00759493670886,
"grad_norm": 0.001574037130922079,
"learning_rate": 1.9127988748241915e-06,
"loss": 0.0,
"step": 6540
},
{
"epoch": 81.00886075949367,
"grad_norm": 0.0032889668364077806,
"learning_rate": 1.8987341772151901e-06,
"loss": 0.0,
"step": 6550
},
{
"epoch": 81.01012658227847,
"grad_norm": 0.002083021914586425,
"learning_rate": 1.8846694796061887e-06,
"loss": 0.0002,
"step": 6560
},
{
"epoch": 81.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.766859531402588,
"eval_runtime": 8.4247,
"eval_samples_per_second": 5.579,
"eval_steps_per_second": 1.424,
"step": 6560
},
{
"epoch": 82.00126582278482,
"grad_norm": 0.001750531722791493,
"learning_rate": 1.8706047819971873e-06,
"loss": 0.0,
"step": 6570
},
{
"epoch": 82.00253164556962,
"grad_norm": 0.0014307881938293576,
"learning_rate": 1.856540084388186e-06,
"loss": 0.0,
"step": 6580
},
{
"epoch": 82.00379746835443,
"grad_norm": 0.0012166056549176574,
"learning_rate": 1.8424753867791846e-06,
"loss": 0.0,
"step": 6590
},
{
"epoch": 82.00506329113924,
"grad_norm": 0.0018584979698061943,
"learning_rate": 1.828410689170183e-06,
"loss": 0.0,
"step": 6600
},
{
"epoch": 82.00632911392405,
"grad_norm": 0.0007850803667679429,
"learning_rate": 1.8143459915611816e-06,
"loss": 0.0,
"step": 6610
},
{
"epoch": 82.00759493670886,
"grad_norm": 0.00409714225679636,
"learning_rate": 1.8002812939521802e-06,
"loss": 0.0,
"step": 6620
},
{
"epoch": 82.00886075949367,
"grad_norm": 0.0014620538568124175,
"learning_rate": 1.7862165963431788e-06,
"loss": 0.0,
"step": 6630
},
{
"epoch": 82.01012658227847,
"grad_norm": 0.0011849668808281422,
"learning_rate": 1.7721518987341774e-06,
"loss": 0.0,
"step": 6640
},
{
"epoch": 82.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.8224687576293945,
"eval_runtime": 8.4754,
"eval_samples_per_second": 5.545,
"eval_steps_per_second": 1.416,
"step": 6640
},
{
"epoch": 83.00126582278482,
"grad_norm": 0.0014004989061504602,
"learning_rate": 1.758087201125176e-06,
"loss": 0.0,
"step": 6650
},
{
"epoch": 83.00253164556962,
"grad_norm": 0.0015136294532567263,
"learning_rate": 1.7440225035161746e-06,
"loss": 0.0,
"step": 6660
},
{
"epoch": 83.00379746835443,
"grad_norm": 0.00371897267177701,
"learning_rate": 1.729957805907173e-06,
"loss": 0.0,
"step": 6670
},
{
"epoch": 83.00506329113924,
"grad_norm": 0.0008714126888662577,
"learning_rate": 1.7158931082981716e-06,
"loss": 0.0,
"step": 6680
},
{
"epoch": 83.00632911392405,
"grad_norm": 0.003846103325486183,
"learning_rate": 1.7018284106891702e-06,
"loss": 0.0,
"step": 6690
},
{
"epoch": 83.00759493670886,
"grad_norm": 0.0013878681929782033,
"learning_rate": 1.6877637130801689e-06,
"loss": 0.0,
"step": 6700
},
{
"epoch": 83.00886075949367,
"grad_norm": 0.0011597294360399246,
"learning_rate": 1.6736990154711675e-06,
"loss": 0.0,
"step": 6710
},
{
"epoch": 83.01012658227847,
"grad_norm": 0.0006404958548955619,
"learning_rate": 1.659634317862166e-06,
"loss": 0.0,
"step": 6720
},
{
"epoch": 83.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.833099842071533,
"eval_runtime": 8.4872,
"eval_samples_per_second": 5.538,
"eval_steps_per_second": 1.414,
"step": 6720
},
{
"epoch": 84.00126582278482,
"grad_norm": 0.0019264252623543143,
"learning_rate": 1.6455696202531647e-06,
"loss": 0.0,
"step": 6730
},
{
"epoch": 84.00253164556962,
"grad_norm": 0.001029732171446085,
"learning_rate": 1.631504922644163e-06,
"loss": 0.0005,
"step": 6740
},
{
"epoch": 84.00379746835443,
"grad_norm": 0.0017922447295859456,
"learning_rate": 1.6174402250351621e-06,
"loss": 0.0,
"step": 6750
},
{
"epoch": 84.00506329113924,
"grad_norm": 0.0012487670173868537,
"learning_rate": 1.6033755274261605e-06,
"loss": 0.0,
"step": 6760
},
{
"epoch": 84.00632911392405,
"grad_norm": 0.0014119717525318265,
"learning_rate": 1.5893108298171591e-06,
"loss": 0.0001,
"step": 6770
},
{
"epoch": 84.00759493670886,
"grad_norm": 0.0006965138600207865,
"learning_rate": 1.5752461322081577e-06,
"loss": 0.0,
"step": 6780
},
{
"epoch": 84.00886075949367,
"grad_norm": 0.0011428669095039368,
"learning_rate": 1.5611814345991563e-06,
"loss": 0.0,
"step": 6790
},
{
"epoch": 84.01012658227847,
"grad_norm": 0.002268004696816206,
"learning_rate": 1.547116736990155e-06,
"loss": 0.0,
"step": 6800
},
{
"epoch": 84.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.715381145477295,
"eval_runtime": 8.3979,
"eval_samples_per_second": 5.597,
"eval_steps_per_second": 1.429,
"step": 6800
},
{
"epoch": 85.00126582278482,
"grad_norm": 0.002592705423012376,
"learning_rate": 1.5330520393811536e-06,
"loss": 0.0,
"step": 6810
},
{
"epoch": 85.00253164556962,
"grad_norm": 0.04101519286632538,
"learning_rate": 1.518987341772152e-06,
"loss": 0.0,
"step": 6820
},
{
"epoch": 85.00379746835443,
"grad_norm": 0.0005958875990472734,
"learning_rate": 1.5049226441631506e-06,
"loss": 0.0,
"step": 6830
},
{
"epoch": 85.00506329113924,
"grad_norm": 0.0025226089637726545,
"learning_rate": 1.4908579465541492e-06,
"loss": 0.0,
"step": 6840
},
{
"epoch": 85.00632911392405,
"grad_norm": 0.0008681220351718366,
"learning_rate": 1.4767932489451478e-06,
"loss": 0.0,
"step": 6850
},
{
"epoch": 85.00759493670886,
"grad_norm": 0.0013401606120169163,
"learning_rate": 1.4627285513361464e-06,
"loss": 0.0,
"step": 6860
},
{
"epoch": 85.00886075949367,
"grad_norm": 0.0010737047996371984,
"learning_rate": 1.448663853727145e-06,
"loss": 0.0,
"step": 6870
},
{
"epoch": 85.01012658227847,
"grad_norm": 0.001385514042340219,
"learning_rate": 1.4345991561181436e-06,
"loss": 0.0,
"step": 6880
},
{
"epoch": 85.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.716861248016357,
"eval_runtime": 8.4473,
"eval_samples_per_second": 5.564,
"eval_steps_per_second": 1.421,
"step": 6880
},
{
"epoch": 86.00126582278482,
"grad_norm": 0.0020737305749207735,
"learning_rate": 1.420534458509142e-06,
"loss": 0.0,
"step": 6890
},
{
"epoch": 86.00253164556962,
"grad_norm": 0.0013663348508998752,
"learning_rate": 1.4064697609001406e-06,
"loss": 0.0001,
"step": 6900
},
{
"epoch": 86.00379746835443,
"grad_norm": 0.01127583533525467,
"learning_rate": 1.3924050632911392e-06,
"loss": 0.0,
"step": 6910
},
{
"epoch": 86.00506329113924,
"grad_norm": 0.002084192121401429,
"learning_rate": 1.3783403656821379e-06,
"loss": 0.0,
"step": 6920
},
{
"epoch": 86.00632911392405,
"grad_norm": 0.0009935207199305296,
"learning_rate": 1.3642756680731365e-06,
"loss": 0.0,
"step": 6930
},
{
"epoch": 86.00759493670886,
"grad_norm": 0.0008619350846856833,
"learning_rate": 1.350210970464135e-06,
"loss": 0.0,
"step": 6940
},
{
"epoch": 86.00886075949367,
"grad_norm": 0.000807570235338062,
"learning_rate": 1.3361462728551337e-06,
"loss": 0.0,
"step": 6950
},
{
"epoch": 86.01012658227847,
"grad_norm": 0.001797909731976688,
"learning_rate": 1.3220815752461325e-06,
"loss": 0.0,
"step": 6960
},
{
"epoch": 86.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.900410175323486,
"eval_runtime": 8.4167,
"eval_samples_per_second": 5.584,
"eval_steps_per_second": 1.426,
"step": 6960
},
{
"epoch": 87.00126582278482,
"grad_norm": 0.0008401426021009684,
"learning_rate": 1.3080168776371311e-06,
"loss": 0.0,
"step": 6970
},
{
"epoch": 87.00253164556962,
"grad_norm": 0.0014646663330495358,
"learning_rate": 1.2939521800281295e-06,
"loss": 0.0,
"step": 6980
},
{
"epoch": 87.00379746835443,
"grad_norm": 0.0010633817873895168,
"learning_rate": 1.2798874824191281e-06,
"loss": 0.0,
"step": 6990
},
{
"epoch": 87.00506329113924,
"grad_norm": 0.0017103515565395355,
"learning_rate": 1.2658227848101267e-06,
"loss": 0.0,
"step": 7000
},
{
"epoch": 87.00632911392405,
"grad_norm": 0.001976841827854514,
"learning_rate": 1.2517580872011254e-06,
"loss": 0.0,
"step": 7010
},
{
"epoch": 87.00759493670886,
"grad_norm": 0.000657711352687329,
"learning_rate": 1.237693389592124e-06,
"loss": 0.0,
"step": 7020
},
{
"epoch": 87.00886075949367,
"grad_norm": 0.0006206512916833162,
"learning_rate": 1.2236286919831226e-06,
"loss": 0.0,
"step": 7030
},
{
"epoch": 87.01012658227847,
"grad_norm": 0.0030793757177889347,
"learning_rate": 1.2095639943741212e-06,
"loss": 0.0,
"step": 7040
},
{
"epoch": 87.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.909188270568848,
"eval_runtime": 8.4164,
"eval_samples_per_second": 5.584,
"eval_steps_per_second": 1.426,
"step": 7040
},
{
"epoch": 88.00126582278482,
"grad_norm": 0.0011876953067258,
"learning_rate": 1.1954992967651196e-06,
"loss": 0.0,
"step": 7050
},
{
"epoch": 88.00253164556962,
"grad_norm": 0.0019371965900063515,
"learning_rate": 1.1814345991561182e-06,
"loss": 0.0,
"step": 7060
},
{
"epoch": 88.00379746835443,
"grad_norm": 0.001290988875553012,
"learning_rate": 1.1673699015471168e-06,
"loss": 0.0,
"step": 7070
},
{
"epoch": 88.00506329113924,
"grad_norm": 0.003862161422148347,
"learning_rate": 1.1533052039381154e-06,
"loss": 0.0,
"step": 7080
},
{
"epoch": 88.00632911392405,
"grad_norm": 0.0007267307373695076,
"learning_rate": 1.139240506329114e-06,
"loss": 0.0,
"step": 7090
},
{
"epoch": 88.00759493670886,
"grad_norm": 0.0007938898052088916,
"learning_rate": 1.1251758087201126e-06,
"loss": 0.0,
"step": 7100
},
{
"epoch": 88.00886075949367,
"grad_norm": 0.0006018871208652854,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0,
"step": 7110
},
{
"epoch": 88.01012658227847,
"grad_norm": 0.0017778057372197509,
"learning_rate": 1.0970464135021099e-06,
"loss": 0.0,
"step": 7120
},
{
"epoch": 88.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.89414119720459,
"eval_runtime": 8.438,
"eval_samples_per_second": 5.57,
"eval_steps_per_second": 1.422,
"step": 7120
},
{
"epoch": 89.00126582278482,
"grad_norm": 0.0007234832737594843,
"learning_rate": 1.0829817158931085e-06,
"loss": 0.0,
"step": 7130
},
{
"epoch": 89.00253164556962,
"grad_norm": 0.0015409559709951282,
"learning_rate": 1.068917018284107e-06,
"loss": 0.0,
"step": 7140
},
{
"epoch": 89.00379746835443,
"grad_norm": 0.0008910736651159823,
"learning_rate": 1.0548523206751057e-06,
"loss": 0.0,
"step": 7150
},
{
"epoch": 89.00506329113924,
"grad_norm": 0.0020937921945005655,
"learning_rate": 1.040787623066104e-06,
"loss": 0.0,
"step": 7160
},
{
"epoch": 89.00632911392405,
"grad_norm": 0.0014372824225574732,
"learning_rate": 1.0267229254571027e-06,
"loss": 0.0,
"step": 7170
},
{
"epoch": 89.00759493670886,
"grad_norm": 0.003179526887834072,
"learning_rate": 1.0126582278481013e-06,
"loss": 0.0,
"step": 7180
},
{
"epoch": 89.00886075949367,
"grad_norm": 0.0012057056883350015,
"learning_rate": 9.985935302391e-07,
"loss": 0.0001,
"step": 7190
},
{
"epoch": 89.01012658227847,
"grad_norm": 0.005369286518543959,
"learning_rate": 9.845288326300985e-07,
"loss": 0.0,
"step": 7200
},
{
"epoch": 89.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.789796352386475,
"eval_runtime": 8.4427,
"eval_samples_per_second": 5.567,
"eval_steps_per_second": 1.421,
"step": 7200
},
{
"epoch": 90.00126582278482,
"grad_norm": 0.0017162526492029428,
"learning_rate": 9.704641350210971e-07,
"loss": 0.0,
"step": 7210
},
{
"epoch": 90.00253164556962,
"grad_norm": 0.0009838847909122705,
"learning_rate": 9.563994374120958e-07,
"loss": 0.0,
"step": 7220
},
{
"epoch": 90.00379746835443,
"grad_norm": 0.015449059195816517,
"learning_rate": 9.423347398030944e-07,
"loss": 0.0,
"step": 7230
},
{
"epoch": 90.00506329113924,
"grad_norm": 0.0017991637578234076,
"learning_rate": 9.28270042194093e-07,
"loss": 0.0,
"step": 7240
},
{
"epoch": 90.00632911392405,
"grad_norm": 0.0010769497603178024,
"learning_rate": 9.142053445850915e-07,
"loss": 0.0,
"step": 7250
},
{
"epoch": 90.00759493670886,
"grad_norm": 0.0007212001946754754,
"learning_rate": 9.001406469760901e-07,
"loss": 0.0,
"step": 7260
},
{
"epoch": 90.00886075949367,
"grad_norm": 0.000739375944249332,
"learning_rate": 8.860759493670887e-07,
"loss": 0.0,
"step": 7270
},
{
"epoch": 90.01012658227847,
"grad_norm": 0.002124297898262739,
"learning_rate": 8.720112517580873e-07,
"loss": 0.0,
"step": 7280
},
{
"epoch": 90.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.827134132385254,
"eval_runtime": 8.4657,
"eval_samples_per_second": 5.552,
"eval_steps_per_second": 1.417,
"step": 7280
},
{
"epoch": 91.00126582278482,
"grad_norm": 0.00743636442348361,
"learning_rate": 8.579465541490858e-07,
"loss": 0.0,
"step": 7290
},
{
"epoch": 91.00253164556962,
"grad_norm": 0.001242807717062533,
"learning_rate": 8.438818565400844e-07,
"loss": 0.0,
"step": 7300
},
{
"epoch": 91.00379746835443,
"grad_norm": 0.017530538141727448,
"learning_rate": 8.29817158931083e-07,
"loss": 0.0,
"step": 7310
},
{
"epoch": 91.00506329113924,
"grad_norm": 0.0027876682579517365,
"learning_rate": 8.157524613220815e-07,
"loss": 0.0,
"step": 7320
},
{
"epoch": 91.00632911392405,
"grad_norm": 0.001038099406287074,
"learning_rate": 8.016877637130803e-07,
"loss": 0.0,
"step": 7330
},
{
"epoch": 91.00759493670886,
"grad_norm": 0.0012997939484193921,
"learning_rate": 7.876230661040789e-07,
"loss": 0.0,
"step": 7340
},
{
"epoch": 91.00886075949367,
"grad_norm": 0.00228080153465271,
"learning_rate": 7.735583684950775e-07,
"loss": 0.0,
"step": 7350
},
{
"epoch": 91.01012658227847,
"grad_norm": 0.0014501850819215178,
"learning_rate": 7.59493670886076e-07,
"loss": 0.0,
"step": 7360
},
{
"epoch": 91.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.832017421722412,
"eval_runtime": 8.6255,
"eval_samples_per_second": 5.449,
"eval_steps_per_second": 1.391,
"step": 7360
},
{
"epoch": 92.00126582278482,
"grad_norm": 0.0007885429658927023,
"learning_rate": 7.454289732770746e-07,
"loss": 0.0,
"step": 7370
},
{
"epoch": 92.00253164556962,
"grad_norm": 0.0009592593996785581,
"learning_rate": 7.313642756680732e-07,
"loss": 0.0,
"step": 7380
},
{
"epoch": 92.00379746835443,
"grad_norm": 0.004812302067875862,
"learning_rate": 7.172995780590718e-07,
"loss": 0.0,
"step": 7390
},
{
"epoch": 92.00506329113924,
"grad_norm": 0.0012065304908901453,
"learning_rate": 7.032348804500703e-07,
"loss": 0.0,
"step": 7400
},
{
"epoch": 92.00632911392405,
"grad_norm": 0.0025038751773536205,
"learning_rate": 6.891701828410689e-07,
"loss": 0.0,
"step": 7410
},
{
"epoch": 92.00759493670886,
"grad_norm": 0.0007439328473992646,
"learning_rate": 6.751054852320675e-07,
"loss": 0.0,
"step": 7420
},
{
"epoch": 92.00886075949367,
"grad_norm": 0.040091563016176224,
"learning_rate": 6.610407876230663e-07,
"loss": 0.0,
"step": 7430
},
{
"epoch": 92.01012658227847,
"grad_norm": 0.00362952146679163,
"learning_rate": 6.469760900140648e-07,
"loss": 0.0,
"step": 7440
},
{
"epoch": 92.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.827417850494385,
"eval_runtime": 8.4772,
"eval_samples_per_second": 5.544,
"eval_steps_per_second": 1.416,
"step": 7440
},
{
"epoch": 93.00126582278482,
"grad_norm": 0.0022241012193262577,
"learning_rate": 6.329113924050634e-07,
"loss": 0.0,
"step": 7450
},
{
"epoch": 93.00253164556962,
"grad_norm": 0.025551510974764824,
"learning_rate": 6.18846694796062e-07,
"loss": 0.0,
"step": 7460
},
{
"epoch": 93.00379746835443,
"grad_norm": 0.00078696379205212,
"learning_rate": 6.047819971870606e-07,
"loss": 0.0,
"step": 7470
},
{
"epoch": 93.00506329113924,
"grad_norm": 0.0017261310713365674,
"learning_rate": 5.907172995780591e-07,
"loss": 0.0,
"step": 7480
},
{
"epoch": 93.00632911392405,
"grad_norm": 0.003345273435115814,
"learning_rate": 5.766526019690577e-07,
"loss": 0.0,
"step": 7490
},
{
"epoch": 93.00759493670886,
"grad_norm": 0.0011764048831537366,
"learning_rate": 5.625879043600563e-07,
"loss": 0.0,
"step": 7500
},
{
"epoch": 93.00886075949367,
"grad_norm": 0.0013756465632468462,
"learning_rate": 5.485232067510549e-07,
"loss": 0.0,
"step": 7510
},
{
"epoch": 93.01012658227847,
"grad_norm": 0.0011709831887856126,
"learning_rate": 5.344585091420535e-07,
"loss": 0.0,
"step": 7520
},
{
"epoch": 93.01012658227847,
"eval_accuracy": 0.44680851063829785,
"eval_loss": 4.826868057250977,
"eval_runtime": 170.4429,
"eval_samples_per_second": 0.276,
"eval_steps_per_second": 0.07,
"step": 7520
},
{
"epoch": 94.00126582278482,
"grad_norm": 0.002271972130984068,
"learning_rate": 5.20393811533052e-07,
"loss": 0.0001,
"step": 7530
},
{
"epoch": 94.00253164556962,
"grad_norm": 12.584663391113281,
"learning_rate": 5.063291139240507e-07,
"loss": 0.0007,
"step": 7540
},
{
"epoch": 94.00379746835443,
"grad_norm": 0.0010153332259505987,
"learning_rate": 4.922644163150493e-07,
"loss": 0.0,
"step": 7550
},
{
"epoch": 94.00506329113924,
"grad_norm": 0.0019480427727103233,
"learning_rate": 4.781997187060479e-07,
"loss": 0.0,
"step": 7560
},
{
"epoch": 94.00632911392405,
"grad_norm": 0.005996455904096365,
"learning_rate": 4.641350210970465e-07,
"loss": 0.0,
"step": 7570
},
{
"epoch": 94.00759493670886,
"grad_norm": 0.0005868688458576798,
"learning_rate": 4.5007032348804504e-07,
"loss": 0.0,
"step": 7580
},
{
"epoch": 94.00886075949367,
"grad_norm": 0.0008807959966361523,
"learning_rate": 4.3600562587904366e-07,
"loss": 0.0,
"step": 7590
},
{
"epoch": 94.01012658227847,
"grad_norm": 0.0008403842803090811,
"learning_rate": 4.219409282700422e-07,
"loss": 0.0,
"step": 7600
},
{
"epoch": 94.01012658227847,
"eval_accuracy": 0.3829787234042553,
"eval_loss": 4.878473281860352,
"eval_runtime": 8.411,
"eval_samples_per_second": 5.588,
"eval_steps_per_second": 1.427,
"step": 7600
},
{
"epoch": 95.00126582278482,
"grad_norm": 0.005562290083616972,
"learning_rate": 4.0787623066104077e-07,
"loss": 0.0001,
"step": 7610
},
{
"epoch": 95.00253164556962,
"grad_norm": 0.004410718102008104,
"learning_rate": 3.9381153305203943e-07,
"loss": 0.0,
"step": 7620
},
{
"epoch": 95.00379746835443,
"grad_norm": 0.0042534684762358665,
"learning_rate": 3.79746835443038e-07,
"loss": 0.0,
"step": 7630
},
{
"epoch": 95.00506329113924,
"grad_norm": 0.0012142916675657034,
"learning_rate": 3.656821378340366e-07,
"loss": 0.0,
"step": 7640
},
{
"epoch": 95.00632911392405,
"grad_norm": 0.0007107236888259649,
"learning_rate": 3.5161744022503516e-07,
"loss": 0.0,
"step": 7650
},
{
"epoch": 95.00759493670886,
"grad_norm": 0.0018182717030867934,
"learning_rate": 3.3755274261603377e-07,
"loss": 0.0,
"step": 7660
},
{
"epoch": 95.00886075949367,
"grad_norm": 0.003002484329044819,
"learning_rate": 3.234880450070324e-07,
"loss": 0.0001,
"step": 7670
},
{
"epoch": 95.01012658227847,
"grad_norm": 0.0012020288268104196,
"learning_rate": 3.09423347398031e-07,
"loss": 0.0,
"step": 7680
},
{
"epoch": 95.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.963972568511963,
"eval_runtime": 8.4253,
"eval_samples_per_second": 5.578,
"eval_steps_per_second": 1.424,
"step": 7680
},
{
"epoch": 96.00126582278482,
"grad_norm": 0.0024408556055277586,
"learning_rate": 2.9535864978902955e-07,
"loss": 0.0,
"step": 7690
},
{
"epoch": 96.00253164556962,
"grad_norm": 329.71331787109375,
"learning_rate": 2.8129395218002816e-07,
"loss": 0.132,
"step": 7700
},
{
"epoch": 96.00379746835443,
"grad_norm": 0.0019197100773453712,
"learning_rate": 2.6722925457102677e-07,
"loss": 0.0005,
"step": 7710
},
{
"epoch": 96.00506329113924,
"grad_norm": 0.0013428219826892018,
"learning_rate": 2.5316455696202533e-07,
"loss": 0.0,
"step": 7720
},
{
"epoch": 96.00632911392405,
"grad_norm": 0.0015475323889404535,
"learning_rate": 2.3909985935302394e-07,
"loss": 0.0,
"step": 7730
},
{
"epoch": 96.00759493670886,
"grad_norm": 0.0013204860733821988,
"learning_rate": 2.2503516174402252e-07,
"loss": 0.0,
"step": 7740
},
{
"epoch": 96.00886075949367,
"grad_norm": 0.0007807817310094833,
"learning_rate": 2.109704641350211e-07,
"loss": 0.0,
"step": 7750
},
{
"epoch": 96.01012658227847,
"grad_norm": 0.0009276365744881332,
"learning_rate": 1.9690576652601972e-07,
"loss": 0.0,
"step": 7760
},
{
"epoch": 96.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.947977066040039,
"eval_runtime": 8.4147,
"eval_samples_per_second": 5.585,
"eval_steps_per_second": 1.426,
"step": 7760
},
{
"epoch": 97.00126582278482,
"grad_norm": 0.0013080050703138113,
"learning_rate": 1.828410689170183e-07,
"loss": 0.0,
"step": 7770
},
{
"epoch": 97.00253164556962,
"grad_norm": 0.0018693221500143409,
"learning_rate": 1.6877637130801689e-07,
"loss": 0.0,
"step": 7780
},
{
"epoch": 97.00379746835443,
"grad_norm": 0.0018208841793239117,
"learning_rate": 1.547116736990155e-07,
"loss": 0.0,
"step": 7790
},
{
"epoch": 97.00506329113924,
"grad_norm": 0.0018955061677843332,
"learning_rate": 1.4064697609001408e-07,
"loss": 0.0,
"step": 7800
},
{
"epoch": 97.00632911392405,
"grad_norm": 0.0007756951963528991,
"learning_rate": 1.2658227848101266e-07,
"loss": 0.0,
"step": 7810
},
{
"epoch": 97.00759493670886,
"grad_norm": 0.0009716827771626413,
"learning_rate": 1.1251758087201126e-07,
"loss": 0.0,
"step": 7820
},
{
"epoch": 97.00886075949367,
"grad_norm": 0.003705208422616124,
"learning_rate": 9.845288326300986e-08,
"loss": 0.0,
"step": 7830
},
{
"epoch": 97.01012658227847,
"grad_norm": 0.013106240890920162,
"learning_rate": 8.438818565400844e-08,
"loss": 0.0,
"step": 7840
},
{
"epoch": 97.01012658227847,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.940354824066162,
"eval_runtime": 8.4561,
"eval_samples_per_second": 5.558,
"eval_steps_per_second": 1.419,
"step": 7840
},
{
"epoch": 98.00126582278482,
"grad_norm": 0.0024029065389186144,
"learning_rate": 7.032348804500704e-08,
"loss": 0.0,
"step": 7850
},
{
"epoch": 98.00253164556962,
"grad_norm": 0.0011833859607577324,
"learning_rate": 5.625879043600563e-08,
"loss": 0.0,
"step": 7860
},
{
"epoch": 98.00379746835443,
"grad_norm": 0.0013356610434129834,
"learning_rate": 4.219409282700422e-08,
"loss": 0.0,
"step": 7870
},
{
"epoch": 98.00506329113924,
"grad_norm": 0.0007605087594129145,
"learning_rate": 2.8129395218002815e-08,
"loss": 0.0,
"step": 7880
},
{
"epoch": 98.00632911392405,
"grad_norm": 0.0008561754948459566,
"learning_rate": 1.4064697609001408e-08,
"loss": 0.0011,
"step": 7890
},
{
"epoch": 98.00759493670886,
"grad_norm": 0.0006674563628621399,
"learning_rate": 0.0,
"loss": 0.0,
"step": 7900
},
{
"epoch": 98.00759493670886,
"eval_accuracy": 0.425531914893617,
"eval_loss": 4.9351420402526855,
"eval_runtime": 9.2926,
"eval_samples_per_second": 5.058,
"eval_steps_per_second": 1.291,
"step": 7900
},
{
"epoch": 98.00759493670886,
"step": 7900,
"total_flos": 1.378962555602208e+20,
"train_loss": 0.1596812904629944,
"train_runtime": 13780.8931,
"train_samples_per_second": 2.293,
"train_steps_per_second": 0.573
},
{
"epoch": 98.00759493670886,
"eval_accuracy": 0.3076923076923077,
"eval_loss": 5.42232084274292,
"eval_runtime": 8.1011,
"eval_samples_per_second": 4.814,
"eval_steps_per_second": 1.234,
"step": 7900
},
{
"epoch": 98.00759493670886,
"eval_accuracy": 0.3076923076923077,
"eval_loss": 5.422321319580078,
"eval_runtime": 7.0644,
"eval_samples_per_second": 5.521,
"eval_steps_per_second": 1.416,
"step": 7900
}
],
"logging_steps": 10,
"max_steps": 7900,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.378962555602208e+20,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}