{ "best_metric": 2.4058215618133545, "best_model_checkpoint": "miner_id_24/checkpoint-3600", "epoch": 0.15513817640244912, "eval_steps": 50, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.137018037398643e-05, "eval_loss": 3.1065964698791504, "eval_runtime": 195.4528, "eval_samples_per_second": 109.934, "eval_steps_per_second": 13.742, "step": 1 }, { "epoch": 0.0004137018037398643, "grad_norm": 2.176201581954956, "learning_rate": 1.3238895876083937e-07, "loss": 3.0714, "step": 10 }, { "epoch": 0.0008274036074797286, "grad_norm": 1.626641869544983, "learning_rate": 2.6477791752167873e-07, "loss": 3.1136, "step": 20 }, { "epoch": 0.0012411054112195928, "grad_norm": 2.843463659286499, "learning_rate": 3.9716687628251804e-07, "loss": 3.0861, "step": 30 }, { "epoch": 0.0016548072149594572, "grad_norm": 2.288268804550171, "learning_rate": 5.295558350433575e-07, "loss": 3.1362, "step": 40 }, { "epoch": 0.0020685090186993217, "grad_norm": 4.716451644897461, "learning_rate": 6.619447938041968e-07, "loss": 3.0737, "step": 50 }, { "epoch": 0.0020685090186993217, "eval_loss": 3.094273090362549, "eval_runtime": 192.6539, "eval_samples_per_second": 111.532, "eval_steps_per_second": 13.942, "step": 50 }, { "epoch": 0.0024822108224391857, "grad_norm": 2.3416748046875, "learning_rate": 7.943337525650361e-07, "loss": 3.0291, "step": 60 }, { "epoch": 0.00289591262617905, "grad_norm": 1.7577191591262817, "learning_rate": 9.267227113258755e-07, "loss": 3.0965, "step": 70 }, { "epoch": 0.0033096144299189145, "grad_norm": 2.6903438568115234, "learning_rate": 1.059111670086715e-06, "loss": 3.0055, "step": 80 }, { "epoch": 0.003723316233658779, "grad_norm": 1.737985610961914, "learning_rate": 1.1915006288475542e-06, "loss": 3.0698, "step": 90 }, { "epoch": 0.004137018037398643, "grad_norm": 6.027029991149902, "learning_rate": 1.3238895876083937e-06, "loss": 3.2193, "step": 100 }, { "epoch": 0.004137018037398643, "eval_loss": 3.005723237991333, "eval_runtime": 192.0294, "eval_samples_per_second": 111.894, "eval_steps_per_second": 13.987, "step": 100 }, { "epoch": 0.004550719841138508, "grad_norm": 2.31941819190979, "learning_rate": 1.456278546369233e-06, "loss": 2.8305, "step": 110 }, { "epoch": 0.004964421644878371, "grad_norm": 1.7663012742996216, "learning_rate": 1.5886675051300722e-06, "loss": 3.0105, "step": 120 }, { "epoch": 0.005378123448618236, "grad_norm": 4.188333988189697, "learning_rate": 1.7210564638909114e-06, "loss": 2.9189, "step": 130 }, { "epoch": 0.0057918252523581, "grad_norm": 1.6050326824188232, "learning_rate": 1.853445422651751e-06, "loss": 3.0359, "step": 140 }, { "epoch": 0.0062055270560979646, "grad_norm": 3.5903775691986084, "learning_rate": 1.9858343814125904e-06, "loss": 2.9091, "step": 150 }, { "epoch": 0.0062055270560979646, "eval_loss": 2.8280489444732666, "eval_runtime": 191.6387, "eval_samples_per_second": 112.122, "eval_steps_per_second": 14.016, "step": 150 }, { "epoch": 0.006619228859837829, "grad_norm": 2.433257579803467, "learning_rate": 2.11822334017343e-06, "loss": 2.772, "step": 160 }, { "epoch": 0.007032930663577693, "grad_norm": 1.546025037765503, "learning_rate": 2.250612298934269e-06, "loss": 2.7859, "step": 170 }, { "epoch": 0.007446632467317558, "grad_norm": 2.9423036575317383, "learning_rate": 2.3830012576951084e-06, "loss": 2.7472, "step": 180 }, { "epoch": 0.007860334271057421, "grad_norm": 1.7298214435577393, "learning_rate": 2.515390216455948e-06, "loss": 2.6401, "step": 190 }, { "epoch": 0.008274036074797287, "grad_norm": 7.6301140785217285, "learning_rate": 2.6477791752167873e-06, "loss": 2.8518, "step": 200 }, { "epoch": 0.008274036074797287, "eval_loss": 2.691387891769409, "eval_runtime": 191.6791, "eval_samples_per_second": 112.099, "eval_steps_per_second": 14.013, "step": 200 }, { "epoch": 0.00868773787853715, "grad_norm": 1.8872076272964478, "learning_rate": 2.7801681339776264e-06, "loss": 2.5464, "step": 210 }, { "epoch": 0.009101439682277015, "grad_norm": 1.6354514360427856, "learning_rate": 2.912557092738466e-06, "loss": 2.7549, "step": 220 }, { "epoch": 0.009515141486016879, "grad_norm": 2.8152554035186768, "learning_rate": 3.044946051499305e-06, "loss": 2.6623, "step": 230 }, { "epoch": 0.009928843289756743, "grad_norm": 1.6499367952346802, "learning_rate": 3.1773350102601444e-06, "loss": 2.7109, "step": 240 }, { "epoch": 0.010342545093496608, "grad_norm": 6.550207138061523, "learning_rate": 3.3097239690209834e-06, "loss": 2.7049, "step": 250 }, { "epoch": 0.010342545093496608, "eval_loss": 2.596395969390869, "eval_runtime": 191.5625, "eval_samples_per_second": 112.167, "eval_steps_per_second": 14.022, "step": 250 }, { "epoch": 0.010756246897236471, "grad_norm": 2.1888961791992188, "learning_rate": 3.442112927781823e-06, "loss": 2.5695, "step": 260 }, { "epoch": 0.011169948700976337, "grad_norm": 1.707841157913208, "learning_rate": 3.5745018865426623e-06, "loss": 2.6986, "step": 270 }, { "epoch": 0.0115836505047162, "grad_norm": 2.4801785945892334, "learning_rate": 3.706890845303502e-06, "loss": 2.5655, "step": 280 }, { "epoch": 0.011997352308456066, "grad_norm": 1.716589093208313, "learning_rate": 3.839279804064341e-06, "loss": 2.5959, "step": 290 }, { "epoch": 0.012411054112195929, "grad_norm": 7.755068302154541, "learning_rate": 3.971668762825181e-06, "loss": 2.5077, "step": 300 }, { "epoch": 0.012411054112195929, "eval_loss": 2.562431573867798, "eval_runtime": 191.6318, "eval_samples_per_second": 112.126, "eval_steps_per_second": 14.016, "step": 300 }, { "epoch": 0.012824755915935793, "grad_norm": 2.1086246967315674, "learning_rate": 4.10405772158602e-06, "loss": 2.4752, "step": 310 }, { "epoch": 0.013238457719675658, "grad_norm": 1.591207504272461, "learning_rate": 4.23644668034686e-06, "loss": 2.5712, "step": 320 }, { "epoch": 0.013652159523415522, "grad_norm": 2.639002561569214, "learning_rate": 4.368835639107698e-06, "loss": 2.5321, "step": 330 }, { "epoch": 0.014065861327155387, "grad_norm": 1.8787785768508911, "learning_rate": 4.501224597868538e-06, "loss": 2.5703, "step": 340 }, { "epoch": 0.01447956313089525, "grad_norm": 7.730015754699707, "learning_rate": 4.633613556629377e-06, "loss": 2.5767, "step": 350 }, { "epoch": 0.01447956313089525, "eval_loss": 2.543426990509033, "eval_runtime": 191.9866, "eval_samples_per_second": 111.919, "eval_steps_per_second": 13.991, "step": 350 }, { "epoch": 0.014893264934635116, "grad_norm": 2.009415864944458, "learning_rate": 4.766002515390217e-06, "loss": 2.6492, "step": 360 }, { "epoch": 0.01530696673837498, "grad_norm": 1.7757397890090942, "learning_rate": 4.898391474151056e-06, "loss": 2.4928, "step": 370 }, { "epoch": 0.015720668542114843, "grad_norm": 2.7528040409088135, "learning_rate": 5.030780432911896e-06, "loss": 2.5713, "step": 380 }, { "epoch": 0.016134370345854708, "grad_norm": 1.725705862045288, "learning_rate": 5.163169391672735e-06, "loss": 2.4606, "step": 390 }, { "epoch": 0.016548072149594573, "grad_norm": 4.424041748046875, "learning_rate": 5.295558350433575e-06, "loss": 2.4882, "step": 400 }, { "epoch": 0.016548072149594573, "eval_loss": 2.5288844108581543, "eval_runtime": 191.9718, "eval_samples_per_second": 111.928, "eval_steps_per_second": 13.992, "step": 400 }, { "epoch": 0.016961773953334435, "grad_norm": 2.2628228664398193, "learning_rate": 5.427947309194414e-06, "loss": 2.5639, "step": 410 }, { "epoch": 0.0173754757570743, "grad_norm": 1.7356669902801514, "learning_rate": 5.560336267955253e-06, "loss": 2.4799, "step": 420 }, { "epoch": 0.017789177560814166, "grad_norm": 2.5851871967315674, "learning_rate": 5.692725226716092e-06, "loss": 2.4444, "step": 430 }, { "epoch": 0.01820287936455403, "grad_norm": 1.77878999710083, "learning_rate": 5.825114185476932e-06, "loss": 2.5227, "step": 440 }, { "epoch": 0.018616581168293893, "grad_norm": 3.740952968597412, "learning_rate": 5.957503144237771e-06, "loss": 2.5446, "step": 450 }, { "epoch": 0.018616581168293893, "eval_loss": 2.5211548805236816, "eval_runtime": 192.0432, "eval_samples_per_second": 111.886, "eval_steps_per_second": 13.986, "step": 450 }, { "epoch": 0.019030282972033758, "grad_norm": 1.886467456817627, "learning_rate": 6.08989210299861e-06, "loss": 2.4978, "step": 460 }, { "epoch": 0.019443984775773623, "grad_norm": 1.640499234199524, "learning_rate": 6.222281061759449e-06, "loss": 2.5732, "step": 470 }, { "epoch": 0.019857686579513485, "grad_norm": 2.688105344772339, "learning_rate": 6.354670020520289e-06, "loss": 2.4776, "step": 480 }, { "epoch": 0.02027138838325335, "grad_norm": 1.7561728954315186, "learning_rate": 6.487058979281128e-06, "loss": 2.5655, "step": 490 }, { "epoch": 0.020685090186993216, "grad_norm": 4.519068241119385, "learning_rate": 6.619447938041967e-06, "loss": 2.5746, "step": 500 }, { "epoch": 0.020685090186993216, "eval_loss": 2.5129964351654053, "eval_runtime": 192.1233, "eval_samples_per_second": 111.84, "eval_steps_per_second": 13.981, "step": 500 }, { "epoch": 0.02109879199073308, "grad_norm": 2.033447742462158, "learning_rate": 6.751836896802807e-06, "loss": 2.4859, "step": 510 }, { "epoch": 0.021512493794472943, "grad_norm": 1.5632065534591675, "learning_rate": 6.884225855563646e-06, "loss": 2.4457, "step": 520 }, { "epoch": 0.021926195598212808, "grad_norm": 2.681281805038452, "learning_rate": 7.016614814324486e-06, "loss": 2.5608, "step": 530 }, { "epoch": 0.022339897401952673, "grad_norm": 1.8857308626174927, "learning_rate": 7.149003773085325e-06, "loss": 2.4159, "step": 540 }, { "epoch": 0.022753599205692535, "grad_norm": 2.5732996463775635, "learning_rate": 7.281392731846165e-06, "loss": 2.552, "step": 550 }, { "epoch": 0.022753599205692535, "eval_loss": 2.506718158721924, "eval_runtime": 191.8473, "eval_samples_per_second": 112.001, "eval_steps_per_second": 14.001, "step": 550 }, { "epoch": 0.0231673010094324, "grad_norm": 2.0375943183898926, "learning_rate": 7.413781690607004e-06, "loss": 2.4045, "step": 560 }, { "epoch": 0.023581002813172266, "grad_norm": 1.7082266807556152, "learning_rate": 7.546170649367842e-06, "loss": 2.5122, "step": 570 }, { "epoch": 0.02399470461691213, "grad_norm": 3.083179235458374, "learning_rate": 7.678559608128683e-06, "loss": 2.4468, "step": 580 }, { "epoch": 0.024408406420651993, "grad_norm": 2.036375045776367, "learning_rate": 7.810948566889522e-06, "loss": 2.3977, "step": 590 }, { "epoch": 0.024822108224391858, "grad_norm": 5.537284851074219, "learning_rate": 7.943337525650362e-06, "loss": 2.5758, "step": 600 }, { "epoch": 0.024822108224391858, "eval_loss": 2.5001819133758545, "eval_runtime": 192.0232, "eval_samples_per_second": 111.898, "eval_steps_per_second": 13.988, "step": 600 }, { "epoch": 0.025235810028131724, "grad_norm": 1.991523027420044, "learning_rate": 8.075726484411201e-06, "loss": 2.4625, "step": 610 }, { "epoch": 0.025649511831871585, "grad_norm": 1.6775861978530884, "learning_rate": 8.20811544317204e-06, "loss": 2.544, "step": 620 }, { "epoch": 0.02606321363561145, "grad_norm": 3.812007188796997, "learning_rate": 8.340504401932878e-06, "loss": 2.512, "step": 630 }, { "epoch": 0.026476915439351316, "grad_norm": 1.9208543300628662, "learning_rate": 8.47289336069372e-06, "loss": 2.5037, "step": 640 }, { "epoch": 0.02689061724309118, "grad_norm": 4.224174976348877, "learning_rate": 8.605282319454557e-06, "loss": 2.5321, "step": 650 }, { "epoch": 0.02689061724309118, "eval_loss": 2.49426531791687, "eval_runtime": 191.5704, "eval_samples_per_second": 112.162, "eval_steps_per_second": 14.021, "step": 650 }, { "epoch": 0.027304319046831043, "grad_norm": 2.186660051345825, "learning_rate": 8.737671278215397e-06, "loss": 2.5514, "step": 660 }, { "epoch": 0.02771802085057091, "grad_norm": 1.7968275547027588, "learning_rate": 8.870060236976236e-06, "loss": 2.5907, "step": 670 }, { "epoch": 0.028131722654310774, "grad_norm": 2.399308443069458, "learning_rate": 9.002449195737076e-06, "loss": 2.4944, "step": 680 }, { "epoch": 0.028545424458050635, "grad_norm": 1.9562846422195435, "learning_rate": 9.134838154497915e-06, "loss": 2.5399, "step": 690 }, { "epoch": 0.0289591262617905, "grad_norm": 8.584575653076172, "learning_rate": 9.267227113258755e-06, "loss": 2.5634, "step": 700 }, { "epoch": 0.0289591262617905, "eval_loss": 2.491830587387085, "eval_runtime": 191.7155, "eval_samples_per_second": 112.078, "eval_steps_per_second": 14.01, "step": 700 }, { "epoch": 0.029372828065530366, "grad_norm": 2.4395720958709717, "learning_rate": 9.399616072019594e-06, "loss": 2.4985, "step": 710 }, { "epoch": 0.02978652986927023, "grad_norm": 1.7639998197555542, "learning_rate": 9.532005030780433e-06, "loss": 2.4504, "step": 720 }, { "epoch": 0.030200231673010093, "grad_norm": 3.084226131439209, "learning_rate": 9.664393989541273e-06, "loss": 2.5218, "step": 730 }, { "epoch": 0.03061393347674996, "grad_norm": 1.8147473335266113, "learning_rate": 9.796782948302112e-06, "loss": 2.469, "step": 740 }, { "epoch": 0.031027635280489824, "grad_norm": 3.9339489936828613, "learning_rate": 9.92917190706295e-06, "loss": 2.4308, "step": 750 }, { "epoch": 0.031027635280489824, "eval_loss": 2.487623691558838, "eval_runtime": 192.0131, "eval_samples_per_second": 111.904, "eval_steps_per_second": 13.989, "step": 750 }, { "epoch": 0.031441337084229685, "grad_norm": 2.745901346206665, "learning_rate": 1.0061560865823791e-05, "loss": 2.4403, "step": 760 }, { "epoch": 0.031855038887969554, "grad_norm": 1.6878538131713867, "learning_rate": 1.019394982458463e-05, "loss": 2.5143, "step": 770 }, { "epoch": 0.032268740691709416, "grad_norm": 2.9260056018829346, "learning_rate": 1.032633878334547e-05, "loss": 2.495, "step": 780 }, { "epoch": 0.03268244249544928, "grad_norm": 1.7049219608306885, "learning_rate": 1.0458727742106308e-05, "loss": 2.5727, "step": 790 }, { "epoch": 0.03309614429918915, "grad_norm": 6.6026611328125, "learning_rate": 1.059111670086715e-05, "loss": 2.5713, "step": 800 }, { "epoch": 0.03309614429918915, "eval_loss": 2.4831435680389404, "eval_runtime": 191.7737, "eval_samples_per_second": 112.044, "eval_steps_per_second": 14.006, "step": 800 }, { "epoch": 0.03350984610292901, "grad_norm": 1.8980331420898438, "learning_rate": 1.0723505659627987e-05, "loss": 2.4341, "step": 810 }, { "epoch": 0.03392354790666887, "grad_norm": 1.5614609718322754, "learning_rate": 1.0855894618388828e-05, "loss": 2.5319, "step": 820 }, { "epoch": 0.03433724971040874, "grad_norm": 2.465698480606079, "learning_rate": 1.0988283577149666e-05, "loss": 2.5726, "step": 830 }, { "epoch": 0.0347509515141486, "grad_norm": 1.8372745513916016, "learning_rate": 1.1120672535910505e-05, "loss": 2.4916, "step": 840 }, { "epoch": 0.03516465331788846, "grad_norm": 5.947615146636963, "learning_rate": 1.1253061494671345e-05, "loss": 2.3993, "step": 850 }, { "epoch": 0.03516465331788846, "eval_loss": 2.481991767883301, "eval_runtime": 191.9466, "eval_samples_per_second": 111.943, "eval_steps_per_second": 13.993, "step": 850 }, { "epoch": 0.03557835512162833, "grad_norm": 2.2666265964508057, "learning_rate": 1.1385450453432184e-05, "loss": 2.443, "step": 860 }, { "epoch": 0.03599205692536819, "grad_norm": 1.4873089790344238, "learning_rate": 1.1517839412193024e-05, "loss": 2.475, "step": 870 }, { "epoch": 0.03640575872910806, "grad_norm": 2.3272297382354736, "learning_rate": 1.1650228370953863e-05, "loss": 2.5476, "step": 880 }, { "epoch": 0.036819460532847924, "grad_norm": 1.6611170768737793, "learning_rate": 1.1782617329714703e-05, "loss": 2.5148, "step": 890 }, { "epoch": 0.037233162336587786, "grad_norm": 4.772570610046387, "learning_rate": 1.1915006288475542e-05, "loss": 2.4609, "step": 900 }, { "epoch": 0.037233162336587786, "eval_loss": 2.476624011993408, "eval_runtime": 192.1516, "eval_samples_per_second": 111.823, "eval_steps_per_second": 13.979, "step": 900 }, { "epoch": 0.037646864140327654, "grad_norm": 1.8993462324142456, "learning_rate": 1.2047395247236382e-05, "loss": 2.4464, "step": 910 }, { "epoch": 0.038060565944067516, "grad_norm": 1.7211616039276123, "learning_rate": 1.217978420599722e-05, "loss": 2.4218, "step": 920 }, { "epoch": 0.03847426774780738, "grad_norm": 2.31164288520813, "learning_rate": 1.231217316475806e-05, "loss": 2.4613, "step": 930 }, { "epoch": 0.03888796955154725, "grad_norm": 1.8366121053695679, "learning_rate": 1.2444562123518898e-05, "loss": 2.5939, "step": 940 }, { "epoch": 0.03930167135528711, "grad_norm": 5.815886497497559, "learning_rate": 1.257695108227974e-05, "loss": 2.4981, "step": 950 }, { "epoch": 0.03930167135528711, "eval_loss": 2.473769426345825, "eval_runtime": 191.9148, "eval_samples_per_second": 111.961, "eval_steps_per_second": 13.996, "step": 950 }, { "epoch": 0.03971537315902697, "grad_norm": 1.901371955871582, "learning_rate": 1.2709340041040577e-05, "loss": 2.3919, "step": 960 }, { "epoch": 0.04012907496276684, "grad_norm": 1.5490262508392334, "learning_rate": 1.2841728999801417e-05, "loss": 2.4137, "step": 970 }, { "epoch": 0.0405427767665067, "grad_norm": 2.4386820793151855, "learning_rate": 1.2974117958562256e-05, "loss": 2.4258, "step": 980 }, { "epoch": 0.04095647857024657, "grad_norm": 1.8617912530899048, "learning_rate": 1.3106506917323098e-05, "loss": 2.4652, "step": 990 }, { "epoch": 0.04137018037398643, "grad_norm": 4.586647033691406, "learning_rate": 1.3238895876083934e-05, "loss": 2.5594, "step": 1000 }, { "epoch": 0.04137018037398643, "eval_loss": 2.470456838607788, "eval_runtime": 191.9811, "eval_samples_per_second": 111.922, "eval_steps_per_second": 13.991, "step": 1000 }, { "epoch": 0.04178388217772629, "grad_norm": 2.402545690536499, "learning_rate": 1.3371284834844775e-05, "loss": 2.4016, "step": 1010 }, { "epoch": 0.04219758398146616, "grad_norm": 1.6968059539794922, "learning_rate": 1.3503673793605614e-05, "loss": 2.5863, "step": 1020 }, { "epoch": 0.042611285785206024, "grad_norm": 2.5629961490631104, "learning_rate": 1.3636062752366452e-05, "loss": 2.5449, "step": 1030 }, { "epoch": 0.043024987588945886, "grad_norm": 1.868660807609558, "learning_rate": 1.3768451711127292e-05, "loss": 2.5221, "step": 1040 }, { "epoch": 0.043438689392685755, "grad_norm": 7.510336399078369, "learning_rate": 1.3900840669888133e-05, "loss": 2.5697, "step": 1050 }, { "epoch": 0.043438689392685755, "eval_loss": 2.4701919555664062, "eval_runtime": 191.9303, "eval_samples_per_second": 111.952, "eval_steps_per_second": 13.995, "step": 1050 }, { "epoch": 0.043852391196425616, "grad_norm": 1.8021678924560547, "learning_rate": 1.4033229628648972e-05, "loss": 2.4508, "step": 1060 }, { "epoch": 0.04426609300016548, "grad_norm": 1.5612075328826904, "learning_rate": 1.416561858740981e-05, "loss": 2.4669, "step": 1070 }, { "epoch": 0.04467979480390535, "grad_norm": 2.181138515472412, "learning_rate": 1.429800754617065e-05, "loss": 2.4372, "step": 1080 }, { "epoch": 0.04509349660764521, "grad_norm": 1.6941418647766113, "learning_rate": 1.443039650493149e-05, "loss": 2.4752, "step": 1090 }, { "epoch": 0.04550719841138507, "grad_norm": 4.783656120300293, "learning_rate": 1.456278546369233e-05, "loss": 2.5192, "step": 1100 }, { "epoch": 0.04550719841138507, "eval_loss": 2.4676802158355713, "eval_runtime": 191.766, "eval_samples_per_second": 112.048, "eval_steps_per_second": 14.007, "step": 1100 }, { "epoch": 0.04592090021512494, "grad_norm": 1.9872281551361084, "learning_rate": 1.4695174422453168e-05, "loss": 2.4922, "step": 1110 }, { "epoch": 0.0463346020188648, "grad_norm": 1.4942958354949951, "learning_rate": 1.4827563381214007e-05, "loss": 2.4805, "step": 1120 }, { "epoch": 0.04674830382260467, "grad_norm": 3.1316111087799072, "learning_rate": 1.4959952339974847e-05, "loss": 2.4435, "step": 1130 }, { "epoch": 0.04716200562634453, "grad_norm": 1.6889320611953735, "learning_rate": 1.5092341298735685e-05, "loss": 2.4423, "step": 1140 }, { "epoch": 0.047575707430084394, "grad_norm": 4.084187984466553, "learning_rate": 1.5224730257496526e-05, "loss": 2.5156, "step": 1150 }, { "epoch": 0.047575707430084394, "eval_loss": 2.4648633003234863, "eval_runtime": 191.7304, "eval_samples_per_second": 112.069, "eval_steps_per_second": 14.009, "step": 1150 }, { "epoch": 0.04798940923382426, "grad_norm": 2.0002176761627197, "learning_rate": 1.5357119216257365e-05, "loss": 2.4802, "step": 1160 }, { "epoch": 0.048403111037564124, "grad_norm": 1.587095022201538, "learning_rate": 1.5489508175018206e-05, "loss": 2.4569, "step": 1170 }, { "epoch": 0.048816812841303986, "grad_norm": 2.3453493118286133, "learning_rate": 1.5621897133779044e-05, "loss": 2.4507, "step": 1180 }, { "epoch": 0.049230514645043855, "grad_norm": 1.5499389171600342, "learning_rate": 1.5754286092539882e-05, "loss": 2.4855, "step": 1190 }, { "epoch": 0.049644216448783716, "grad_norm": 5.546605110168457, "learning_rate": 1.5886675051300723e-05, "loss": 2.5819, "step": 1200 }, { "epoch": 0.049644216448783716, "eval_loss": 2.463754892349243, "eval_runtime": 192.2671, "eval_samples_per_second": 111.756, "eval_steps_per_second": 13.97, "step": 1200 }, { "epoch": 0.05005791825252358, "grad_norm": 1.8684651851654053, "learning_rate": 1.6019064010061564e-05, "loss": 2.4518, "step": 1210 }, { "epoch": 0.05047162005626345, "grad_norm": 1.4557489156723022, "learning_rate": 1.6151452968822402e-05, "loss": 2.3933, "step": 1220 }, { "epoch": 0.05088532186000331, "grad_norm": 2.2704148292541504, "learning_rate": 1.628384192758324e-05, "loss": 2.5085, "step": 1230 }, { "epoch": 0.05129902366374317, "grad_norm": 1.5222177505493164, "learning_rate": 1.641623088634408e-05, "loss": 2.413, "step": 1240 }, { "epoch": 0.05171272546748304, "grad_norm": 5.1939921379089355, "learning_rate": 1.654861984510492e-05, "loss": 2.5288, "step": 1250 }, { "epoch": 0.05171272546748304, "eval_loss": 2.4594507217407227, "eval_runtime": 192.0651, "eval_samples_per_second": 111.874, "eval_steps_per_second": 13.985, "step": 1250 }, { "epoch": 0.0521264272712229, "grad_norm": 1.7127642631530762, "learning_rate": 1.6681008803865757e-05, "loss": 2.4378, "step": 1260 }, { "epoch": 0.05254012907496277, "grad_norm": 1.5611684322357178, "learning_rate": 1.6813397762626598e-05, "loss": 2.5137, "step": 1270 }, { "epoch": 0.05295383087870263, "grad_norm": 2.775508403778076, "learning_rate": 1.694578672138744e-05, "loss": 2.5229, "step": 1280 }, { "epoch": 0.053367532682442494, "grad_norm": 1.8678507804870605, "learning_rate": 1.7078175680148277e-05, "loss": 2.4307, "step": 1290 }, { "epoch": 0.05378123448618236, "grad_norm": 6.799563884735107, "learning_rate": 1.7210564638909114e-05, "loss": 2.4565, "step": 1300 }, { "epoch": 0.05378123448618236, "eval_loss": 2.4585297107696533, "eval_runtime": 192.2996, "eval_samples_per_second": 111.737, "eval_steps_per_second": 13.968, "step": 1300 }, { "epoch": 0.054194936289922224, "grad_norm": 1.9326202869415283, "learning_rate": 1.7342953597669956e-05, "loss": 2.5295, "step": 1310 }, { "epoch": 0.054608638093662086, "grad_norm": 1.4933080673217773, "learning_rate": 1.7475342556430793e-05, "loss": 2.47, "step": 1320 }, { "epoch": 0.055022339897401955, "grad_norm": 2.226086378097534, "learning_rate": 1.7607731515191634e-05, "loss": 2.4618, "step": 1330 }, { "epoch": 0.05543604170114182, "grad_norm": 1.5279550552368164, "learning_rate": 1.7740120473952472e-05, "loss": 2.4417, "step": 1340 }, { "epoch": 0.05584974350488168, "grad_norm": 5.200754642486572, "learning_rate": 1.7872509432713313e-05, "loss": 2.4487, "step": 1350 }, { "epoch": 0.05584974350488168, "eval_loss": 2.455735445022583, "eval_runtime": 192.1091, "eval_samples_per_second": 111.848, "eval_steps_per_second": 13.982, "step": 1350 }, { "epoch": 0.05626344530862155, "grad_norm": 1.7944433689117432, "learning_rate": 1.800489839147415e-05, "loss": 2.4703, "step": 1360 }, { "epoch": 0.05667714711236141, "grad_norm": 1.4252562522888184, "learning_rate": 1.8137287350234992e-05, "loss": 2.3916, "step": 1370 }, { "epoch": 0.05709084891610127, "grad_norm": 2.8883371353149414, "learning_rate": 1.826967630899583e-05, "loss": 2.4727, "step": 1380 }, { "epoch": 0.05750455071984114, "grad_norm": 1.4811818599700928, "learning_rate": 1.840206526775667e-05, "loss": 2.4215, "step": 1390 }, { "epoch": 0.057918252523581, "grad_norm": 4.289738655090332, "learning_rate": 1.853445422651751e-05, "loss": 2.5059, "step": 1400 }, { "epoch": 0.057918252523581, "eval_loss": 2.453115940093994, "eval_runtime": 192.2724, "eval_samples_per_second": 111.753, "eval_steps_per_second": 13.97, "step": 1400 }, { "epoch": 0.05833195432732087, "grad_norm": 1.7383557558059692, "learning_rate": 1.866684318527835e-05, "loss": 2.4347, "step": 1410 }, { "epoch": 0.05874565613106073, "grad_norm": 1.3924076557159424, "learning_rate": 1.8799232144039188e-05, "loss": 2.5528, "step": 1420 }, { "epoch": 0.059159357934800594, "grad_norm": 2.3464672565460205, "learning_rate": 1.8931621102800026e-05, "loss": 2.4477, "step": 1430 }, { "epoch": 0.05957305973854046, "grad_norm": 1.5505847930908203, "learning_rate": 1.9064010061560867e-05, "loss": 2.5231, "step": 1440 }, { "epoch": 0.059986761542280324, "grad_norm": 3.678102731704712, "learning_rate": 1.9196399020321705e-05, "loss": 2.4266, "step": 1450 }, { "epoch": 0.059986761542280324, "eval_loss": 2.4536619186401367, "eval_runtime": 192.0128, "eval_samples_per_second": 111.904, "eval_steps_per_second": 13.989, "step": 1450 }, { "epoch": 0.060400463346020186, "grad_norm": 1.829718828201294, "learning_rate": 1.9328787979082546e-05, "loss": 2.3626, "step": 1460 }, { "epoch": 0.060814165149760055, "grad_norm": 1.4227572679519653, "learning_rate": 1.9461176937843384e-05, "loss": 2.4631, "step": 1470 }, { "epoch": 0.06122786695349992, "grad_norm": 2.187938690185547, "learning_rate": 1.9593565896604225e-05, "loss": 2.369, "step": 1480 }, { "epoch": 0.06164156875723978, "grad_norm": 1.507803201675415, "learning_rate": 1.9725954855365063e-05, "loss": 2.5326, "step": 1490 }, { "epoch": 0.06205527056097965, "grad_norm": 4.287981033325195, "learning_rate": 1.98583438141259e-05, "loss": 2.4951, "step": 1500 }, { "epoch": 0.06205527056097965, "eval_loss": 2.45443058013916, "eval_runtime": 192.2543, "eval_samples_per_second": 111.763, "eval_steps_per_second": 13.971, "step": 1500 }, { "epoch": 0.06246897236471951, "grad_norm": 1.8807919025421143, "learning_rate": 1.999073277288674e-05, "loss": 2.5014, "step": 1510 }, { "epoch": 0.06288267416845937, "grad_norm": 1.365456223487854, "learning_rate": 2.0123121731647583e-05, "loss": 2.528, "step": 1520 }, { "epoch": 0.06329637597219924, "grad_norm": 1.8896796703338623, "learning_rate": 2.025551069040842e-05, "loss": 2.4115, "step": 1530 }, { "epoch": 0.06371007777593911, "grad_norm": 1.3959323167800903, "learning_rate": 2.038789964916926e-05, "loss": 2.4861, "step": 1540 }, { "epoch": 0.06412377957967896, "grad_norm": 2.8032724857330322, "learning_rate": 2.05202886079301e-05, "loss": 2.4606, "step": 1550 }, { "epoch": 0.06412377957967896, "eval_loss": 2.446666717529297, "eval_runtime": 192.1822, "eval_samples_per_second": 111.805, "eval_steps_per_second": 13.976, "step": 1550 }, { "epoch": 0.06453748138341883, "grad_norm": 1.7319939136505127, "learning_rate": 2.065267756669094e-05, "loss": 2.4814, "step": 1560 }, { "epoch": 0.0649511831871587, "grad_norm": 1.5144038200378418, "learning_rate": 2.078506652545178e-05, "loss": 2.5466, "step": 1570 }, { "epoch": 0.06536488499089856, "grad_norm": 2.343106746673584, "learning_rate": 2.0917455484212616e-05, "loss": 2.4458, "step": 1580 }, { "epoch": 0.06577858679463842, "grad_norm": 1.3685145378112793, "learning_rate": 2.1049844442973457e-05, "loss": 2.437, "step": 1590 }, { "epoch": 0.0661922885983783, "grad_norm": 3.443723440170288, "learning_rate": 2.11822334017343e-05, "loss": 2.3836, "step": 1600 }, { "epoch": 0.0661922885983783, "eval_loss": 2.445317268371582, "eval_runtime": 192.3906, "eval_samples_per_second": 111.684, "eval_steps_per_second": 13.961, "step": 1600 }, { "epoch": 0.06660599040211815, "grad_norm": 1.615563154220581, "learning_rate": 2.1314622360495133e-05, "loss": 2.485, "step": 1610 }, { "epoch": 0.06701969220585802, "grad_norm": 1.3848172426223755, "learning_rate": 2.1447011319255974e-05, "loss": 2.4662, "step": 1620 }, { "epoch": 0.06743339400959789, "grad_norm": 2.333655834197998, "learning_rate": 2.1579400278016815e-05, "loss": 2.4428, "step": 1630 }, { "epoch": 0.06784709581333774, "grad_norm": 1.4733355045318604, "learning_rate": 2.1711789236777656e-05, "loss": 2.4355, "step": 1640 }, { "epoch": 0.06826079761707761, "grad_norm": 3.7136733531951904, "learning_rate": 2.184417819553849e-05, "loss": 2.4641, "step": 1650 }, { "epoch": 0.06826079761707761, "eval_loss": 2.4460644721984863, "eval_runtime": 192.2823, "eval_samples_per_second": 111.747, "eval_steps_per_second": 13.969, "step": 1650 }, { "epoch": 0.06867449942081748, "grad_norm": 1.6279494762420654, "learning_rate": 2.1976567154299332e-05, "loss": 2.4104, "step": 1660 }, { "epoch": 0.06908820122455733, "grad_norm": 1.2883050441741943, "learning_rate": 2.2108956113060173e-05, "loss": 2.4437, "step": 1670 }, { "epoch": 0.0695019030282972, "grad_norm": 1.9009106159210205, "learning_rate": 2.224134507182101e-05, "loss": 2.4805, "step": 1680 }, { "epoch": 0.06991560483203707, "grad_norm": 1.559876561164856, "learning_rate": 2.237373403058185e-05, "loss": 2.4597, "step": 1690 }, { "epoch": 0.07032930663577693, "grad_norm": 5.1238908767700195, "learning_rate": 2.250612298934269e-05, "loss": 2.4473, "step": 1700 }, { "epoch": 0.07032930663577693, "eval_loss": 2.443244695663452, "eval_runtime": 192.4163, "eval_samples_per_second": 111.669, "eval_steps_per_second": 13.959, "step": 1700 }, { "epoch": 0.0707430084395168, "grad_norm": 1.5885063409805298, "learning_rate": 2.263851194810353e-05, "loss": 2.4338, "step": 1710 }, { "epoch": 0.07115671024325666, "grad_norm": 1.3484504222869873, "learning_rate": 2.277090090686437e-05, "loss": 2.4246, "step": 1720 }, { "epoch": 0.07157041204699653, "grad_norm": 2.1982786655426025, "learning_rate": 2.2903289865625207e-05, "loss": 2.3924, "step": 1730 }, { "epoch": 0.07198411385073639, "grad_norm": 1.4460484981536865, "learning_rate": 2.3035678824386048e-05, "loss": 2.3709, "step": 1740 }, { "epoch": 0.07239781565447626, "grad_norm": 3.882939577102661, "learning_rate": 2.316806778314689e-05, "loss": 2.3924, "step": 1750 }, { "epoch": 0.07239781565447626, "eval_loss": 2.4417777061462402, "eval_runtime": 192.6175, "eval_samples_per_second": 111.553, "eval_steps_per_second": 13.945, "step": 1750 }, { "epoch": 0.07281151745821612, "grad_norm": 1.7614493370056152, "learning_rate": 2.3300456741907727e-05, "loss": 2.3751, "step": 1760 }, { "epoch": 0.07322521926195598, "grad_norm": 1.3580434322357178, "learning_rate": 2.3432845700668564e-05, "loss": 2.5205, "step": 1770 }, { "epoch": 0.07363892106569585, "grad_norm": 1.8556549549102783, "learning_rate": 2.3565234659429406e-05, "loss": 2.3609, "step": 1780 }, { "epoch": 0.07405262286943572, "grad_norm": 1.334720492362976, "learning_rate": 2.3697623618190243e-05, "loss": 2.3737, "step": 1790 }, { "epoch": 0.07446632467317557, "grad_norm": 4.6519036293029785, "learning_rate": 2.3830012576951085e-05, "loss": 2.4956, "step": 1800 }, { "epoch": 0.07446632467317557, "eval_loss": 2.4415478706359863, "eval_runtime": 192.2742, "eval_samples_per_second": 111.752, "eval_steps_per_second": 13.97, "step": 1800 }, { "epoch": 0.07488002647691544, "grad_norm": 1.7479881048202515, "learning_rate": 2.3962401535711922e-05, "loss": 2.4674, "step": 1810 }, { "epoch": 0.07529372828065531, "grad_norm": 1.2697054147720337, "learning_rate": 2.4094790494472764e-05, "loss": 2.4265, "step": 1820 }, { "epoch": 0.07570743008439516, "grad_norm": 2.329758405685425, "learning_rate": 2.42271794532336e-05, "loss": 2.4019, "step": 1830 }, { "epoch": 0.07612113188813503, "grad_norm": 1.4872639179229736, "learning_rate": 2.435956841199444e-05, "loss": 2.4432, "step": 1840 }, { "epoch": 0.0765348336918749, "grad_norm": 5.384171485900879, "learning_rate": 2.449195737075528e-05, "loss": 2.5065, "step": 1850 }, { "epoch": 0.0765348336918749, "eval_loss": 2.43766450881958, "eval_runtime": 192.1785, "eval_samples_per_second": 111.808, "eval_steps_per_second": 13.977, "step": 1850 }, { "epoch": 0.07694853549561476, "grad_norm": 1.7890139818191528, "learning_rate": 2.462434632951612e-05, "loss": 2.5044, "step": 1860 }, { "epoch": 0.07736223729935462, "grad_norm": 1.1488181352615356, "learning_rate": 2.475673528827696e-05, "loss": 2.4178, "step": 1870 }, { "epoch": 0.0777759391030945, "grad_norm": 1.9832926988601685, "learning_rate": 2.4889124247037797e-05, "loss": 2.4288, "step": 1880 }, { "epoch": 0.07818964090683435, "grad_norm": 1.3308862447738647, "learning_rate": 2.5021513205798635e-05, "loss": 2.4807, "step": 1890 }, { "epoch": 0.07860334271057422, "grad_norm": 4.683075428009033, "learning_rate": 2.515390216455948e-05, "loss": 2.57, "step": 1900 }, { "epoch": 0.07860334271057422, "eval_loss": 2.4399499893188477, "eval_runtime": 192.4133, "eval_samples_per_second": 111.671, "eval_steps_per_second": 13.96, "step": 1900 }, { "epoch": 0.07901704451431409, "grad_norm": 1.6687976121902466, "learning_rate": 2.5286291123320317e-05, "loss": 2.477, "step": 1910 }, { "epoch": 0.07943074631805394, "grad_norm": 1.240014910697937, "learning_rate": 2.5418680082081155e-05, "loss": 2.4024, "step": 1920 }, { "epoch": 0.07984444812179381, "grad_norm": 2.4139554500579834, "learning_rate": 2.5551069040841996e-05, "loss": 2.4779, "step": 1930 }, { "epoch": 0.08025814992553368, "grad_norm": 1.3807213306427002, "learning_rate": 2.5683457999602834e-05, "loss": 2.5232, "step": 1940 }, { "epoch": 0.08067185172927353, "grad_norm": 3.208155393600464, "learning_rate": 2.5815846958363675e-05, "loss": 2.4057, "step": 1950 }, { "epoch": 0.08067185172927353, "eval_loss": 2.435680866241455, "eval_runtime": 192.0232, "eval_samples_per_second": 111.898, "eval_steps_per_second": 13.988, "step": 1950 }, { "epoch": 0.0810855535330134, "grad_norm": 1.622277855873108, "learning_rate": 2.5948235917124513e-05, "loss": 2.4458, "step": 1960 }, { "epoch": 0.08149925533675327, "grad_norm": 1.298906683921814, "learning_rate": 2.608062487588535e-05, "loss": 2.5103, "step": 1970 }, { "epoch": 0.08191295714049314, "grad_norm": 1.9455851316452026, "learning_rate": 2.6213013834646195e-05, "loss": 2.381, "step": 1980 }, { "epoch": 0.082326658944233, "grad_norm": 1.4825599193572998, "learning_rate": 2.6345402793407033e-05, "loss": 2.3797, "step": 1990 }, { "epoch": 0.08274036074797286, "grad_norm": 3.7889599800109863, "learning_rate": 2.6477791752167867e-05, "loss": 2.4555, "step": 2000 }, { "epoch": 0.08274036074797286, "eval_loss": 2.4349894523620605, "eval_runtime": 192.2531, "eval_samples_per_second": 111.764, "eval_steps_per_second": 13.971, "step": 2000 }, { "epoch": 0.08315406255171273, "grad_norm": 1.5155397653579712, "learning_rate": 2.6610180710928712e-05, "loss": 2.4048, "step": 2010 }, { "epoch": 0.08356776435545259, "grad_norm": 1.2312376499176025, "learning_rate": 2.674256966968955e-05, "loss": 2.3897, "step": 2020 }, { "epoch": 0.08398146615919246, "grad_norm": 1.872260332107544, "learning_rate": 2.6874958628450387e-05, "loss": 2.4613, "step": 2030 }, { "epoch": 0.08439516796293232, "grad_norm": 1.3909080028533936, "learning_rate": 2.700734758721123e-05, "loss": 2.41, "step": 2040 }, { "epoch": 0.08480886976667218, "grad_norm": 4.282350540161133, "learning_rate": 2.7139736545972066e-05, "loss": 2.5578, "step": 2050 }, { "epoch": 0.08480886976667218, "eval_loss": 2.43388295173645, "eval_runtime": 192.2578, "eval_samples_per_second": 111.761, "eval_steps_per_second": 13.971, "step": 2050 }, { "epoch": 0.08522257157041205, "grad_norm": 1.5022432804107666, "learning_rate": 2.7272125504732904e-05, "loss": 2.4, "step": 2060 }, { "epoch": 0.08563627337415192, "grad_norm": 1.1862342357635498, "learning_rate": 2.7404514463493745e-05, "loss": 2.4851, "step": 2070 }, { "epoch": 0.08604997517789177, "grad_norm": 2.135618209838867, "learning_rate": 2.7536903422254583e-05, "loss": 2.5404, "step": 2080 }, { "epoch": 0.08646367698163164, "grad_norm": 1.5129780769348145, "learning_rate": 2.7669292381015428e-05, "loss": 2.4569, "step": 2090 }, { "epoch": 0.08687737878537151, "grad_norm": 3.3164620399475098, "learning_rate": 2.7801681339776265e-05, "loss": 2.4314, "step": 2100 }, { "epoch": 0.08687737878537151, "eval_loss": 2.433966636657715, "eval_runtime": 192.2774, "eval_samples_per_second": 111.75, "eval_steps_per_second": 13.969, "step": 2100 }, { "epoch": 0.08729108058911136, "grad_norm": 1.7532408237457275, "learning_rate": 2.7934070298537103e-05, "loss": 2.493, "step": 2110 }, { "epoch": 0.08770478239285123, "grad_norm": 1.2657155990600586, "learning_rate": 2.8066459257297944e-05, "loss": 2.4338, "step": 2120 }, { "epoch": 0.0881184841965911, "grad_norm": 1.486377239227295, "learning_rate": 2.8198848216058782e-05, "loss": 2.4833, "step": 2130 }, { "epoch": 0.08853218600033096, "grad_norm": 1.3843668699264526, "learning_rate": 2.833123717481962e-05, "loss": 2.444, "step": 2140 }, { "epoch": 0.08894588780407083, "grad_norm": 4.9271745681762695, "learning_rate": 2.846362613358046e-05, "loss": 2.4294, "step": 2150 }, { "epoch": 0.08894588780407083, "eval_loss": 2.431682825088501, "eval_runtime": 192.1644, "eval_samples_per_second": 111.816, "eval_steps_per_second": 13.978, "step": 2150 }, { "epoch": 0.0893595896078107, "grad_norm": 1.6303709745407104, "learning_rate": 2.85960150923413e-05, "loss": 2.4843, "step": 2160 }, { "epoch": 0.08977329141155055, "grad_norm": 1.255014181137085, "learning_rate": 2.8728404051102137e-05, "loss": 2.4619, "step": 2170 }, { "epoch": 0.09018699321529042, "grad_norm": 2.0804553031921387, "learning_rate": 2.886079300986298e-05, "loss": 2.4025, "step": 2180 }, { "epoch": 0.09060069501903029, "grad_norm": 1.3440794944763184, "learning_rate": 2.8993181968623816e-05, "loss": 2.3668, "step": 2190 }, { "epoch": 0.09101439682277014, "grad_norm": 4.05084228515625, "learning_rate": 2.912557092738466e-05, "loss": 2.4092, "step": 2200 }, { "epoch": 0.09101439682277014, "eval_loss": 2.4323976039886475, "eval_runtime": 192.1923, "eval_samples_per_second": 111.799, "eval_steps_per_second": 13.976, "step": 2200 }, { "epoch": 0.09142809862651001, "grad_norm": 1.741429328918457, "learning_rate": 2.9257959886145498e-05, "loss": 2.403, "step": 2210 }, { "epoch": 0.09184180043024988, "grad_norm": 1.1738437414169312, "learning_rate": 2.9390348844906336e-05, "loss": 2.4762, "step": 2220 }, { "epoch": 0.09225550223398973, "grad_norm": 2.2958755493164062, "learning_rate": 2.9522737803667177e-05, "loss": 2.4725, "step": 2230 }, { "epoch": 0.0926692040377296, "grad_norm": 1.3816258907318115, "learning_rate": 2.9655126762428015e-05, "loss": 2.4265, "step": 2240 }, { "epoch": 0.09308290584146947, "grad_norm": 4.2315449714660645, "learning_rate": 2.9787515721188852e-05, "loss": 2.5031, "step": 2250 }, { "epoch": 0.09308290584146947, "eval_loss": 2.428933620452881, "eval_runtime": 192.4257, "eval_samples_per_second": 111.664, "eval_steps_per_second": 13.959, "step": 2250 }, { "epoch": 0.09349660764520934, "grad_norm": 1.4175034761428833, "learning_rate": 2.9919904679949694e-05, "loss": 2.4373, "step": 2260 }, { "epoch": 0.0939103094489492, "grad_norm": 1.317726969718933, "learning_rate": 3.005229363871053e-05, "loss": 2.4553, "step": 2270 }, { "epoch": 0.09432401125268906, "grad_norm": 1.6487064361572266, "learning_rate": 3.018468259747137e-05, "loss": 2.4697, "step": 2280 }, { "epoch": 0.09473771305642893, "grad_norm": 1.2023123502731323, "learning_rate": 3.0317071556232214e-05, "loss": 2.4394, "step": 2290 }, { "epoch": 0.09515141486016879, "grad_norm": 2.6498024463653564, "learning_rate": 3.044946051499305e-05, "loss": 2.3989, "step": 2300 }, { "epoch": 0.09515141486016879, "eval_loss": 2.4275782108306885, "eval_runtime": 192.4281, "eval_samples_per_second": 111.662, "eval_steps_per_second": 13.958, "step": 2300 }, { "epoch": 0.09556511666390866, "grad_norm": 1.5007858276367188, "learning_rate": 3.0581849473753896e-05, "loss": 2.4096, "step": 2310 }, { "epoch": 0.09597881846764852, "grad_norm": 1.0859999656677246, "learning_rate": 3.071423843251473e-05, "loss": 2.4666, "step": 2320 }, { "epoch": 0.09639252027138838, "grad_norm": 1.9067802429199219, "learning_rate": 3.0846627391275565e-05, "loss": 2.4082, "step": 2330 }, { "epoch": 0.09680622207512825, "grad_norm": 1.3921682834625244, "learning_rate": 3.097901635003641e-05, "loss": 2.3922, "step": 2340 }, { "epoch": 0.09721992387886812, "grad_norm": 4.302435874938965, "learning_rate": 3.111140530879725e-05, "loss": 2.4823, "step": 2350 }, { "epoch": 0.09721992387886812, "eval_loss": 2.425855875015259, "eval_runtime": 192.3695, "eval_samples_per_second": 111.696, "eval_steps_per_second": 13.963, "step": 2350 }, { "epoch": 0.09763362568260797, "grad_norm": 1.5660197734832764, "learning_rate": 3.124379426755809e-05, "loss": 2.3543, "step": 2360 }, { "epoch": 0.09804732748634784, "grad_norm": 1.0670582056045532, "learning_rate": 3.137618322631893e-05, "loss": 2.463, "step": 2370 }, { "epoch": 0.09846102929008771, "grad_norm": 1.8397210836410522, "learning_rate": 3.1508572185079764e-05, "loss": 2.3398, "step": 2380 }, { "epoch": 0.09887473109382756, "grad_norm": 1.3415971994400024, "learning_rate": 3.1640961143840605e-05, "loss": 2.5236, "step": 2390 }, { "epoch": 0.09928843289756743, "grad_norm": 3.6568195819854736, "learning_rate": 3.1773350102601446e-05, "loss": 2.4884, "step": 2400 }, { "epoch": 0.09928843289756743, "eval_loss": 2.424231767654419, "eval_runtime": 192.4379, "eval_samples_per_second": 111.657, "eval_steps_per_second": 13.958, "step": 2400 }, { "epoch": 0.0997021347013073, "grad_norm": 1.3887779712677002, "learning_rate": 3.190573906136228e-05, "loss": 2.3346, "step": 2410 }, { "epoch": 0.10011583650504716, "grad_norm": 1.2054352760314941, "learning_rate": 3.203812802012313e-05, "loss": 2.4657, "step": 2420 }, { "epoch": 0.10052953830878703, "grad_norm": 2.0938591957092285, "learning_rate": 3.217051697888396e-05, "loss": 2.4636, "step": 2430 }, { "epoch": 0.1009432401125269, "grad_norm": 1.3192065954208374, "learning_rate": 3.2302905937644804e-05, "loss": 2.3815, "step": 2440 }, { "epoch": 0.10135694191626675, "grad_norm": 2.894458055496216, "learning_rate": 3.2435294896405645e-05, "loss": 2.3923, "step": 2450 }, { "epoch": 0.10135694191626675, "eval_loss": 2.4255168437957764, "eval_runtime": 192.1, "eval_samples_per_second": 111.853, "eval_steps_per_second": 13.982, "step": 2450 }, { "epoch": 0.10177064372000662, "grad_norm": 1.536455512046814, "learning_rate": 3.256768385516648e-05, "loss": 2.3386, "step": 2460 }, { "epoch": 0.10218434552374649, "grad_norm": 1.1271722316741943, "learning_rate": 3.270007281392732e-05, "loss": 2.3759, "step": 2470 }, { "epoch": 0.10259804732748634, "grad_norm": 1.591378927230835, "learning_rate": 3.283246177268816e-05, "loss": 2.3984, "step": 2480 }, { "epoch": 0.10301174913122621, "grad_norm": 1.177627682685852, "learning_rate": 3.2964850731448996e-05, "loss": 2.3915, "step": 2490 }, { "epoch": 0.10342545093496608, "grad_norm": 3.4692955017089844, "learning_rate": 3.309723969020984e-05, "loss": 2.4107, "step": 2500 }, { "epoch": 0.10342545093496608, "eval_loss": 2.4271774291992188, "eval_runtime": 192.0196, "eval_samples_per_second": 111.9, "eval_steps_per_second": 13.988, "step": 2500 }, { "epoch": 0.10383915273870595, "grad_norm": 1.9192875623703003, "learning_rate": 3.322962864897068e-05, "loss": 2.4599, "step": 2510 }, { "epoch": 0.1042528545424458, "grad_norm": 1.1955854892730713, "learning_rate": 3.336201760773151e-05, "loss": 2.4953, "step": 2520 }, { "epoch": 0.10466655634618567, "grad_norm": 2.000783681869507, "learning_rate": 3.3494406566492354e-05, "loss": 2.4035, "step": 2530 }, { "epoch": 0.10508025814992554, "grad_norm": 1.265498399734497, "learning_rate": 3.3626795525253195e-05, "loss": 2.4769, "step": 2540 }, { "epoch": 0.1054939599536654, "grad_norm": 3.203890085220337, "learning_rate": 3.3759184484014037e-05, "loss": 2.4565, "step": 2550 }, { "epoch": 0.1054939599536654, "eval_loss": 2.4234707355499268, "eval_runtime": 192.027, "eval_samples_per_second": 111.896, "eval_steps_per_second": 13.988, "step": 2550 }, { "epoch": 0.10590766175740526, "grad_norm": 1.5027508735656738, "learning_rate": 3.389157344277488e-05, "loss": 2.4131, "step": 2560 }, { "epoch": 0.10632136356114513, "grad_norm": 1.1371029615402222, "learning_rate": 3.402396240153571e-05, "loss": 2.4704, "step": 2570 }, { "epoch": 0.10673506536488499, "grad_norm": 1.9199362993240356, "learning_rate": 3.415635136029655e-05, "loss": 2.4057, "step": 2580 }, { "epoch": 0.10714876716862486, "grad_norm": 1.340593934059143, "learning_rate": 3.4288740319057394e-05, "loss": 2.448, "step": 2590 }, { "epoch": 0.10756246897236472, "grad_norm": 4.329082489013672, "learning_rate": 3.442112927781823e-05, "loss": 2.3695, "step": 2600 }, { "epoch": 0.10756246897236472, "eval_loss": 2.4227752685546875, "eval_runtime": 191.9343, "eval_samples_per_second": 111.95, "eval_steps_per_second": 13.994, "step": 2600 }, { "epoch": 0.10797617077610458, "grad_norm": 1.6098721027374268, "learning_rate": 3.455351823657907e-05, "loss": 2.4109, "step": 2610 }, { "epoch": 0.10838987257984445, "grad_norm": 1.108033299446106, "learning_rate": 3.468590719533991e-05, "loss": 2.3429, "step": 2620 }, { "epoch": 0.10880357438358432, "grad_norm": 1.7498970031738281, "learning_rate": 3.481829615410075e-05, "loss": 2.4312, "step": 2630 }, { "epoch": 0.10921727618732417, "grad_norm": 1.2808787822723389, "learning_rate": 3.495068511286159e-05, "loss": 2.3522, "step": 2640 }, { "epoch": 0.10963097799106404, "grad_norm": 3.2506847381591797, "learning_rate": 3.508307407162243e-05, "loss": 2.4399, "step": 2650 }, { "epoch": 0.10963097799106404, "eval_loss": 2.4229462146759033, "eval_runtime": 191.7802, "eval_samples_per_second": 112.04, "eval_steps_per_second": 14.006, "step": 2650 }, { "epoch": 0.11004467979480391, "grad_norm": 1.5602270364761353, "learning_rate": 3.521546303038327e-05, "loss": 2.3953, "step": 2660 }, { "epoch": 0.11045838159854376, "grad_norm": 1.2033346891403198, "learning_rate": 3.534785198914411e-05, "loss": 2.478, "step": 2670 }, { "epoch": 0.11087208340228363, "grad_norm": 1.848679780960083, "learning_rate": 3.5480240947904945e-05, "loss": 2.4274, "step": 2680 }, { "epoch": 0.1112857852060235, "grad_norm": 1.2948276996612549, "learning_rate": 3.5612629906665786e-05, "loss": 2.3832, "step": 2690 }, { "epoch": 0.11169948700976336, "grad_norm": 2.2465500831604004, "learning_rate": 3.574501886542663e-05, "loss": 2.4686, "step": 2700 }, { "epoch": 0.11169948700976336, "eval_loss": 2.419696092605591, "eval_runtime": 192.0038, "eval_samples_per_second": 111.909, "eval_steps_per_second": 13.989, "step": 2700 }, { "epoch": 0.11211318881350323, "grad_norm": 1.3335747718811035, "learning_rate": 3.587740782418746e-05, "loss": 2.3928, "step": 2710 }, { "epoch": 0.1125268906172431, "grad_norm": 1.230947732925415, "learning_rate": 3.60097967829483e-05, "loss": 2.3551, "step": 2720 }, { "epoch": 0.11294059242098295, "grad_norm": 1.938184380531311, "learning_rate": 3.6142185741709144e-05, "loss": 2.4604, "step": 2730 }, { "epoch": 0.11335429422472282, "grad_norm": 1.2202266454696655, "learning_rate": 3.6274574700469985e-05, "loss": 2.4213, "step": 2740 }, { "epoch": 0.11376799602846269, "grad_norm": 5.06790828704834, "learning_rate": 3.640696365923082e-05, "loss": 2.4199, "step": 2750 }, { "epoch": 0.11376799602846269, "eval_loss": 2.417276382446289, "eval_runtime": 192.1448, "eval_samples_per_second": 111.827, "eval_steps_per_second": 13.979, "step": 2750 }, { "epoch": 0.11418169783220254, "grad_norm": 1.4237877130508423, "learning_rate": 3.653935261799166e-05, "loss": 2.3292, "step": 2760 }, { "epoch": 0.11459539963594241, "grad_norm": 1.0992203950881958, "learning_rate": 3.66717415767525e-05, "loss": 2.4039, "step": 2770 }, { "epoch": 0.11500910143968228, "grad_norm": 1.9876927137374878, "learning_rate": 3.680413053551334e-05, "loss": 2.4894, "step": 2780 }, { "epoch": 0.11542280324342215, "grad_norm": 1.0931942462921143, "learning_rate": 3.693651949427418e-05, "loss": 2.4246, "step": 2790 }, { "epoch": 0.115836505047162, "grad_norm": 2.594099998474121, "learning_rate": 3.706890845303502e-05, "loss": 2.3615, "step": 2800 }, { "epoch": 0.115836505047162, "eval_loss": 2.4184815883636475, "eval_runtime": 192.196, "eval_samples_per_second": 111.797, "eval_steps_per_second": 13.975, "step": 2800 }, { "epoch": 0.11625020685090187, "grad_norm": 1.609725832939148, "learning_rate": 3.720129741179586e-05, "loss": 2.3932, "step": 2810 }, { "epoch": 0.11666390865464174, "grad_norm": 1.1549509763717651, "learning_rate": 3.73336863705567e-05, "loss": 2.3866, "step": 2820 }, { "epoch": 0.1170776104583816, "grad_norm": 1.6155941486358643, "learning_rate": 3.7466075329317535e-05, "loss": 2.4339, "step": 2830 }, { "epoch": 0.11749131226212146, "grad_norm": 1.222001314163208, "learning_rate": 3.7598464288078376e-05, "loss": 2.3729, "step": 2840 }, { "epoch": 0.11790501406586133, "grad_norm": 2.7472927570343018, "learning_rate": 3.773085324683922e-05, "loss": 2.4635, "step": 2850 }, { "epoch": 0.11790501406586133, "eval_loss": 2.419049024581909, "eval_runtime": 192.0307, "eval_samples_per_second": 111.894, "eval_steps_per_second": 13.987, "step": 2850 }, { "epoch": 0.11831871586960119, "grad_norm": 1.4578399658203125, "learning_rate": 3.786324220560005e-05, "loss": 2.3201, "step": 2860 }, { "epoch": 0.11873241767334106, "grad_norm": 1.1061253547668457, "learning_rate": 3.799563116436089e-05, "loss": 2.3709, "step": 2870 }, { "epoch": 0.11914611947708093, "grad_norm": 1.784773826599121, "learning_rate": 3.8128020123121734e-05, "loss": 2.4057, "step": 2880 }, { "epoch": 0.11955982128082078, "grad_norm": 1.2136852741241455, "learning_rate": 3.8260409081882575e-05, "loss": 2.4394, "step": 2890 }, { "epoch": 0.11997352308456065, "grad_norm": 3.511291265487671, "learning_rate": 3.839279804064341e-05, "loss": 2.4492, "step": 2900 }, { "epoch": 0.11997352308456065, "eval_loss": 2.415731191635132, "eval_runtime": 192.2162, "eval_samples_per_second": 111.786, "eval_steps_per_second": 13.974, "step": 2900 }, { "epoch": 0.12038722488830052, "grad_norm": 1.2510019540786743, "learning_rate": 3.852518699940425e-05, "loss": 2.4108, "step": 2910 }, { "epoch": 0.12080092669204037, "grad_norm": 1.142307162284851, "learning_rate": 3.865757595816509e-05, "loss": 2.4522, "step": 2920 }, { "epoch": 0.12121462849578024, "grad_norm": 2.044956922531128, "learning_rate": 3.878996491692593e-05, "loss": 2.3445, "step": 2930 }, { "epoch": 0.12162833029952011, "grad_norm": 1.1978434324264526, "learning_rate": 3.892235387568677e-05, "loss": 2.4609, "step": 2940 }, { "epoch": 0.12204203210325996, "grad_norm": 2.3457064628601074, "learning_rate": 3.905474283444761e-05, "loss": 2.4444, "step": 2950 }, { "epoch": 0.12204203210325996, "eval_loss": 2.416599988937378, "eval_runtime": 192.2199, "eval_samples_per_second": 111.783, "eval_steps_per_second": 13.974, "step": 2950 }, { "epoch": 0.12245573390699983, "grad_norm": 1.6423189640045166, "learning_rate": 3.918713179320845e-05, "loss": 2.3361, "step": 2960 }, { "epoch": 0.1228694357107397, "grad_norm": 1.1515411138534546, "learning_rate": 3.9319520751969284e-05, "loss": 2.3867, "step": 2970 }, { "epoch": 0.12328313751447956, "grad_norm": 1.924850344657898, "learning_rate": 3.9451909710730125e-05, "loss": 2.4643, "step": 2980 }, { "epoch": 0.12369683931821943, "grad_norm": 1.2111527919769287, "learning_rate": 3.9584298669490966e-05, "loss": 2.4908, "step": 2990 }, { "epoch": 0.1241105411219593, "grad_norm": 3.314215660095215, "learning_rate": 3.97166876282518e-05, "loss": 2.4057, "step": 3000 }, { "epoch": 0.1241105411219593, "eval_loss": 2.4141523838043213, "eval_runtime": 192.2433, "eval_samples_per_second": 111.77, "eval_steps_per_second": 13.972, "step": 3000 }, { "epoch": 0.12452424292569915, "grad_norm": 1.6337625980377197, "learning_rate": 3.984907658701265e-05, "loss": 2.4341, "step": 3010 }, { "epoch": 0.12493794472943902, "grad_norm": 1.1137609481811523, "learning_rate": 3.998146554577348e-05, "loss": 2.4426, "step": 3020 }, { "epoch": 0.12535164653317887, "grad_norm": 1.6660860776901245, "learning_rate": 4.0113854504534324e-05, "loss": 2.3384, "step": 3030 }, { "epoch": 0.12576534833691874, "grad_norm": 1.2971585988998413, "learning_rate": 4.0246243463295166e-05, "loss": 2.3972, "step": 3040 }, { "epoch": 0.1261790501406586, "grad_norm": 2.1300132274627686, "learning_rate": 4.0378632422056e-05, "loss": 2.3822, "step": 3050 }, { "epoch": 0.1261790501406586, "eval_loss": 2.4136855602264404, "eval_runtime": 192.0899, "eval_samples_per_second": 111.859, "eval_steps_per_second": 13.983, "step": 3050 }, { "epoch": 0.12659275194439848, "grad_norm": 1.2941290140151978, "learning_rate": 4.051102138081684e-05, "loss": 2.3648, "step": 3060 }, { "epoch": 0.12700645374813835, "grad_norm": 1.0182734727859497, "learning_rate": 4.064341033957768e-05, "loss": 2.397, "step": 3070 }, { "epoch": 0.12742015555187822, "grad_norm": 1.5782710313796997, "learning_rate": 4.077579929833852e-05, "loss": 2.4181, "step": 3080 }, { "epoch": 0.12783385735561806, "grad_norm": 1.1129857301712036, "learning_rate": 4.0908188257099365e-05, "loss": 2.5093, "step": 3090 }, { "epoch": 0.12824755915935793, "grad_norm": 3.005021810531616, "learning_rate": 4.10405772158602e-05, "loss": 2.3831, "step": 3100 }, { "epoch": 0.12824755915935793, "eval_loss": 2.412200927734375, "eval_runtime": 192.0361, "eval_samples_per_second": 111.89, "eval_steps_per_second": 13.987, "step": 3100 }, { "epoch": 0.1286612609630978, "grad_norm": 1.411031723022461, "learning_rate": 4.117296617462103e-05, "loss": 2.4292, "step": 3110 }, { "epoch": 0.12907496276683766, "grad_norm": 1.1242040395736694, "learning_rate": 4.130535513338188e-05, "loss": 2.4665, "step": 3120 }, { "epoch": 0.12948866457057753, "grad_norm": 1.8342890739440918, "learning_rate": 4.1437744092142716e-05, "loss": 2.4071, "step": 3130 }, { "epoch": 0.1299023663743174, "grad_norm": 1.177221417427063, "learning_rate": 4.157013305090356e-05, "loss": 2.4843, "step": 3140 }, { "epoch": 0.13031606817805724, "grad_norm": 2.7261977195739746, "learning_rate": 4.17025220096644e-05, "loss": 2.376, "step": 3150 }, { "epoch": 0.13031606817805724, "eval_loss": 2.4139814376831055, "eval_runtime": 192.1227, "eval_samples_per_second": 111.84, "eval_steps_per_second": 13.981, "step": 3150 }, { "epoch": 0.1307297699817971, "grad_norm": 1.5093187093734741, "learning_rate": 4.183491096842523e-05, "loss": 2.2857, "step": 3160 }, { "epoch": 0.13114347178553698, "grad_norm": 1.1134452819824219, "learning_rate": 4.1967299927186074e-05, "loss": 2.3132, "step": 3170 }, { "epoch": 0.13155717358927685, "grad_norm": 1.7363109588623047, "learning_rate": 4.2099688885946915e-05, "loss": 2.3669, "step": 3180 }, { "epoch": 0.13197087539301672, "grad_norm": 1.1606100797653198, "learning_rate": 4.223207784470775e-05, "loss": 2.4524, "step": 3190 }, { "epoch": 0.1323845771967566, "grad_norm": 3.1138908863067627, "learning_rate": 4.23644668034686e-05, "loss": 2.4278, "step": 3200 }, { "epoch": 0.1323845771967566, "eval_loss": 2.4108588695526123, "eval_runtime": 192.1595, "eval_samples_per_second": 111.819, "eval_steps_per_second": 13.978, "step": 3200 }, { "epoch": 0.13279827900049646, "grad_norm": 1.4554942846298218, "learning_rate": 4.249685576222943e-05, "loss": 2.4582, "step": 3210 }, { "epoch": 0.1332119808042363, "grad_norm": 1.002738118171692, "learning_rate": 4.2629244720990266e-05, "loss": 2.3749, "step": 3220 }, { "epoch": 0.13362568260797617, "grad_norm": 1.7151089906692505, "learning_rate": 4.2761633679751114e-05, "loss": 2.4267, "step": 3230 }, { "epoch": 0.13403938441171603, "grad_norm": 1.282315731048584, "learning_rate": 4.289402263851195e-05, "loss": 2.4196, "step": 3240 }, { "epoch": 0.1344530862154559, "grad_norm": 2.6666760444641113, "learning_rate": 4.302641159727279e-05, "loss": 2.3976, "step": 3250 }, { "epoch": 0.1344530862154559, "eval_loss": 2.412097930908203, "eval_runtime": 191.9903, "eval_samples_per_second": 111.917, "eval_steps_per_second": 13.99, "step": 3250 }, { "epoch": 0.13486678801919577, "grad_norm": 1.4049510955810547, "learning_rate": 4.315880055603363e-05, "loss": 2.39, "step": 3260 }, { "epoch": 0.13528048982293564, "grad_norm": 1.1045989990234375, "learning_rate": 4.3291189514794465e-05, "loss": 2.414, "step": 3270 }, { "epoch": 0.13569419162667548, "grad_norm": 1.7965985536575317, "learning_rate": 4.342357847355531e-05, "loss": 2.3464, "step": 3280 }, { "epoch": 0.13610789343041535, "grad_norm": 1.3104205131530762, "learning_rate": 4.355596743231615e-05, "loss": 2.4777, "step": 3290 }, { "epoch": 0.13652159523415522, "grad_norm": 3.072674512863159, "learning_rate": 4.368835639107698e-05, "loss": 2.3883, "step": 3300 }, { "epoch": 0.13652159523415522, "eval_loss": 2.4098899364471436, "eval_runtime": 191.8234, "eval_samples_per_second": 112.015, "eval_steps_per_second": 14.002, "step": 3300 }, { "epoch": 0.1369352970378951, "grad_norm": 1.380400538444519, "learning_rate": 4.382074534983783e-05, "loss": 2.3192, "step": 3310 }, { "epoch": 0.13734899884163496, "grad_norm": 1.0491138696670532, "learning_rate": 4.3953134308598664e-05, "loss": 2.468, "step": 3320 }, { "epoch": 0.13776270064537482, "grad_norm": 1.5807843208312988, "learning_rate": 4.4085523267359505e-05, "loss": 2.3982, "step": 3330 }, { "epoch": 0.13817640244911467, "grad_norm": 1.2569433450698853, "learning_rate": 4.4217912226120346e-05, "loss": 2.3795, "step": 3340 }, { "epoch": 0.13859010425285453, "grad_norm": 3.245999813079834, "learning_rate": 4.435030118488118e-05, "loss": 2.4337, "step": 3350 }, { "epoch": 0.13859010425285453, "eval_loss": 2.4094996452331543, "eval_runtime": 192.1185, "eval_samples_per_second": 111.842, "eval_steps_per_second": 13.981, "step": 3350 }, { "epoch": 0.1390038060565944, "grad_norm": 1.3025696277618408, "learning_rate": 4.448269014364202e-05, "loss": 2.4963, "step": 3360 }, { "epoch": 0.13941750786033427, "grad_norm": 1.0986889600753784, "learning_rate": 4.461507910240286e-05, "loss": 2.4034, "step": 3370 }, { "epoch": 0.13983120966407414, "grad_norm": 1.683685064315796, "learning_rate": 4.47474680611637e-05, "loss": 2.3835, "step": 3380 }, { "epoch": 0.140244911467814, "grad_norm": 1.1629588603973389, "learning_rate": 4.4879857019924545e-05, "loss": 2.3337, "step": 3390 }, { "epoch": 0.14065861327155385, "grad_norm": 2.2829859256744385, "learning_rate": 4.501224597868538e-05, "loss": 2.3364, "step": 3400 }, { "epoch": 0.14065861327155385, "eval_loss": 2.4066433906555176, "eval_runtime": 192.2065, "eval_samples_per_second": 111.791, "eval_steps_per_second": 13.975, "step": 3400 }, { "epoch": 0.14107231507529372, "grad_norm": 1.37264883518219, "learning_rate": 4.5144634937446214e-05, "loss": 2.3832, "step": 3410 }, { "epoch": 0.1414860168790336, "grad_norm": 1.1494956016540527, "learning_rate": 4.527702389620706e-05, "loss": 2.4552, "step": 3420 }, { "epoch": 0.14189971868277346, "grad_norm": 1.7282670736312866, "learning_rate": 4.5409412854967896e-05, "loss": 2.3983, "step": 3430 }, { "epoch": 0.14231342048651333, "grad_norm": 1.3106037378311157, "learning_rate": 4.554180181372874e-05, "loss": 2.3335, "step": 3440 }, { "epoch": 0.1427271222902532, "grad_norm": 2.7251391410827637, "learning_rate": 4.567419077248958e-05, "loss": 2.3768, "step": 3450 }, { "epoch": 0.1427271222902532, "eval_loss": 2.4065420627593994, "eval_runtime": 192.1546, "eval_samples_per_second": 111.821, "eval_steps_per_second": 13.978, "step": 3450 }, { "epoch": 0.14314082409399306, "grad_norm": 1.519156813621521, "learning_rate": 4.580657973125041e-05, "loss": 2.3361, "step": 3460 }, { "epoch": 0.1435545258977329, "grad_norm": 1.0513484477996826, "learning_rate": 4.5938968690011254e-05, "loss": 2.3188, "step": 3470 }, { "epoch": 0.14396822770147277, "grad_norm": 1.7401469945907593, "learning_rate": 4.6071357648772096e-05, "loss": 2.4063, "step": 3480 }, { "epoch": 0.14438192950521264, "grad_norm": 1.1648222208023071, "learning_rate": 4.620374660753293e-05, "loss": 2.3839, "step": 3490 }, { "epoch": 0.1447956313089525, "grad_norm": 3.3850860595703125, "learning_rate": 4.633613556629378e-05, "loss": 2.4395, "step": 3500 }, { "epoch": 0.1447956313089525, "eval_loss": 2.4080898761749268, "eval_runtime": 192.0663, "eval_samples_per_second": 111.873, "eval_steps_per_second": 13.985, "step": 3500 }, { "epoch": 0.14520933311269238, "grad_norm": 1.410775899887085, "learning_rate": 4.646852452505461e-05, "loss": 2.3726, "step": 3510 }, { "epoch": 0.14562303491643225, "grad_norm": 1.1001288890838623, "learning_rate": 4.6600913483815453e-05, "loss": 2.3611, "step": 3520 }, { "epoch": 0.1460367367201721, "grad_norm": 2.0212314128875732, "learning_rate": 4.6733302442576295e-05, "loss": 2.4736, "step": 3530 }, { "epoch": 0.14645043852391196, "grad_norm": 1.198148250579834, "learning_rate": 4.686569140133713e-05, "loss": 2.473, "step": 3540 }, { "epoch": 0.14686414032765183, "grad_norm": 3.012451648712158, "learning_rate": 4.699808036009797e-05, "loss": 2.2957, "step": 3550 }, { "epoch": 0.14686414032765183, "eval_loss": 2.4069294929504395, "eval_runtime": 191.954, "eval_samples_per_second": 111.938, "eval_steps_per_second": 13.993, "step": 3550 }, { "epoch": 0.1472778421313917, "grad_norm": 1.3353909254074097, "learning_rate": 4.713046931885881e-05, "loss": 2.3832, "step": 3560 }, { "epoch": 0.14769154393513156, "grad_norm": 1.046620488166809, "learning_rate": 4.7262858277619646e-05, "loss": 2.357, "step": 3570 }, { "epoch": 0.14810524573887143, "grad_norm": 1.5267986059188843, "learning_rate": 4.739524723638049e-05, "loss": 2.3542, "step": 3580 }, { "epoch": 0.14851894754261127, "grad_norm": 1.165702223777771, "learning_rate": 4.752763619514133e-05, "loss": 2.4514, "step": 3590 }, { "epoch": 0.14893264934635114, "grad_norm": 2.9726755619049072, "learning_rate": 4.766002515390217e-05, "loss": 2.396, "step": 3600 }, { "epoch": 0.14893264934635114, "eval_loss": 2.4058215618133545, "eval_runtime": 192.0029, "eval_samples_per_second": 111.91, "eval_steps_per_second": 13.989, "step": 3600 }, { "epoch": 0.149346351150091, "grad_norm": 1.3360719680786133, "learning_rate": 4.779241411266301e-05, "loss": 2.3427, "step": 3610 }, { "epoch": 0.14976005295383088, "grad_norm": 1.145864725112915, "learning_rate": 4.7924803071423845e-05, "loss": 2.4329, "step": 3620 }, { "epoch": 0.15017375475757075, "grad_norm": 1.6864769458770752, "learning_rate": 4.8057192030184686e-05, "loss": 2.3972, "step": 3630 }, { "epoch": 0.15058745656131062, "grad_norm": 1.061402678489685, "learning_rate": 4.818958098894553e-05, "loss": 2.3986, "step": 3640 }, { "epoch": 0.15100115836505046, "grad_norm": 3.768049955368042, "learning_rate": 4.832196994770636e-05, "loss": 2.4117, "step": 3650 }, { "epoch": 0.15100115836505046, "eval_loss": 2.407238245010376, "eval_runtime": 192.2125, "eval_samples_per_second": 111.788, "eval_steps_per_second": 13.974, "step": 3650 }, { "epoch": 0.15141486016879033, "grad_norm": 1.5922744274139404, "learning_rate": 4.84543589064672e-05, "loss": 2.4517, "step": 3660 }, { "epoch": 0.1518285619725302, "grad_norm": 1.0235862731933594, "learning_rate": 4.8586747865228044e-05, "loss": 2.4191, "step": 3670 }, { "epoch": 0.15224226377627006, "grad_norm": 1.8338124752044678, "learning_rate": 4.871913682398888e-05, "loss": 2.3864, "step": 3680 }, { "epoch": 0.15265596558000993, "grad_norm": 1.1432383060455322, "learning_rate": 4.885152578274972e-05, "loss": 2.3625, "step": 3690 }, { "epoch": 0.1530696673837498, "grad_norm": 2.982774257659912, "learning_rate": 4.898391474151056e-05, "loss": 2.3691, "step": 3700 }, { "epoch": 0.1530696673837498, "eval_loss": 2.4091320037841797, "eval_runtime": 192.1143, "eval_samples_per_second": 111.845, "eval_steps_per_second": 13.981, "step": 3700 }, { "epoch": 0.15348336918748967, "grad_norm": 1.2390708923339844, "learning_rate": 4.91163037002714e-05, "loss": 2.3731, "step": 3710 }, { "epoch": 0.1538970709912295, "grad_norm": 1.0039831399917603, "learning_rate": 4.924869265903224e-05, "loss": 2.4012, "step": 3720 }, { "epoch": 0.15431077279496938, "grad_norm": 1.6033133268356323, "learning_rate": 4.938108161779308e-05, "loss": 2.4, "step": 3730 }, { "epoch": 0.15472447459870925, "grad_norm": 1.278464674949646, "learning_rate": 4.951347057655392e-05, "loss": 2.4183, "step": 3740 }, { "epoch": 0.15513817640244912, "grad_norm": 2.8587698936462402, "learning_rate": 4.964585953531476e-05, "loss": 2.3721, "step": 3750 }, { "epoch": 0.15513817640244912, "eval_loss": 2.40727162361145, "eval_runtime": 192.3745, "eval_samples_per_second": 111.694, "eval_steps_per_second": 13.962, "step": 3750 } ], "logging_steps": 10, "max_steps": 2417200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.37516682344661e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }