{ "epoch": 1.0, "global_step": 15617, "max_steps": 15617, "logging_steps": 5, "eval_steps": 200, "save_steps": 0, "train_batch_size": 8, "num_train_epochs": 1, "num_input_tokens_seen": 0, "total_flos": 5.2748789856731136e+17, "log_history": [ { "loss": 2.6996, "grad_norm": 2.458204984664917, "learning_rate": 1.6005121638924457e-07, "epoch": 0.0003201639239290517, "step": 5 }, { "loss": 2.6896, "grad_norm": 2.7179408073425293, "learning_rate": 3.2010243277848913e-07, "epoch": 0.0006403278478581034, "step": 10 }, { "loss": 2.6812, "grad_norm": 2.403977870941162, "learning_rate": 4.801536491677337e-07, "epoch": 0.0009604917717871551, "step": 15 }, { "loss": 2.6922, "grad_norm": 2.4200124740600586, "learning_rate": 6.402048655569783e-07, "epoch": 0.0012806556957162068, "step": 20 }, { "loss": 2.6932, "grad_norm": 2.451019048690796, "learning_rate": 8.002560819462229e-07, "epoch": 0.0016008196196452584, "step": 25 }, { "loss": 2.6732, "grad_norm": 2.2547831535339355, "learning_rate": 9.603072983354673e-07, "epoch": 0.0019209835435743102, "step": 30 }, { "loss": 2.7105, "grad_norm": 2.2860751152038574, "learning_rate": 1.120358514724712e-06, "epoch": 0.0022411474675033617, "step": 35 }, { "loss": 2.6946, "grad_norm": 2.1596930027008057, "learning_rate": 1.2804097311139565e-06, "epoch": 0.0025613113914324135, "step": 40 }, { "loss": 2.697, "grad_norm": 2.220602512359619, "learning_rate": 1.4404609475032012e-06, "epoch": 0.002881475315361465, "step": 45 }, { "loss": 2.6972, "grad_norm": 2.0607404708862305, "learning_rate": 1.6005121638924457e-06, "epoch": 0.0032016392392905167, "step": 50 }, { "loss": 2.6861, "grad_norm": 2.138319492340088, "learning_rate": 1.7605633802816904e-06, "epoch": 0.0035218031632195685, "step": 55 }, { "loss": 2.6478, "grad_norm": 2.244121789932251, "learning_rate": 1.9206145966709347e-06, "epoch": 0.0038419670871486203, "step": 60 }, { "loss": 2.6676, "grad_norm": 2.0738816261291504, "learning_rate": 2.0806658130601794e-06, "epoch": 0.004162131011077672, "step": 65 }, { "loss": 2.6839, "grad_norm": 2.0179197788238525, "learning_rate": 2.240717029449424e-06, "epoch": 0.0044822949350067235, "step": 70 }, { "loss": 2.6514, "grad_norm": 1.970077633857727, "learning_rate": 2.4007682458386688e-06, "epoch": 0.004802458858935775, "step": 75 }, { "loss": 2.6685, "grad_norm": 1.9740138053894043, "learning_rate": 2.560819462227913e-06, "epoch": 0.005122622782864827, "step": 80 }, { "loss": 2.6835, "grad_norm": 2.0440573692321777, "learning_rate": 2.7208706786171577e-06, "epoch": 0.0054427867067938784, "step": 85 }, { "loss": 2.6463, "grad_norm": 1.9804933071136475, "learning_rate": 2.8809218950064024e-06, "epoch": 0.00576295063072293, "step": 90 }, { "loss": 2.6683, "grad_norm": 1.9438477754592896, "learning_rate": 3.0409731113956467e-06, "epoch": 0.006083114554651982, "step": 95 }, { "loss": 2.6281, "grad_norm": 2.0071446895599365, "learning_rate": 3.2010243277848914e-06, "epoch": 0.006403278478581033, "step": 100 }, { "loss": 2.6636, "grad_norm": 1.9696836471557617, "learning_rate": 3.361075544174136e-06, "epoch": 0.006723442402510085, "step": 105 }, { "loss": 2.6637, "grad_norm": 2.0100696086883545, "learning_rate": 3.521126760563381e-06, "epoch": 0.007043606326439137, "step": 110 }, { "loss": 2.6533, "grad_norm": 1.9581183195114136, "learning_rate": 3.681177976952625e-06, "epoch": 0.007363770250368188, "step": 115 }, { "loss": 2.6481, "grad_norm": 1.921207070350647, "learning_rate": 3.841229193341869e-06, "epoch": 0.007683934174297241, "step": 120 }, { "loss": 2.6398, "grad_norm": 1.9562214612960815, "learning_rate": 4.001280409731114e-06, "epoch": 0.008004098098226291, "step": 125 }, { "loss": 2.6512, "grad_norm": 2.040691614151001, "learning_rate": 4.161331626120359e-06, "epoch": 0.008324262022155344, "step": 130 }, { "loss": 2.6453, "grad_norm": 2.003002643585205, "learning_rate": 4.321382842509603e-06, "epoch": 0.008644425946084396, "step": 135 }, { "loss": 2.6442, "grad_norm": 1.9360718727111816, "learning_rate": 4.481434058898848e-06, "epoch": 0.008964589870013447, "step": 140 }, { "loss": 2.6393, "grad_norm": 2.054797410964966, "learning_rate": 4.641485275288092e-06, "epoch": 0.009284753793942498, "step": 145 }, { "loss": 2.6241, "grad_norm": 1.9937611818313599, "learning_rate": 4.8015364916773375e-06, "epoch": 0.00960491771787155, "step": 150 }, { "loss": 2.6592, "grad_norm": 2.0823230743408203, "learning_rate": 4.961587708066581e-06, "epoch": 0.009925081641800603, "step": 155 }, { "loss": 2.6511, "grad_norm": 2.1629271507263184, "learning_rate": 5.121638924455826e-06, "epoch": 0.010245245565729654, "step": 160 }, { "loss": 2.6291, "grad_norm": 1.968585729598999, "learning_rate": 5.28169014084507e-06, "epoch": 0.010565409489658706, "step": 165 }, { "loss": 2.632, "grad_norm": 2.0315310955047607, "learning_rate": 5.4417413572343155e-06, "epoch": 0.010885573413587757, "step": 170 }, { "loss": 2.6488, "grad_norm": 2.0342774391174316, "learning_rate": 5.60179257362356e-06, "epoch": 0.011205737337516808, "step": 175 }, { "loss": 2.6346, "grad_norm": 1.9805340766906738, "learning_rate": 5.761843790012805e-06, "epoch": 0.01152590126144586, "step": 180 }, { "loss": 2.6129, "grad_norm": 2.0554709434509277, "learning_rate": 5.921895006402049e-06, "epoch": 0.011846065185374913, "step": 185 }, { "loss": 2.5925, "grad_norm": 1.9576430320739746, "learning_rate": 6.0819462227912934e-06, "epoch": 0.012166229109303964, "step": 190 }, { "loss": 2.6392, "grad_norm": 2.046264410018921, "learning_rate": 6.241997439180538e-06, "epoch": 0.012486393033233015, "step": 195 }, { "loss": 2.6122, "grad_norm": 1.9459033012390137, "learning_rate": 6.402048655569783e-06, "epoch": 0.012806556957162067, "step": 200 }, { "eval_loss": 2.469697952270508, "eval_runtime": 13.9916, "eval_samples_per_second": 146.373, "eval_steps_per_second": 18.297, "epoch": 0.012806556957162067, "step": 200 }, { "loss": 2.6529, "grad_norm": 1.9425408840179443, "learning_rate": 6.562099871959026e-06, "epoch": 0.013126720881091118, "step": 205 }, { "loss": 2.624, "grad_norm": 1.9245175123214722, "learning_rate": 6.722151088348272e-06, "epoch": 0.01344688480502017, "step": 210 }, { "loss": 2.6275, "grad_norm": 2.075512647628784, "learning_rate": 6.882202304737516e-06, "epoch": 0.013767048728949223, "step": 215 }, { "loss": 2.6339, "grad_norm": 2.0762271881103516, "learning_rate": 7.042253521126762e-06, "epoch": 0.014087212652878274, "step": 220 }, { "loss": 2.615, "grad_norm": 1.9588446617126465, "learning_rate": 7.202304737516005e-06, "epoch": 0.014407376576807325, "step": 225 }, { "loss": 2.615, "grad_norm": 2.0142860412597656, "learning_rate": 7.36235595390525e-06, "epoch": 0.014727540500736377, "step": 230 }, { "loss": 2.6257, "grad_norm": 2.0105693340301514, "learning_rate": 7.5224071702944944e-06, "epoch": 0.015047704424665428, "step": 235 }, { "loss": 2.6421, "grad_norm": 2.0586354732513428, "learning_rate": 7.682458386683739e-06, "epoch": 0.015367868348594481, "step": 240 }, { "loss": 2.6181, "grad_norm": 1.9821962118148804, "learning_rate": 7.842509603072984e-06, "epoch": 0.015688032272523533, "step": 245 }, { "loss": 2.593, "grad_norm": 2.0167949199676514, "learning_rate": 8.002560819462227e-06, "epoch": 0.016008196196452582, "step": 250 }, { "loss": 2.6102, "grad_norm": 2.1412761211395264, "learning_rate": 8.162612035851472e-06, "epoch": 0.016328360120381635, "step": 255 }, { "loss": 2.6243, "grad_norm": 1.926281213760376, "learning_rate": 8.322663252240718e-06, "epoch": 0.01664852404431069, "step": 260 }, { "loss": 2.6286, "grad_norm": 1.9839813709259033, "learning_rate": 8.482714468629963e-06, "epoch": 0.016968687968239738, "step": 265 }, { "loss": 2.5883, "grad_norm": 2.0237507820129395, "learning_rate": 8.642765685019206e-06, "epoch": 0.01728885189216879, "step": 270 }, { "loss": 2.6162, "grad_norm": 1.937261939048767, "learning_rate": 8.802816901408451e-06, "epoch": 0.01760901581609784, "step": 275 }, { "loss": 2.6302, "grad_norm": 2.0588278770446777, "learning_rate": 8.962868117797696e-06, "epoch": 0.017929179740026894, "step": 280 }, { "loss": 2.6054, "grad_norm": 2.097682476043701, "learning_rate": 9.12291933418694e-06, "epoch": 0.018249343663955947, "step": 285 }, { "loss": 2.6023, "grad_norm": 2.114814043045044, "learning_rate": 9.282970550576185e-06, "epoch": 0.018569507587884997, "step": 290 }, { "loss": 2.5885, "grad_norm": 1.9583766460418701, "learning_rate": 9.44302176696543e-06, "epoch": 0.01888967151181405, "step": 295 }, { "loss": 2.6237, "grad_norm": 2.047853708267212, "learning_rate": 9.603072983354675e-06, "epoch": 0.0192098354357431, "step": 300 }, { "loss": 2.5634, "grad_norm": 2.0519795417785645, "learning_rate": 9.763124199743919e-06, "epoch": 0.019529999359672152, "step": 305 }, { "loss": 2.624, "grad_norm": 2.057596445083618, "learning_rate": 9.923175416133162e-06, "epoch": 0.019850163283601206, "step": 310 }, { "loss": 2.6016, "grad_norm": 2.060136079788208, "learning_rate": 1.0083226632522407e-05, "epoch": 0.020170327207530255, "step": 315 }, { "loss": 2.6114, "grad_norm": 2.0268754959106445, "learning_rate": 1.0243277848911652e-05, "epoch": 0.02049049113145931, "step": 320 }, { "loss": 2.5783, "grad_norm": 2.0680699348449707, "learning_rate": 1.0403329065300897e-05, "epoch": 0.020810655055388358, "step": 325 }, { "loss": 2.5616, "grad_norm": 1.9991204738616943, "learning_rate": 1.056338028169014e-05, "epoch": 0.02113081897931741, "step": 330 }, { "loss": 2.6128, "grad_norm": 2.09833025932312, "learning_rate": 1.0723431498079386e-05, "epoch": 0.02145098290324646, "step": 335 }, { "loss": 2.6025, "grad_norm": 1.9758498668670654, "learning_rate": 1.0883482714468631e-05, "epoch": 0.021771146827175514, "step": 340 }, { "loss": 2.5805, "grad_norm": 2.0436413288116455, "learning_rate": 1.1043533930857874e-05, "epoch": 0.022091310751104567, "step": 345 }, { "loss": 2.5944, "grad_norm": 2.0317132472991943, "learning_rate": 1.120358514724712e-05, "epoch": 0.022411474675033616, "step": 350 }, { "loss": 2.5963, "grad_norm": 2.199904680252075, "learning_rate": 1.1363636363636365e-05, "epoch": 0.02273163859896267, "step": 355 }, { "loss": 2.5799, "grad_norm": 1.9787744283676147, "learning_rate": 1.152368758002561e-05, "epoch": 0.02305180252289172, "step": 360 }, { "loss": 2.6212, "grad_norm": 1.890915870666504, "learning_rate": 1.1683738796414853e-05, "epoch": 0.023371966446820772, "step": 365 }, { "loss": 2.5704, "grad_norm": 2.013899087905884, "learning_rate": 1.1843790012804098e-05, "epoch": 0.023692130370749825, "step": 370 }, { "loss": 2.6101, "grad_norm": 2.0689878463745117, "learning_rate": 1.2003841229193342e-05, "epoch": 0.024012294294678875, "step": 375 }, { "loss": 2.599, "grad_norm": 2.028207540512085, "learning_rate": 1.2163892445582587e-05, "epoch": 0.024332458218607928, "step": 380 }, { "loss": 2.6125, "grad_norm": 1.9936445951461792, "learning_rate": 1.2323943661971832e-05, "epoch": 0.024652622142536978, "step": 385 }, { "loss": 2.5983, "grad_norm": 2.112257719039917, "learning_rate": 1.2483994878361075e-05, "epoch": 0.02497278606646603, "step": 390 }, { "loss": 2.6074, "grad_norm": 2.079145669937134, "learning_rate": 1.264404609475032e-05, "epoch": 0.025292949990395084, "step": 395 }, { "loss": 2.5697, "grad_norm": 2.074415922164917, "learning_rate": 1.2804097311139566e-05, "epoch": 0.025613113914324134, "step": 400 }, { "eval_loss": 2.420924663543701, "eval_runtime": 14.9165, "eval_samples_per_second": 137.297, "eval_steps_per_second": 17.162, "epoch": 0.025613113914324134, "step": 400 }, { "loss": 2.5931, "grad_norm": 1.9997014999389648, "learning_rate": 1.296414852752881e-05, "epoch": 0.025933277838253187, "step": 405 }, { "loss": 2.5684, "grad_norm": 2.0531606674194336, "learning_rate": 1.3124199743918053e-05, "epoch": 0.026253441762182236, "step": 410 }, { "loss": 2.5935, "grad_norm": 2.0724613666534424, "learning_rate": 1.3284250960307298e-05, "epoch": 0.02657360568611129, "step": 415 }, { "loss": 2.5658, "grad_norm": 2.071101188659668, "learning_rate": 1.3444302176696544e-05, "epoch": 0.02689376961004034, "step": 420 }, { "loss": 2.5874, "grad_norm": 1.9947307109832764, "learning_rate": 1.360435339308579e-05, "epoch": 0.027213933533969392, "step": 425 }, { "loss": 2.5713, "grad_norm": 2.066715717315674, "learning_rate": 1.3764404609475031e-05, "epoch": 0.027534097457898445, "step": 430 }, { "loss": 2.5831, "grad_norm": 2.0885956287384033, "learning_rate": 1.3924455825864276e-05, "epoch": 0.027854261381827495, "step": 435 }, { "loss": 2.5972, "grad_norm": 2.0391199588775635, "learning_rate": 1.4084507042253523e-05, "epoch": 0.028174425305756548, "step": 440 }, { "loss": 2.5874, "grad_norm": 1.9865524768829346, "learning_rate": 1.4244558258642765e-05, "epoch": 0.028494589229685598, "step": 445 }, { "loss": 2.5465, "grad_norm": 2.0678815841674805, "learning_rate": 1.440460947503201e-05, "epoch": 0.02881475315361465, "step": 450 }, { "loss": 2.5946, "grad_norm": 2.057670831680298, "learning_rate": 1.4564660691421255e-05, "epoch": 0.029134917077543704, "step": 455 }, { "loss": 2.6092, "grad_norm": 2.101698875427246, "learning_rate": 1.47247119078105e-05, "epoch": 0.029455081001472753, "step": 460 }, { "loss": 2.5654, "grad_norm": 2.0881927013397217, "learning_rate": 1.4884763124199744e-05, "epoch": 0.029775244925401807, "step": 465 }, { "loss": 2.5596, "grad_norm": 2.091878890991211, "learning_rate": 1.5044814340588989e-05, "epoch": 0.030095408849330856, "step": 470 }, { "loss": 2.5909, "grad_norm": 2.0245919227600098, "learning_rate": 1.5204865556978234e-05, "epoch": 0.03041557277325991, "step": 475 }, { "loss": 2.5902, "grad_norm": 2.0498111248016357, "learning_rate": 1.5364916773367477e-05, "epoch": 0.030735736697188962, "step": 480 }, { "loss": 2.5611, "grad_norm": 2.0705480575561523, "learning_rate": 1.5524967989756723e-05, "epoch": 0.031055900621118012, "step": 485 }, { "loss": 2.5305, "grad_norm": 2.0246481895446777, "learning_rate": 1.5685019206145968e-05, "epoch": 0.031376064545047065, "step": 490 }, { "loss": 2.5752, "grad_norm": 1.9090466499328613, "learning_rate": 1.5845070422535213e-05, "epoch": 0.031696228468976115, "step": 495 }, { "loss": 2.5908, "grad_norm": 2.044546604156494, "learning_rate": 1.6005121638924455e-05, "epoch": 0.032016392392905164, "step": 500 }, { "loss": 2.5695, "grad_norm": 2.096444845199585, "learning_rate": 1.61651728553137e-05, "epoch": 0.03233655631683422, "step": 505 }, { "loss": 2.5942, "grad_norm": 2.1051011085510254, "learning_rate": 1.6325224071702945e-05, "epoch": 0.03265672024076327, "step": 510 }, { "loss": 2.5629, "grad_norm": 1.9321959018707275, "learning_rate": 1.648527528809219e-05, "epoch": 0.03297688416469232, "step": 515 }, { "loss": 2.5745, "grad_norm": 2.038756847381592, "learning_rate": 1.6645326504481435e-05, "epoch": 0.03329704808862138, "step": 520 }, { "loss": 2.5618, "grad_norm": 2.018385410308838, "learning_rate": 1.680537772087068e-05, "epoch": 0.033617212012550426, "step": 525 }, { "loss": 2.5884, "grad_norm": 2.459459066390991, "learning_rate": 1.6965428937259925e-05, "epoch": 0.033937375936479476, "step": 530 }, { "loss": 2.5639, "grad_norm": 1.96848726272583, "learning_rate": 1.7125480153649167e-05, "epoch": 0.034257539860408526, "step": 535 }, { "loss": 2.56, "grad_norm": 2.0582144260406494, "learning_rate": 1.7285531370038412e-05, "epoch": 0.03457770378433758, "step": 540 }, { "loss": 2.5354, "grad_norm": 2.1106033325195312, "learning_rate": 1.7445582586427657e-05, "epoch": 0.03489786770826663, "step": 545 }, { "loss": 2.5768, "grad_norm": 2.137942314147949, "learning_rate": 1.7605633802816902e-05, "epoch": 0.03521803163219568, "step": 550 }, { "loss": 2.5491, "grad_norm": 2.1352152824401855, "learning_rate": 1.7765685019206147e-05, "epoch": 0.03553819555612474, "step": 555 }, { "loss": 2.5545, "grad_norm": 2.0633294582366943, "learning_rate": 1.7925736235595393e-05, "epoch": 0.03585835948005379, "step": 560 }, { "loss": 2.5579, "grad_norm": 2.02809476852417, "learning_rate": 1.8085787451984638e-05, "epoch": 0.03617852340398284, "step": 565 }, { "loss": 2.5585, "grad_norm": 2.1206002235412598, "learning_rate": 1.824583866837388e-05, "epoch": 0.036498687327911894, "step": 570 }, { "loss": 2.5356, "grad_norm": 2.072930335998535, "learning_rate": 1.8405889884763125e-05, "epoch": 0.036818851251840944, "step": 575 }, { "loss": 2.5238, "grad_norm": 1.958531379699707, "learning_rate": 1.856594110115237e-05, "epoch": 0.03713901517576999, "step": 580 }, { "loss": 2.5486, "grad_norm": 2.0069925785064697, "learning_rate": 1.872599231754161e-05, "epoch": 0.03745917909969904, "step": 585 }, { "loss": 2.5567, "grad_norm": 2.0546441078186035, "learning_rate": 1.888604353393086e-05, "epoch": 0.0377793430236281, "step": 590 }, { "loss": 2.5517, "grad_norm": 2.076535940170288, "learning_rate": 1.9046094750320105e-05, "epoch": 0.03809950694755715, "step": 595 }, { "loss": 2.5513, "grad_norm": 2.17091703414917, "learning_rate": 1.920614596670935e-05, "epoch": 0.0384196708714862, "step": 600 }, { "eval_loss": 2.400339126586914, "eval_runtime": 10.2049, "eval_samples_per_second": 200.687, "eval_steps_per_second": 25.086, "epoch": 0.0384196708714862, "step": 600 }, { "loss": 2.5444, "grad_norm": 1.9744805097579956, "learning_rate": 1.9366197183098592e-05, "epoch": 0.038739834795415255, "step": 605 }, { "loss": 2.5378, "grad_norm": 1.9619749784469604, "learning_rate": 1.9526248399487837e-05, "epoch": 0.039059998719344305, "step": 610 }, { "loss": 2.5362, "grad_norm": 2.023552656173706, "learning_rate": 1.9686299615877082e-05, "epoch": 0.039380162643273355, "step": 615 }, { "loss": 2.563, "grad_norm": 2.148352861404419, "learning_rate": 1.9846350832266324e-05, "epoch": 0.03970032656720241, "step": 620 }, { "loss": 2.5317, "grad_norm": 2.052964925765991, "learning_rate": 2.000640204865557e-05, "epoch": 0.04002049049113146, "step": 625 }, { "loss": 2.5463, "grad_norm": 2.0874581336975098, "learning_rate": 2.0166453265044814e-05, "epoch": 0.04034065441506051, "step": 630 }, { "loss": 2.5432, "grad_norm": 2.1276135444641113, "learning_rate": 2.0326504481434063e-05, "epoch": 0.04066081833898956, "step": 635 }, { "loss": 2.5441, "grad_norm": 2.120331048965454, "learning_rate": 2.0486555697823304e-05, "epoch": 0.04098098226291862, "step": 640 }, { "loss": 2.5594, "grad_norm": 1.9808118343353271, "learning_rate": 2.064660691421255e-05, "epoch": 0.041301146186847666, "step": 645 }, { "loss": 2.5496, "grad_norm": 2.133842706680298, "learning_rate": 2.0806658130601795e-05, "epoch": 0.041621310110776716, "step": 650 }, { "loss": 2.5293, "grad_norm": 2.060401439666748, "learning_rate": 2.0966709346991036e-05, "epoch": 0.04194147403470577, "step": 655 }, { "loss": 2.5548, "grad_norm": 2.127145767211914, "learning_rate": 2.112676056338028e-05, "epoch": 0.04226163795863482, "step": 660 }, { "loss": 2.5335, "grad_norm": 2.0574846267700195, "learning_rate": 2.1286811779769527e-05, "epoch": 0.04258180188256387, "step": 665 }, { "loss": 2.569, "grad_norm": 2.057927131652832, "learning_rate": 2.1446862996158772e-05, "epoch": 0.04290196580649292, "step": 670 }, { "loss": 2.5401, "grad_norm": 2.064457654953003, "learning_rate": 2.1606914212548017e-05, "epoch": 0.04322212973042198, "step": 675 }, { "loss": 2.5279, "grad_norm": 2.1458826065063477, "learning_rate": 2.1766965428937262e-05, "epoch": 0.04354229365435103, "step": 680 }, { "loss": 2.5387, "grad_norm": 1.951903223991394, "learning_rate": 2.1927016645326507e-05, "epoch": 0.04386245757828008, "step": 685 }, { "loss": 2.4947, "grad_norm": 2.019552707672119, "learning_rate": 2.208706786171575e-05, "epoch": 0.044182621502209134, "step": 690 }, { "loss": 2.5334, "grad_norm": 2.1926770210266113, "learning_rate": 2.2247119078104994e-05, "epoch": 0.04450278542613818, "step": 695 }, { "loss": 2.5333, "grad_norm": 2.108576774597168, "learning_rate": 2.240717029449424e-05, "epoch": 0.04482294935006723, "step": 700 }, { "loss": 2.5361, "grad_norm": 2.0758907794952393, "learning_rate": 2.2567221510883484e-05, "epoch": 0.04514311327399629, "step": 705 }, { "loss": 2.5382, "grad_norm": 2.120901107788086, "learning_rate": 2.272727272727273e-05, "epoch": 0.04546327719792534, "step": 710 }, { "loss": 2.5182, "grad_norm": 2.098022699356079, "learning_rate": 2.2887323943661974e-05, "epoch": 0.04578344112185439, "step": 715 }, { "loss": 2.5062, "grad_norm": 2.1501033306121826, "learning_rate": 2.304737516005122e-05, "epoch": 0.04610360504578344, "step": 720 }, { "loss": 2.5476, "grad_norm": 2.16194748878479, "learning_rate": 2.320742637644046e-05, "epoch": 0.046423768969712495, "step": 725 }, { "loss": 2.5253, "grad_norm": 2.200193405151367, "learning_rate": 2.3367477592829706e-05, "epoch": 0.046743932893641545, "step": 730 }, { "loss": 2.5261, "grad_norm": 2.143402099609375, "learning_rate": 2.352752880921895e-05, "epoch": 0.047064096817570594, "step": 735 }, { "loss": 2.5514, "grad_norm": 2.0588419437408447, "learning_rate": 2.3687580025608197e-05, "epoch": 0.04738426074149965, "step": 740 }, { "loss": 2.5248, "grad_norm": 2.039383888244629, "learning_rate": 2.384763124199744e-05, "epoch": 0.0477044246654287, "step": 745 }, { "loss": 2.5464, "grad_norm": 2.1919689178466797, "learning_rate": 2.4007682458386683e-05, "epoch": 0.04802458858935775, "step": 750 }, { "loss": 2.5142, "grad_norm": 2.0702781677246094, "learning_rate": 2.4167733674775932e-05, "epoch": 0.0483447525132868, "step": 755 }, { "loss": 2.5399, "grad_norm": 2.0351169109344482, "learning_rate": 2.4327784891165174e-05, "epoch": 0.048664916437215856, "step": 760 }, { "loss": 2.5392, "grad_norm": 2.276874542236328, "learning_rate": 2.448783610755442e-05, "epoch": 0.048985080361144906, "step": 765 }, { "loss": 2.5221, "grad_norm": 2.1543283462524414, "learning_rate": 2.4647887323943664e-05, "epoch": 0.049305244285073956, "step": 770 }, { "loss": 2.5136, "grad_norm": 2.0453758239746094, "learning_rate": 2.480793854033291e-05, "epoch": 0.04962540820900301, "step": 775 }, { "loss": 2.5164, "grad_norm": 2.0810751914978027, "learning_rate": 2.496798975672215e-05, "epoch": 0.04994557213293206, "step": 780 }, { "loss": 2.5324, "grad_norm": 2.0781326293945312, "learning_rate": 2.5128040973111393e-05, "epoch": 0.05026573605686111, "step": 785 }, { "loss": 2.4946, "grad_norm": 1.9677348136901855, "learning_rate": 2.528809218950064e-05, "epoch": 0.05058589998079017, "step": 790 }, { "loss": 2.5258, "grad_norm": 2.114290475845337, "learning_rate": 2.5448143405889886e-05, "epoch": 0.05090606390471922, "step": 795 }, { "loss": 2.5322, "grad_norm": 2.0765063762664795, "learning_rate": 2.560819462227913e-05, "epoch": 0.05122622782864827, "step": 800 }, { "eval_loss": 2.371161460876465, "eval_runtime": 12.4134, "eval_samples_per_second": 164.983, "eval_steps_per_second": 20.623, "epoch": 0.05122622782864827, "step": 800 }, { "loss": 2.5383, "grad_norm": 2.069668769836426, "learning_rate": 2.5768245838668376e-05, "epoch": 0.05154639175257732, "step": 805 }, { "loss": 2.5004, "grad_norm": 2.1110806465148926, "learning_rate": 2.592829705505762e-05, "epoch": 0.05186655567650637, "step": 810 }, { "loss": 2.5093, "grad_norm": 2.0620675086975098, "learning_rate": 2.6088348271446867e-05, "epoch": 0.05218671960043542, "step": 815 }, { "loss": 2.4999, "grad_norm": 1.9995859861373901, "learning_rate": 2.6248399487836105e-05, "epoch": 0.05250688352436447, "step": 820 }, { "loss": 2.5001, "grad_norm": 2.050431489944458, "learning_rate": 2.640845070422535e-05, "epoch": 0.05282704744829353, "step": 825 }, { "loss": 2.5435, "grad_norm": 2.142716646194458, "learning_rate": 2.6568501920614595e-05, "epoch": 0.05314721137222258, "step": 830 }, { "loss": 2.5513, "grad_norm": 2.053705930709839, "learning_rate": 2.6728553137003844e-05, "epoch": 0.05346737529615163, "step": 835 }, { "loss": 2.5534, "grad_norm": 1.9456514120101929, "learning_rate": 2.688860435339309e-05, "epoch": 0.05378753922008068, "step": 840 }, { "loss": 2.5351, "grad_norm": 2.099213123321533, "learning_rate": 2.7048655569782334e-05, "epoch": 0.054107703144009735, "step": 845 }, { "loss": 2.5077, "grad_norm": 2.0750980377197266, "learning_rate": 2.720870678617158e-05, "epoch": 0.054427867067938784, "step": 850 }, { "loss": 2.4956, "grad_norm": 2.1118557453155518, "learning_rate": 2.7368758002560817e-05, "epoch": 0.054748030991867834, "step": 855 }, { "loss": 2.5411, "grad_norm": 2.021570920944214, "learning_rate": 2.7528809218950063e-05, "epoch": 0.05506819491579689, "step": 860 }, { "loss": 2.5294, "grad_norm": 2.095647096633911, "learning_rate": 2.7688860435339308e-05, "epoch": 0.05538835883972594, "step": 865 }, { "loss": 2.5224, "grad_norm": 2.172281503677368, "learning_rate": 2.7848911651728553e-05, "epoch": 0.05570852276365499, "step": 870 }, { "loss": 2.51, "grad_norm": 2.0782620906829834, "learning_rate": 2.8008962868117798e-05, "epoch": 0.056028686687584046, "step": 875 }, { "loss": 2.5158, "grad_norm": 2.1138572692871094, "learning_rate": 2.8169014084507046e-05, "epoch": 0.056348850611513096, "step": 880 }, { "loss": 2.5186, "grad_norm": 1.9800282716751099, "learning_rate": 2.832906530089629e-05, "epoch": 0.056669014535442146, "step": 885 }, { "loss": 2.4717, "grad_norm": 2.043164014816284, "learning_rate": 2.848911651728553e-05, "epoch": 0.056989178459371195, "step": 890 }, { "loss": 2.5061, "grad_norm": 2.0079712867736816, "learning_rate": 2.8649167733674775e-05, "epoch": 0.05730934238330025, "step": 895 }, { "loss": 2.4861, "grad_norm": 2.0949175357818604, "learning_rate": 2.880921895006402e-05, "epoch": 0.0576295063072293, "step": 900 }, { "loss": 2.5082, "grad_norm": 2.1614928245544434, "learning_rate": 2.8969270166453265e-05, "epoch": 0.05794967023115835, "step": 905 }, { "loss": 2.5003, "grad_norm": 2.0658435821533203, "learning_rate": 2.912932138284251e-05, "epoch": 0.05826983415508741, "step": 910 }, { "loss": 2.5126, "grad_norm": 2.056453227996826, "learning_rate": 2.9289372599231756e-05, "epoch": 0.05858999807901646, "step": 915 }, { "loss": 2.5079, "grad_norm": 2.0968542098999023, "learning_rate": 2.9449423815621e-05, "epoch": 0.05891016200294551, "step": 920 }, { "loss": 2.5189, "grad_norm": 2.0200068950653076, "learning_rate": 2.9609475032010242e-05, "epoch": 0.05923032592687456, "step": 925 }, { "loss": 2.531, "grad_norm": 2.081430435180664, "learning_rate": 2.9769526248399488e-05, "epoch": 0.05955048985080361, "step": 930 }, { "loss": 2.4866, "grad_norm": 2.0429458618164062, "learning_rate": 2.9929577464788733e-05, "epoch": 0.05987065377473266, "step": 935 }, { "loss": 2.51, "grad_norm": 2.0854263305664062, "learning_rate": 3.0089628681177978e-05, "epoch": 0.06019081769866171, "step": 940 }, { "loss": 2.5245, "grad_norm": 2.192448854446411, "learning_rate": 3.0249679897567223e-05, "epoch": 0.06051098162259077, "step": 945 }, { "loss": 2.504, "grad_norm": 2.0920021533966064, "learning_rate": 3.0409731113956468e-05, "epoch": 0.06083114554651982, "step": 950 }, { "loss": 2.4903, "grad_norm": 2.0515296459198, "learning_rate": 3.056978233034571e-05, "epoch": 0.06115130947044887, "step": 955 }, { "loss": 2.5079, "grad_norm": 2.128044605255127, "learning_rate": 3.0729833546734955e-05, "epoch": 0.061471473394377925, "step": 960 }, { "loss": 2.4882, "grad_norm": 2.0168440341949463, "learning_rate": 3.0889884763124197e-05, "epoch": 0.061791637318306974, "step": 965 }, { "loss": 2.5079, "grad_norm": 2.090272903442383, "learning_rate": 3.1049935979513445e-05, "epoch": 0.062111801242236024, "step": 970 }, { "loss": 2.5153, "grad_norm": 2.0438127517700195, "learning_rate": 3.120998719590269e-05, "epoch": 0.062431965166165074, "step": 975 }, { "loss": 2.4814, "grad_norm": 2.051866292953491, "learning_rate": 3.1370038412291935e-05, "epoch": 0.06275212909009413, "step": 980 }, { "loss": 2.5105, "grad_norm": 2.1288902759552, "learning_rate": 3.1530089628681184e-05, "epoch": 0.06307229301402317, "step": 985 }, { "loss": 2.5006, "grad_norm": 2.1590287685394287, "learning_rate": 3.1690140845070426e-05, "epoch": 0.06339245693795223, "step": 990 }, { "loss": 2.5005, "grad_norm": 2.182297945022583, "learning_rate": 3.185019206145967e-05, "epoch": 0.06371262086188129, "step": 995 }, { "loss": 2.5104, "grad_norm": 1.9559736251831055, "learning_rate": 3.201024327784891e-05, "epoch": 0.06403278478581033, "step": 1000 }, { "eval_loss": 2.3531105518341064, "eval_runtime": 14.4683, "eval_samples_per_second": 141.551, "eval_steps_per_second": 17.694, "epoch": 0.06403278478581033, "step": 1000 }, { "loss": 2.4771, "grad_norm": 2.445621967315674, "learning_rate": 3.217029449423816e-05, "epoch": 0.06435294870973939, "step": 1005 }, { "loss": 2.4956, "grad_norm": 2.1934397220611572, "learning_rate": 3.23303457106274e-05, "epoch": 0.06467311263366844, "step": 1010 }, { "loss": 2.4993, "grad_norm": 2.0829265117645264, "learning_rate": 3.249039692701665e-05, "epoch": 0.06499327655759748, "step": 1015 }, { "loss": 2.5011, "grad_norm": 2.163093328475952, "learning_rate": 3.265044814340589e-05, "epoch": 0.06531344048152654, "step": 1020 }, { "loss": 2.4802, "grad_norm": 2.2062385082244873, "learning_rate": 3.281049935979514e-05, "epoch": 0.0656336044054556, "step": 1025 }, { "loss": 2.5182, "grad_norm": 2.199197769165039, "learning_rate": 3.297055057618438e-05, "epoch": 0.06595376832938464, "step": 1030 }, { "loss": 2.4656, "grad_norm": 2.0707991123199463, "learning_rate": 3.313060179257362e-05, "epoch": 0.0662739322533137, "step": 1035 }, { "loss": 2.5002, "grad_norm": 2.182140588760376, "learning_rate": 3.329065300896287e-05, "epoch": 0.06659409617724275, "step": 1040 }, { "loss": 2.5177, "grad_norm": 1.9117063283920288, "learning_rate": 3.345070422535211e-05, "epoch": 0.0669142601011718, "step": 1045 }, { "loss": 2.4955, "grad_norm": 2.077578067779541, "learning_rate": 3.361075544174136e-05, "epoch": 0.06723442402510085, "step": 1050 }, { "loss": 2.5032, "grad_norm": 2.046825408935547, "learning_rate": 3.37708066581306e-05, "epoch": 0.06755458794902991, "step": 1055 }, { "loss": 2.51, "grad_norm": 2.127065420150757, "learning_rate": 3.393085787451985e-05, "epoch": 0.06787475187295895, "step": 1060 }, { "loss": 2.4983, "grad_norm": 2.076838493347168, "learning_rate": 3.409090909090909e-05, "epoch": 0.06819491579688801, "step": 1065 }, { "loss": 2.4759, "grad_norm": 1.964388370513916, "learning_rate": 3.4250960307298334e-05, "epoch": 0.06851507972081705, "step": 1070 }, { "loss": 2.4878, "grad_norm": 2.0563740730285645, "learning_rate": 3.441101152368758e-05, "epoch": 0.06883524364474611, "step": 1075 }, { "loss": 2.5031, "grad_norm": 2.236644744873047, "learning_rate": 3.4571062740076824e-05, "epoch": 0.06915540756867516, "step": 1080 }, { "loss": 2.5151, "grad_norm": 2.1143178939819336, "learning_rate": 3.473111395646607e-05, "epoch": 0.06947557149260421, "step": 1085 }, { "loss": 2.4976, "grad_norm": 2.1225712299346924, "learning_rate": 3.4891165172855314e-05, "epoch": 0.06979573541653326, "step": 1090 }, { "loss": 2.4847, "grad_norm": 2.148134231567383, "learning_rate": 3.505121638924456e-05, "epoch": 0.07011589934046232, "step": 1095 }, { "loss": 2.5044, "grad_norm": 2.070889949798584, "learning_rate": 3.5211267605633805e-05, "epoch": 0.07043606326439136, "step": 1100 }, { "loss": 2.4932, "grad_norm": 2.0513927936553955, "learning_rate": 3.5371318822023046e-05, "epoch": 0.07075622718832042, "step": 1105 }, { "loss": 2.4801, "grad_norm": 2.2668299674987793, "learning_rate": 3.5531370038412295e-05, "epoch": 0.07107639111224948, "step": 1110 }, { "loss": 2.4892, "grad_norm": 2.1977014541625977, "learning_rate": 3.569142125480154e-05, "epoch": 0.07139655503617852, "step": 1115 }, { "loss": 2.4956, "grad_norm": 2.03102707862854, "learning_rate": 3.5851472471190785e-05, "epoch": 0.07171671896010758, "step": 1120 }, { "loss": 2.4646, "grad_norm": 2.058772325515747, "learning_rate": 3.601152368758003e-05, "epoch": 0.07203688288403663, "step": 1125 }, { "loss": 2.4888, "grad_norm": 2.179579019546509, "learning_rate": 3.6171574903969275e-05, "epoch": 0.07235704680796567, "step": 1130 }, { "loss": 2.4983, "grad_norm": 2.1978416442871094, "learning_rate": 3.633162612035852e-05, "epoch": 0.07267721073189473, "step": 1135 }, { "loss": 2.4855, "grad_norm": 1.9466743469238281, "learning_rate": 3.649167733674776e-05, "epoch": 0.07299737465582379, "step": 1140 }, { "loss": 2.4661, "grad_norm": 2.139958143234253, "learning_rate": 3.665172855313701e-05, "epoch": 0.07331753857975283, "step": 1145 }, { "loss": 2.4822, "grad_norm": 2.0397911071777344, "learning_rate": 3.681177976952625e-05, "epoch": 0.07363770250368189, "step": 1150 }, { "loss": 2.5217, "grad_norm": 2.008603811264038, "learning_rate": 3.69718309859155e-05, "epoch": 0.07395786642761093, "step": 1155 }, { "loss": 2.4926, "grad_norm": 2.2945339679718018, "learning_rate": 3.713188220230474e-05, "epoch": 0.07427803035153999, "step": 1160 }, { "loss": 2.5163, "grad_norm": 2.1897239685058594, "learning_rate": 3.729193341869399e-05, "epoch": 0.07459819427546904, "step": 1165 }, { "loss": 2.4827, "grad_norm": 2.0864007472991943, "learning_rate": 3.745198463508322e-05, "epoch": 0.07491835819939809, "step": 1170 }, { "loss": 2.4885, "grad_norm": 2.1951565742492676, "learning_rate": 3.761203585147247e-05, "epoch": 0.07523852212332714, "step": 1175 }, { "loss": 2.4403, "grad_norm": 2.176609516143799, "learning_rate": 3.777208706786172e-05, "epoch": 0.0755586860472562, "step": 1180 }, { "loss": 2.4691, "grad_norm": 2.2460696697235107, "learning_rate": 3.793213828425096e-05, "epoch": 0.07587884997118524, "step": 1185 }, { "loss": 2.4858, "grad_norm": 2.059447765350342, "learning_rate": 3.809218950064021e-05, "epoch": 0.0761990138951143, "step": 1190 }, { "loss": 2.4878, "grad_norm": 2.062699794769287, "learning_rate": 3.825224071702945e-05, "epoch": 0.07651917781904335, "step": 1195 }, { "loss": 2.483, "grad_norm": 2.070650815963745, "learning_rate": 3.84122919334187e-05, "epoch": 0.0768393417429724, "step": 1200 }, { "eval_loss": 2.3345463275909424, "eval_runtime": 11.3977, "eval_samples_per_second": 179.686, "eval_steps_per_second": 22.461, "epoch": 0.0768393417429724, "step": 1200 }, { "loss": 2.4838, "grad_norm": 2.119915723800659, "learning_rate": 3.8572343149807935e-05, "epoch": 0.07715950566690145, "step": 1205 }, { "loss": 2.4591, "grad_norm": 2.0714828968048096, "learning_rate": 3.8732394366197184e-05, "epoch": 0.07747966959083051, "step": 1210 }, { "loss": 2.472, "grad_norm": 2.186169385910034, "learning_rate": 3.8892445582586426e-05, "epoch": 0.07779983351475955, "step": 1215 }, { "loss": 2.4773, "grad_norm": 2.181817054748535, "learning_rate": 3.9052496798975674e-05, "epoch": 0.07811999743868861, "step": 1220 }, { "loss": 2.4565, "grad_norm": 2.1009023189544678, "learning_rate": 3.921254801536492e-05, "epoch": 0.07844016136261767, "step": 1225 }, { "loss": 2.4827, "grad_norm": 1.9727615118026733, "learning_rate": 3.9372599231754164e-05, "epoch": 0.07876032528654671, "step": 1230 }, { "loss": 2.494, "grad_norm": 2.114440679550171, "learning_rate": 3.953265044814341e-05, "epoch": 0.07908048921047577, "step": 1235 }, { "loss": 2.4776, "grad_norm": 2.6610660552978516, "learning_rate": 3.969270166453265e-05, "epoch": 0.07940065313440482, "step": 1240 }, { "loss": 2.4643, "grad_norm": 2.1695549488067627, "learning_rate": 3.9852752880921896e-05, "epoch": 0.07972081705833386, "step": 1245 }, { "loss": 2.4708, "grad_norm": 2.2169156074523926, "learning_rate": 4.001280409731114e-05, "epoch": 0.08004098098226292, "step": 1250 }, { "loss": 2.4964, "grad_norm": 2.0154330730438232, "learning_rate": 4.0172855313700387e-05, "epoch": 0.08036114490619196, "step": 1255 }, { "loss": 2.4564, "grad_norm": 2.2851929664611816, "learning_rate": 4.033290653008963e-05, "epoch": 0.08068130883012102, "step": 1260 }, { "loss": 2.4764, "grad_norm": 2.229935646057129, "learning_rate": 4.049295774647888e-05, "epoch": 0.08100147275405008, "step": 1265 }, { "loss": 2.4936, "grad_norm": 2.0593361854553223, "learning_rate": 4.0653008962868125e-05, "epoch": 0.08132163667797912, "step": 1270 }, { "loss": 2.4756, "grad_norm": 1.9973433017730713, "learning_rate": 4.081306017925736e-05, "epoch": 0.08164180060190818, "step": 1275 }, { "loss": 2.4594, "grad_norm": 2.005742073059082, "learning_rate": 4.097311139564661e-05, "epoch": 0.08196196452583723, "step": 1280 }, { "loss": 2.4543, "grad_norm": 2.015453577041626, "learning_rate": 4.113316261203585e-05, "epoch": 0.08228212844976628, "step": 1285 }, { "loss": 2.4474, "grad_norm": 2.0268640518188477, "learning_rate": 4.12932138284251e-05, "epoch": 0.08260229237369533, "step": 1290 }, { "loss": 2.4646, "grad_norm": 2.077282428741455, "learning_rate": 4.145326504481434e-05, "epoch": 0.08292245629762439, "step": 1295 }, { "loss": 2.4894, "grad_norm": 2.0141093730926514, "learning_rate": 4.161331626120359e-05, "epoch": 0.08324262022155343, "step": 1300 }, { "loss": 2.46, "grad_norm": 2.0732734203338623, "learning_rate": 4.177336747759283e-05, "epoch": 0.08356278414548249, "step": 1305 }, { "loss": 2.4604, "grad_norm": 2.012782573699951, "learning_rate": 4.193341869398207e-05, "epoch": 0.08388294806941154, "step": 1310 }, { "loss": 2.439, "grad_norm": 2.024240255355835, "learning_rate": 4.209346991037132e-05, "epoch": 0.08420311199334059, "step": 1315 }, { "loss": 2.4635, "grad_norm": 2.0818872451782227, "learning_rate": 4.225352112676056e-05, "epoch": 0.08452327591726964, "step": 1320 }, { "loss": 2.4801, "grad_norm": 2.048849105834961, "learning_rate": 4.241357234314981e-05, "epoch": 0.0848434398411987, "step": 1325 }, { "loss": 2.4625, "grad_norm": 2.284207344055176, "learning_rate": 4.257362355953905e-05, "epoch": 0.08516360376512774, "step": 1330 }, { "loss": 2.4771, "grad_norm": 2.018928050994873, "learning_rate": 4.27336747759283e-05, "epoch": 0.0854837676890568, "step": 1335 }, { "loss": 2.4864, "grad_norm": 2.1332316398620605, "learning_rate": 4.2893725992317543e-05, "epoch": 0.08580393161298584, "step": 1340 }, { "loss": 2.4726, "grad_norm": 2.0433480739593506, "learning_rate": 4.3053777208706785e-05, "epoch": 0.0861240955369149, "step": 1345 }, { "loss": 2.4597, "grad_norm": 2.058560609817505, "learning_rate": 4.3213828425096034e-05, "epoch": 0.08644425946084396, "step": 1350 }, { "loss": 2.4773, "grad_norm": 2.096250534057617, "learning_rate": 4.3373879641485275e-05, "epoch": 0.086764423384773, "step": 1355 }, { "loss": 2.4619, "grad_norm": 2.168686866760254, "learning_rate": 4.3533930857874524e-05, "epoch": 0.08708458730870205, "step": 1360 }, { "loss": 2.4256, "grad_norm": 2.0486621856689453, "learning_rate": 4.3693982074263766e-05, "epoch": 0.08740475123263111, "step": 1365 }, { "loss": 2.4488, "grad_norm": 2.1706786155700684, "learning_rate": 4.3854033290653014e-05, "epoch": 0.08772491515656015, "step": 1370 }, { "loss": 2.4556, "grad_norm": 1.9638718366622925, "learning_rate": 4.4014084507042256e-05, "epoch": 0.08804507908048921, "step": 1375 }, { "loss": 2.4687, "grad_norm": 2.0920019149780273, "learning_rate": 4.41741357234315e-05, "epoch": 0.08836524300441827, "step": 1380 }, { "loss": 2.4431, "grad_norm": 2.1053900718688965, "learning_rate": 4.4334186939820746e-05, "epoch": 0.08868540692834731, "step": 1385 }, { "loss": 2.4907, "grad_norm": 2.1533970832824707, "learning_rate": 4.449423815620999e-05, "epoch": 0.08900557085227637, "step": 1390 }, { "loss": 2.4612, "grad_norm": 2.0936789512634277, "learning_rate": 4.4654289372599236e-05, "epoch": 0.08932573477620542, "step": 1395 }, { "loss": 2.4923, "grad_norm": 2.1903157234191895, "learning_rate": 4.481434058898848e-05, "epoch": 0.08964589870013447, "step": 1400 }, { "eval_loss": 2.323389768600464, "eval_runtime": 12.3833, "eval_samples_per_second": 165.383, "eval_steps_per_second": 20.673, "epoch": 0.08964589870013447, "step": 1400 }, { "loss": 2.4615, "grad_norm": 2.1091010570526123, "learning_rate": 4.4974391805377727e-05, "epoch": 0.08996606262406352, "step": 1405 }, { "loss": 2.4428, "grad_norm": 2.227038621902466, "learning_rate": 4.513444302176697e-05, "epoch": 0.09028622654799258, "step": 1410 }, { "loss": 2.4297, "grad_norm": 2.059403419494629, "learning_rate": 4.529449423815621e-05, "epoch": 0.09060639047192162, "step": 1415 }, { "loss": 2.4364, "grad_norm": 2.005385398864746, "learning_rate": 4.545454545454546e-05, "epoch": 0.09092655439585068, "step": 1420 }, { "loss": 2.4764, "grad_norm": 2.142878532409668, "learning_rate": 4.56145966709347e-05, "epoch": 0.09124671831977972, "step": 1425 }, { "loss": 2.4698, "grad_norm": 2.005213499069214, "learning_rate": 4.577464788732395e-05, "epoch": 0.09156688224370878, "step": 1430 }, { "loss": 2.4566, "grad_norm": 2.094695568084717, "learning_rate": 4.593469910371319e-05, "epoch": 0.09188704616763783, "step": 1435 }, { "loss": 2.4363, "grad_norm": 2.0781939029693604, "learning_rate": 4.609475032010244e-05, "epoch": 0.09220721009156688, "step": 1440 }, { "loss": 2.4561, "grad_norm": 2.1999306678771973, "learning_rate": 4.625480153649168e-05, "epoch": 0.09252737401549593, "step": 1445 }, { "loss": 2.456, "grad_norm": 2.0999979972839355, "learning_rate": 4.641485275288092e-05, "epoch": 0.09284753793942499, "step": 1450 }, { "loss": 2.4783, "grad_norm": 2.1072137355804443, "learning_rate": 4.6574903969270164e-05, "epoch": 0.09316770186335403, "step": 1455 }, { "loss": 2.4956, "grad_norm": 1.932655692100525, "learning_rate": 4.673495518565941e-05, "epoch": 0.09348786578728309, "step": 1460 }, { "loss": 2.49, "grad_norm": 2.29823637008667, "learning_rate": 4.689500640204866e-05, "epoch": 0.09380802971121215, "step": 1465 }, { "loss": 2.4669, "grad_norm": 2.6139848232269287, "learning_rate": 4.70550576184379e-05, "epoch": 0.09412819363514119, "step": 1470 }, { "loss": 2.4432, "grad_norm": 2.2109243869781494, "learning_rate": 4.721510883482715e-05, "epoch": 0.09444835755907025, "step": 1475 }, { "loss": 2.4578, "grad_norm": 2.114405870437622, "learning_rate": 4.737516005121639e-05, "epoch": 0.0947685214829993, "step": 1480 }, { "loss": 2.4617, "grad_norm": 2.0485222339630127, "learning_rate": 4.7535211267605635e-05, "epoch": 0.09508868540692834, "step": 1485 }, { "loss": 2.4296, "grad_norm": 2.0749807357788086, "learning_rate": 4.769526248399488e-05, "epoch": 0.0954088493308574, "step": 1490 }, { "loss": 2.4569, "grad_norm": 2.2009363174438477, "learning_rate": 4.7855313700384125e-05, "epoch": 0.09572901325478646, "step": 1495 }, { "loss": 2.4203, "grad_norm": 2.4653289318084717, "learning_rate": 4.801536491677337e-05, "epoch": 0.0960491771787155, "step": 1500 }, { "loss": 2.445, "grad_norm": 2.145634889602661, "learning_rate": 4.8175416133162615e-05, "epoch": 0.09636934110264456, "step": 1505 }, { "loss": 2.4232, "grad_norm": 2.214996337890625, "learning_rate": 4.8335467349551864e-05, "epoch": 0.0966895050265736, "step": 1510 }, { "loss": 2.5058, "grad_norm": 2.039727210998535, "learning_rate": 4.8495518565941106e-05, "epoch": 0.09700966895050266, "step": 1515 }, { "loss": 2.4503, "grad_norm": 2.134812593460083, "learning_rate": 4.865556978233035e-05, "epoch": 0.09732983287443171, "step": 1520 }, { "loss": 2.4561, "grad_norm": 2.0128939151763916, "learning_rate": 4.881562099871959e-05, "epoch": 0.09764999679836076, "step": 1525 }, { "loss": 2.4869, "grad_norm": 1.9132862091064453, "learning_rate": 4.897567221510884e-05, "epoch": 0.09797016072228981, "step": 1530 }, { "loss": 2.4637, "grad_norm": 2.2746827602386475, "learning_rate": 4.913572343149808e-05, "epoch": 0.09829032464621887, "step": 1535 }, { "loss": 2.4481, "grad_norm": 2.09806489944458, "learning_rate": 4.929577464788733e-05, "epoch": 0.09861048857014791, "step": 1540 }, { "loss": 2.4761, "grad_norm": 2.1433379650115967, "learning_rate": 4.945582586427657e-05, "epoch": 0.09893065249407697, "step": 1545 }, { "loss": 2.4404, "grad_norm": 2.127873659133911, "learning_rate": 4.961587708066582e-05, "epoch": 0.09925081641800602, "step": 1550 }, { "loss": 2.4531, "grad_norm": 2.067396879196167, "learning_rate": 4.977592829705506e-05, "epoch": 0.09957098034193507, "step": 1555 }, { "loss": 2.4531, "grad_norm": 2.0335302352905273, "learning_rate": 4.99359795134443e-05, "epoch": 0.09989114426586412, "step": 1560 }, { "loss": 2.456, "grad_norm": 2.0103564262390137, "learning_rate": 5e-05, "epoch": 0.10021130818979318, "step": 1565 }, { "loss": 2.4837, "grad_norm": 1.9280204772949219, "learning_rate": 5e-05, "epoch": 0.10053147211372222, "step": 1570 }, { "loss": 2.4348, "grad_norm": 2.0677709579467773, "learning_rate": 5e-05, "epoch": 0.10085163603765128, "step": 1575 }, { "loss": 2.4423, "grad_norm": 2.088454484939575, "learning_rate": 5e-05, "epoch": 0.10117179996158034, "step": 1580 }, { "loss": 2.4472, "grad_norm": 2.0001513957977295, "learning_rate": 5e-05, "epoch": 0.10149196388550938, "step": 1585 }, { "loss": 2.4643, "grad_norm": 2.0158650875091553, "learning_rate": 5e-05, "epoch": 0.10181212780943844, "step": 1590 }, { "loss": 2.4517, "grad_norm": 2.062638282775879, "learning_rate": 5e-05, "epoch": 0.10213229173336748, "step": 1595 }, { "loss": 2.4274, "grad_norm": 2.11297345161438, "learning_rate": 5e-05, "epoch": 0.10245245565729653, "step": 1600 }, { "eval_loss": 2.3070931434631348, "eval_runtime": 9.4351, "eval_samples_per_second": 217.062, "eval_steps_per_second": 27.133, "epoch": 0.10245245565729653, "step": 1600 }, { "loss": 2.4551, "grad_norm": 2.169626235961914, "learning_rate": 5e-05, "epoch": 0.10277261958122559, "step": 1605 }, { "loss": 2.4464, "grad_norm": 2.102466344833374, "learning_rate": 5e-05, "epoch": 0.10309278350515463, "step": 1610 }, { "loss": 2.438, "grad_norm": 1.9966940879821777, "learning_rate": 5e-05, "epoch": 0.10341294742908369, "step": 1615 }, { "loss": 2.4323, "grad_norm": 2.103325605392456, "learning_rate": 5e-05, "epoch": 0.10373311135301275, "step": 1620 }, { "loss": 2.4293, "grad_norm": 2.05993390083313, "learning_rate": 5e-05, "epoch": 0.10405327527694179, "step": 1625 }, { "loss": 2.428, "grad_norm": 1.9764646291732788, "learning_rate": 5e-05, "epoch": 0.10437343920087085, "step": 1630 }, { "loss": 2.4515, "grad_norm": 1.9260586500167847, "learning_rate": 5e-05, "epoch": 0.1046936031247999, "step": 1635 }, { "loss": 2.4378, "grad_norm": 1.9698050022125244, "learning_rate": 5e-05, "epoch": 0.10501376704872895, "step": 1640 }, { "loss": 2.4413, "grad_norm": 2.1451985836029053, "learning_rate": 5e-05, "epoch": 0.105333930972658, "step": 1645 }, { "loss": 2.4739, "grad_norm": 2.0343995094299316, "learning_rate": 5e-05, "epoch": 0.10565409489658706, "step": 1650 }, { "loss": 2.4383, "grad_norm": 2.035264253616333, "learning_rate": 5e-05, "epoch": 0.1059742588205161, "step": 1655 }, { "loss": 2.412, "grad_norm": 1.9604747295379639, "learning_rate": 5e-05, "epoch": 0.10629442274444516, "step": 1660 }, { "loss": 2.4449, "grad_norm": 2.0956430435180664, "learning_rate": 5e-05, "epoch": 0.10661458666837421, "step": 1665 }, { "loss": 2.4662, "grad_norm": 2.05611252784729, "learning_rate": 5e-05, "epoch": 0.10693475059230326, "step": 1670 }, { "loss": 2.4423, "grad_norm": 2.157836437225342, "learning_rate": 5e-05, "epoch": 0.10725491451623231, "step": 1675 }, { "loss": 2.4285, "grad_norm": 1.9412627220153809, "learning_rate": 5e-05, "epoch": 0.10757507844016136, "step": 1680 }, { "loss": 2.4448, "grad_norm": 2.1207661628723145, "learning_rate": 5e-05, "epoch": 0.10789524236409041, "step": 1685 }, { "loss": 2.4737, "grad_norm": 2.0780351161956787, "learning_rate": 5e-05, "epoch": 0.10821540628801947, "step": 1690 }, { "loss": 2.4195, "grad_norm": 1.9629524946212769, "learning_rate": 5e-05, "epoch": 0.10853557021194851, "step": 1695 }, { "loss": 2.4542, "grad_norm": 2.141195774078369, "learning_rate": 5e-05, "epoch": 0.10885573413587757, "step": 1700 }, { "loss": 2.4617, "grad_norm": 2.042581081390381, "learning_rate": 5e-05, "epoch": 0.10917589805980663, "step": 1705 }, { "loss": 2.4604, "grad_norm": 2.0919344425201416, "learning_rate": 5e-05, "epoch": 0.10949606198373567, "step": 1710 }, { "loss": 2.438, "grad_norm": 2.0356478691101074, "learning_rate": 5e-05, "epoch": 0.10981622590766472, "step": 1715 }, { "loss": 2.4202, "grad_norm": 1.9953988790512085, "learning_rate": 5e-05, "epoch": 0.11013638983159378, "step": 1720 }, { "loss": 2.4372, "grad_norm": 2.033454656600952, "learning_rate": 5e-05, "epoch": 0.11045655375552282, "step": 1725 }, { "loss": 2.4564, "grad_norm": 1.898619532585144, "learning_rate": 5e-05, "epoch": 0.11077671767945188, "step": 1730 }, { "loss": 2.4104, "grad_norm": 1.9848005771636963, "learning_rate": 5e-05, "epoch": 0.11109688160338094, "step": 1735 }, { "loss": 2.4339, "grad_norm": 2.142657518386841, "learning_rate": 5e-05, "epoch": 0.11141704552730998, "step": 1740 }, { "loss": 2.4583, "grad_norm": 1.9848843812942505, "learning_rate": 5e-05, "epoch": 0.11173720945123904, "step": 1745 }, { "loss": 2.4431, "grad_norm": 1.9729015827178955, "learning_rate": 5e-05, "epoch": 0.11205737337516809, "step": 1750 }, { "loss": 2.4312, "grad_norm": 8.698593139648438, "learning_rate": 5e-05, "epoch": 0.11237753729909714, "step": 1755 }, { "loss": 2.4592, "grad_norm": 2.070530414581299, "learning_rate": 5e-05, "epoch": 0.11269770122302619, "step": 1760 }, { "loss": 2.4533, "grad_norm": 2.034292697906494, "learning_rate": 5e-05, "epoch": 0.11301786514695523, "step": 1765 }, { "loss": 2.4622, "grad_norm": 2.0932867527008057, "learning_rate": 5e-05, "epoch": 0.11333802907088429, "step": 1770 }, { "loss": 2.4165, "grad_norm": 1.9923781156539917, "learning_rate": 5e-05, "epoch": 0.11365819299481335, "step": 1775 }, { "loss": 2.4384, "grad_norm": 2.063328504562378, "learning_rate": 5e-05, "epoch": 0.11397835691874239, "step": 1780 }, { "loss": 2.4038, "grad_norm": 2.021510124206543, "learning_rate": 5e-05, "epoch": 0.11429852084267145, "step": 1785 }, { "loss": 2.4164, "grad_norm": 1.953681468963623, "learning_rate": 5e-05, "epoch": 0.1146186847666005, "step": 1790 }, { "loss": 2.4103, "grad_norm": 1.976102590560913, "learning_rate": 5e-05, "epoch": 0.11493884869052955, "step": 1795 }, { "loss": 2.4182, "grad_norm": 1.886995792388916, "learning_rate": 5e-05, "epoch": 0.1152590126144586, "step": 1800 }, { "eval_loss": 2.283639430999756, "eval_runtime": 12.6341, "eval_samples_per_second": 162.101, "eval_steps_per_second": 20.263, "epoch": 0.1152590126144586, "step": 1800 }, { "loss": 2.4338, "grad_norm": 2.145838975906372, "learning_rate": 5e-05, "epoch": 0.11557917653838766, "step": 1805 }, { "loss": 2.4621, "grad_norm": 2.145569324493408, "learning_rate": 5e-05, "epoch": 0.1158993404623167, "step": 1810 }, { "loss": 2.4088, "grad_norm": 2.0806474685668945, "learning_rate": 5e-05, "epoch": 0.11621950438624576, "step": 1815 }, { "loss": 2.4203, "grad_norm": 2.0347843170166016, "learning_rate": 5e-05, "epoch": 0.11653966831017482, "step": 1820 }, { "loss": 2.4236, "grad_norm": 1.9410957098007202, "learning_rate": 5e-05, "epoch": 0.11685983223410386, "step": 1825 }, { "loss": 2.4548, "grad_norm": 1.9339467287063599, "learning_rate": 5e-05, "epoch": 0.11717999615803291, "step": 1830 }, { "loss": 2.4241, "grad_norm": 2.0596871376037598, "learning_rate": 5e-05, "epoch": 0.11750016008196197, "step": 1835 }, { "loss": 2.4443, "grad_norm": 2.01784610748291, "learning_rate": 5e-05, "epoch": 0.11782032400589101, "step": 1840 }, { "loss": 2.4297, "grad_norm": 1.915007472038269, "learning_rate": 5e-05, "epoch": 0.11814048792982007, "step": 1845 }, { "loss": 2.412, "grad_norm": 2.025275945663452, "learning_rate": 5e-05, "epoch": 0.11846065185374911, "step": 1850 }, { "loss": 2.4118, "grad_norm": 1.9844189882278442, "learning_rate": 5e-05, "epoch": 0.11878081577767817, "step": 1855 }, { "loss": 2.4055, "grad_norm": 2.0724167823791504, "learning_rate": 5e-05, "epoch": 0.11910097970160723, "step": 1860 }, { "loss": 2.4241, "grad_norm": 2.0442521572113037, "learning_rate": 5e-05, "epoch": 0.11942114362553627, "step": 1865 }, { "loss": 2.4462, "grad_norm": 1.9685866832733154, "learning_rate": 5e-05, "epoch": 0.11974130754946533, "step": 1870 }, { "loss": 2.4232, "grad_norm": 1.9572803974151611, "learning_rate": 5e-05, "epoch": 0.12006147147339438, "step": 1875 }, { "loss": 2.4565, "grad_norm": 2.105123519897461, "learning_rate": 5e-05, "epoch": 0.12038163539732342, "step": 1880 }, { "loss": 2.4146, "grad_norm": 2.017563581466675, "learning_rate": 5e-05, "epoch": 0.12070179932125248, "step": 1885 }, { "loss": 2.4295, "grad_norm": 1.9783453941345215, "learning_rate": 5e-05, "epoch": 0.12102196324518154, "step": 1890 }, { "loss": 2.4148, "grad_norm": 2.017634868621826, "learning_rate": 5e-05, "epoch": 0.12134212716911058, "step": 1895 }, { "loss": 2.4214, "grad_norm": 2.16438627243042, "learning_rate": 5e-05, "epoch": 0.12166229109303964, "step": 1900 }, { "loss": 2.42, "grad_norm": 1.980455994606018, "learning_rate": 5e-05, "epoch": 0.1219824550169687, "step": 1905 }, { "loss": 2.3945, "grad_norm": 1.9849518537521362, "learning_rate": 5e-05, "epoch": 0.12230261894089774, "step": 1910 }, { "loss": 2.459, "grad_norm": 1.9081141948699951, "learning_rate": 5e-05, "epoch": 0.1226227828648268, "step": 1915 }, { "loss": 2.4555, "grad_norm": 1.9519824981689453, "learning_rate": 5e-05, "epoch": 0.12294294678875585, "step": 1920 }, { "loss": 2.4165, "grad_norm": 1.8904905319213867, "learning_rate": 5e-05, "epoch": 0.12326311071268489, "step": 1925 }, { "loss": 2.3908, "grad_norm": 2.1023762226104736, "learning_rate": 5e-05, "epoch": 0.12358327463661395, "step": 1930 }, { "loss": 2.4155, "grad_norm": 1.937259554862976, "learning_rate": 5e-05, "epoch": 0.12390343856054299, "step": 1935 }, { "loss": 2.3992, "grad_norm": 2.0456533432006836, "learning_rate": 5e-05, "epoch": 0.12422360248447205, "step": 1940 }, { "loss": 2.4287, "grad_norm": 2.0166923999786377, "learning_rate": 5e-05, "epoch": 0.1245437664084011, "step": 1945 }, { "loss": 2.4264, "grad_norm": 2.064141273498535, "learning_rate": 5e-05, "epoch": 0.12486393033233015, "step": 1950 }, { "loss": 2.4268, "grad_norm": 1.9742683172225952, "learning_rate": 5e-05, "epoch": 0.1251840942562592, "step": 1955 }, { "loss": 2.4056, "grad_norm": 1.9377827644348145, "learning_rate": 5e-05, "epoch": 0.12550425818018826, "step": 1960 }, { "loss": 2.4298, "grad_norm": 1.92035710811615, "learning_rate": 5e-05, "epoch": 0.12582442210411732, "step": 1965 }, { "loss": 2.4565, "grad_norm": 1.9152588844299316, "learning_rate": 5e-05, "epoch": 0.12614458602804635, "step": 1970 }, { "loss": 2.4223, "grad_norm": 1.9562370777130127, "learning_rate": 5e-05, "epoch": 0.1264647499519754, "step": 1975 }, { "loss": 2.3804, "grad_norm": 1.9528794288635254, "learning_rate": 5e-05, "epoch": 0.12678491387590446, "step": 1980 }, { "loss": 2.4475, "grad_norm": 1.9754786491394043, "learning_rate": 5e-05, "epoch": 0.12710507779983352, "step": 1985 }, { "loss": 2.4395, "grad_norm": 1.9041407108306885, "learning_rate": 5e-05, "epoch": 0.12742524172376257, "step": 1990 }, { "loss": 2.4235, "grad_norm": 2.0152764320373535, "learning_rate": 5e-05, "epoch": 0.12774540564769163, "step": 1995 }, { "loss": 2.4227, "grad_norm": 1.8605611324310303, "learning_rate": 5e-05, "epoch": 0.12806556957162066, "step": 2000 }, { "eval_loss": 2.2688512802124023, "eval_runtime": 9.5336, "eval_samples_per_second": 214.818, "eval_steps_per_second": 26.852, "epoch": 0.12806556957162066, "step": 2000 }, { "loss": 2.4202, "grad_norm": 2.040646553039551, "learning_rate": 5e-05, "epoch": 0.12838573349554971, "step": 2005 }, { "loss": 2.4003, "grad_norm": 2.212007522583008, "learning_rate": 5e-05, "epoch": 0.12870589741947877, "step": 2010 }, { "loss": 2.4121, "grad_norm": 1.964044451713562, "learning_rate": 5e-05, "epoch": 0.12902606134340783, "step": 2015 }, { "loss": 2.3939, "grad_norm": 1.9541124105453491, "learning_rate": 5e-05, "epoch": 0.12934622526733688, "step": 2020 }, { "loss": 2.4335, "grad_norm": 1.8698004484176636, "learning_rate": 5e-05, "epoch": 0.12966638919126594, "step": 2025 }, { "loss": 2.4246, "grad_norm": 1.956687092781067, "learning_rate": 5e-05, "epoch": 0.12998655311519497, "step": 2030 }, { "loss": 2.4097, "grad_norm": 2.137901544570923, "learning_rate": 5e-05, "epoch": 0.13030671703912403, "step": 2035 }, { "loss": 2.4066, "grad_norm": 1.9962667226791382, "learning_rate": 5e-05, "epoch": 0.13062688096305308, "step": 2040 }, { "loss": 2.404, "grad_norm": 1.9223984479904175, "learning_rate": 5e-05, "epoch": 0.13094704488698214, "step": 2045 }, { "loss": 2.3836, "grad_norm": 1.956534504890442, "learning_rate": 5e-05, "epoch": 0.1312672088109112, "step": 2050 }, { "loss": 2.4043, "grad_norm": 1.9311598539352417, "learning_rate": 5e-05, "epoch": 0.13158737273484022, "step": 2055 }, { "loss": 2.4129, "grad_norm": 2.080878496170044, "learning_rate": 5e-05, "epoch": 0.13190753665876928, "step": 2060 }, { "loss": 2.4115, "grad_norm": 2.0229105949401855, "learning_rate": 5e-05, "epoch": 0.13222770058269834, "step": 2065 }, { "loss": 2.3804, "grad_norm": 2.0059521198272705, "learning_rate": 5e-05, "epoch": 0.1325478645066274, "step": 2070 }, { "loss": 2.402, "grad_norm": 2.0359392166137695, "learning_rate": 5e-05, "epoch": 0.13286802843055645, "step": 2075 }, { "loss": 2.3843, "grad_norm": 1.9484703540802002, "learning_rate": 5e-05, "epoch": 0.1331881923544855, "step": 2080 }, { "loss": 2.4186, "grad_norm": 2.008492946624756, "learning_rate": 5e-05, "epoch": 0.13350835627841454, "step": 2085 }, { "loss": 2.4124, "grad_norm": 2.0623104572296143, "learning_rate": 5e-05, "epoch": 0.1338285202023436, "step": 2090 }, { "loss": 2.3599, "grad_norm": 1.8425260782241821, "learning_rate": 5e-05, "epoch": 0.13414868412627265, "step": 2095 }, { "loss": 2.3984, "grad_norm": 2.0146563053131104, "learning_rate": 5e-05, "epoch": 0.1344688480502017, "step": 2100 }, { "loss": 2.3858, "grad_norm": 2.082679033279419, "learning_rate": 5e-05, "epoch": 0.13478901197413076, "step": 2105 }, { "loss": 2.4171, "grad_norm": 2.029128074645996, "learning_rate": 5e-05, "epoch": 0.13510917589805982, "step": 2110 }, { "loss": 2.3959, "grad_norm": 1.987762212753296, "learning_rate": 5e-05, "epoch": 0.13542933982198885, "step": 2115 }, { "loss": 2.3947, "grad_norm": 2.0950815677642822, "learning_rate": 5e-05, "epoch": 0.1357495037459179, "step": 2120 }, { "loss": 2.4035, "grad_norm": 2.036588668823242, "learning_rate": 5e-05, "epoch": 0.13606966766984696, "step": 2125 }, { "loss": 2.4071, "grad_norm": 2.120378017425537, "learning_rate": 5e-05, "epoch": 0.13638983159377602, "step": 2130 }, { "loss": 2.404, "grad_norm": 1.9909882545471191, "learning_rate": 5e-05, "epoch": 0.13670999551770507, "step": 2135 }, { "loss": 2.4018, "grad_norm": 2.004340887069702, "learning_rate": 5e-05, "epoch": 0.1370301594416341, "step": 2140 }, { "loss": 2.4058, "grad_norm": 1.8815783262252808, "learning_rate": 5e-05, "epoch": 0.13735032336556316, "step": 2145 }, { "loss": 2.3965, "grad_norm": 1.9335230588912964, "learning_rate": 5e-05, "epoch": 0.13767048728949222, "step": 2150 }, { "loss": 2.4259, "grad_norm": 2.0381062030792236, "learning_rate": 5e-05, "epoch": 0.13799065121342127, "step": 2155 }, { "loss": 2.406, "grad_norm": 1.9996901750564575, "learning_rate": 5e-05, "epoch": 0.13831081513735033, "step": 2160 }, { "loss": 2.4039, "grad_norm": 1.899214267730713, "learning_rate": 5e-05, "epoch": 0.13863097906127939, "step": 2165 }, { "loss": 2.4298, "grad_norm": 1.9462037086486816, "learning_rate": 5e-05, "epoch": 0.13895114298520841, "step": 2170 }, { "loss": 2.4035, "grad_norm": 1.935436487197876, "learning_rate": 5e-05, "epoch": 0.13927130690913747, "step": 2175 }, { "loss": 2.3991, "grad_norm": 1.9040533304214478, "learning_rate": 5e-05, "epoch": 0.13959147083306653, "step": 2180 }, { "loss": 2.4263, "grad_norm": 1.9670405387878418, "learning_rate": 5e-05, "epoch": 0.13991163475699558, "step": 2185 }, { "loss": 2.4282, "grad_norm": 2.0874922275543213, "learning_rate": 5e-05, "epoch": 0.14023179868092464, "step": 2190 }, { "loss": 2.4144, "grad_norm": 1.953114628791809, "learning_rate": 5e-05, "epoch": 0.1405519626048537, "step": 2195 }, { "loss": 2.3966, "grad_norm": 2.0168471336364746, "learning_rate": 5e-05, "epoch": 0.14087212652878273, "step": 2200 }, { "eval_loss": 2.253744125366211, "eval_runtime": 10.1134, "eval_samples_per_second": 202.504, "eval_steps_per_second": 25.313, "epoch": 0.14087212652878273, "step": 2200 }, { "loss": 2.3911, "grad_norm": 1.9229140281677246, "learning_rate": 5e-05, "epoch": 0.14119229045271178, "step": 2205 }, { "loss": 2.4168, "grad_norm": 1.9505460262298584, "learning_rate": 5e-05, "epoch": 0.14151245437664084, "step": 2210 }, { "loss": 2.4122, "grad_norm": 1.9959040880203247, "learning_rate": 5e-05, "epoch": 0.1418326183005699, "step": 2215 }, { "loss": 2.4155, "grad_norm": 2.070401191711426, "learning_rate": 5e-05, "epoch": 0.14215278222449895, "step": 2220 }, { "loss": 2.3864, "grad_norm": 1.980580449104309, "learning_rate": 5e-05, "epoch": 0.14247294614842798, "step": 2225 }, { "loss": 2.4002, "grad_norm": 1.9218393564224243, "learning_rate": 5e-05, "epoch": 0.14279311007235704, "step": 2230 }, { "loss": 2.3819, "grad_norm": 2.057966709136963, "learning_rate": 5e-05, "epoch": 0.1431132739962861, "step": 2235 }, { "loss": 2.4171, "grad_norm": 2.0004384517669678, "learning_rate": 5e-05, "epoch": 0.14343343792021515, "step": 2240 }, { "loss": 2.3866, "grad_norm": 2.0221750736236572, "learning_rate": 5e-05, "epoch": 0.1437536018441442, "step": 2245 }, { "loss": 2.4216, "grad_norm": 2.0313234329223633, "learning_rate": 5e-05, "epoch": 0.14407376576807326, "step": 2250 }, { "loss": 2.3814, "grad_norm": 2.1582136154174805, "learning_rate": 5e-05, "epoch": 0.1443939296920023, "step": 2255 }, { "loss": 2.4123, "grad_norm": 2.0171945095062256, "learning_rate": 5e-05, "epoch": 0.14471409361593135, "step": 2260 }, { "loss": 2.3892, "grad_norm": 1.9124054908752441, "learning_rate": 5e-05, "epoch": 0.1450342575398604, "step": 2265 }, { "loss": 2.4326, "grad_norm": 1.9355947971343994, "learning_rate": 5e-05, "epoch": 0.14535442146378946, "step": 2270 }, { "loss": 2.4224, "grad_norm": 2.030381679534912, "learning_rate": 5e-05, "epoch": 0.14567458538771852, "step": 2275 }, { "loss": 2.4016, "grad_norm": 2.0123848915100098, "learning_rate": 5e-05, "epoch": 0.14599474931164758, "step": 2280 }, { "loss": 2.3838, "grad_norm": 2.062603712081909, "learning_rate": 5e-05, "epoch": 0.1463149132355766, "step": 2285 }, { "loss": 2.3896, "grad_norm": 2.0680532455444336, "learning_rate": 5e-05, "epoch": 0.14663507715950566, "step": 2290 }, { "loss": 2.4275, "grad_norm": 1.846703290939331, "learning_rate": 5e-05, "epoch": 0.14695524108343472, "step": 2295 }, { "loss": 2.3873, "grad_norm": 1.9122810363769531, "learning_rate": 5e-05, "epoch": 0.14727540500736377, "step": 2300 }, { "loss": 2.4004, "grad_norm": 1.9584786891937256, "learning_rate": 5e-05, "epoch": 0.14759556893129283, "step": 2305 }, { "loss": 2.3838, "grad_norm": 1.8874859809875488, "learning_rate": 5e-05, "epoch": 0.14791573285522186, "step": 2310 }, { "loss": 2.3581, "grad_norm": 1.8960413932800293, "learning_rate": 5e-05, "epoch": 0.14823589677915092, "step": 2315 }, { "loss": 2.3889, "grad_norm": 1.8831268548965454, "learning_rate": 5e-05, "epoch": 0.14855606070307997, "step": 2320 }, { "loss": 2.3608, "grad_norm": 1.910288691520691, "learning_rate": 5e-05, "epoch": 0.14887622462700903, "step": 2325 }, { "loss": 2.4205, "grad_norm": 1.9343372583389282, "learning_rate": 5e-05, "epoch": 0.14919638855093809, "step": 2330 }, { "loss": 2.3924, "grad_norm": 1.953525424003601, "learning_rate": 5e-05, "epoch": 0.14951655247486714, "step": 2335 }, { "loss": 2.3593, "grad_norm": 1.9582774639129639, "learning_rate": 5e-05, "epoch": 0.14983671639879617, "step": 2340 }, { "loss": 2.3897, "grad_norm": 1.9290440082550049, "learning_rate": 5e-05, "epoch": 0.15015688032272523, "step": 2345 }, { "loss": 2.389, "grad_norm": 1.846218228340149, "learning_rate": 5e-05, "epoch": 0.15047704424665428, "step": 2350 }, { "loss": 2.4196, "grad_norm": 1.9014278650283813, "learning_rate": 5e-05, "epoch": 0.15079720817058334, "step": 2355 }, { "loss": 2.3938, "grad_norm": 1.9431865215301514, "learning_rate": 5e-05, "epoch": 0.1511173720945124, "step": 2360 }, { "loss": 2.3931, "grad_norm": 1.8423478603363037, "learning_rate": 5e-05, "epoch": 0.15143753601844145, "step": 2365 }, { "loss": 2.3864, "grad_norm": 1.8788933753967285, "learning_rate": 5e-05, "epoch": 0.15175769994237048, "step": 2370 }, { "loss": 2.3934, "grad_norm": 1.9234330654144287, "learning_rate": 5e-05, "epoch": 0.15207786386629954, "step": 2375 }, { "loss": 2.3706, "grad_norm": 1.8926013708114624, "learning_rate": 5e-05, "epoch": 0.1523980277902286, "step": 2380 }, { "loss": 2.3912, "grad_norm": 2.009702682495117, "learning_rate": 5e-05, "epoch": 0.15271819171415765, "step": 2385 }, { "loss": 2.4004, "grad_norm": 2.00524640083313, "learning_rate": 5e-05, "epoch": 0.1530383556380867, "step": 2390 }, { "loss": 2.3845, "grad_norm": 1.8410059213638306, "learning_rate": 5e-05, "epoch": 0.15335851956201574, "step": 2395 }, { "loss": 2.3933, "grad_norm": 1.9169642925262451, "learning_rate": 5e-05, "epoch": 0.1536786834859448, "step": 2400 }, { "eval_loss": 2.2529916763305664, "eval_runtime": 9.2707, "eval_samples_per_second": 220.912, "eval_steps_per_second": 27.614, "epoch": 0.1536786834859448, "step": 2400 }, { "loss": 2.3935, "grad_norm": 1.948587417602539, "learning_rate": 5e-05, "epoch": 0.15399884740987385, "step": 2405 }, { "loss": 2.4017, "grad_norm": 2.016439914703369, "learning_rate": 5e-05, "epoch": 0.1543190113338029, "step": 2410 }, { "loss": 2.3605, "grad_norm": 2.007875680923462, "learning_rate": 5e-05, "epoch": 0.15463917525773196, "step": 2415 }, { "loss": 2.3663, "grad_norm": 1.9632021188735962, "learning_rate": 5e-05, "epoch": 0.15495933918166102, "step": 2420 }, { "loss": 2.3814, "grad_norm": 1.9989489316940308, "learning_rate": 5e-05, "epoch": 0.15527950310559005, "step": 2425 }, { "loss": 2.3695, "grad_norm": 2.0570878982543945, "learning_rate": 5e-05, "epoch": 0.1555996670295191, "step": 2430 }, { "loss": 2.3995, "grad_norm": 2.0144171714782715, "learning_rate": 5e-05, "epoch": 0.15591983095344816, "step": 2435 }, { "loss": 2.4044, "grad_norm": 1.8741555213928223, "learning_rate": 5e-05, "epoch": 0.15623999487737722, "step": 2440 }, { "loss": 2.3671, "grad_norm": 1.8373762369155884, "learning_rate": 5e-05, "epoch": 0.15656015880130628, "step": 2445 }, { "loss": 2.3604, "grad_norm": 1.9890358448028564, "learning_rate": 5e-05, "epoch": 0.15688032272523533, "step": 2450 }, { "loss": 2.3899, "grad_norm": 2.008896827697754, "learning_rate": 5e-05, "epoch": 0.15720048664916436, "step": 2455 }, { "loss": 2.3925, "grad_norm": 1.98505699634552, "learning_rate": 5e-05, "epoch": 0.15752065057309342, "step": 2460 }, { "loss": 2.3917, "grad_norm": 1.958855152130127, "learning_rate": 5e-05, "epoch": 0.15784081449702247, "step": 2465 }, { "loss": 2.3918, "grad_norm": 1.9323810338974, "learning_rate": 5e-05, "epoch": 0.15816097842095153, "step": 2470 }, { "loss": 2.398, "grad_norm": 2.0385091304779053, "learning_rate": 5e-05, "epoch": 0.1584811423448806, "step": 2475 }, { "loss": 2.3735, "grad_norm": 1.877455711364746, "learning_rate": 5e-05, "epoch": 0.15880130626880964, "step": 2480 }, { "loss": 2.4105, "grad_norm": 1.8172228336334229, "learning_rate": 5e-05, "epoch": 0.15912147019273867, "step": 2485 }, { "loss": 2.3432, "grad_norm": 1.9281107187271118, "learning_rate": 5e-05, "epoch": 0.15944163411666773, "step": 2490 }, { "loss": 2.4057, "grad_norm": 2.0879902839660645, "learning_rate": 5e-05, "epoch": 0.1597617980405968, "step": 2495 }, { "loss": 2.3725, "grad_norm": 1.9170490503311157, "learning_rate": 5e-05, "epoch": 0.16008196196452584, "step": 2500 }, { "loss": 2.3963, "grad_norm": 1.973979115486145, "learning_rate": 5e-05, "epoch": 0.1604021258884549, "step": 2505 }, { "loss": 2.3596, "grad_norm": 1.9528260231018066, "learning_rate": 5e-05, "epoch": 0.16072228981238393, "step": 2510 }, { "loss": 2.3587, "grad_norm": 1.9395289421081543, "learning_rate": 5e-05, "epoch": 0.16104245373631298, "step": 2515 }, { "loss": 2.3813, "grad_norm": 1.9283883571624756, "learning_rate": 5e-05, "epoch": 0.16136261766024204, "step": 2520 }, { "loss": 2.3981, "grad_norm": 1.9027310609817505, "learning_rate": 5e-05, "epoch": 0.1616827815841711, "step": 2525 }, { "loss": 2.3892, "grad_norm": 1.9189422130584717, "learning_rate": 5e-05, "epoch": 0.16200294550810015, "step": 2530 }, { "loss": 2.3914, "grad_norm": 1.9815748929977417, "learning_rate": 5e-05, "epoch": 0.1623231094320292, "step": 2535 }, { "loss": 2.3935, "grad_norm": 1.9846832752227783, "learning_rate": 5e-05, "epoch": 0.16264327335595824, "step": 2540 }, { "loss": 2.3638, "grad_norm": 1.8738396167755127, "learning_rate": 5e-05, "epoch": 0.1629634372798873, "step": 2545 }, { "loss": 2.402, "grad_norm": 1.9183320999145508, "learning_rate": 5e-05, "epoch": 0.16328360120381635, "step": 2550 }, { "loss": 2.3585, "grad_norm": 1.9421886205673218, "learning_rate": 5e-05, "epoch": 0.1636037651277454, "step": 2555 }, { "loss": 2.372, "grad_norm": 1.8870490789413452, "learning_rate": 5e-05, "epoch": 0.16392392905167447, "step": 2560 }, { "loss": 2.3954, "grad_norm": 1.9618778228759766, "learning_rate": 5e-05, "epoch": 0.16424409297560352, "step": 2565 }, { "loss": 2.3422, "grad_norm": 2.0231478214263916, "learning_rate": 5e-05, "epoch": 0.16456425689953255, "step": 2570 }, { "loss": 2.375, "grad_norm": 2.0343830585479736, "learning_rate": 5e-05, "epoch": 0.1648844208234616, "step": 2575 }, { "loss": 2.3795, "grad_norm": 1.8410435914993286, "learning_rate": 5e-05, "epoch": 0.16520458474739066, "step": 2580 }, { "loss": 2.3509, "grad_norm": 1.8402718305587769, "learning_rate": 5e-05, "epoch": 0.16552474867131972, "step": 2585 }, { "loss": 2.3851, "grad_norm": 1.9031217098236084, "learning_rate": 5e-05, "epoch": 0.16584491259524878, "step": 2590 }, { "loss": 2.3865, "grad_norm": 1.8732967376708984, "learning_rate": 5e-05, "epoch": 0.1661650765191778, "step": 2595 }, { "loss": 2.3787, "grad_norm": 1.8325210809707642, "learning_rate": 5e-05, "epoch": 0.16648524044310686, "step": 2600 }, { "eval_loss": 2.2488222122192383, "eval_runtime": 9.2735, "eval_samples_per_second": 220.844, "eval_steps_per_second": 27.606, "epoch": 0.16648524044310686, "step": 2600 }, { "loss": 2.3569, "grad_norm": 1.9319522380828857, "learning_rate": 5e-05, "epoch": 0.16680540436703592, "step": 2605 }, { "loss": 2.3725, "grad_norm": 1.997213363647461, "learning_rate": 5e-05, "epoch": 0.16712556829096498, "step": 2610 }, { "loss": 2.4022, "grad_norm": 2.0918712615966797, "learning_rate": 5e-05, "epoch": 0.16744573221489403, "step": 2615 }, { "loss": 2.369, "grad_norm": 2.015212297439575, "learning_rate": 5e-05, "epoch": 0.1677658961388231, "step": 2620 }, { "loss": 2.3813, "grad_norm": 1.9604344367980957, "learning_rate": 5e-05, "epoch": 0.16808606006275212, "step": 2625 }, { "loss": 2.3668, "grad_norm": 1.8466747999191284, "learning_rate": 5e-05, "epoch": 0.16840622398668117, "step": 2630 }, { "loss": 2.3645, "grad_norm": 1.8700547218322754, "learning_rate": 5e-05, "epoch": 0.16872638791061023, "step": 2635 }, { "loss": 2.3847, "grad_norm": 1.9464409351348877, "learning_rate": 5e-05, "epoch": 0.1690465518345393, "step": 2640 }, { "loss": 2.3664, "grad_norm": 1.9160685539245605, "learning_rate": 5e-05, "epoch": 0.16936671575846834, "step": 2645 }, { "loss": 2.3949, "grad_norm": 1.993518590927124, "learning_rate": 5e-05, "epoch": 0.1696868796823974, "step": 2650 }, { "loss": 2.3986, "grad_norm": 1.8965723514556885, "learning_rate": 5e-05, "epoch": 0.17000704360632643, "step": 2655 }, { "loss": 2.3931, "grad_norm": 1.8633148670196533, "learning_rate": 5e-05, "epoch": 0.1703272075302555, "step": 2660 }, { "loss": 2.3934, "grad_norm": 1.8948819637298584, "learning_rate": 5e-05, "epoch": 0.17064737145418454, "step": 2665 }, { "loss": 2.3787, "grad_norm": 1.9217243194580078, "learning_rate": 5e-05, "epoch": 0.1709675353781136, "step": 2670 }, { "loss": 2.3821, "grad_norm": 1.774686574935913, "learning_rate": 5e-05, "epoch": 0.17128769930204266, "step": 2675 }, { "loss": 2.364, "grad_norm": 1.9000036716461182, "learning_rate": 5e-05, "epoch": 0.17160786322597169, "step": 2680 }, { "loss": 2.3718, "grad_norm": 1.8330143690109253, "learning_rate": 5e-05, "epoch": 0.17192802714990074, "step": 2685 }, { "loss": 2.4015, "grad_norm": 1.9639065265655518, "learning_rate": 5e-05, "epoch": 0.1722481910738298, "step": 2690 }, { "loss": 2.403, "grad_norm": 1.8496508598327637, "learning_rate": 5e-05, "epoch": 0.17256835499775885, "step": 2695 }, { "loss": 2.3548, "grad_norm": 1.8958038091659546, "learning_rate": 5e-05, "epoch": 0.1728885189216879, "step": 2700 }, { "loss": 2.3648, "grad_norm": 2.150702476501465, "learning_rate": 5e-05, "epoch": 0.17320868284561697, "step": 2705 }, { "loss": 2.3544, "grad_norm": 2.0381345748901367, "learning_rate": 5e-05, "epoch": 0.173528846769546, "step": 2710 }, { "loss": 2.3726, "grad_norm": 1.9142519235610962, "learning_rate": 5e-05, "epoch": 0.17384901069347505, "step": 2715 }, { "loss": 2.3721, "grad_norm": 1.9022127389907837, "learning_rate": 5e-05, "epoch": 0.1741691746174041, "step": 2720 }, { "loss": 2.3595, "grad_norm": 2.145447254180908, "learning_rate": 5e-05, "epoch": 0.17448933854133317, "step": 2725 }, { "loss": 2.3423, "grad_norm": 1.8730753660202026, "learning_rate": 5e-05, "epoch": 0.17480950246526222, "step": 2730 }, { "loss": 2.3756, "grad_norm": 1.9949947595596313, "learning_rate": 5e-05, "epoch": 0.17512966638919128, "step": 2735 }, { "loss": 2.3446, "grad_norm": 1.9902111291885376, "learning_rate": 5e-05, "epoch": 0.1754498303131203, "step": 2740 }, { "loss": 2.3616, "grad_norm": 1.9357950687408447, "learning_rate": 5e-05, "epoch": 0.17576999423704937, "step": 2745 }, { "loss": 2.3666, "grad_norm": 1.9442518949508667, "learning_rate": 5e-05, "epoch": 0.17609015816097842, "step": 2750 }, { "loss": 2.3623, "grad_norm": 1.8661247491836548, "learning_rate": 5e-05, "epoch": 0.17641032208490748, "step": 2755 }, { "loss": 2.368, "grad_norm": 1.8535679578781128, "learning_rate": 5e-05, "epoch": 0.17673048600883653, "step": 2760 }, { "loss": 2.3872, "grad_norm": 1.8927170038223267, "learning_rate": 5e-05, "epoch": 0.17705064993276556, "step": 2765 }, { "loss": 2.3464, "grad_norm": 1.9512662887573242, "learning_rate": 5e-05, "epoch": 0.17737081385669462, "step": 2770 }, { "loss": 2.3607, "grad_norm": 1.8841359615325928, "learning_rate": 5e-05, "epoch": 0.17769097778062368, "step": 2775 }, { "loss": 2.3779, "grad_norm": 1.9043680429458618, "learning_rate": 5e-05, "epoch": 0.17801114170455273, "step": 2780 }, { "loss": 2.3836, "grad_norm": 1.8490782976150513, "learning_rate": 5e-05, "epoch": 0.1783313056284818, "step": 2785 }, { "loss": 2.3739, "grad_norm": 1.9332350492477417, "learning_rate": 5e-05, "epoch": 0.17865146955241085, "step": 2790 }, { "loss": 2.3407, "grad_norm": 1.8520585298538208, "learning_rate": 5e-05, "epoch": 0.17897163347633988, "step": 2795 }, { "loss": 2.3733, "grad_norm": 1.8878172636032104, "learning_rate": 5e-05, "epoch": 0.17929179740026893, "step": 2800 }, { "eval_loss": 2.2236533164978027, "eval_runtime": 9.4962, "eval_samples_per_second": 215.665, "eval_steps_per_second": 26.958, "epoch": 0.17929179740026893, "step": 2800 }, { "loss": 2.3377, "grad_norm": 1.994707465171814, "learning_rate": 5e-05, "epoch": 0.179611961324198, "step": 2805 }, { "loss": 2.3754, "grad_norm": 1.7752844095230103, "learning_rate": 5e-05, "epoch": 0.17993212524812705, "step": 2810 }, { "loss": 2.376, "grad_norm": 1.9438122510910034, "learning_rate": 5e-05, "epoch": 0.1802522891720561, "step": 2815 }, { "loss": 2.3698, "grad_norm": 1.9461045265197754, "learning_rate": 5e-05, "epoch": 0.18057245309598516, "step": 2820 }, { "loss": 2.3587, "grad_norm": 1.92300283908844, "learning_rate": 5e-05, "epoch": 0.1808926170199142, "step": 2825 }, { "loss": 2.395, "grad_norm": 2.004666328430176, "learning_rate": 5e-05, "epoch": 0.18121278094384324, "step": 2830 }, { "loss": 2.3882, "grad_norm": 1.8876590728759766, "learning_rate": 5e-05, "epoch": 0.1815329448677723, "step": 2835 }, { "loss": 2.337, "grad_norm": 1.9001890420913696, "learning_rate": 5e-05, "epoch": 0.18185310879170136, "step": 2840 }, { "loss": 2.3796, "grad_norm": 1.876528024673462, "learning_rate": 5e-05, "epoch": 0.1821732727156304, "step": 2845 }, { "loss": 2.366, "grad_norm": 1.977066159248352, "learning_rate": 5e-05, "epoch": 0.18249343663955944, "step": 2850 }, { "loss": 2.3681, "grad_norm": 1.8940303325653076, "learning_rate": 5e-05, "epoch": 0.1828136005634885, "step": 2855 }, { "loss": 2.3721, "grad_norm": 1.9290567636489868, "learning_rate": 5e-05, "epoch": 0.18313376448741756, "step": 2860 }, { "loss": 2.3555, "grad_norm": 1.8955270051956177, "learning_rate": 5e-05, "epoch": 0.1834539284113466, "step": 2865 }, { "loss": 2.3675, "grad_norm": 1.916295051574707, "learning_rate": 5e-05, "epoch": 0.18377409233527567, "step": 2870 }, { "loss": 2.3718, "grad_norm": 2.090623617172241, "learning_rate": 5e-05, "epoch": 0.18409425625920472, "step": 2875 }, { "loss": 2.3617, "grad_norm": 1.9669326543807983, "learning_rate": 5e-05, "epoch": 0.18441442018313375, "step": 2880 }, { "loss": 2.3693, "grad_norm": 1.856131911277771, "learning_rate": 5e-05, "epoch": 0.1847345841070628, "step": 2885 }, { "loss": 2.3714, "grad_norm": 1.9637651443481445, "learning_rate": 5e-05, "epoch": 0.18505474803099187, "step": 2890 }, { "loss": 2.3669, "grad_norm": 2.030195951461792, "learning_rate": 5e-05, "epoch": 0.18537491195492092, "step": 2895 }, { "loss": 2.364, "grad_norm": 1.9500765800476074, "learning_rate": 5e-05, "epoch": 0.18569507587884998, "step": 2900 }, { "loss": 2.3647, "grad_norm": 1.9658929109573364, "learning_rate": 5e-05, "epoch": 0.18601523980277904, "step": 2905 }, { "loss": 2.3541, "grad_norm": 1.9205372333526611, "learning_rate": 5e-05, "epoch": 0.18633540372670807, "step": 2910 }, { "loss": 2.3631, "grad_norm": 1.9161553382873535, "learning_rate": 5e-05, "epoch": 0.18665556765063712, "step": 2915 }, { "loss": 2.3526, "grad_norm": 1.9861873388290405, "learning_rate": 5e-05, "epoch": 0.18697573157456618, "step": 2920 }, { "loss": 2.3579, "grad_norm": 1.9132648706436157, "learning_rate": 5e-05, "epoch": 0.18729589549849524, "step": 2925 }, { "loss": 2.358, "grad_norm": 1.9785979986190796, "learning_rate": 5e-05, "epoch": 0.1876160594224243, "step": 2930 }, { "loss": 2.3736, "grad_norm": 1.9504092931747437, "learning_rate": 5e-05, "epoch": 0.18793622334635332, "step": 2935 }, { "loss": 2.3665, "grad_norm": 1.8407647609710693, "learning_rate": 5e-05, "epoch": 0.18825638727028238, "step": 2940 }, { "loss": 2.3351, "grad_norm": 2.004072666168213, "learning_rate": 5e-05, "epoch": 0.18857655119421143, "step": 2945 }, { "loss": 2.3649, "grad_norm": 1.8799525499343872, "learning_rate": 5e-05, "epoch": 0.1888967151181405, "step": 2950 }, { "loss": 2.351, "grad_norm": 2.0074257850646973, "learning_rate": 5e-05, "epoch": 0.18921687904206955, "step": 2955 }, { "loss": 2.3236, "grad_norm": 1.8301315307617188, "learning_rate": 5e-05, "epoch": 0.1895370429659986, "step": 2960 }, { "loss": 2.3545, "grad_norm": 1.9088454246520996, "learning_rate": 5e-05, "epoch": 0.18985720688992763, "step": 2965 }, { "loss": 2.3666, "grad_norm": 1.9445098638534546, "learning_rate": 5e-05, "epoch": 0.1901773708138567, "step": 2970 }, { "loss": 2.3593, "grad_norm": 1.8038558959960938, "learning_rate": 5e-05, "epoch": 0.19049753473778575, "step": 2975 }, { "loss": 2.3387, "grad_norm": 1.8952257633209229, "learning_rate": 5e-05, "epoch": 0.1908176986617148, "step": 2980 }, { "loss": 2.3742, "grad_norm": 1.8767812252044678, "learning_rate": 5e-05, "epoch": 0.19113786258564386, "step": 2985 }, { "loss": 2.3786, "grad_norm": 1.771638035774231, "learning_rate": 5e-05, "epoch": 0.19145802650957292, "step": 2990 }, { "loss": 2.3528, "grad_norm": 1.9153273105621338, "learning_rate": 5e-05, "epoch": 0.19177819043350194, "step": 2995 }, { "loss": 2.3781, "grad_norm": 1.9172663688659668, "learning_rate": 5e-05, "epoch": 0.192098354357431, "step": 3000 }, { "eval_loss": 2.209613800048828, "eval_runtime": 12.6666, "eval_samples_per_second": 161.685, "eval_steps_per_second": 20.211, "epoch": 0.192098354357431, "step": 3000 }, { "loss": 2.3762, "grad_norm": 1.8722891807556152, "learning_rate": 5e-05, "epoch": 0.19241851828136006, "step": 3005 }, { "loss": 2.3861, "grad_norm": 1.907089352607727, "learning_rate": 5e-05, "epoch": 0.1927386822052891, "step": 3010 }, { "loss": 2.3477, "grad_norm": 1.982266902923584, "learning_rate": 5e-05, "epoch": 0.19305884612921817, "step": 3015 }, { "loss": 2.3555, "grad_norm": 1.9882186651229858, "learning_rate": 5e-05, "epoch": 0.1933790100531472, "step": 3020 }, { "loss": 2.327, "grad_norm": 1.8406589031219482, "learning_rate": 5e-05, "epoch": 0.19369917397707626, "step": 3025 }, { "loss": 2.3411, "grad_norm": 1.9217675924301147, "learning_rate": 5e-05, "epoch": 0.1940193379010053, "step": 3030 }, { "loss": 2.3554, "grad_norm": 1.9655086994171143, "learning_rate": 5e-05, "epoch": 0.19433950182493437, "step": 3035 }, { "loss": 2.373, "grad_norm": 2.0582704544067383, "learning_rate": 5e-05, "epoch": 0.19465966574886343, "step": 3040 }, { "loss": 2.3424, "grad_norm": 1.8587092161178589, "learning_rate": 5e-05, "epoch": 0.19497982967279248, "step": 3045 }, { "loss": 2.3319, "grad_norm": 1.8777543306350708, "learning_rate": 5e-05, "epoch": 0.1952999935967215, "step": 3050 }, { "loss": 2.3365, "grad_norm": 1.9621491432189941, "learning_rate": 5e-05, "epoch": 0.19562015752065057, "step": 3055 }, { "loss": 2.3459, "grad_norm": 1.8719767332077026, "learning_rate": 5e-05, "epoch": 0.19594032144457962, "step": 3060 }, { "loss": 2.3487, "grad_norm": 1.970376968383789, "learning_rate": 5e-05, "epoch": 0.19626048536850868, "step": 3065 }, { "loss": 2.3692, "grad_norm": 1.8878765106201172, "learning_rate": 5e-05, "epoch": 0.19658064929243774, "step": 3070 }, { "loss": 2.3335, "grad_norm": 1.9047449827194214, "learning_rate": 5e-05, "epoch": 0.1969008132163668, "step": 3075 }, { "loss": 2.3679, "grad_norm": 1.8994492292404175, "learning_rate": 5e-05, "epoch": 0.19722097714029582, "step": 3080 }, { "loss": 2.3786, "grad_norm": 1.8764537572860718, "learning_rate": 5e-05, "epoch": 0.19754114106422488, "step": 3085 }, { "loss": 2.3656, "grad_norm": 1.937950849533081, "learning_rate": 5e-05, "epoch": 0.19786130498815394, "step": 3090 }, { "loss": 2.3501, "grad_norm": 1.9067658185958862, "learning_rate": 5e-05, "epoch": 0.198181468912083, "step": 3095 }, { "loss": 2.3382, "grad_norm": 1.9059247970581055, "learning_rate": 5e-05, "epoch": 0.19850163283601205, "step": 3100 }, { "loss": 2.3374, "grad_norm": 1.9067003726959229, "learning_rate": 5e-05, "epoch": 0.19882179675994108, "step": 3105 }, { "loss": 2.3787, "grad_norm": 1.860835075378418, "learning_rate": 5e-05, "epoch": 0.19914196068387013, "step": 3110 }, { "loss": 2.3622, "grad_norm": 1.9090162515640259, "learning_rate": 5e-05, "epoch": 0.1994621246077992, "step": 3115 }, { "loss": 2.3465, "grad_norm": 2.014218330383301, "learning_rate": 5e-05, "epoch": 0.19978228853172825, "step": 3120 }, { "loss": 2.3767, "grad_norm": 1.9965459108352661, "learning_rate": 5e-05, "epoch": 0.2001024524556573, "step": 3125 }, { "loss": 2.3674, "grad_norm": 1.9212764501571655, "learning_rate": 5e-05, "epoch": 0.20042261637958636, "step": 3130 }, { "loss": 2.3373, "grad_norm": 1.9215936660766602, "learning_rate": 5e-05, "epoch": 0.2007427803035154, "step": 3135 }, { "loss": 2.3155, "grad_norm": 1.9446462392807007, "learning_rate": 5e-05, "epoch": 0.20106294422744445, "step": 3140 }, { "loss": 2.3644, "grad_norm": 1.8948733806610107, "learning_rate": 5e-05, "epoch": 0.2013831081513735, "step": 3145 }, { "loss": 2.3357, "grad_norm": 1.919724702835083, "learning_rate": 5e-05, "epoch": 0.20170327207530256, "step": 3150 }, { "loss": 2.3539, "grad_norm": 1.8508602380752563, "learning_rate": 5e-05, "epoch": 0.20202343599923162, "step": 3155 }, { "loss": 2.3601, "grad_norm": 2.0129966735839844, "learning_rate": 5e-05, "epoch": 0.20234359992316067, "step": 3160 }, { "loss": 2.3601, "grad_norm": 1.693015456199646, "learning_rate": 5e-05, "epoch": 0.2026637638470897, "step": 3165 }, { "loss": 2.3558, "grad_norm": 1.956392526626587, "learning_rate": 5e-05, "epoch": 0.20298392777101876, "step": 3170 }, { "loss": 2.338, "grad_norm": 1.818519949913025, "learning_rate": 5e-05, "epoch": 0.2033040916949478, "step": 3175 }, { "loss": 2.3717, "grad_norm": 1.8295650482177734, "learning_rate": 5e-05, "epoch": 0.20362425561887687, "step": 3180 }, { "loss": 2.3375, "grad_norm": 2.044961929321289, "learning_rate": 5e-05, "epoch": 0.20394441954280593, "step": 3185 }, { "loss": 2.3358, "grad_norm": 2.061805248260498, "learning_rate": 5e-05, "epoch": 0.20426458346673496, "step": 3190 }, { "loss": 2.3462, "grad_norm": 1.870125651359558, "learning_rate": 5e-05, "epoch": 0.204584747390664, "step": 3195 }, { "loss": 2.3482, "grad_norm": 1.9724948406219482, "learning_rate": 5e-05, "epoch": 0.20490491131459307, "step": 3200 }, { "eval_loss": 2.2023611068725586, "eval_runtime": 12.9284, "eval_samples_per_second": 158.411, "eval_steps_per_second": 19.801, "epoch": 0.20490491131459307, "step": 3200 }, { "loss": 2.3575, "grad_norm": 1.840522289276123, "learning_rate": 5e-05, "epoch": 0.20522507523852213, "step": 3205 }, { "loss": 2.35, "grad_norm": 1.815750241279602, "learning_rate": 5e-05, "epoch": 0.20554523916245118, "step": 3210 }, { "loss": 2.3852, "grad_norm": 1.8655439615249634, "learning_rate": 5e-05, "epoch": 0.20586540308638024, "step": 3215 }, { "loss": 2.3345, "grad_norm": 1.8097730875015259, "learning_rate": 5e-05, "epoch": 0.20618556701030927, "step": 3220 }, { "loss": 2.3516, "grad_norm": 1.7540837526321411, "learning_rate": 5e-05, "epoch": 0.20650573093423832, "step": 3225 }, { "loss": 2.3371, "grad_norm": 1.8675291538238525, "learning_rate": 5e-05, "epoch": 0.20682589485816738, "step": 3230 }, { "loss": 2.352, "grad_norm": 2.067999839782715, "learning_rate": 5e-05, "epoch": 0.20714605878209644, "step": 3235 }, { "loss": 2.3439, "grad_norm": 1.8403364419937134, "learning_rate": 5e-05, "epoch": 0.2074662227060255, "step": 3240 }, { "loss": 2.3299, "grad_norm": 1.8896580934524536, "learning_rate": 5e-05, "epoch": 0.20778638662995455, "step": 3245 }, { "loss": 2.3492, "grad_norm": 1.941513180732727, "learning_rate": 5e-05, "epoch": 0.20810655055388358, "step": 3250 }, { "loss": 2.3744, "grad_norm": 1.916695237159729, "learning_rate": 5e-05, "epoch": 0.20842671447781264, "step": 3255 }, { "loss": 2.3438, "grad_norm": 1.907235026359558, "learning_rate": 5e-05, "epoch": 0.2087468784017417, "step": 3260 }, { "loss": 2.3503, "grad_norm": 1.864814281463623, "learning_rate": 5e-05, "epoch": 0.20906704232567075, "step": 3265 }, { "loss": 2.3345, "grad_norm": 1.8227028846740723, "learning_rate": 5e-05, "epoch": 0.2093872062495998, "step": 3270 }, { "loss": 2.3648, "grad_norm": 1.9340879917144775, "learning_rate": 5e-05, "epoch": 0.20970737017352883, "step": 3275 }, { "loss": 2.3289, "grad_norm": 1.8352042436599731, "learning_rate": 5e-05, "epoch": 0.2100275340974579, "step": 3280 }, { "loss": 2.3582, "grad_norm": 1.7997504472732544, "learning_rate": 5e-05, "epoch": 0.21034769802138695, "step": 3285 }, { "loss": 2.3383, "grad_norm": 1.8354027271270752, "learning_rate": 5e-05, "epoch": 0.210667861945316, "step": 3290 }, { "loss": 2.3495, "grad_norm": 1.9071873426437378, "learning_rate": 5e-05, "epoch": 0.21098802586924506, "step": 3295 }, { "loss": 2.3667, "grad_norm": 1.8804770708084106, "learning_rate": 5e-05, "epoch": 0.21130818979317412, "step": 3300 }, { "loss": 2.348, "grad_norm": 1.8979647159576416, "learning_rate": 5e-05, "epoch": 0.21162835371710315, "step": 3305 }, { "loss": 2.3424, "grad_norm": 1.9298757314682007, "learning_rate": 5e-05, "epoch": 0.2119485176410322, "step": 3310 }, { "loss": 2.3626, "grad_norm": 2.027535915374756, "learning_rate": 5e-05, "epoch": 0.21226868156496126, "step": 3315 }, { "loss": 2.3596, "grad_norm": 1.896079421043396, "learning_rate": 5e-05, "epoch": 0.21258884548889032, "step": 3320 }, { "loss": 2.3383, "grad_norm": 1.798487901687622, "learning_rate": 5e-05, "epoch": 0.21290900941281937, "step": 3325 }, { "loss": 2.3482, "grad_norm": 1.9177759885787964, "learning_rate": 5e-05, "epoch": 0.21322917333674843, "step": 3330 }, { "loss": 2.3124, "grad_norm": 1.9621219635009766, "learning_rate": 5e-05, "epoch": 0.21354933726067746, "step": 3335 }, { "loss": 2.3662, "grad_norm": 1.9927774667739868, "learning_rate": 5e-05, "epoch": 0.21386950118460651, "step": 3340 }, { "loss": 2.3203, "grad_norm": 1.8306477069854736, "learning_rate": 5e-05, "epoch": 0.21418966510853557, "step": 3345 }, { "loss": 2.3709, "grad_norm": 1.922379732131958, "learning_rate": 5e-05, "epoch": 0.21450982903246463, "step": 3350 }, { "loss": 2.3456, "grad_norm": 1.8414316177368164, "learning_rate": 5e-05, "epoch": 0.21482999295639368, "step": 3355 }, { "loss": 2.331, "grad_norm": 1.847821831703186, "learning_rate": 5e-05, "epoch": 0.2151501568803227, "step": 3360 }, { "loss": 2.3428, "grad_norm": 1.8807631731033325, "learning_rate": 5e-05, "epoch": 0.21547032080425177, "step": 3365 }, { "loss": 2.335, "grad_norm": 1.9684065580368042, "learning_rate": 5e-05, "epoch": 0.21579048472818083, "step": 3370 }, { "loss": 2.3302, "grad_norm": 1.9757294654846191, "learning_rate": 5e-05, "epoch": 0.21611064865210988, "step": 3375 }, { "loss": 2.3542, "grad_norm": 2.087277412414551, "learning_rate": 5e-05, "epoch": 0.21643081257603894, "step": 3380 }, { "loss": 2.3651, "grad_norm": 2.1057560443878174, "learning_rate": 5e-05, "epoch": 0.216750976499968, "step": 3385 }, { "loss": 2.3234, "grad_norm": 1.805690884590149, "learning_rate": 5e-05, "epoch": 0.21707114042389702, "step": 3390 }, { "loss": 2.3306, "grad_norm": 1.82710862159729, "learning_rate": 5e-05, "epoch": 0.21739130434782608, "step": 3395 }, { "loss": 2.3316, "grad_norm": 1.8435773849487305, "learning_rate": 5e-05, "epoch": 0.21771146827175514, "step": 3400 }, { "eval_loss": 2.198260545730591, "eval_runtime": 12.4233, "eval_samples_per_second": 164.852, "eval_steps_per_second": 20.606, "epoch": 0.21771146827175514, "step": 3400 }, { "loss": 2.3341, "grad_norm": 1.901654839515686, "learning_rate": 5e-05, "epoch": 0.2180316321956842, "step": 3405 }, { "loss": 2.3537, "grad_norm": 1.9241116046905518, "learning_rate": 5e-05, "epoch": 0.21835179611961325, "step": 3410 }, { "loss": 2.3616, "grad_norm": 1.9076368808746338, "learning_rate": 5e-05, "epoch": 0.2186719600435423, "step": 3415 }, { "loss": 2.3516, "grad_norm": 1.940434217453003, "learning_rate": 5e-05, "epoch": 0.21899212396747134, "step": 3420 }, { "loss": 2.3462, "grad_norm": 1.9202423095703125, "learning_rate": 5e-05, "epoch": 0.2193122878914004, "step": 3425 }, { "loss": 2.3302, "grad_norm": 1.8307172060012817, "learning_rate": 5e-05, "epoch": 0.21963245181532945, "step": 3430 }, { "loss": 2.3675, "grad_norm": 1.825799584388733, "learning_rate": 5e-05, "epoch": 0.2199526157392585, "step": 3435 }, { "loss": 2.3283, "grad_norm": 1.83090341091156, "learning_rate": 5e-05, "epoch": 0.22027277966318756, "step": 3440 }, { "loss": 2.3698, "grad_norm": 1.8049718141555786, "learning_rate": 5e-05, "epoch": 0.2205929435871166, "step": 3445 }, { "loss": 2.3352, "grad_norm": 2.047487735748291, "learning_rate": 5e-05, "epoch": 0.22091310751104565, "step": 3450 }, { "loss": 2.3209, "grad_norm": 1.8851560354232788, "learning_rate": 5e-05, "epoch": 0.2212332714349747, "step": 3455 }, { "loss": 2.3278, "grad_norm": 1.781719446182251, "learning_rate": 5e-05, "epoch": 0.22155343535890376, "step": 3460 }, { "loss": 2.3379, "grad_norm": 1.822160243988037, "learning_rate": 5e-05, "epoch": 0.22187359928283282, "step": 3465 }, { "loss": 2.3341, "grad_norm": 1.7682366371154785, "learning_rate": 5e-05, "epoch": 0.22219376320676187, "step": 3470 }, { "loss": 2.3617, "grad_norm": 2.0239579677581787, "learning_rate": 5e-05, "epoch": 0.2225139271306909, "step": 3475 }, { "loss": 2.3376, "grad_norm": 1.9589248895645142, "learning_rate": 5e-05, "epoch": 0.22283409105461996, "step": 3480 }, { "loss": 2.325, "grad_norm": 1.8296499252319336, "learning_rate": 5e-05, "epoch": 0.22315425497854902, "step": 3485 }, { "loss": 2.3318, "grad_norm": 1.7005376815795898, "learning_rate": 5e-05, "epoch": 0.22347441890247807, "step": 3490 }, { "loss": 2.3311, "grad_norm": 1.8033257722854614, "learning_rate": 5e-05, "epoch": 0.22379458282640713, "step": 3495 }, { "loss": 2.3697, "grad_norm": 1.813000202178955, "learning_rate": 5e-05, "epoch": 0.22411474675033619, "step": 3500 }, { "loss": 2.3146, "grad_norm": 1.8140153884887695, "learning_rate": 5e-05, "epoch": 0.22443491067426521, "step": 3505 }, { "loss": 2.3511, "grad_norm": 1.8404661417007446, "learning_rate": 5e-05, "epoch": 0.22475507459819427, "step": 3510 }, { "loss": 2.3413, "grad_norm": 1.7392330169677734, "learning_rate": 5e-05, "epoch": 0.22507523852212333, "step": 3515 }, { "loss": 2.382, "grad_norm": 1.833692193031311, "learning_rate": 5e-05, "epoch": 0.22539540244605238, "step": 3520 }, { "loss": 2.3425, "grad_norm": 1.841347575187683, "learning_rate": 5e-05, "epoch": 0.22571556636998144, "step": 3525 }, { "loss": 2.3298, "grad_norm": 1.789963960647583, "learning_rate": 5e-05, "epoch": 0.22603573029391047, "step": 3530 }, { "loss": 2.3307, "grad_norm": 1.7983555793762207, "learning_rate": 5e-05, "epoch": 0.22635589421783953, "step": 3535 }, { "loss": 2.3397, "grad_norm": 2.009568452835083, "learning_rate": 5e-05, "epoch": 0.22667605814176858, "step": 3540 }, { "loss": 2.314, "grad_norm": 1.8560316562652588, "learning_rate": 5e-05, "epoch": 0.22699622206569764, "step": 3545 }, { "loss": 2.3296, "grad_norm": 1.8345584869384766, "learning_rate": 5e-05, "epoch": 0.2273163859896267, "step": 3550 }, { "loss": 2.3488, "grad_norm": 1.8176889419555664, "learning_rate": 5e-05, "epoch": 0.22763654991355575, "step": 3555 }, { "loss": 2.3174, "grad_norm": 1.8083289861679077, "learning_rate": 5e-05, "epoch": 0.22795671383748478, "step": 3560 }, { "loss": 2.3544, "grad_norm": 1.8767235279083252, "learning_rate": 5e-05, "epoch": 0.22827687776141384, "step": 3565 }, { "loss": 2.3371, "grad_norm": 1.7118626832962036, "learning_rate": 5e-05, "epoch": 0.2285970416853429, "step": 3570 }, { "loss": 2.3282, "grad_norm": 1.8422014713287354, "learning_rate": 5e-05, "epoch": 0.22891720560927195, "step": 3575 }, { "loss": 2.3305, "grad_norm": 1.787657380104065, "learning_rate": 5e-05, "epoch": 0.229237369533201, "step": 3580 }, { "loss": 2.3791, "grad_norm": 1.8549391031265259, "learning_rate": 5e-05, "epoch": 0.22955753345713006, "step": 3585 }, { "loss": 2.3312, "grad_norm": 1.923996090888977, "learning_rate": 5e-05, "epoch": 0.2298776973810591, "step": 3590 }, { "loss": 2.3529, "grad_norm": 1.9095416069030762, "learning_rate": 5e-05, "epoch": 0.23019786130498815, "step": 3595 }, { "loss": 2.3178, "grad_norm": 1.8591622114181519, "learning_rate": 5e-05, "epoch": 0.2305180252289172, "step": 3600 }, { "eval_loss": 2.183711528778076, "eval_runtime": 9.2433, "eval_samples_per_second": 221.566, "eval_steps_per_second": 27.696, "epoch": 0.2305180252289172, "step": 3600 }, { "loss": 2.3165, "grad_norm": 1.8757052421569824, "learning_rate": 5e-05, "epoch": 0.23083818915284626, "step": 3605 }, { "loss": 2.3444, "grad_norm": 1.7773499488830566, "learning_rate": 5e-05, "epoch": 0.23115835307677532, "step": 3610 }, { "loss": 2.3176, "grad_norm": 1.9299156665802002, "learning_rate": 5e-05, "epoch": 0.23147851700070435, "step": 3615 }, { "loss": 2.3268, "grad_norm": 1.8555759191513062, "learning_rate": 5e-05, "epoch": 0.2317986809246334, "step": 3620 }, { "loss": 2.3094, "grad_norm": 1.8463343381881714, "learning_rate": 5e-05, "epoch": 0.23211884484856246, "step": 3625 }, { "loss": 2.3227, "grad_norm": 1.9021217823028564, "learning_rate": 5e-05, "epoch": 0.23243900877249152, "step": 3630 }, { "loss": 2.33, "grad_norm": 1.8267807960510254, "learning_rate": 5e-05, "epoch": 0.23275917269642057, "step": 3635 }, { "loss": 2.3597, "grad_norm": 1.7418160438537598, "learning_rate": 5e-05, "epoch": 0.23307933662034963, "step": 3640 }, { "loss": 2.3096, "grad_norm": 1.9065451622009277, "learning_rate": 5e-05, "epoch": 0.23339950054427866, "step": 3645 }, { "loss": 2.3189, "grad_norm": 1.8539282083511353, "learning_rate": 5e-05, "epoch": 0.23371966446820772, "step": 3650 }, { "loss": 2.3683, "grad_norm": 1.8925061225891113, "learning_rate": 5e-05, "epoch": 0.23403982839213677, "step": 3655 }, { "loss": 2.3056, "grad_norm": 1.8763203620910645, "learning_rate": 5e-05, "epoch": 0.23435999231606583, "step": 3660 }, { "loss": 2.3628, "grad_norm": 1.9830697774887085, "learning_rate": 5e-05, "epoch": 0.23468015623999489, "step": 3665 }, { "loss": 2.3377, "grad_norm": 1.957559585571289, "learning_rate": 5e-05, "epoch": 0.23500032016392394, "step": 3670 }, { "loss": 2.3105, "grad_norm": 1.8156100511550903, "learning_rate": 5e-05, "epoch": 0.23532048408785297, "step": 3675 }, { "loss": 2.3339, "grad_norm": 1.8064128160476685, "learning_rate": 5e-05, "epoch": 0.23564064801178203, "step": 3680 }, { "loss": 2.3292, "grad_norm": 1.7643941640853882, "learning_rate": 5e-05, "epoch": 0.23596081193571108, "step": 3685 }, { "loss": 2.3173, "grad_norm": 1.9001303911209106, "learning_rate": 5e-05, "epoch": 0.23628097585964014, "step": 3690 }, { "loss": 2.319, "grad_norm": 1.7739763259887695, "learning_rate": 5e-05, "epoch": 0.2366011397835692, "step": 3695 }, { "loss": 2.3218, "grad_norm": 1.7484267950057983, "learning_rate": 5e-05, "epoch": 0.23692130370749823, "step": 3700 }, { "loss": 2.3221, "grad_norm": 1.8267314434051514, "learning_rate": 5e-05, "epoch": 0.23724146763142728, "step": 3705 }, { "loss": 2.3376, "grad_norm": 1.9269976615905762, "learning_rate": 5e-05, "epoch": 0.23756163155535634, "step": 3710 }, { "loss": 2.3206, "grad_norm": 1.820557951927185, "learning_rate": 5e-05, "epoch": 0.2378817954792854, "step": 3715 }, { "loss": 2.3379, "grad_norm": 1.788172960281372, "learning_rate": 5e-05, "epoch": 0.23820195940321445, "step": 3720 }, { "loss": 2.339, "grad_norm": 1.866925835609436, "learning_rate": 5e-05, "epoch": 0.2385221233271435, "step": 3725 }, { "loss": 2.3227, "grad_norm": 1.9489960670471191, "learning_rate": 5e-05, "epoch": 0.23884228725107254, "step": 3730 }, { "loss": 2.3115, "grad_norm": 1.8640096187591553, "learning_rate": 5e-05, "epoch": 0.2391624511750016, "step": 3735 }, { "loss": 2.3011, "grad_norm": 1.8240951299667358, "learning_rate": 5e-05, "epoch": 0.23948261509893065, "step": 3740 }, { "loss": 2.3257, "grad_norm": 1.8693873882293701, "learning_rate": 5e-05, "epoch": 0.2398027790228597, "step": 3745 }, { "loss": 2.3027, "grad_norm": 1.879884958267212, "learning_rate": 5e-05, "epoch": 0.24012294294678876, "step": 3750 }, { "loss": 2.3408, "grad_norm": 1.8470027446746826, "learning_rate": 5e-05, "epoch": 0.24044310687071782, "step": 3755 }, { "loss": 2.2935, "grad_norm": 1.8507801294326782, "learning_rate": 5e-05, "epoch": 0.24076327079464685, "step": 3760 }, { "loss": 2.3283, "grad_norm": 1.8579989671707153, "learning_rate": 5e-05, "epoch": 0.2410834347185759, "step": 3765 }, { "loss": 2.3095, "grad_norm": 1.8650803565979004, "learning_rate": 5e-05, "epoch": 0.24140359864250496, "step": 3770 }, { "loss": 2.3032, "grad_norm": 1.8193062543869019, "learning_rate": 5e-05, "epoch": 0.24172376256643402, "step": 3775 }, { "loss": 2.3259, "grad_norm": 2.0027434825897217, "learning_rate": 5e-05, "epoch": 0.24204392649036308, "step": 3780 }, { "loss": 2.3105, "grad_norm": 1.9422210454940796, "learning_rate": 5e-05, "epoch": 0.2423640904142921, "step": 3785 }, { "loss": 2.3438, "grad_norm": 1.9346174001693726, "learning_rate": 5e-05, "epoch": 0.24268425433822116, "step": 3790 }, { "loss": 2.317, "grad_norm": 1.8896723985671997, "learning_rate": 5e-05, "epoch": 0.24300441826215022, "step": 3795 }, { "loss": 2.3141, "grad_norm": 1.8078259229660034, "learning_rate": 5e-05, "epoch": 0.24332458218607927, "step": 3800 }, { "eval_loss": 2.184438705444336, "eval_runtime": 10.614, "eval_samples_per_second": 192.952, "eval_steps_per_second": 24.119, "epoch": 0.24332458218607927, "step": 3800 }, { "loss": 2.3237, "grad_norm": 1.9326097965240479, "learning_rate": 5e-05, "epoch": 0.24364474611000833, "step": 3805 }, { "loss": 2.3287, "grad_norm": 1.8264923095703125, "learning_rate": 5e-05, "epoch": 0.2439649100339374, "step": 3810 }, { "loss": 2.3117, "grad_norm": 1.9702720642089844, "learning_rate": 5e-05, "epoch": 0.24428507395786642, "step": 3815 }, { "loss": 2.306, "grad_norm": 1.810170292854309, "learning_rate": 5e-05, "epoch": 0.24460523788179547, "step": 3820 }, { "loss": 2.3408, "grad_norm": 1.8189213275909424, "learning_rate": 5e-05, "epoch": 0.24492540180572453, "step": 3825 }, { "loss": 2.3131, "grad_norm": 1.8908005952835083, "learning_rate": 5e-05, "epoch": 0.2452455657296536, "step": 3830 }, { "loss": 2.339, "grad_norm": 1.8011490106582642, "learning_rate": 5e-05, "epoch": 0.24556572965358264, "step": 3835 }, { "loss": 2.3175, "grad_norm": 1.7648205757141113, "learning_rate": 5e-05, "epoch": 0.2458858935775117, "step": 3840 }, { "loss": 2.325, "grad_norm": 1.8377348184585571, "learning_rate": 5e-05, "epoch": 0.24620605750144073, "step": 3845 }, { "loss": 2.2859, "grad_norm": 1.8196038007736206, "learning_rate": 5e-05, "epoch": 0.24652622142536978, "step": 3850 }, { "loss": 2.331, "grad_norm": 1.7730222940444946, "learning_rate": 5e-05, "epoch": 0.24684638534929884, "step": 3855 }, { "loss": 2.3507, "grad_norm": 1.8816814422607422, "learning_rate": 5e-05, "epoch": 0.2471665492732279, "step": 3860 }, { "loss": 2.3348, "grad_norm": 1.842856526374817, "learning_rate": 5e-05, "epoch": 0.24748671319715695, "step": 3865 }, { "loss": 2.3169, "grad_norm": 1.792005181312561, "learning_rate": 5e-05, "epoch": 0.24780687712108598, "step": 3870 }, { "loss": 2.3106, "grad_norm": 1.7900562286376953, "learning_rate": 5e-05, "epoch": 0.24812704104501504, "step": 3875 }, { "loss": 2.3323, "grad_norm": 1.7834545373916626, "learning_rate": 5e-05, "epoch": 0.2484472049689441, "step": 3880 }, { "loss": 2.3434, "grad_norm": 1.8184958696365356, "learning_rate": 5e-05, "epoch": 0.24876736889287315, "step": 3885 }, { "loss": 2.3054, "grad_norm": 1.8056672811508179, "learning_rate": 5e-05, "epoch": 0.2490875328168022, "step": 3890 }, { "loss": 2.3238, "grad_norm": 1.8434021472930908, "learning_rate": 5e-05, "epoch": 0.24940769674073127, "step": 3895 }, { "loss": 2.3059, "grad_norm": 1.7343634366989136, "learning_rate": 5e-05, "epoch": 0.2497278606646603, "step": 3900 }, { "loss": 2.325, "grad_norm": 1.7279424667358398, "learning_rate": 5e-05, "epoch": 0.2500480245885894, "step": 3905 }, { "loss": 2.323, "grad_norm": 1.7715774774551392, "learning_rate": 5e-05, "epoch": 0.2503681885125184, "step": 3910 }, { "loss": 2.2872, "grad_norm": 1.7786765098571777, "learning_rate": 5e-05, "epoch": 0.25068835243644744, "step": 3915 }, { "loss": 2.3408, "grad_norm": 1.8894507884979248, "learning_rate": 5e-05, "epoch": 0.2510085163603765, "step": 3920 }, { "loss": 2.3193, "grad_norm": 1.8058632612228394, "learning_rate": 5e-05, "epoch": 0.25132868028430555, "step": 3925 }, { "loss": 2.3168, "grad_norm": 1.7818254232406616, "learning_rate": 5e-05, "epoch": 0.25164884420823463, "step": 3930 }, { "loss": 2.3393, "grad_norm": 1.7840033769607544, "learning_rate": 5e-05, "epoch": 0.25196900813216366, "step": 3935 }, { "loss": 2.3405, "grad_norm": 1.8498218059539795, "learning_rate": 5e-05, "epoch": 0.2522891720560927, "step": 3940 }, { "loss": 2.3186, "grad_norm": 1.827964425086975, "learning_rate": 5e-05, "epoch": 0.2526093359800218, "step": 3945 }, { "loss": 2.3069, "grad_norm": 1.8498241901397705, "learning_rate": 5e-05, "epoch": 0.2529294999039508, "step": 3950 }, { "loss": 2.3419, "grad_norm": 1.7726775407791138, "learning_rate": 5e-05, "epoch": 0.2532496638278799, "step": 3955 }, { "loss": 2.3047, "grad_norm": 1.9088823795318604, "learning_rate": 5e-05, "epoch": 0.2535698277518089, "step": 3960 }, { "loss": 2.334, "grad_norm": 1.8803976774215698, "learning_rate": 5e-05, "epoch": 0.253889991675738, "step": 3965 }, { "loss": 2.2902, "grad_norm": 1.914686679840088, "learning_rate": 5e-05, "epoch": 0.25421015559966703, "step": 3970 }, { "loss": 2.3233, "grad_norm": 1.8192038536071777, "learning_rate": 5e-05, "epoch": 0.25453031952359606, "step": 3975 }, { "loss": 2.3367, "grad_norm": 1.8976246118545532, "learning_rate": 5e-05, "epoch": 0.25485048344752514, "step": 3980 }, { "loss": 2.2671, "grad_norm": 1.7621504068374634, "learning_rate": 5e-05, "epoch": 0.2551706473714542, "step": 3985 }, { "loss": 2.3326, "grad_norm": 1.912398338317871, "learning_rate": 5e-05, "epoch": 0.25549081129538326, "step": 3990 }, { "loss": 2.3008, "grad_norm": 2.027517557144165, "learning_rate": 5e-05, "epoch": 0.2558109752193123, "step": 3995 }, { "loss": 2.3146, "grad_norm": 1.934006690979004, "learning_rate": 5e-05, "epoch": 0.2561311391432413, "step": 4000 }, { "eval_loss": 2.177450180053711, "eval_runtime": 9.6248, "eval_samples_per_second": 212.784, "eval_steps_per_second": 26.598, "epoch": 0.2561311391432413, "step": 4000 }, { "loss": 2.2979, "grad_norm": 1.8951339721679688, "learning_rate": 5e-05, "epoch": 0.2564513030671704, "step": 4005 }, { "loss": 2.3171, "grad_norm": 1.7967989444732666, "learning_rate": 5e-05, "epoch": 0.25677146699109943, "step": 4010 }, { "loss": 2.3077, "grad_norm": 1.878688097000122, "learning_rate": 5e-05, "epoch": 0.2570916309150285, "step": 4015 }, { "loss": 2.2935, "grad_norm": 1.855326771736145, "learning_rate": 5e-05, "epoch": 0.25741179483895754, "step": 4020 }, { "loss": 2.3404, "grad_norm": 1.797782063484192, "learning_rate": 5e-05, "epoch": 0.25773195876288657, "step": 4025 }, { "loss": 2.2913, "grad_norm": 1.9316190481185913, "learning_rate": 5e-05, "epoch": 0.25805212268681565, "step": 4030 }, { "loss": 2.306, "grad_norm": 1.7924227714538574, "learning_rate": 5e-05, "epoch": 0.2583722866107447, "step": 4035 }, { "loss": 2.3185, "grad_norm": 1.8619980812072754, "learning_rate": 5e-05, "epoch": 0.25869245053467377, "step": 4040 }, { "loss": 2.3279, "grad_norm": 1.931126594543457, "learning_rate": 5e-05, "epoch": 0.2590126144586028, "step": 4045 }, { "loss": 2.3288, "grad_norm": 1.8355220556259155, "learning_rate": 5e-05, "epoch": 0.2593327783825319, "step": 4050 }, { "loss": 2.3015, "grad_norm": 1.8821378946304321, "learning_rate": 5e-05, "epoch": 0.2596529423064609, "step": 4055 }, { "loss": 2.3181, "grad_norm": 1.929376482963562, "learning_rate": 5e-05, "epoch": 0.25997310623038994, "step": 4060 }, { "loss": 2.3188, "grad_norm": 1.8176177740097046, "learning_rate": 5e-05, "epoch": 0.260293270154319, "step": 4065 }, { "loss": 2.3164, "grad_norm": 1.7493705749511719, "learning_rate": 5e-05, "epoch": 0.26061343407824805, "step": 4070 }, { "loss": 2.2848, "grad_norm": 1.8640022277832031, "learning_rate": 5e-05, "epoch": 0.26093359800217714, "step": 4075 }, { "loss": 2.3192, "grad_norm": 1.8012224435806274, "learning_rate": 5e-05, "epoch": 0.26125376192610616, "step": 4080 }, { "loss": 2.3191, "grad_norm": 1.9000436067581177, "learning_rate": 5e-05, "epoch": 0.2615739258500352, "step": 4085 }, { "loss": 2.3032, "grad_norm": 1.8532963991165161, "learning_rate": 5e-05, "epoch": 0.2618940897739643, "step": 4090 }, { "loss": 2.326, "grad_norm": 1.8395898342132568, "learning_rate": 5e-05, "epoch": 0.2622142536978933, "step": 4095 }, { "loss": 2.3354, "grad_norm": 1.7998299598693848, "learning_rate": 5e-05, "epoch": 0.2625344176218224, "step": 4100 }, { "loss": 2.3093, "grad_norm": 1.8509643077850342, "learning_rate": 5e-05, "epoch": 0.2628545815457514, "step": 4105 }, { "loss": 2.3287, "grad_norm": 1.9016660451889038, "learning_rate": 5e-05, "epoch": 0.26317474546968045, "step": 4110 }, { "loss": 2.2871, "grad_norm": 1.8604185581207275, "learning_rate": 5e-05, "epoch": 0.26349490939360953, "step": 4115 }, { "loss": 2.3072, "grad_norm": 1.842264175415039, "learning_rate": 5e-05, "epoch": 0.26381507331753856, "step": 4120 }, { "loss": 2.3312, "grad_norm": 1.7420934438705444, "learning_rate": 5e-05, "epoch": 0.26413523724146765, "step": 4125 }, { "loss": 2.3131, "grad_norm": 1.7676818370819092, "learning_rate": 5e-05, "epoch": 0.2644554011653967, "step": 4130 }, { "loss": 2.3026, "grad_norm": 1.7885444164276123, "learning_rate": 5e-05, "epoch": 0.26477556508932576, "step": 4135 }, { "loss": 2.3201, "grad_norm": 1.6721593141555786, "learning_rate": 5e-05, "epoch": 0.2650957290132548, "step": 4140 }, { "loss": 2.3047, "grad_norm": 1.8331459760665894, "learning_rate": 5e-05, "epoch": 0.2654158929371838, "step": 4145 }, { "loss": 2.3259, "grad_norm": 1.735121250152588, "learning_rate": 5e-05, "epoch": 0.2657360568611129, "step": 4150 }, { "loss": 2.2825, "grad_norm": 1.8103950023651123, "learning_rate": 5e-05, "epoch": 0.26605622078504193, "step": 4155 }, { "loss": 2.3094, "grad_norm": 1.833533525466919, "learning_rate": 5e-05, "epoch": 0.266376384708971, "step": 4160 }, { "loss": 2.3258, "grad_norm": 1.7850996255874634, "learning_rate": 5e-05, "epoch": 0.26669654863290004, "step": 4165 }, { "loss": 2.314, "grad_norm": 1.8073853254318237, "learning_rate": 5e-05, "epoch": 0.2670167125568291, "step": 4170 }, { "loss": 2.33, "grad_norm": 1.7849806547164917, "learning_rate": 5e-05, "epoch": 0.26733687648075816, "step": 4175 }, { "loss": 2.3151, "grad_norm": 1.7261276245117188, "learning_rate": 5e-05, "epoch": 0.2676570404046872, "step": 4180 }, { "loss": 2.3036, "grad_norm": 1.763243556022644, "learning_rate": 5e-05, "epoch": 0.26797720432861627, "step": 4185 }, { "loss": 2.3306, "grad_norm": 1.8771343231201172, "learning_rate": 5e-05, "epoch": 0.2682973682525453, "step": 4190 }, { "loss": 2.3093, "grad_norm": 1.8824447393417358, "learning_rate": 5e-05, "epoch": 0.2686175321764743, "step": 4195 }, { "loss": 2.3243, "grad_norm": 1.757983684539795, "learning_rate": 5e-05, "epoch": 0.2689376961004034, "step": 4200 }, { "eval_loss": 2.162459135055542, "eval_runtime": 12.641, "eval_samples_per_second": 162.012, "eval_steps_per_second": 20.252, "epoch": 0.2689376961004034, "step": 4200 }, { "loss": 2.3164, "grad_norm": 1.7963429689407349, "learning_rate": 5e-05, "epoch": 0.26925786002433244, "step": 4205 }, { "loss": 2.3378, "grad_norm": 1.9342796802520752, "learning_rate": 5e-05, "epoch": 0.2695780239482615, "step": 4210 }, { "loss": 2.3121, "grad_norm": 1.863183856010437, "learning_rate": 5e-05, "epoch": 0.26989818787219055, "step": 4215 }, { "loss": 2.2946, "grad_norm": 1.7715400457382202, "learning_rate": 5e-05, "epoch": 0.27021835179611964, "step": 4220 }, { "loss": 2.2986, "grad_norm": 1.806707739830017, "learning_rate": 5e-05, "epoch": 0.27053851572004867, "step": 4225 }, { "loss": 2.3095, "grad_norm": 1.8026810884475708, "learning_rate": 5e-05, "epoch": 0.2708586796439777, "step": 4230 }, { "loss": 2.3165, "grad_norm": 1.7845863103866577, "learning_rate": 5e-05, "epoch": 0.2711788435679068, "step": 4235 }, { "loss": 2.3014, "grad_norm": 1.864893913269043, "learning_rate": 5e-05, "epoch": 0.2714990074918358, "step": 4240 }, { "loss": 2.3238, "grad_norm": 1.7305742502212524, "learning_rate": 5e-05, "epoch": 0.2718191714157649, "step": 4245 }, { "loss": 2.291, "grad_norm": 1.7948355674743652, "learning_rate": 5e-05, "epoch": 0.2721393353396939, "step": 4250 }, { "loss": 2.3144, "grad_norm": 1.7819257974624634, "learning_rate": 5e-05, "epoch": 0.27245949926362295, "step": 4255 }, { "loss": 2.327, "grad_norm": 1.8523086309432983, "learning_rate": 5e-05, "epoch": 0.27277966318755204, "step": 4260 }, { "loss": 2.3293, "grad_norm": 1.7722643613815308, "learning_rate": 5e-05, "epoch": 0.27309982711148106, "step": 4265 }, { "loss": 2.303, "grad_norm": 1.8291378021240234, "learning_rate": 5e-05, "epoch": 0.27341999103541015, "step": 4270 }, { "loss": 2.2893, "grad_norm": 1.8277583122253418, "learning_rate": 5e-05, "epoch": 0.2737401549593392, "step": 4275 }, { "loss": 2.3348, "grad_norm": 1.8024441003799438, "learning_rate": 5e-05, "epoch": 0.2740603188832682, "step": 4280 }, { "loss": 2.3024, "grad_norm": 1.8651007413864136, "learning_rate": 5e-05, "epoch": 0.2743804828071973, "step": 4285 }, { "loss": 2.327, "grad_norm": 1.8744381666183472, "learning_rate": 5e-05, "epoch": 0.2747006467311263, "step": 4290 }, { "loss": 2.3225, "grad_norm": 1.8396573066711426, "learning_rate": 5e-05, "epoch": 0.2750208106550554, "step": 4295 }, { "loss": 2.2918, "grad_norm": 1.7585549354553223, "learning_rate": 5e-05, "epoch": 0.27534097457898443, "step": 4300 }, { "loss": 2.3135, "grad_norm": 1.8332717418670654, "learning_rate": 5e-05, "epoch": 0.2756611385029135, "step": 4305 }, { "loss": 2.3176, "grad_norm": 1.8986752033233643, "learning_rate": 5e-05, "epoch": 0.27598130242684255, "step": 4310 }, { "loss": 2.2758, "grad_norm": 1.7375805377960205, "learning_rate": 5e-05, "epoch": 0.2763014663507716, "step": 4315 }, { "loss": 2.2786, "grad_norm": 1.838408350944519, "learning_rate": 5e-05, "epoch": 0.27662163027470066, "step": 4320 }, { "loss": 2.3119, "grad_norm": 1.9200383424758911, "learning_rate": 5e-05, "epoch": 0.2769417941986297, "step": 4325 }, { "loss": 2.3082, "grad_norm": 1.8573769330978394, "learning_rate": 5e-05, "epoch": 0.27726195812255877, "step": 4330 }, { "loss": 2.2946, "grad_norm": 1.819273829460144, "learning_rate": 5e-05, "epoch": 0.2775821220464878, "step": 4335 }, { "loss": 2.28, "grad_norm": 1.8978952169418335, "learning_rate": 5e-05, "epoch": 0.27790228597041683, "step": 4340 }, { "loss": 2.3058, "grad_norm": 1.7782193422317505, "learning_rate": 5e-05, "epoch": 0.2782224498943459, "step": 4345 }, { "loss": 2.3189, "grad_norm": 1.831231951713562, "learning_rate": 5e-05, "epoch": 0.27854261381827494, "step": 4350 }, { "loss": 2.314, "grad_norm": 1.830064296722412, "learning_rate": 5e-05, "epoch": 0.278862777742204, "step": 4355 }, { "loss": 2.2983, "grad_norm": 1.8492834568023682, "learning_rate": 5e-05, "epoch": 0.27918294166613306, "step": 4360 }, { "loss": 2.3379, "grad_norm": 1.841322898864746, "learning_rate": 5e-05, "epoch": 0.2795031055900621, "step": 4365 }, { "loss": 2.3309, "grad_norm": 1.8109886646270752, "learning_rate": 5e-05, "epoch": 0.27982326951399117, "step": 4370 }, { "loss": 2.2967, "grad_norm": 1.9388337135314941, "learning_rate": 5e-05, "epoch": 0.2801434334379202, "step": 4375 }, { "loss": 2.3163, "grad_norm": 1.8596948385238647, "learning_rate": 5e-05, "epoch": 0.2804635973618493, "step": 4380 }, { "loss": 2.2899, "grad_norm": 1.8299187421798706, "learning_rate": 5e-05, "epoch": 0.2807837612857783, "step": 4385 }, { "loss": 2.3007, "grad_norm": 1.819220781326294, "learning_rate": 5e-05, "epoch": 0.2811039252097074, "step": 4390 }, { "loss": 2.312, "grad_norm": 1.8518681526184082, "learning_rate": 5e-05, "epoch": 0.2814240891336364, "step": 4395 }, { "loss": 2.3215, "grad_norm": 1.8841506242752075, "learning_rate": 5e-05, "epoch": 0.28174425305756545, "step": 4400 }, { "eval_loss": 2.1684622764587402, "eval_runtime": 9.6418, "eval_samples_per_second": 212.408, "eval_steps_per_second": 26.551, "epoch": 0.28174425305756545, "step": 4400 }, { "loss": 2.2986, "grad_norm": 1.71231210231781, "learning_rate": 5e-05, "epoch": 0.28206441698149454, "step": 4405 }, { "loss": 2.2829, "grad_norm": 1.7305104732513428, "learning_rate": 5e-05, "epoch": 0.28238458090542357, "step": 4410 }, { "loss": 2.2749, "grad_norm": 1.8442025184631348, "learning_rate": 5e-05, "epoch": 0.28270474482935265, "step": 4415 }, { "loss": 2.3002, "grad_norm": 1.8370575904846191, "learning_rate": 5e-05, "epoch": 0.2830249087532817, "step": 4420 }, { "loss": 2.2997, "grad_norm": 1.8042954206466675, "learning_rate": 5e-05, "epoch": 0.2833450726772107, "step": 4425 }, { "loss": 2.3224, "grad_norm": 1.7841765880584717, "learning_rate": 5e-05, "epoch": 0.2836652366011398, "step": 4430 }, { "loss": 2.3205, "grad_norm": 1.6575603485107422, "learning_rate": 5e-05, "epoch": 0.2839854005250688, "step": 4435 }, { "loss": 2.301, "grad_norm": 1.8698128461837769, "learning_rate": 5e-05, "epoch": 0.2843055644489979, "step": 4440 }, { "loss": 2.3192, "grad_norm": 1.8466641902923584, "learning_rate": 5e-05, "epoch": 0.28462572837292693, "step": 4445 }, { "loss": 2.2704, "grad_norm": 1.7934186458587646, "learning_rate": 5e-05, "epoch": 0.28494589229685596, "step": 4450 }, { "loss": 2.3225, "grad_norm": 1.770643949508667, "learning_rate": 5e-05, "epoch": 0.28526605622078505, "step": 4455 }, { "loss": 2.3238, "grad_norm": 1.7914665937423706, "learning_rate": 5e-05, "epoch": 0.2855862201447141, "step": 4460 }, { "loss": 2.3193, "grad_norm": 1.7819799184799194, "learning_rate": 5e-05, "epoch": 0.28590638406864316, "step": 4465 }, { "loss": 2.2642, "grad_norm": 1.7854515314102173, "learning_rate": 5e-05, "epoch": 0.2862265479925722, "step": 4470 }, { "loss": 2.3097, "grad_norm": 1.703332543373108, "learning_rate": 5e-05, "epoch": 0.2865467119165013, "step": 4475 }, { "loss": 2.3122, "grad_norm": 1.7654129266738892, "learning_rate": 5e-05, "epoch": 0.2868668758404303, "step": 4480 }, { "loss": 2.3142, "grad_norm": 1.8920791149139404, "learning_rate": 5e-05, "epoch": 0.28718703976435933, "step": 4485 }, { "loss": 2.3208, "grad_norm": 1.824573278427124, "learning_rate": 5e-05, "epoch": 0.2875072036882884, "step": 4490 }, { "loss": 2.3027, "grad_norm": 1.7249481678009033, "learning_rate": 5e-05, "epoch": 0.28782736761221744, "step": 4495 }, { "loss": 2.2931, "grad_norm": 1.8988478183746338, "learning_rate": 5e-05, "epoch": 0.28814753153614653, "step": 4500 }, { "loss": 2.3057, "grad_norm": 1.8929831981658936, "learning_rate": 5e-05, "epoch": 0.28846769546007556, "step": 4505 }, { "loss": 2.313, "grad_norm": 1.8110840320587158, "learning_rate": 5e-05, "epoch": 0.2887878593840046, "step": 4510 }, { "loss": 2.2986, "grad_norm": 1.7756503820419312, "learning_rate": 5e-05, "epoch": 0.28910802330793367, "step": 4515 }, { "loss": 2.2993, "grad_norm": 1.8048218488693237, "learning_rate": 5e-05, "epoch": 0.2894281872318627, "step": 4520 }, { "loss": 2.283, "grad_norm": 1.7407152652740479, "learning_rate": 5e-05, "epoch": 0.2897483511557918, "step": 4525 }, { "loss": 2.3091, "grad_norm": 1.909650206565857, "learning_rate": 5e-05, "epoch": 0.2900685150797208, "step": 4530 }, { "loss": 2.3102, "grad_norm": 1.803214430809021, "learning_rate": 5e-05, "epoch": 0.29038867900364984, "step": 4535 }, { "loss": 2.2942, "grad_norm": 1.9347703456878662, "learning_rate": 5e-05, "epoch": 0.2907088429275789, "step": 4540 }, { "loss": 2.2967, "grad_norm": 1.8654407262802124, "learning_rate": 5e-05, "epoch": 0.29102900685150795, "step": 4545 }, { "loss": 2.3208, "grad_norm": 1.7373706102371216, "learning_rate": 5e-05, "epoch": 0.29134917077543704, "step": 4550 }, { "loss": 2.3188, "grad_norm": 1.7621229887008667, "learning_rate": 5e-05, "epoch": 0.29166933469936607, "step": 4555 }, { "loss": 2.2971, "grad_norm": 1.7987569570541382, "learning_rate": 5e-05, "epoch": 0.29198949862329515, "step": 4560 }, { "loss": 2.2926, "grad_norm": 1.8752938508987427, "learning_rate": 5e-05, "epoch": 0.2923096625472242, "step": 4565 }, { "loss": 2.294, "grad_norm": 1.809169888496399, "learning_rate": 5e-05, "epoch": 0.2926298264711532, "step": 4570 }, { "loss": 2.3059, "grad_norm": 1.8496021032333374, "learning_rate": 5e-05, "epoch": 0.2929499903950823, "step": 4575 }, { "loss": 2.2714, "grad_norm": 1.8275306224822998, "learning_rate": 5e-05, "epoch": 0.2932701543190113, "step": 4580 }, { "loss": 2.2797, "grad_norm": 1.8231137990951538, "learning_rate": 5e-05, "epoch": 0.2935903182429404, "step": 4585 }, { "loss": 2.2901, "grad_norm": 1.7881653308868408, "learning_rate": 5e-05, "epoch": 0.29391048216686944, "step": 4590 }, { "loss": 2.3043, "grad_norm": 1.9115880727767944, "learning_rate": 5e-05, "epoch": 0.29423064609079846, "step": 4595 }, { "loss": 2.3292, "grad_norm": 1.8070696592330933, "learning_rate": 5e-05, "epoch": 0.29455081001472755, "step": 4600 }, { "eval_loss": 2.1569859981536865, "eval_runtime": 9.5207, "eval_samples_per_second": 215.111, "eval_steps_per_second": 26.889, "epoch": 0.29455081001472755, "step": 4600 }, { "loss": 2.32, "grad_norm": 1.7979247570037842, "learning_rate": 5e-05, "epoch": 0.2948709739386566, "step": 4605 }, { "loss": 2.2949, "grad_norm": 1.7743096351623535, "learning_rate": 5e-05, "epoch": 0.29519113786258566, "step": 4610 }, { "loss": 2.2916, "grad_norm": 1.7690064907073975, "learning_rate": 5e-05, "epoch": 0.2955113017865147, "step": 4615 }, { "loss": 2.3084, "grad_norm": 1.9324722290039062, "learning_rate": 5e-05, "epoch": 0.2958314657104437, "step": 4620 }, { "loss": 2.2975, "grad_norm": 1.7818751335144043, "learning_rate": 5e-05, "epoch": 0.2961516296343728, "step": 4625 }, { "loss": 2.2689, "grad_norm": 1.7577718496322632, "learning_rate": 5e-05, "epoch": 0.29647179355830183, "step": 4630 }, { "loss": 2.2863, "grad_norm": 1.7863922119140625, "learning_rate": 5e-05, "epoch": 0.2967919574822309, "step": 4635 }, { "loss": 2.2954, "grad_norm": 1.8004027605056763, "learning_rate": 5e-05, "epoch": 0.29711212140615995, "step": 4640 }, { "loss": 2.3267, "grad_norm": 1.7635235786437988, "learning_rate": 5e-05, "epoch": 0.29743228533008903, "step": 4645 }, { "loss": 2.3047, "grad_norm": 1.814304232597351, "learning_rate": 5e-05, "epoch": 0.29775244925401806, "step": 4650 }, { "loss": 2.2948, "grad_norm": 1.8464570045471191, "learning_rate": 5e-05, "epoch": 0.2980726131779471, "step": 4655 }, { "loss": 2.2944, "grad_norm": 1.7262645959854126, "learning_rate": 5e-05, "epoch": 0.29839277710187617, "step": 4660 }, { "loss": 2.3012, "grad_norm": 1.7750794887542725, "learning_rate": 5e-05, "epoch": 0.2987129410258052, "step": 4665 }, { "loss": 2.2965, "grad_norm": 1.7386796474456787, "learning_rate": 5e-05, "epoch": 0.2990331049497343, "step": 4670 }, { "loss": 2.2986, "grad_norm": 1.8167015314102173, "learning_rate": 5e-05, "epoch": 0.2993532688736633, "step": 4675 }, { "loss": 2.2928, "grad_norm": 1.7372899055480957, "learning_rate": 5e-05, "epoch": 0.29967343279759234, "step": 4680 }, { "loss": 2.2767, "grad_norm": 1.793278694152832, "learning_rate": 5e-05, "epoch": 0.2999935967215214, "step": 4685 }, { "loss": 2.2887, "grad_norm": 1.93364679813385, "learning_rate": 5e-05, "epoch": 0.30031376064545046, "step": 4690 }, { "loss": 2.3015, "grad_norm": 1.7167513370513916, "learning_rate": 5e-05, "epoch": 0.30063392456937954, "step": 4695 }, { "loss": 2.2635, "grad_norm": 1.7310161590576172, "learning_rate": 5e-05, "epoch": 0.30095408849330857, "step": 4700 }, { "loss": 2.2942, "grad_norm": 1.7478691339492798, "learning_rate": 5e-05, "epoch": 0.3012742524172376, "step": 4705 }, { "loss": 2.2552, "grad_norm": 1.8167970180511475, "learning_rate": 5e-05, "epoch": 0.3015944163411667, "step": 4710 }, { "loss": 2.2812, "grad_norm": 1.6952241659164429, "learning_rate": 5e-05, "epoch": 0.3019145802650957, "step": 4715 }, { "loss": 2.2908, "grad_norm": 1.7196714878082275, "learning_rate": 5e-05, "epoch": 0.3022347441890248, "step": 4720 }, { "loss": 2.2867, "grad_norm": 1.7747132778167725, "learning_rate": 5e-05, "epoch": 0.3025549081129538, "step": 4725 }, { "loss": 2.2695, "grad_norm": 1.8552742004394531, "learning_rate": 5e-05, "epoch": 0.3028750720368829, "step": 4730 }, { "loss": 2.264, "grad_norm": 1.719187617301941, "learning_rate": 5e-05, "epoch": 0.30319523596081194, "step": 4735 }, { "loss": 2.2794, "grad_norm": 1.8311821222305298, "learning_rate": 5e-05, "epoch": 0.30351539988474097, "step": 4740 }, { "loss": 2.2649, "grad_norm": 1.7115150690078735, "learning_rate": 5e-05, "epoch": 0.30383556380867005, "step": 4745 }, { "loss": 2.299, "grad_norm": 1.7218992710113525, "learning_rate": 5e-05, "epoch": 0.3041557277325991, "step": 4750 }, { "loss": 2.3031, "grad_norm": 1.7692986726760864, "learning_rate": 5e-05, "epoch": 0.30447589165652816, "step": 4755 }, { "loss": 2.3133, "grad_norm": 1.7613261938095093, "learning_rate": 5e-05, "epoch": 0.3047960555804572, "step": 4760 }, { "loss": 2.3096, "grad_norm": 1.762600064277649, "learning_rate": 5e-05, "epoch": 0.3051162195043862, "step": 4765 }, { "loss": 2.2789, "grad_norm": 1.7768152952194214, "learning_rate": 5e-05, "epoch": 0.3054363834283153, "step": 4770 }, { "loss": 2.3094, "grad_norm": 1.8207039833068848, "learning_rate": 5e-05, "epoch": 0.30575654735224433, "step": 4775 }, { "loss": 2.2842, "grad_norm": 1.7720569372177124, "learning_rate": 5e-05, "epoch": 0.3060767112761734, "step": 4780 }, { "loss": 2.3167, "grad_norm": 1.7781318426132202, "learning_rate": 5e-05, "epoch": 0.30639687520010245, "step": 4785 }, { "loss": 2.2924, "grad_norm": 1.797167181968689, "learning_rate": 5e-05, "epoch": 0.3067170391240315, "step": 4790 }, { "loss": 2.2818, "grad_norm": 1.7862792015075684, "learning_rate": 5e-05, "epoch": 0.30703720304796056, "step": 4795 }, { "loss": 2.2911, "grad_norm": 1.913051724433899, "learning_rate": 5e-05, "epoch": 0.3073573669718896, "step": 4800 }, { "eval_loss": 2.1567511558532715, "eval_runtime": 9.5969, "eval_samples_per_second": 213.403, "eval_steps_per_second": 26.675, "epoch": 0.3073573669718896, "step": 4800 }, { "loss": 2.3157, "grad_norm": 1.9030219316482544, "learning_rate": 5e-05, "epoch": 0.3076775308958187, "step": 4805 }, { "loss": 2.2967, "grad_norm": 1.981708288192749, "learning_rate": 5e-05, "epoch": 0.3079976948197477, "step": 4810 }, { "loss": 2.2553, "grad_norm": 1.7999526262283325, "learning_rate": 5e-05, "epoch": 0.3083178587436768, "step": 4815 }, { "loss": 2.2993, "grad_norm": 1.7089029550552368, "learning_rate": 5e-05, "epoch": 0.3086380226676058, "step": 4820 }, { "loss": 2.2782, "grad_norm": 1.7940775156021118, "learning_rate": 5e-05, "epoch": 0.30895818659153484, "step": 4825 }, { "loss": 2.2651, "grad_norm": 1.896036148071289, "learning_rate": 5e-05, "epoch": 0.30927835051546393, "step": 4830 }, { "loss": 2.2781, "grad_norm": 1.7181426286697388, "learning_rate": 5e-05, "epoch": 0.30959851443939296, "step": 4835 }, { "loss": 2.3079, "grad_norm": 1.7568175792694092, "learning_rate": 5e-05, "epoch": 0.30991867836332204, "step": 4840 }, { "loss": 2.3256, "grad_norm": 1.7117818593978882, "learning_rate": 5e-05, "epoch": 0.31023884228725107, "step": 4845 }, { "loss": 2.2652, "grad_norm": 1.8535692691802979, "learning_rate": 5e-05, "epoch": 0.3105590062111801, "step": 4850 }, { "loss": 2.3082, "grad_norm": 1.8518201112747192, "learning_rate": 5e-05, "epoch": 0.3108791701351092, "step": 4855 }, { "loss": 2.2893, "grad_norm": 1.875934362411499, "learning_rate": 5e-05, "epoch": 0.3111993340590382, "step": 4860 }, { "loss": 2.2672, "grad_norm": 1.743920087814331, "learning_rate": 5e-05, "epoch": 0.3115194979829673, "step": 4865 }, { "loss": 2.2895, "grad_norm": 1.7549186944961548, "learning_rate": 5e-05, "epoch": 0.3118396619068963, "step": 4870 }, { "loss": 2.282, "grad_norm": 1.7128772735595703, "learning_rate": 5e-05, "epoch": 0.31215982583082535, "step": 4875 }, { "loss": 2.3079, "grad_norm": 1.7349681854248047, "learning_rate": 5e-05, "epoch": 0.31247998975475444, "step": 4880 }, { "loss": 2.2719, "grad_norm": 1.8261305093765259, "learning_rate": 5e-05, "epoch": 0.31280015367868347, "step": 4885 }, { "loss": 2.2801, "grad_norm": 1.7922636270523071, "learning_rate": 5e-05, "epoch": 0.31312031760261255, "step": 4890 }, { "loss": 2.2963, "grad_norm": 1.8621406555175781, "learning_rate": 5e-05, "epoch": 0.3134404815265416, "step": 4895 }, { "loss": 2.2842, "grad_norm": 1.7960196733474731, "learning_rate": 5e-05, "epoch": 0.31376064545047067, "step": 4900 }, { "loss": 2.2535, "grad_norm": 1.7535030841827393, "learning_rate": 5e-05, "epoch": 0.3140808093743997, "step": 4905 }, { "loss": 2.2706, "grad_norm": 1.7128777503967285, "learning_rate": 5e-05, "epoch": 0.3144009732983287, "step": 4910 }, { "loss": 2.2963, "grad_norm": 1.7324950695037842, "learning_rate": 5e-05, "epoch": 0.3147211372222578, "step": 4915 }, { "loss": 2.2996, "grad_norm": 1.759783387184143, "learning_rate": 5e-05, "epoch": 0.31504130114618684, "step": 4920 }, { "loss": 2.2934, "grad_norm": 1.799742579460144, "learning_rate": 5e-05, "epoch": 0.3153614650701159, "step": 4925 }, { "loss": 2.2862, "grad_norm": 1.726730465888977, "learning_rate": 5e-05, "epoch": 0.31568162899404495, "step": 4930 }, { "loss": 2.2912, "grad_norm": 1.8253145217895508, "learning_rate": 5e-05, "epoch": 0.316001792917974, "step": 4935 }, { "loss": 2.3154, "grad_norm": 1.7888239622116089, "learning_rate": 5e-05, "epoch": 0.31632195684190306, "step": 4940 }, { "loss": 2.291, "grad_norm": 1.818763256072998, "learning_rate": 5e-05, "epoch": 0.3166421207658321, "step": 4945 }, { "loss": 2.2879, "grad_norm": 1.679724097251892, "learning_rate": 5e-05, "epoch": 0.3169622846897612, "step": 4950 }, { "loss": 2.276, "grad_norm": 1.7187193632125854, "learning_rate": 5e-05, "epoch": 0.3172824486136902, "step": 4955 }, { "loss": 2.3043, "grad_norm": 1.8776874542236328, "learning_rate": 5e-05, "epoch": 0.3176026125376193, "step": 4960 }, { "loss": 2.299, "grad_norm": 1.8387751579284668, "learning_rate": 5e-05, "epoch": 0.3179227764615483, "step": 4965 }, { "loss": 2.2942, "grad_norm": 1.8348480463027954, "learning_rate": 5e-05, "epoch": 0.31824294038547735, "step": 4970 }, { "loss": 2.2789, "grad_norm": 1.76790189743042, "learning_rate": 5e-05, "epoch": 0.31856310430940643, "step": 4975 }, { "loss": 2.2821, "grad_norm": 1.7413114309310913, "learning_rate": 5e-05, "epoch": 0.31888326823333546, "step": 4980 }, { "loss": 2.2802, "grad_norm": 1.720826268196106, "learning_rate": 5e-05, "epoch": 0.31920343215726454, "step": 4985 }, { "loss": 2.3128, "grad_norm": 1.6995984315872192, "learning_rate": 5e-05, "epoch": 0.3195235960811936, "step": 4990 }, { "loss": 2.3075, "grad_norm": 1.8378366231918335, "learning_rate": 5e-05, "epoch": 0.3198437600051226, "step": 4995 }, { "loss": 2.3002, "grad_norm": 1.812118411064148, "learning_rate": 5e-05, "epoch": 0.3201639239290517, "step": 5000 }, { "eval_loss": 2.1544718742370605, "eval_runtime": 9.2883, "eval_samples_per_second": 220.493, "eval_steps_per_second": 27.562, "epoch": 0.3201639239290517, "step": 5000 }, { "loss": 2.2662, "grad_norm": 1.87315833568573, "learning_rate": 5e-05, "epoch": 0.3204840878529807, "step": 5005 }, { "loss": 2.3016, "grad_norm": 1.7631300687789917, "learning_rate": 5e-05, "epoch": 0.3208042517769098, "step": 5010 }, { "loss": 2.3032, "grad_norm": 1.8889778852462769, "learning_rate": 5e-05, "epoch": 0.32112441570083883, "step": 5015 }, { "loss": 2.3028, "grad_norm": 1.7224068641662598, "learning_rate": 5e-05, "epoch": 0.32144457962476786, "step": 5020 }, { "loss": 2.2573, "grad_norm": 1.7411279678344727, "learning_rate": 5e-05, "epoch": 0.32176474354869694, "step": 5025 }, { "loss": 2.2797, "grad_norm": 2.0215229988098145, "learning_rate": 5e-05, "epoch": 0.32208490747262597, "step": 5030 }, { "loss": 2.2683, "grad_norm": 1.7024788856506348, "learning_rate": 5e-05, "epoch": 0.32240507139655505, "step": 5035 }, { "loss": 2.2709, "grad_norm": 1.871773600578308, "learning_rate": 5e-05, "epoch": 0.3227252353204841, "step": 5040 }, { "loss": 2.28, "grad_norm": 1.672973394393921, "learning_rate": 5e-05, "epoch": 0.32304539924441317, "step": 5045 }, { "loss": 2.2829, "grad_norm": 1.955171823501587, "learning_rate": 5e-05, "epoch": 0.3233655631683422, "step": 5050 }, { "loss": 2.2957, "grad_norm": 1.820365071296692, "learning_rate": 5e-05, "epoch": 0.3236857270922712, "step": 5055 }, { "loss": 2.2904, "grad_norm": 1.8486545085906982, "learning_rate": 5e-05, "epoch": 0.3240058910162003, "step": 5060 }, { "loss": 2.297, "grad_norm": 1.7132619619369507, "learning_rate": 5e-05, "epoch": 0.32432605494012934, "step": 5065 }, { "loss": 2.2782, "grad_norm": 1.8040876388549805, "learning_rate": 5e-05, "epoch": 0.3246462188640584, "step": 5070 }, { "loss": 2.2734, "grad_norm": 1.7798943519592285, "learning_rate": 5e-05, "epoch": 0.32496638278798745, "step": 5075 }, { "loss": 2.2636, "grad_norm": 1.7690693140029907, "learning_rate": 5e-05, "epoch": 0.3252865467119165, "step": 5080 }, { "loss": 2.2689, "grad_norm": 1.793588638305664, "learning_rate": 5e-05, "epoch": 0.32560671063584556, "step": 5085 }, { "loss": 2.3057, "grad_norm": 1.7516108751296997, "learning_rate": 5e-05, "epoch": 0.3259268745597746, "step": 5090 }, { "loss": 2.2766, "grad_norm": 1.7585774660110474, "learning_rate": 5e-05, "epoch": 0.3262470384837037, "step": 5095 }, { "loss": 2.3102, "grad_norm": 1.7661858797073364, "learning_rate": 5e-05, "epoch": 0.3265672024076327, "step": 5100 }, { "loss": 2.3072, "grad_norm": 1.7506427764892578, "learning_rate": 5e-05, "epoch": 0.32688736633156174, "step": 5105 }, { "loss": 2.2912, "grad_norm": 1.7459840774536133, "learning_rate": 5e-05, "epoch": 0.3272075302554908, "step": 5110 }, { "loss": 2.2875, "grad_norm": 1.7619469165802002, "learning_rate": 5e-05, "epoch": 0.32752769417941985, "step": 5115 }, { "loss": 2.275, "grad_norm": 1.7539411783218384, "learning_rate": 5e-05, "epoch": 0.32784785810334893, "step": 5120 }, { "loss": 2.2907, "grad_norm": 1.726323127746582, "learning_rate": 5e-05, "epoch": 0.32816802202727796, "step": 5125 }, { "loss": 2.2491, "grad_norm": 1.6898913383483887, "learning_rate": 5e-05, "epoch": 0.32848818595120705, "step": 5130 }, { "loss": 2.3008, "grad_norm": 1.7721654176712036, "learning_rate": 5e-05, "epoch": 0.3288083498751361, "step": 5135 }, { "loss": 2.2871, "grad_norm": 1.859742522239685, "learning_rate": 5e-05, "epoch": 0.3291285137990651, "step": 5140 }, { "loss": 2.2713, "grad_norm": 1.8325496912002563, "learning_rate": 5e-05, "epoch": 0.3294486777229942, "step": 5145 }, { "loss": 2.2931, "grad_norm": 1.845747709274292, "learning_rate": 5e-05, "epoch": 0.3297688416469232, "step": 5150 }, { "loss": 2.2746, "grad_norm": 1.8495094776153564, "learning_rate": 5e-05, "epoch": 0.3300890055708523, "step": 5155 }, { "loss": 2.2889, "grad_norm": 1.7559887170791626, "learning_rate": 5e-05, "epoch": 0.33040916949478133, "step": 5160 }, { "loss": 2.3083, "grad_norm": 1.7825977802276611, "learning_rate": 5e-05, "epoch": 0.33072933341871036, "step": 5165 }, { "loss": 2.2705, "grad_norm": 1.788145899772644, "learning_rate": 5e-05, "epoch": 0.33104949734263944, "step": 5170 }, { "loss": 2.2735, "grad_norm": 1.755177617073059, "learning_rate": 5e-05, "epoch": 0.33136966126656847, "step": 5175 }, { "loss": 2.2677, "grad_norm": 1.7675113677978516, "learning_rate": 5e-05, "epoch": 0.33168982519049756, "step": 5180 }, { "loss": 2.3125, "grad_norm": 1.7566906213760376, "learning_rate": 5e-05, "epoch": 0.3320099891144266, "step": 5185 }, { "loss": 2.2741, "grad_norm": 1.8251054286956787, "learning_rate": 5e-05, "epoch": 0.3323301530383556, "step": 5190 }, { "loss": 2.255, "grad_norm": 1.815388560295105, "learning_rate": 5e-05, "epoch": 0.3326503169622847, "step": 5195 }, { "loss": 2.3014, "grad_norm": 1.8509902954101562, "learning_rate": 5e-05, "epoch": 0.3329704808862137, "step": 5200 }, { "eval_loss": 2.1533737182617188, "eval_runtime": 13.3544, "eval_samples_per_second": 153.358, "eval_steps_per_second": 19.17, "epoch": 0.3329704808862137, "step": 5200 }, { "loss": 2.2773, "grad_norm": 1.8529142141342163, "learning_rate": 5e-05, "epoch": 0.3332906448101428, "step": 5205 }, { "loss": 2.2882, "grad_norm": 1.8580735921859741, "learning_rate": 5e-05, "epoch": 0.33361080873407184, "step": 5210 }, { "loss": 2.2652, "grad_norm": 1.8027414083480835, "learning_rate": 5e-05, "epoch": 0.3339309726580009, "step": 5215 }, { "loss": 2.2714, "grad_norm": 1.7679603099822998, "learning_rate": 5e-05, "epoch": 0.33425113658192995, "step": 5220 }, { "loss": 2.2793, "grad_norm": 1.730897068977356, "learning_rate": 5e-05, "epoch": 0.334571300505859, "step": 5225 }, { "loss": 2.275, "grad_norm": 1.7817909717559814, "learning_rate": 5e-05, "epoch": 0.33489146442978807, "step": 5230 }, { "loss": 2.27, "grad_norm": 1.763421893119812, "learning_rate": 5e-05, "epoch": 0.3352116283537171, "step": 5235 }, { "loss": 2.2856, "grad_norm": 1.7344940900802612, "learning_rate": 5e-05, "epoch": 0.3355317922776462, "step": 5240 }, { "loss": 2.2903, "grad_norm": 1.8662790060043335, "learning_rate": 5e-05, "epoch": 0.3358519562015752, "step": 5245 }, { "loss": 2.2768, "grad_norm": 1.7917147874832153, "learning_rate": 5e-05, "epoch": 0.33617212012550424, "step": 5250 }, { "loss": 2.2903, "grad_norm": 1.734060525894165, "learning_rate": 5e-05, "epoch": 0.3364922840494333, "step": 5255 }, { "loss": 2.268, "grad_norm": 1.7482142448425293, "learning_rate": 5e-05, "epoch": 0.33681244797336235, "step": 5260 }, { "loss": 2.2548, "grad_norm": 1.819955825805664, "learning_rate": 5e-05, "epoch": 0.33713261189729143, "step": 5265 }, { "loss": 2.2723, "grad_norm": 1.7646571397781372, "learning_rate": 5e-05, "epoch": 0.33745277582122046, "step": 5270 }, { "loss": 2.2785, "grad_norm": 1.7525885105133057, "learning_rate": 5e-05, "epoch": 0.3377729397451495, "step": 5275 }, { "loss": 2.2877, "grad_norm": 1.8543511629104614, "learning_rate": 5e-05, "epoch": 0.3380931036690786, "step": 5280 }, { "loss": 2.2604, "grad_norm": 1.848857045173645, "learning_rate": 5e-05, "epoch": 0.3384132675930076, "step": 5285 }, { "loss": 2.2763, "grad_norm": 1.8210421800613403, "learning_rate": 5e-05, "epoch": 0.3387334315169367, "step": 5290 }, { "loss": 2.2962, "grad_norm": 1.717044472694397, "learning_rate": 5e-05, "epoch": 0.3390535954408657, "step": 5295 }, { "loss": 2.2777, "grad_norm": 1.7110569477081299, "learning_rate": 5e-05, "epoch": 0.3393737593647948, "step": 5300 }, { "loss": 2.2395, "grad_norm": 1.7508400678634644, "learning_rate": 5e-05, "epoch": 0.33969392328872383, "step": 5305 }, { "loss": 2.2731, "grad_norm": 1.8925203084945679, "learning_rate": 5e-05, "epoch": 0.34001408721265286, "step": 5310 }, { "loss": 2.3024, "grad_norm": 1.7870714664459229, "learning_rate": 5e-05, "epoch": 0.34033425113658194, "step": 5315 }, { "loss": 2.2829, "grad_norm": 1.744795799255371, "learning_rate": 5e-05, "epoch": 0.340654415060511, "step": 5320 }, { "loss": 2.2815, "grad_norm": 1.7675684690475464, "learning_rate": 5e-05, "epoch": 0.34097457898444006, "step": 5325 }, { "loss": 2.2803, "grad_norm": 1.8785274028778076, "learning_rate": 5e-05, "epoch": 0.3412947429083691, "step": 5330 }, { "loss": 2.2804, "grad_norm": 1.818994402885437, "learning_rate": 5e-05, "epoch": 0.3416149068322981, "step": 5335 }, { "loss": 2.3124, "grad_norm": 1.9585684537887573, "learning_rate": 5e-05, "epoch": 0.3419350707562272, "step": 5340 }, { "loss": 2.2672, "grad_norm": 1.770952820777893, "learning_rate": 5e-05, "epoch": 0.34225523468015623, "step": 5345 }, { "loss": 2.2677, "grad_norm": 1.6577550172805786, "learning_rate": 5e-05, "epoch": 0.3425753986040853, "step": 5350 }, { "loss": 2.2676, "grad_norm": 1.9661815166473389, "learning_rate": 5e-05, "epoch": 0.34289556252801434, "step": 5355 }, { "loss": 2.2868, "grad_norm": 1.7929357290267944, "learning_rate": 5e-05, "epoch": 0.34321572645194337, "step": 5360 }, { "loss": 2.2842, "grad_norm": 1.8188828229904175, "learning_rate": 5e-05, "epoch": 0.34353589037587245, "step": 5365 }, { "loss": 2.2513, "grad_norm": 1.781710147857666, "learning_rate": 5e-05, "epoch": 0.3438560542998015, "step": 5370 }, { "loss": 2.2664, "grad_norm": 1.8323997259140015, "learning_rate": 5e-05, "epoch": 0.34417621822373057, "step": 5375 }, { "loss": 2.2698, "grad_norm": 1.8200249671936035, "learning_rate": 5e-05, "epoch": 0.3444963821476596, "step": 5380 }, { "loss": 2.2822, "grad_norm": 1.749212622642517, "learning_rate": 5e-05, "epoch": 0.3448165460715887, "step": 5385 }, { "loss": 2.2588, "grad_norm": 2.007263422012329, "learning_rate": 5e-05, "epoch": 0.3451367099955177, "step": 5390 }, { "loss": 2.2757, "grad_norm": 1.8585344552993774, "learning_rate": 5e-05, "epoch": 0.34545687391944674, "step": 5395 }, { "loss": 2.2768, "grad_norm": 1.923609733581543, "learning_rate": 5e-05, "epoch": 0.3457770378433758, "step": 5400 }, { "eval_loss": 2.1260647773742676, "eval_runtime": 9.2705, "eval_samples_per_second": 220.916, "eval_steps_per_second": 27.615, "epoch": 0.3457770378433758, "step": 5400 }, { "loss": 2.291, "grad_norm": 1.8628069162368774, "learning_rate": 5e-05, "epoch": 0.34609720176730485, "step": 5405 }, { "loss": 2.276, "grad_norm": 1.712012529373169, "learning_rate": 5e-05, "epoch": 0.34641736569123394, "step": 5410 }, { "loss": 2.2774, "grad_norm": 1.8341697454452515, "learning_rate": 5e-05, "epoch": 0.34673752961516296, "step": 5415 }, { "loss": 2.2809, "grad_norm": 1.7650182247161865, "learning_rate": 5e-05, "epoch": 0.347057693539092, "step": 5420 }, { "loss": 2.2893, "grad_norm": 1.7278627157211304, "learning_rate": 5e-05, "epoch": 0.3473778574630211, "step": 5425 }, { "loss": 2.2507, "grad_norm": 1.6427825689315796, "learning_rate": 5e-05, "epoch": 0.3476980213869501, "step": 5430 }, { "loss": 2.2821, "grad_norm": 1.7547065019607544, "learning_rate": 5e-05, "epoch": 0.3480181853108792, "step": 5435 }, { "loss": 2.2893, "grad_norm": 1.7182886600494385, "learning_rate": 5e-05, "epoch": 0.3483383492348082, "step": 5440 }, { "loss": 2.2876, "grad_norm": 1.84799063205719, "learning_rate": 5e-05, "epoch": 0.34865851315873725, "step": 5445 }, { "loss": 2.2712, "grad_norm": 1.6814558506011963, "learning_rate": 5e-05, "epoch": 0.34897867708266633, "step": 5450 }, { "loss": 2.29, "grad_norm": 1.881182074546814, "learning_rate": 5e-05, "epoch": 0.34929884100659536, "step": 5455 }, { "loss": 2.2206, "grad_norm": 1.8951886892318726, "learning_rate": 5e-05, "epoch": 0.34961900493052445, "step": 5460 }, { "loss": 2.2433, "grad_norm": 1.8026858568191528, "learning_rate": 5e-05, "epoch": 0.3499391688544535, "step": 5465 }, { "loss": 2.2556, "grad_norm": 1.7248677015304565, "learning_rate": 5e-05, "epoch": 0.35025933277838256, "step": 5470 }, { "loss": 2.2637, "grad_norm": 1.7217531204223633, "learning_rate": 5e-05, "epoch": 0.3505794967023116, "step": 5475 }, { "loss": 2.2854, "grad_norm": 1.7552876472473145, "learning_rate": 5e-05, "epoch": 0.3508996606262406, "step": 5480 }, { "loss": 2.262, "grad_norm": 1.7518340349197388, "learning_rate": 5e-05, "epoch": 0.3512198245501697, "step": 5485 }, { "loss": 2.2613, "grad_norm": 1.8185194730758667, "learning_rate": 5e-05, "epoch": 0.35153998847409873, "step": 5490 }, { "loss": 2.2867, "grad_norm": 1.7345361709594727, "learning_rate": 5e-05, "epoch": 0.3518601523980278, "step": 5495 }, { "loss": 2.2498, "grad_norm": 1.7367279529571533, "learning_rate": 5e-05, "epoch": 0.35218031632195684, "step": 5500 }, { "loss": 2.3032, "grad_norm": 1.7460354566574097, "learning_rate": 5e-05, "epoch": 0.35250048024588587, "step": 5505 }, { "loss": 2.279, "grad_norm": 1.6836531162261963, "learning_rate": 5e-05, "epoch": 0.35282064416981496, "step": 5510 }, { "loss": 2.2791, "grad_norm": 1.7619463205337524, "learning_rate": 5e-05, "epoch": 0.353140808093744, "step": 5515 }, { "loss": 2.2974, "grad_norm": 1.8177152872085571, "learning_rate": 5e-05, "epoch": 0.35346097201767307, "step": 5520 }, { "loss": 2.3094, "grad_norm": 1.7507604360580444, "learning_rate": 5e-05, "epoch": 0.3537811359416021, "step": 5525 }, { "loss": 2.2745, "grad_norm": 1.7359153032302856, "learning_rate": 5e-05, "epoch": 0.3541012998655311, "step": 5530 }, { "loss": 2.2713, "grad_norm": 1.7324638366699219, "learning_rate": 5e-05, "epoch": 0.3544214637894602, "step": 5535 }, { "loss": 2.3063, "grad_norm": 1.7245142459869385, "learning_rate": 5e-05, "epoch": 0.35474162771338924, "step": 5540 }, { "loss": 2.2258, "grad_norm": 1.699273943901062, "learning_rate": 5e-05, "epoch": 0.3550617916373183, "step": 5545 }, { "loss": 2.2459, "grad_norm": 1.653936505317688, "learning_rate": 5e-05, "epoch": 0.35538195556124735, "step": 5550 }, { "loss": 2.2984, "grad_norm": 1.7689787149429321, "learning_rate": 5e-05, "epoch": 0.35570211948517644, "step": 5555 }, { "loss": 2.2585, "grad_norm": 1.693535327911377, "learning_rate": 5e-05, "epoch": 0.35602228340910547, "step": 5560 }, { "loss": 2.2832, "grad_norm": 1.801584243774414, "learning_rate": 5e-05, "epoch": 0.3563424473330345, "step": 5565 }, { "loss": 2.3029, "grad_norm": 1.8008770942687988, "learning_rate": 5e-05, "epoch": 0.3566626112569636, "step": 5570 }, { "loss": 2.3078, "grad_norm": 1.7314320802688599, "learning_rate": 5e-05, "epoch": 0.3569827751808926, "step": 5575 }, { "loss": 2.2484, "grad_norm": 1.7883455753326416, "learning_rate": 5e-05, "epoch": 0.3573029391048217, "step": 5580 }, { "loss": 2.2453, "grad_norm": 1.7167059183120728, "learning_rate": 5e-05, "epoch": 0.3576231030287507, "step": 5585 }, { "loss": 2.2577, "grad_norm": 1.7459754943847656, "learning_rate": 5e-05, "epoch": 0.35794326695267975, "step": 5590 }, { "loss": 2.2555, "grad_norm": 1.783430576324463, "learning_rate": 5e-05, "epoch": 0.35826343087660883, "step": 5595 }, { "loss": 2.2625, "grad_norm": 1.8094017505645752, "learning_rate": 5e-05, "epoch": 0.35858359480053786, "step": 5600 }, { "eval_loss": 2.1376986503601074, "eval_runtime": 9.2886, "eval_samples_per_second": 220.484, "eval_steps_per_second": 27.561, "epoch": 0.35858359480053786, "step": 5600 }, { "loss": 2.2669, "grad_norm": 1.796750783920288, "learning_rate": 5e-05, "epoch": 0.35890375872446695, "step": 5605 }, { "loss": 2.2834, "grad_norm": 1.7852609157562256, "learning_rate": 5e-05, "epoch": 0.359223922648396, "step": 5610 }, { "loss": 2.2566, "grad_norm": 1.7483196258544922, "learning_rate": 5e-05, "epoch": 0.359544086572325, "step": 5615 }, { "loss": 2.2819, "grad_norm": 1.7469184398651123, "learning_rate": 5e-05, "epoch": 0.3598642504962541, "step": 5620 }, { "loss": 2.253, "grad_norm": 1.787428855895996, "learning_rate": 5e-05, "epoch": 0.3601844144201831, "step": 5625 }, { "loss": 2.2558, "grad_norm": 1.7420175075531006, "learning_rate": 5e-05, "epoch": 0.3605045783441122, "step": 5630 }, { "loss": 2.2465, "grad_norm": 1.676102638244629, "learning_rate": 5e-05, "epoch": 0.36082474226804123, "step": 5635 }, { "loss": 2.2594, "grad_norm": 1.754003882408142, "learning_rate": 5e-05, "epoch": 0.3611449061919703, "step": 5640 }, { "loss": 2.2558, "grad_norm": 1.7780991792678833, "learning_rate": 5e-05, "epoch": 0.36146507011589935, "step": 5645 }, { "loss": 2.2613, "grad_norm": 1.7494131326675415, "learning_rate": 5e-05, "epoch": 0.3617852340398284, "step": 5650 }, { "loss": 2.2511, "grad_norm": 1.8119771480560303, "learning_rate": 5e-05, "epoch": 0.36210539796375746, "step": 5655 }, { "loss": 2.2891, "grad_norm": 1.8024489879608154, "learning_rate": 5e-05, "epoch": 0.3624255618876865, "step": 5660 }, { "loss": 2.2737, "grad_norm": 1.7026606798171997, "learning_rate": 5e-05, "epoch": 0.36274572581161557, "step": 5665 }, { "loss": 2.3008, "grad_norm": 1.7064659595489502, "learning_rate": 5e-05, "epoch": 0.3630658897355446, "step": 5670 }, { "loss": 2.2796, "grad_norm": 1.7445411682128906, "learning_rate": 5e-05, "epoch": 0.36338605365947363, "step": 5675 }, { "loss": 2.2406, "grad_norm": 1.7404433488845825, "learning_rate": 5e-05, "epoch": 0.3637062175834027, "step": 5680 }, { "loss": 2.2742, "grad_norm": 1.6843476295471191, "learning_rate": 5e-05, "epoch": 0.36402638150733174, "step": 5685 }, { "loss": 2.2763, "grad_norm": 1.8461291790008545, "learning_rate": 5e-05, "epoch": 0.3643465454312608, "step": 5690 }, { "loss": 2.2594, "grad_norm": 1.7500439882278442, "learning_rate": 5e-05, "epoch": 0.36466670935518986, "step": 5695 }, { "loss": 2.2495, "grad_norm": 1.7546688318252563, "learning_rate": 5e-05, "epoch": 0.3649868732791189, "step": 5700 }, { "loss": 2.268, "grad_norm": 1.7128827571868896, "learning_rate": 5e-05, "epoch": 0.36530703720304797, "step": 5705 }, { "loss": 2.2716, "grad_norm": 1.8002029657363892, "learning_rate": 5e-05, "epoch": 0.365627201126977, "step": 5710 }, { "loss": 2.2689, "grad_norm": 1.7871887683868408, "learning_rate": 5e-05, "epoch": 0.3659473650509061, "step": 5715 }, { "loss": 2.2444, "grad_norm": 1.801291584968567, "learning_rate": 5e-05, "epoch": 0.3662675289748351, "step": 5720 }, { "loss": 2.2788, "grad_norm": 1.8185909986495972, "learning_rate": 5e-05, "epoch": 0.3665876928987642, "step": 5725 }, { "loss": 2.2746, "grad_norm": 1.7295774221420288, "learning_rate": 5e-05, "epoch": 0.3669078568226932, "step": 5730 }, { "loss": 2.2953, "grad_norm": 1.7250750064849854, "learning_rate": 5e-05, "epoch": 0.36722802074662225, "step": 5735 }, { "loss": 2.2689, "grad_norm": 1.7358938455581665, "learning_rate": 5e-05, "epoch": 0.36754818467055134, "step": 5740 }, { "loss": 2.2594, "grad_norm": 1.7297829389572144, "learning_rate": 5e-05, "epoch": 0.36786834859448037, "step": 5745 }, { "loss": 2.2729, "grad_norm": 1.788424015045166, "learning_rate": 5e-05, "epoch": 0.36818851251840945, "step": 5750 }, { "loss": 2.2442, "grad_norm": 1.873340368270874, "learning_rate": 5e-05, "epoch": 0.3685086764423385, "step": 5755 }, { "loss": 2.249, "grad_norm": 1.7489144802093506, "learning_rate": 5e-05, "epoch": 0.3688288403662675, "step": 5760 }, { "loss": 2.2845, "grad_norm": 1.7094459533691406, "learning_rate": 5e-05, "epoch": 0.3691490042901966, "step": 5765 }, { "loss": 2.2626, "grad_norm": 1.7959952354431152, "learning_rate": 5e-05, "epoch": 0.3694691682141256, "step": 5770 }, { "loss": 2.261, "grad_norm": 1.7271146774291992, "learning_rate": 5e-05, "epoch": 0.3697893321380547, "step": 5775 }, { "loss": 2.2694, "grad_norm": 1.7205613851547241, "learning_rate": 5e-05, "epoch": 0.37010949606198373, "step": 5780 }, { "loss": 2.2703, "grad_norm": 1.6520004272460938, "learning_rate": 5e-05, "epoch": 0.37042965998591276, "step": 5785 }, { "loss": 2.2531, "grad_norm": 1.6878688335418701, "learning_rate": 5e-05, "epoch": 0.37074982390984185, "step": 5790 }, { "loss": 2.2859, "grad_norm": 1.7531139850616455, "learning_rate": 5e-05, "epoch": 0.3710699878337709, "step": 5795 }, { "loss": 2.2609, "grad_norm": 1.7352375984191895, "learning_rate": 5e-05, "epoch": 0.37139015175769996, "step": 5800 }, { "eval_loss": 2.1418533325195312, "eval_runtime": 12.9494, "eval_samples_per_second": 158.154, "eval_steps_per_second": 19.769, "epoch": 0.37139015175769996, "step": 5800 }, { "loss": 2.278, "grad_norm": 1.7465990781784058, "learning_rate": 5e-05, "epoch": 0.371710315681629, "step": 5805 }, { "loss": 2.2576, "grad_norm": 1.6427454948425293, "learning_rate": 5e-05, "epoch": 0.3720304796055581, "step": 5810 }, { "loss": 2.2879, "grad_norm": 1.6827012300491333, "learning_rate": 5e-05, "epoch": 0.3723506435294871, "step": 5815 }, { "loss": 2.2817, "grad_norm": 1.6802785396575928, "learning_rate": 5e-05, "epoch": 0.37267080745341613, "step": 5820 }, { "loss": 2.2624, "grad_norm": 1.764146089553833, "learning_rate": 5e-05, "epoch": 0.3729909713773452, "step": 5825 }, { "loss": 2.2748, "grad_norm": 1.7563925981521606, "learning_rate": 5e-05, "epoch": 0.37331113530127424, "step": 5830 }, { "loss": 2.2679, "grad_norm": 1.7826206684112549, "learning_rate": 5e-05, "epoch": 0.37363129922520333, "step": 5835 }, { "loss": 2.2631, "grad_norm": 1.7394565343856812, "learning_rate": 5e-05, "epoch": 0.37395146314913236, "step": 5840 }, { "loss": 2.2668, "grad_norm": 1.702976942062378, "learning_rate": 5e-05, "epoch": 0.3742716270730614, "step": 5845 }, { "loss": 2.2615, "grad_norm": 1.8101780414581299, "learning_rate": 5e-05, "epoch": 0.37459179099699047, "step": 5850 }, { "loss": 2.2914, "grad_norm": 1.7696033716201782, "learning_rate": 5e-05, "epoch": 0.3749119549209195, "step": 5855 }, { "loss": 2.2536, "grad_norm": 1.7243146896362305, "learning_rate": 5e-05, "epoch": 0.3752321188448486, "step": 5860 }, { "loss": 2.2839, "grad_norm": 1.707695722579956, "learning_rate": 5e-05, "epoch": 0.3755522827687776, "step": 5865 }, { "loss": 2.2604, "grad_norm": 1.650211215019226, "learning_rate": 5e-05, "epoch": 0.37587244669270664, "step": 5870 }, { "loss": 2.258, "grad_norm": 1.6194339990615845, "learning_rate": 5e-05, "epoch": 0.3761926106166357, "step": 5875 }, { "loss": 2.2542, "grad_norm": 1.7091882228851318, "learning_rate": 5e-05, "epoch": 0.37651277454056475, "step": 5880 }, { "loss": 2.2661, "grad_norm": 1.733975887298584, "learning_rate": 5e-05, "epoch": 0.37683293846449384, "step": 5885 }, { "loss": 2.2536, "grad_norm": 1.7769482135772705, "learning_rate": 5e-05, "epoch": 0.37715310238842287, "step": 5890 }, { "loss": 2.2515, "grad_norm": 1.7663599252700806, "learning_rate": 5e-05, "epoch": 0.37747326631235195, "step": 5895 }, { "loss": 2.2979, "grad_norm": 1.6576106548309326, "learning_rate": 5e-05, "epoch": 0.377793430236281, "step": 5900 }, { "loss": 2.2524, "grad_norm": 1.838011384010315, "learning_rate": 5e-05, "epoch": 0.37811359416021, "step": 5905 }, { "loss": 2.2539, "grad_norm": 1.7713699340820312, "learning_rate": 5e-05, "epoch": 0.3784337580841391, "step": 5910 }, { "loss": 2.2644, "grad_norm": 1.765184760093689, "learning_rate": 5e-05, "epoch": 0.3787539220080681, "step": 5915 }, { "loss": 2.2128, "grad_norm": 1.70463228225708, "learning_rate": 5e-05, "epoch": 0.3790740859319972, "step": 5920 }, { "loss": 2.2651, "grad_norm": 1.689228892326355, "learning_rate": 5e-05, "epoch": 0.37939424985592624, "step": 5925 }, { "loss": 2.2368, "grad_norm": 1.7535511255264282, "learning_rate": 5e-05, "epoch": 0.37971441377985526, "step": 5930 }, { "loss": 2.2507, "grad_norm": 1.768235445022583, "learning_rate": 5e-05, "epoch": 0.38003457770378435, "step": 5935 }, { "loss": 2.2313, "grad_norm": 1.7316277027130127, "learning_rate": 5e-05, "epoch": 0.3803547416277134, "step": 5940 }, { "loss": 2.2901, "grad_norm": 1.7372463941574097, "learning_rate": 5e-05, "epoch": 0.38067490555164246, "step": 5945 }, { "loss": 2.2767, "grad_norm": 1.8195472955703735, "learning_rate": 5e-05, "epoch": 0.3809950694755715, "step": 5950 }, { "loss": 2.2819, "grad_norm": 1.7599300146102905, "learning_rate": 5e-05, "epoch": 0.3813152333995005, "step": 5955 }, { "loss": 2.2495, "grad_norm": 1.763772964477539, "learning_rate": 5e-05, "epoch": 0.3816353973234296, "step": 5960 }, { "loss": 2.2689, "grad_norm": 1.6997100114822388, "learning_rate": 5e-05, "epoch": 0.38195556124735863, "step": 5965 }, { "loss": 2.2784, "grad_norm": 1.6891993284225464, "learning_rate": 5e-05, "epoch": 0.3822757251712877, "step": 5970 }, { "loss": 2.289, "grad_norm": 1.7187187671661377, "learning_rate": 5e-05, "epoch": 0.38259588909521675, "step": 5975 }, { "loss": 2.2425, "grad_norm": 1.6827722787857056, "learning_rate": 5e-05, "epoch": 0.38291605301914583, "step": 5980 }, { "loss": 2.2499, "grad_norm": 1.6614289283752441, "learning_rate": 5e-05, "epoch": 0.38323621694307486, "step": 5985 }, { "loss": 2.2559, "grad_norm": 1.7182048559188843, "learning_rate": 5e-05, "epoch": 0.3835563808670039, "step": 5990 }, { "loss": 2.2781, "grad_norm": 1.7572559118270874, "learning_rate": 5e-05, "epoch": 0.38387654479093297, "step": 5995 }, { "loss": 2.26, "grad_norm": 1.8257755041122437, "learning_rate": 5e-05, "epoch": 0.384196708714862, "step": 6000 }, { "eval_loss": 2.11156964302063, "eval_runtime": 9.0951, "eval_samples_per_second": 225.177, "eval_steps_per_second": 28.147, "epoch": 0.384196708714862, "step": 6000 }, { "loss": 2.235, "grad_norm": 1.7780883312225342, "learning_rate": 5e-05, "epoch": 0.3845168726387911, "step": 6005 }, { "loss": 2.2434, "grad_norm": 1.7068132162094116, "learning_rate": 5e-05, "epoch": 0.3848370365627201, "step": 6010 }, { "loss": 2.2748, "grad_norm": 1.6712257862091064, "learning_rate": 5e-05, "epoch": 0.38515720048664914, "step": 6015 }, { "loss": 2.2457, "grad_norm": 1.7343010902404785, "learning_rate": 5e-05, "epoch": 0.3854773644105782, "step": 6020 }, { "loss": 2.2694, "grad_norm": 1.8106725215911865, "learning_rate": 5e-05, "epoch": 0.38579752833450726, "step": 6025 }, { "loss": 2.2828, "grad_norm": 1.7711716890335083, "learning_rate": 5e-05, "epoch": 0.38611769225843634, "step": 6030 }, { "loss": 2.259, "grad_norm": 1.7112571001052856, "learning_rate": 5e-05, "epoch": 0.38643785618236537, "step": 6035 }, { "loss": 2.2748, "grad_norm": 1.7668615579605103, "learning_rate": 5e-05, "epoch": 0.3867580201062944, "step": 6040 }, { "loss": 2.2853, "grad_norm": 1.67672598361969, "learning_rate": 5e-05, "epoch": 0.3870781840302235, "step": 6045 }, { "loss": 2.2871, "grad_norm": 1.7809470891952515, "learning_rate": 5e-05, "epoch": 0.3873983479541525, "step": 6050 }, { "loss": 2.2709, "grad_norm": 1.785502552986145, "learning_rate": 5e-05, "epoch": 0.3877185118780816, "step": 6055 }, { "loss": 2.2826, "grad_norm": 1.725252389907837, "learning_rate": 5e-05, "epoch": 0.3880386758020106, "step": 6060 }, { "loss": 2.2652, "grad_norm": 1.7655749320983887, "learning_rate": 5e-05, "epoch": 0.3883588397259397, "step": 6065 }, { "loss": 2.2745, "grad_norm": 1.845263123512268, "learning_rate": 5e-05, "epoch": 0.38867900364986874, "step": 6070 }, { "loss": 2.2355, "grad_norm": 1.7035220861434937, "learning_rate": 5e-05, "epoch": 0.38899916757379777, "step": 6075 }, { "loss": 2.2324, "grad_norm": 1.7222847938537598, "learning_rate": 5e-05, "epoch": 0.38931933149772685, "step": 6080 }, { "loss": 2.2685, "grad_norm": 1.8165398836135864, "learning_rate": 5e-05, "epoch": 0.3896394954216559, "step": 6085 }, { "loss": 2.2682, "grad_norm": 1.6270705461502075, "learning_rate": 5e-05, "epoch": 0.38995965934558496, "step": 6090 }, { "loss": 2.2508, "grad_norm": 1.7037124633789062, "learning_rate": 5e-05, "epoch": 0.390279823269514, "step": 6095 }, { "loss": 2.2543, "grad_norm": 1.648330569267273, "learning_rate": 5e-05, "epoch": 0.390599987193443, "step": 6100 }, { "loss": 2.2578, "grad_norm": 1.7594226598739624, "learning_rate": 5e-05, "epoch": 0.3909201511173721, "step": 6105 }, { "loss": 2.223, "grad_norm": 1.8519033193588257, "learning_rate": 5e-05, "epoch": 0.39124031504130113, "step": 6110 }, { "loss": 2.239, "grad_norm": 1.7203348875045776, "learning_rate": 5e-05, "epoch": 0.3915604789652302, "step": 6115 }, { "loss": 2.2772, "grad_norm": 1.6320827007293701, "learning_rate": 5e-05, "epoch": 0.39188064288915925, "step": 6120 }, { "loss": 2.2555, "grad_norm": 1.7894231081008911, "learning_rate": 5e-05, "epoch": 0.3922008068130883, "step": 6125 }, { "loss": 2.2534, "grad_norm": 1.8454432487487793, "learning_rate": 5e-05, "epoch": 0.39252097073701736, "step": 6130 }, { "loss": 2.2645, "grad_norm": 1.7246161699295044, "learning_rate": 5e-05, "epoch": 0.3928411346609464, "step": 6135 }, { "loss": 2.2586, "grad_norm": 1.765830159187317, "learning_rate": 5e-05, "epoch": 0.3931612985848755, "step": 6140 }, { "loss": 2.2669, "grad_norm": 1.6727474927902222, "learning_rate": 5e-05, "epoch": 0.3934814625088045, "step": 6145 }, { "loss": 2.2517, "grad_norm": 1.69596529006958, "learning_rate": 5e-05, "epoch": 0.3938016264327336, "step": 6150 }, { "loss": 2.2718, "grad_norm": 1.6889044046401978, "learning_rate": 5e-05, "epoch": 0.3941217903566626, "step": 6155 }, { "loss": 2.2556, "grad_norm": 1.627752661705017, "learning_rate": 5e-05, "epoch": 0.39444195428059164, "step": 6160 }, { "loss": 2.2455, "grad_norm": 1.7724320888519287, "learning_rate": 5e-05, "epoch": 0.39476211820452073, "step": 6165 }, { "loss": 2.2798, "grad_norm": 1.7310431003570557, "learning_rate": 5e-05, "epoch": 0.39508228212844976, "step": 6170 }, { "loss": 2.24, "grad_norm": 1.7662495374679565, "learning_rate": 5e-05, "epoch": 0.39540244605237884, "step": 6175 }, { "loss": 2.2278, "grad_norm": 1.6548069715499878, "learning_rate": 5e-05, "epoch": 0.39572260997630787, "step": 6180 }, { "loss": 2.2691, "grad_norm": 1.7314773797988892, "learning_rate": 5e-05, "epoch": 0.3960427739002369, "step": 6185 }, { "loss": 2.2485, "grad_norm": 1.737084984779358, "learning_rate": 5e-05, "epoch": 0.396362937824166, "step": 6190 }, { "loss": 2.2649, "grad_norm": 1.7657032012939453, "learning_rate": 5e-05, "epoch": 0.396683101748095, "step": 6195 }, { "loss": 2.2166, "grad_norm": 1.7243016958236694, "learning_rate": 5e-05, "epoch": 0.3970032656720241, "step": 6200 }, { "eval_loss": 2.1302952766418457, "eval_runtime": 13.3134, "eval_samples_per_second": 153.83, "eval_steps_per_second": 19.229, "epoch": 0.3970032656720241, "step": 6200 }, { "loss": 2.2638, "grad_norm": 1.847778081893921, "learning_rate": 5e-05, "epoch": 0.3973234295959531, "step": 6205 }, { "loss": 2.2624, "grad_norm": 1.7355600595474243, "learning_rate": 5e-05, "epoch": 0.39764359351988215, "step": 6210 }, { "loss": 2.2611, "grad_norm": 1.7112786769866943, "learning_rate": 5e-05, "epoch": 0.39796375744381124, "step": 6215 }, { "loss": 2.2695, "grad_norm": 1.7076542377471924, "learning_rate": 5e-05, "epoch": 0.39828392136774027, "step": 6220 }, { "loss": 2.2448, "grad_norm": 1.7070058584213257, "learning_rate": 5e-05, "epoch": 0.39860408529166935, "step": 6225 }, { "loss": 2.2445, "grad_norm": 1.7254059314727783, "learning_rate": 5e-05, "epoch": 0.3989242492155984, "step": 6230 }, { "loss": 2.2435, "grad_norm": 1.7010166645050049, "learning_rate": 5e-05, "epoch": 0.39924441313952747, "step": 6235 }, { "loss": 2.2417, "grad_norm": 1.7349189519882202, "learning_rate": 5e-05, "epoch": 0.3995645770634565, "step": 6240 }, { "loss": 2.2544, "grad_norm": 1.812296748161316, "learning_rate": 5e-05, "epoch": 0.3998847409873855, "step": 6245 }, { "loss": 2.2425, "grad_norm": 1.7517497539520264, "learning_rate": 5e-05, "epoch": 0.4002049049113146, "step": 6250 }, { "loss": 2.2405, "grad_norm": 1.7381399869918823, "learning_rate": 5e-05, "epoch": 0.40052506883524364, "step": 6255 }, { "loss": 2.2862, "grad_norm": 1.7130184173583984, "learning_rate": 5e-05, "epoch": 0.4008452327591727, "step": 6260 }, { "loss": 2.2525, "grad_norm": 1.766489028930664, "learning_rate": 5e-05, "epoch": 0.40116539668310175, "step": 6265 }, { "loss": 2.2372, "grad_norm": 1.6739976406097412, "learning_rate": 5e-05, "epoch": 0.4014855606070308, "step": 6270 }, { "loss": 2.2145, "grad_norm": 1.6726338863372803, "learning_rate": 5e-05, "epoch": 0.40180572453095986, "step": 6275 }, { "loss": 2.2622, "grad_norm": 1.8587597608566284, "learning_rate": 5e-05, "epoch": 0.4021258884548889, "step": 6280 }, { "loss": 2.252, "grad_norm": 1.7975720167160034, "learning_rate": 5e-05, "epoch": 0.402446052378818, "step": 6285 }, { "loss": 2.2645, "grad_norm": 1.6847093105316162, "learning_rate": 5e-05, "epoch": 0.402766216302747, "step": 6290 }, { "loss": 2.213, "grad_norm": 1.6540334224700928, "learning_rate": 5e-05, "epoch": 0.40308638022667603, "step": 6295 }, { "loss": 2.2244, "grad_norm": 1.7274028062820435, "learning_rate": 5e-05, "epoch": 0.4034065441506051, "step": 6300 }, { "loss": 2.2298, "grad_norm": 1.738688588142395, "learning_rate": 5e-05, "epoch": 0.40372670807453415, "step": 6305 }, { "loss": 2.2655, "grad_norm": 1.7383339405059814, "learning_rate": 5e-05, "epoch": 0.40404687199846323, "step": 6310 }, { "loss": 2.2595, "grad_norm": 1.6844770908355713, "learning_rate": 5e-05, "epoch": 0.40436703592239226, "step": 6315 }, { "loss": 2.2474, "grad_norm": 1.6770930290222168, "learning_rate": 5e-05, "epoch": 0.40468719984632134, "step": 6320 }, { "loss": 2.2467, "grad_norm": 1.775538682937622, "learning_rate": 5e-05, "epoch": 0.4050073637702504, "step": 6325 }, { "loss": 2.2238, "grad_norm": 1.7302964925765991, "learning_rate": 5e-05, "epoch": 0.4053275276941794, "step": 6330 }, { "loss": 2.2545, "grad_norm": 1.7631667852401733, "learning_rate": 5e-05, "epoch": 0.4056476916181085, "step": 6335 }, { "loss": 2.2462, "grad_norm": 1.704640507698059, "learning_rate": 5e-05, "epoch": 0.4059678555420375, "step": 6340 }, { "loss": 2.2356, "grad_norm": 1.849134087562561, "learning_rate": 5e-05, "epoch": 0.4062880194659666, "step": 6345 }, { "loss": 2.2713, "grad_norm": 1.764115810394287, "learning_rate": 5e-05, "epoch": 0.4066081833898956, "step": 6350 }, { "loss": 2.268, "grad_norm": 1.653106451034546, "learning_rate": 5e-05, "epoch": 0.40692834731382466, "step": 6355 }, { "loss": 2.2242, "grad_norm": 1.816038727760315, "learning_rate": 5e-05, "epoch": 0.40724851123775374, "step": 6360 }, { "loss": 2.2568, "grad_norm": 1.6303045749664307, "learning_rate": 5e-05, "epoch": 0.40756867516168277, "step": 6365 }, { "loss": 2.2756, "grad_norm": 1.6868770122528076, "learning_rate": 5e-05, "epoch": 0.40788883908561185, "step": 6370 }, { "loss": 2.2556, "grad_norm": 1.711913824081421, "learning_rate": 5e-05, "epoch": 0.4082090030095409, "step": 6375 }, { "loss": 2.2355, "grad_norm": 1.6835819482803345, "learning_rate": 5e-05, "epoch": 0.4085291669334699, "step": 6380 }, { "loss": 2.2559, "grad_norm": 1.7608588933944702, "learning_rate": 5e-05, "epoch": 0.408849330857399, "step": 6385 }, { "loss": 2.252, "grad_norm": 1.6766396760940552, "learning_rate": 5e-05, "epoch": 0.409169494781328, "step": 6390 }, { "loss": 2.2475, "grad_norm": 1.6831003427505493, "learning_rate": 5e-05, "epoch": 0.4094896587052571, "step": 6395 }, { "loss": 2.2639, "grad_norm": 1.7193617820739746, "learning_rate": 5e-05, "epoch": 0.40980982262918614, "step": 6400 }, { "eval_loss": 2.11531400680542, "eval_runtime": 9.5033, "eval_samples_per_second": 215.505, "eval_steps_per_second": 26.938, "epoch": 0.40980982262918614, "step": 6400 }, { "loss": 2.2536, "grad_norm": 1.6074965000152588, "learning_rate": 5e-05, "epoch": 0.4101299865531152, "step": 6405 }, { "loss": 2.2417, "grad_norm": 1.6199990510940552, "learning_rate": 5e-05, "epoch": 0.41045015047704425, "step": 6410 }, { "loss": 2.2629, "grad_norm": 1.6224853992462158, "learning_rate": 5e-05, "epoch": 0.4107703144009733, "step": 6415 }, { "loss": 2.2491, "grad_norm": 1.779128909111023, "learning_rate": 5e-05, "epoch": 0.41109047832490236, "step": 6420 }, { "loss": 2.2301, "grad_norm": 1.7013006210327148, "learning_rate": 5e-05, "epoch": 0.4114106422488314, "step": 6425 }, { "loss": 2.2517, "grad_norm": 1.745300531387329, "learning_rate": 5e-05, "epoch": 0.4117308061727605, "step": 6430 }, { "loss": 2.2743, "grad_norm": 1.670337438583374, "learning_rate": 5e-05, "epoch": 0.4120509700966895, "step": 6435 }, { "loss": 2.2511, "grad_norm": 1.7760534286499023, "learning_rate": 5e-05, "epoch": 0.41237113402061853, "step": 6440 }, { "loss": 2.2577, "grad_norm": 1.7097136974334717, "learning_rate": 5e-05, "epoch": 0.4126912979445476, "step": 6445 }, { "loss": 2.2357, "grad_norm": 1.738032341003418, "learning_rate": 5e-05, "epoch": 0.41301146186847665, "step": 6450 }, { "loss": 2.2212, "grad_norm": 1.6849381923675537, "learning_rate": 5e-05, "epoch": 0.41333162579240573, "step": 6455 }, { "loss": 2.233, "grad_norm": 1.8453466892242432, "learning_rate": 5e-05, "epoch": 0.41365178971633476, "step": 6460 }, { "loss": 2.2687, "grad_norm": 1.7124505043029785, "learning_rate": 5e-05, "epoch": 0.4139719536402638, "step": 6465 }, { "loss": 2.2342, "grad_norm": 1.730618953704834, "learning_rate": 5e-05, "epoch": 0.4142921175641929, "step": 6470 }, { "loss": 2.2618, "grad_norm": 1.7143526077270508, "learning_rate": 5e-05, "epoch": 0.4146122814881219, "step": 6475 }, { "loss": 2.267, "grad_norm": 1.7568591833114624, "learning_rate": 5e-05, "epoch": 0.414932445412051, "step": 6480 }, { "loss": 2.2626, "grad_norm": 1.663020372390747, "learning_rate": 5e-05, "epoch": 0.41525260933598, "step": 6485 }, { "loss": 2.2408, "grad_norm": 1.6989688873291016, "learning_rate": 5e-05, "epoch": 0.4155727732599091, "step": 6490 }, { "loss": 2.2769, "grad_norm": 1.648116946220398, "learning_rate": 5e-05, "epoch": 0.41589293718383813, "step": 6495 }, { "loss": 2.2234, "grad_norm": 1.7310383319854736, "learning_rate": 5e-05, "epoch": 0.41621310110776716, "step": 6500 }, { "loss": 2.2395, "grad_norm": 1.7397419214248657, "learning_rate": 5e-05, "epoch": 0.41653326503169624, "step": 6505 }, { "loss": 2.237, "grad_norm": 1.7233692407608032, "learning_rate": 5e-05, "epoch": 0.41685342895562527, "step": 6510 }, { "loss": 2.2458, "grad_norm": 1.7460954189300537, "learning_rate": 5e-05, "epoch": 0.41717359287955436, "step": 6515 }, { "loss": 2.2398, "grad_norm": 1.770958423614502, "learning_rate": 5e-05, "epoch": 0.4174937568034834, "step": 6520 }, { "loss": 2.2217, "grad_norm": 1.7674636840820312, "learning_rate": 5e-05, "epoch": 0.4178139207274124, "step": 6525 }, { "loss": 2.2629, "grad_norm": 1.7418832778930664, "learning_rate": 5e-05, "epoch": 0.4181340846513415, "step": 6530 }, { "loss": 2.2547, "grad_norm": 1.6848324537277222, "learning_rate": 5e-05, "epoch": 0.4184542485752705, "step": 6535 }, { "loss": 2.2029, "grad_norm": 1.698730707168579, "learning_rate": 5e-05, "epoch": 0.4187744124991996, "step": 6540 }, { "loss": 2.2331, "grad_norm": 1.7850102186203003, "learning_rate": 5e-05, "epoch": 0.41909457642312864, "step": 6545 }, { "loss": 2.2741, "grad_norm": 1.8512533903121948, "learning_rate": 5e-05, "epoch": 0.41941474034705767, "step": 6550 }, { "loss": 2.2612, "grad_norm": 1.7491039037704468, "learning_rate": 5e-05, "epoch": 0.41973490427098675, "step": 6555 }, { "loss": 2.2617, "grad_norm": 1.7620813846588135, "learning_rate": 5e-05, "epoch": 0.4200550681949158, "step": 6560 }, { "loss": 2.2338, "grad_norm": 1.7340549230575562, "learning_rate": 5e-05, "epoch": 0.42037523211884487, "step": 6565 }, { "loss": 2.2702, "grad_norm": 1.6962077617645264, "learning_rate": 5e-05, "epoch": 0.4206953960427739, "step": 6570 }, { "loss": 2.2338, "grad_norm": 1.6991527080535889, "learning_rate": 5e-05, "epoch": 0.421015559966703, "step": 6575 }, { "loss": 2.2522, "grad_norm": 1.74476158618927, "learning_rate": 5e-05, "epoch": 0.421335723890632, "step": 6580 }, { "loss": 2.2406, "grad_norm": 1.763519287109375, "learning_rate": 5e-05, "epoch": 0.42165588781456104, "step": 6585 }, { "loss": 2.2464, "grad_norm": 1.675957202911377, "learning_rate": 5e-05, "epoch": 0.4219760517384901, "step": 6590 }, { "loss": 2.2329, "grad_norm": 1.7178364992141724, "learning_rate": 5e-05, "epoch": 0.42229621566241915, "step": 6595 }, { "loss": 2.25, "grad_norm": 1.843867301940918, "learning_rate": 5e-05, "epoch": 0.42261637958634823, "step": 6600 }, { "eval_loss": 2.110640048980713, "eval_runtime": 9.6155, "eval_samples_per_second": 212.989, "eval_steps_per_second": 26.624, "epoch": 0.42261637958634823, "step": 6600 }, { "loss": 2.2297, "grad_norm": 1.850877046585083, "learning_rate": 5e-05, "epoch": 0.42293654351027726, "step": 6605 }, { "loss": 2.2479, "grad_norm": 1.7398591041564941, "learning_rate": 5e-05, "epoch": 0.4232567074342063, "step": 6610 }, { "loss": 2.2746, "grad_norm": 1.7509093284606934, "learning_rate": 5e-05, "epoch": 0.4235768713581354, "step": 6615 }, { "loss": 2.2588, "grad_norm": 1.7495396137237549, "learning_rate": 5e-05, "epoch": 0.4238970352820644, "step": 6620 }, { "loss": 2.2274, "grad_norm": 1.6394826173782349, "learning_rate": 5e-05, "epoch": 0.4242171992059935, "step": 6625 }, { "loss": 2.2229, "grad_norm": 1.7712039947509766, "learning_rate": 5e-05, "epoch": 0.4245373631299225, "step": 6630 }, { "loss": 2.2761, "grad_norm": 1.6803395748138428, "learning_rate": 5e-05, "epoch": 0.42485752705385155, "step": 6635 }, { "loss": 2.2622, "grad_norm": 1.7649556398391724, "learning_rate": 5e-05, "epoch": 0.42517769097778063, "step": 6640 }, { "loss": 2.246, "grad_norm": 1.8420475721359253, "learning_rate": 5e-05, "epoch": 0.42549785490170966, "step": 6645 }, { "loss": 2.2621, "grad_norm": 1.8346716165542603, "learning_rate": 5e-05, "epoch": 0.42581801882563874, "step": 6650 }, { "loss": 2.2548, "grad_norm": 1.6930170059204102, "learning_rate": 5e-05, "epoch": 0.4261381827495678, "step": 6655 }, { "loss": 2.2308, "grad_norm": 1.7325392961502075, "learning_rate": 5e-05, "epoch": 0.42645834667349686, "step": 6660 }, { "loss": 2.2261, "grad_norm": 1.6914280652999878, "learning_rate": 5e-05, "epoch": 0.4267785105974259, "step": 6665 }, { "loss": 2.2445, "grad_norm": 1.7235634326934814, "learning_rate": 5e-05, "epoch": 0.4270986745213549, "step": 6670 }, { "loss": 2.2277, "grad_norm": 1.6718071699142456, "learning_rate": 5e-05, "epoch": 0.427418838445284, "step": 6675 }, { "loss": 2.245, "grad_norm": 1.6824864149093628, "learning_rate": 5e-05, "epoch": 0.42773900236921303, "step": 6680 }, { "loss": 2.2075, "grad_norm": 1.6548774242401123, "learning_rate": 5e-05, "epoch": 0.4280591662931421, "step": 6685 }, { "loss": 2.2419, "grad_norm": 1.6627106666564941, "learning_rate": 5e-05, "epoch": 0.42837933021707114, "step": 6690 }, { "loss": 2.2613, "grad_norm": 1.6999781131744385, "learning_rate": 5e-05, "epoch": 0.42869949414100017, "step": 6695 }, { "loss": 2.2318, "grad_norm": 1.8439428806304932, "learning_rate": 5e-05, "epoch": 0.42901965806492925, "step": 6700 }, { "loss": 2.2611, "grad_norm": 1.7762128114700317, "learning_rate": 5e-05, "epoch": 0.4293398219888583, "step": 6705 }, { "loss": 2.2322, "grad_norm": 1.7934534549713135, "learning_rate": 5e-05, "epoch": 0.42965998591278737, "step": 6710 }, { "loss": 2.2503, "grad_norm": 1.6476908922195435, "learning_rate": 5e-05, "epoch": 0.4299801498367164, "step": 6715 }, { "loss": 2.2723, "grad_norm": 1.757597804069519, "learning_rate": 5e-05, "epoch": 0.4303003137606454, "step": 6720 }, { "loss": 2.2294, "grad_norm": 1.6976431608200073, "learning_rate": 5e-05, "epoch": 0.4306204776845745, "step": 6725 }, { "loss": 2.2551, "grad_norm": 1.81328284740448, "learning_rate": 5e-05, "epoch": 0.43094064160850354, "step": 6730 }, { "loss": 2.2393, "grad_norm": 1.7719358205795288, "learning_rate": 5e-05, "epoch": 0.4312608055324326, "step": 6735 }, { "loss": 2.2426, "grad_norm": 1.8044530153274536, "learning_rate": 5e-05, "epoch": 0.43158096945636165, "step": 6740 }, { "loss": 2.2435, "grad_norm": 1.760985255241394, "learning_rate": 5e-05, "epoch": 0.43190113338029074, "step": 6745 }, { "loss": 2.2433, "grad_norm": 1.7239030599594116, "learning_rate": 5e-05, "epoch": 0.43222129730421976, "step": 6750 }, { "loss": 2.2013, "grad_norm": 1.7211287021636963, "learning_rate": 5e-05, "epoch": 0.4325414612281488, "step": 6755 }, { "loss": 2.2658, "grad_norm": 1.7456564903259277, "learning_rate": 5e-05, "epoch": 0.4328616251520779, "step": 6760 }, { "loss": 2.2225, "grad_norm": 1.7644805908203125, "learning_rate": 5e-05, "epoch": 0.4331817890760069, "step": 6765 }, { "loss": 2.2455, "grad_norm": 1.6574170589447021, "learning_rate": 5e-05, "epoch": 0.433501952999936, "step": 6770 }, { "loss": 2.2565, "grad_norm": 1.673085331916809, "learning_rate": 5e-05, "epoch": 0.433822116923865, "step": 6775 }, { "loss": 2.2561, "grad_norm": 1.7815220355987549, "learning_rate": 5e-05, "epoch": 0.43414228084779405, "step": 6780 }, { "loss": 2.2375, "grad_norm": 1.7830764055252075, "learning_rate": 5e-05, "epoch": 0.43446244477172313, "step": 6785 }, { "loss": 2.2547, "grad_norm": 1.7634798288345337, "learning_rate": 5e-05, "epoch": 0.43478260869565216, "step": 6790 }, { "loss": 2.2365, "grad_norm": 1.771299123764038, "learning_rate": 5e-05, "epoch": 0.43510277261958125, "step": 6795 }, { "loss": 2.2699, "grad_norm": 1.708333134651184, "learning_rate": 5e-05, "epoch": 0.4354229365435103, "step": 6800 }, { "eval_loss": 2.0987234115600586, "eval_runtime": 9.3179, "eval_samples_per_second": 219.792, "eval_steps_per_second": 27.474, "epoch": 0.4354229365435103, "step": 6800 }, { "loss": 2.2188, "grad_norm": 1.8107043504714966, "learning_rate": 5e-05, "epoch": 0.4357431004674393, "step": 6805 }, { "loss": 2.2213, "grad_norm": 1.707737922668457, "learning_rate": 5e-05, "epoch": 0.4360632643913684, "step": 6810 }, { "loss": 2.261, "grad_norm": 1.8159151077270508, "learning_rate": 5e-05, "epoch": 0.4363834283152974, "step": 6815 }, { "loss": 2.2857, "grad_norm": 1.6932034492492676, "learning_rate": 5e-05, "epoch": 0.4367035922392265, "step": 6820 }, { "loss": 2.222, "grad_norm": 1.8024814128875732, "learning_rate": 5e-05, "epoch": 0.43702375616315553, "step": 6825 }, { "loss": 2.2387, "grad_norm": 1.7472243309020996, "learning_rate": 5e-05, "epoch": 0.4373439200870846, "step": 6830 }, { "loss": 2.2135, "grad_norm": 1.7393558025360107, "learning_rate": 5e-05, "epoch": 0.43766408401101364, "step": 6835 }, { "loss": 2.2406, "grad_norm": 1.8635519742965698, "learning_rate": 5e-05, "epoch": 0.43798424793494267, "step": 6840 }, { "loss": 2.2176, "grad_norm": 1.757818579673767, "learning_rate": 5e-05, "epoch": 0.43830441185887176, "step": 6845 }, { "loss": 2.2769, "grad_norm": 1.670522928237915, "learning_rate": 5e-05, "epoch": 0.4386245757828008, "step": 6850 }, { "loss": 2.2687, "grad_norm": 1.708299160003662, "learning_rate": 5e-05, "epoch": 0.43894473970672987, "step": 6855 }, { "loss": 2.2174, "grad_norm": 1.6819125413894653, "learning_rate": 5e-05, "epoch": 0.4392649036306589, "step": 6860 }, { "loss": 2.2336, "grad_norm": 1.7067598104476929, "learning_rate": 5e-05, "epoch": 0.4395850675545879, "step": 6865 }, { "loss": 2.2244, "grad_norm": 1.6839826107025146, "learning_rate": 5e-05, "epoch": 0.439905231478517, "step": 6870 }, { "loss": 2.2274, "grad_norm": 1.8001630306243896, "learning_rate": 5e-05, "epoch": 0.44022539540244604, "step": 6875 }, { "loss": 2.2402, "grad_norm": 1.7565686702728271, "learning_rate": 5e-05, "epoch": 0.4405455593263751, "step": 6880 }, { "loss": 2.2296, "grad_norm": 1.6886423826217651, "learning_rate": 5e-05, "epoch": 0.44086572325030415, "step": 6885 }, { "loss": 2.2304, "grad_norm": 1.713010311126709, "learning_rate": 5e-05, "epoch": 0.4411858871742332, "step": 6890 }, { "loss": 2.2443, "grad_norm": 1.6640557050704956, "learning_rate": 5e-05, "epoch": 0.44150605109816227, "step": 6895 }, { "loss": 2.223, "grad_norm": 1.6528607606887817, "learning_rate": 5e-05, "epoch": 0.4418262150220913, "step": 6900 }, { "loss": 2.2264, "grad_norm": 1.6679795980453491, "learning_rate": 5e-05, "epoch": 0.4421463789460204, "step": 6905 }, { "loss": 2.2393, "grad_norm": 1.7212443351745605, "learning_rate": 5e-05, "epoch": 0.4424665428699494, "step": 6910 }, { "loss": 2.236, "grad_norm": 1.671025037765503, "learning_rate": 5e-05, "epoch": 0.4427867067938785, "step": 6915 }, { "loss": 2.2512, "grad_norm": 1.6580772399902344, "learning_rate": 5e-05, "epoch": 0.4431068707178075, "step": 6920 }, { "loss": 2.2237, "grad_norm": 1.8094478845596313, "learning_rate": 5e-05, "epoch": 0.44342703464173655, "step": 6925 }, { "loss": 2.2487, "grad_norm": 1.6925034523010254, "learning_rate": 5e-05, "epoch": 0.44374719856566563, "step": 6930 }, { "loss": 2.2323, "grad_norm": 1.6939678192138672, "learning_rate": 5e-05, "epoch": 0.44406736248959466, "step": 6935 }, { "loss": 2.1992, "grad_norm": 1.750412940979004, "learning_rate": 5e-05, "epoch": 0.44438752641352375, "step": 6940 }, { "loss": 2.2182, "grad_norm": 1.6810964345932007, "learning_rate": 5e-05, "epoch": 0.4447076903374528, "step": 6945 }, { "loss": 2.26, "grad_norm": 1.6222447156906128, "learning_rate": 5e-05, "epoch": 0.4450278542613818, "step": 6950 }, { "loss": 2.237, "grad_norm": 1.71504545211792, "learning_rate": 5e-05, "epoch": 0.4453480181853109, "step": 6955 }, { "loss": 2.2384, "grad_norm": 1.7647539377212524, "learning_rate": 5e-05, "epoch": 0.4456681821092399, "step": 6960 }, { "loss": 2.2321, "grad_norm": 1.815050482749939, "learning_rate": 5e-05, "epoch": 0.445988346033169, "step": 6965 }, { "loss": 2.2339, "grad_norm": 1.8233994245529175, "learning_rate": 5e-05, "epoch": 0.44630850995709803, "step": 6970 }, { "loss": 2.2443, "grad_norm": 1.7368268966674805, "learning_rate": 5e-05, "epoch": 0.44662867388102706, "step": 6975 }, { "loss": 2.2406, "grad_norm": 1.6867866516113281, "learning_rate": 5e-05, "epoch": 0.44694883780495615, "step": 6980 }, { "loss": 2.2212, "grad_norm": 1.633429765701294, "learning_rate": 5e-05, "epoch": 0.4472690017288852, "step": 6985 }, { "loss": 2.2543, "grad_norm": 1.6579304933547974, "learning_rate": 5e-05, "epoch": 0.44758916565281426, "step": 6990 }, { "loss": 2.2339, "grad_norm": 1.6452136039733887, "learning_rate": 5e-05, "epoch": 0.4479093295767433, "step": 6995 }, { "loss": 2.23, "grad_norm": 1.670894980430603, "learning_rate": 5e-05, "epoch": 0.44822949350067237, "step": 7000 }, { "eval_loss": 2.09128475189209, "eval_runtime": 9.5706, "eval_samples_per_second": 213.988, "eval_steps_per_second": 26.749, "epoch": 0.44822949350067237, "step": 7000 }, { "loss": 2.2297, "grad_norm": 1.6882041692733765, "learning_rate": 5e-05, "epoch": 0.4485496574246014, "step": 7005 }, { "loss": 2.217, "grad_norm": 1.721706748008728, "learning_rate": 5e-05, "epoch": 0.44886982134853043, "step": 7010 }, { "loss": 2.2618, "grad_norm": 1.7233175039291382, "learning_rate": 5e-05, "epoch": 0.4491899852724595, "step": 7015 }, { "loss": 2.2284, "grad_norm": 1.7620608806610107, "learning_rate": 5e-05, "epoch": 0.44951014919638854, "step": 7020 }, { "loss": 2.2098, "grad_norm": 1.692572832107544, "learning_rate": 5e-05, "epoch": 0.4498303131203176, "step": 7025 }, { "loss": 2.2384, "grad_norm": 1.8099894523620605, "learning_rate": 5e-05, "epoch": 0.45015047704424666, "step": 7030 }, { "loss": 2.2435, "grad_norm": 1.7223204374313354, "learning_rate": 5e-05, "epoch": 0.4504706409681757, "step": 7035 }, { "loss": 2.2161, "grad_norm": 1.692357063293457, "learning_rate": 5e-05, "epoch": 0.45079080489210477, "step": 7040 }, { "loss": 2.2449, "grad_norm": 1.6669970750808716, "learning_rate": 5e-05, "epoch": 0.4511109688160338, "step": 7045 }, { "loss": 2.2364, "grad_norm": 1.719010353088379, "learning_rate": 5e-05, "epoch": 0.4514311327399629, "step": 7050 }, { "loss": 2.2415, "grad_norm": 1.6553248167037964, "learning_rate": 5e-05, "epoch": 0.4517512966638919, "step": 7055 }, { "loss": 2.2294, "grad_norm": 1.6735188961029053, "learning_rate": 5e-05, "epoch": 0.45207146058782094, "step": 7060 }, { "loss": 2.2371, "grad_norm": 1.6721162796020508, "learning_rate": 5e-05, "epoch": 0.45239162451175, "step": 7065 }, { "loss": 2.2304, "grad_norm": 1.7263718843460083, "learning_rate": 5e-05, "epoch": 0.45271178843567905, "step": 7070 }, { "loss": 2.2442, "grad_norm": 1.7064590454101562, "learning_rate": 5e-05, "epoch": 0.45303195235960814, "step": 7075 }, { "loss": 2.2804, "grad_norm": 1.7297579050064087, "learning_rate": 5e-05, "epoch": 0.45335211628353717, "step": 7080 }, { "loss": 2.213, "grad_norm": 1.6997263431549072, "learning_rate": 5e-05, "epoch": 0.45367228020746625, "step": 7085 }, { "loss": 2.2262, "grad_norm": 1.6889290809631348, "learning_rate": 5e-05, "epoch": 0.4539924441313953, "step": 7090 }, { "loss": 2.2396, "grad_norm": 1.6912401914596558, "learning_rate": 5e-05, "epoch": 0.4543126080553243, "step": 7095 }, { "loss": 2.2471, "grad_norm": 1.6478922367095947, "learning_rate": 5e-05, "epoch": 0.4546327719792534, "step": 7100 }, { "loss": 2.1881, "grad_norm": 1.6519252061843872, "learning_rate": 5e-05, "epoch": 0.4549529359031824, "step": 7105 }, { "loss": 2.2228, "grad_norm": 1.7075591087341309, "learning_rate": 5e-05, "epoch": 0.4552730998271115, "step": 7110 }, { "loss": 2.2266, "grad_norm": 1.64700448513031, "learning_rate": 5e-05, "epoch": 0.45559326375104053, "step": 7115 }, { "loss": 2.222, "grad_norm": 1.6712502241134644, "learning_rate": 5e-05, "epoch": 0.45591342767496956, "step": 7120 }, { "loss": 2.2108, "grad_norm": 1.7444020509719849, "learning_rate": 5e-05, "epoch": 0.45623359159889865, "step": 7125 }, { "loss": 2.2235, "grad_norm": 1.739864468574524, "learning_rate": 5e-05, "epoch": 0.4565537555228277, "step": 7130 }, { "loss": 2.2242, "grad_norm": 1.6936887502670288, "learning_rate": 5e-05, "epoch": 0.45687391944675676, "step": 7135 }, { "loss": 2.2379, "grad_norm": 1.6458464860916138, "learning_rate": 5e-05, "epoch": 0.4571940833706858, "step": 7140 }, { "loss": 2.2267, "grad_norm": 1.6865026950836182, "learning_rate": 5e-05, "epoch": 0.4575142472946148, "step": 7145 }, { "loss": 2.2165, "grad_norm": 1.769681453704834, "learning_rate": 5e-05, "epoch": 0.4578344112185439, "step": 7150 }, { "loss": 2.2495, "grad_norm": 1.7259198427200317, "learning_rate": 5e-05, "epoch": 0.45815457514247293, "step": 7155 }, { "loss": 2.2434, "grad_norm": 1.6952046155929565, "learning_rate": 5e-05, "epoch": 0.458474739066402, "step": 7160 }, { "loss": 2.2313, "grad_norm": 1.7676249742507935, "learning_rate": 5e-05, "epoch": 0.45879490299033104, "step": 7165 }, { "loss": 2.2408, "grad_norm": 1.6167271137237549, "learning_rate": 5e-05, "epoch": 0.45911506691426013, "step": 7170 }, { "loss": 2.2076, "grad_norm": 1.7571128606796265, "learning_rate": 5e-05, "epoch": 0.45943523083818916, "step": 7175 }, { "loss": 2.2307, "grad_norm": 1.619318962097168, "learning_rate": 5e-05, "epoch": 0.4597553947621182, "step": 7180 }, { "loss": 2.2399, "grad_norm": 1.6977887153625488, "learning_rate": 5e-05, "epoch": 0.46007555868604727, "step": 7185 }, { "loss": 2.2279, "grad_norm": 1.7176746129989624, "learning_rate": 5e-05, "epoch": 0.4603957226099763, "step": 7190 }, { "loss": 2.2263, "grad_norm": 1.7717937231063843, "learning_rate": 5e-05, "epoch": 0.4607158865339054, "step": 7195 }, { "loss": 2.2539, "grad_norm": 1.7532376050949097, "learning_rate": 5e-05, "epoch": 0.4610360504578344, "step": 7200 }, { "eval_loss": 2.105297565460205, "eval_runtime": 9.2265, "eval_samples_per_second": 221.969, "eval_steps_per_second": 27.746, "epoch": 0.4610360504578344, "step": 7200 }, { "loss": 2.2402, "grad_norm": 1.6974916458129883, "learning_rate": 5e-05, "epoch": 0.46135621438176344, "step": 7205 }, { "loss": 2.205, "grad_norm": 1.703689455986023, "learning_rate": 5e-05, "epoch": 0.4616763783056925, "step": 7210 }, { "loss": 2.2214, "grad_norm": 1.6790088415145874, "learning_rate": 5e-05, "epoch": 0.46199654222962155, "step": 7215 }, { "loss": 2.2013, "grad_norm": 1.7056132555007935, "learning_rate": 5e-05, "epoch": 0.46231670615355064, "step": 7220 }, { "loss": 2.2226, "grad_norm": 1.7162805795669556, "learning_rate": 5e-05, "epoch": 0.46263687007747967, "step": 7225 }, { "loss": 2.2465, "grad_norm": 1.6653958559036255, "learning_rate": 5e-05, "epoch": 0.4629570340014087, "step": 7230 }, { "loss": 2.2545, "grad_norm": 1.7309342622756958, "learning_rate": 5e-05, "epoch": 0.4632771979253378, "step": 7235 }, { "loss": 2.2474, "grad_norm": 1.6637336015701294, "learning_rate": 5e-05, "epoch": 0.4635973618492668, "step": 7240 }, { "loss": 2.227, "grad_norm": 1.6410720348358154, "learning_rate": 5e-05, "epoch": 0.4639175257731959, "step": 7245 }, { "loss": 2.2268, "grad_norm": 1.7074912786483765, "learning_rate": 5e-05, "epoch": 0.4642376896971249, "step": 7250 }, { "loss": 2.212, "grad_norm": 1.7265816926956177, "learning_rate": 5e-05, "epoch": 0.464557853621054, "step": 7255 }, { "loss": 2.2419, "grad_norm": 1.7796136140823364, "learning_rate": 5e-05, "epoch": 0.46487801754498304, "step": 7260 }, { "loss": 2.2388, "grad_norm": 1.748008370399475, "learning_rate": 5e-05, "epoch": 0.46519818146891206, "step": 7265 }, { "loss": 2.2236, "grad_norm": 1.7407780885696411, "learning_rate": 5e-05, "epoch": 0.46551834539284115, "step": 7270 }, { "loss": 2.2306, "grad_norm": 1.7564735412597656, "learning_rate": 5e-05, "epoch": 0.4658385093167702, "step": 7275 }, { "loss": 2.246, "grad_norm": 1.8281434774398804, "learning_rate": 5e-05, "epoch": 0.46615867324069926, "step": 7280 }, { "loss": 2.2127, "grad_norm": 1.7006824016571045, "learning_rate": 5e-05, "epoch": 0.4664788371646283, "step": 7285 }, { "loss": 2.2081, "grad_norm": 1.7190055847167969, "learning_rate": 5e-05, "epoch": 0.4667990010885573, "step": 7290 }, { "loss": 2.2096, "grad_norm": 1.6515896320343018, "learning_rate": 5e-05, "epoch": 0.4671191650124864, "step": 7295 }, { "loss": 2.2523, "grad_norm": 1.7050222158432007, "learning_rate": 5e-05, "epoch": 0.46743932893641543, "step": 7300 }, { "loss": 2.257, "grad_norm": 1.8344449996948242, "learning_rate": 5e-05, "epoch": 0.4677594928603445, "step": 7305 }, { "loss": 2.2389, "grad_norm": 1.762683391571045, "learning_rate": 5e-05, "epoch": 0.46807965678427355, "step": 7310 }, { "loss": 2.2384, "grad_norm": 1.7812387943267822, "learning_rate": 5e-05, "epoch": 0.4683998207082026, "step": 7315 }, { "loss": 2.2468, "grad_norm": 1.6442910432815552, "learning_rate": 5e-05, "epoch": 0.46871998463213166, "step": 7320 }, { "loss": 2.1976, "grad_norm": 1.7651413679122925, "learning_rate": 5e-05, "epoch": 0.4690401485560607, "step": 7325 }, { "loss": 2.2286, "grad_norm": 1.6484975814819336, "learning_rate": 5e-05, "epoch": 0.46936031247998977, "step": 7330 }, { "loss": 2.2328, "grad_norm": 1.7871202230453491, "learning_rate": 5e-05, "epoch": 0.4696804764039188, "step": 7335 }, { "loss": 2.2273, "grad_norm": 1.7313917875289917, "learning_rate": 5e-05, "epoch": 0.4700006403278479, "step": 7340 }, { "loss": 2.2665, "grad_norm": 1.7805308103561401, "learning_rate": 5e-05, "epoch": 0.4703208042517769, "step": 7345 }, { "loss": 2.2815, "grad_norm": 1.734679937362671, "learning_rate": 5e-05, "epoch": 0.47064096817570594, "step": 7350 }, { "loss": 2.2212, "grad_norm": 1.696459412574768, "learning_rate": 5e-05, "epoch": 0.470961132099635, "step": 7355 }, { "loss": 2.2609, "grad_norm": 1.767733097076416, "learning_rate": 5e-05, "epoch": 0.47128129602356406, "step": 7360 }, { "loss": 2.2074, "grad_norm": 1.703615665435791, "learning_rate": 5e-05, "epoch": 0.47160145994749314, "step": 7365 }, { "loss": 2.2239, "grad_norm": 1.6856378316879272, "learning_rate": 5e-05, "epoch": 0.47192162387142217, "step": 7370 }, { "loss": 2.2218, "grad_norm": 1.75584876537323, "learning_rate": 5e-05, "epoch": 0.4722417877953512, "step": 7375 }, { "loss": 2.2304, "grad_norm": 1.733469843864441, "learning_rate": 5e-05, "epoch": 0.4725619517192803, "step": 7380 }, { "loss": 2.242, "grad_norm": 1.6238700151443481, "learning_rate": 5e-05, "epoch": 0.4728821156432093, "step": 7385 }, { "loss": 2.2392, "grad_norm": 1.7466235160827637, "learning_rate": 5e-05, "epoch": 0.4732022795671384, "step": 7390 }, { "loss": 2.2455, "grad_norm": 1.8809025287628174, "learning_rate": 5e-05, "epoch": 0.4735224434910674, "step": 7395 }, { "loss": 2.2431, "grad_norm": 1.705507516860962, "learning_rate": 5e-05, "epoch": 0.47384260741499645, "step": 7400 }, { "eval_loss": 2.098146915435791, "eval_runtime": 9.2837, "eval_samples_per_second": 220.601, "eval_steps_per_second": 27.575, "epoch": 0.47384260741499645, "step": 7400 }, { "loss": 2.1957, "grad_norm": 1.7113065719604492, "learning_rate": 5e-05, "epoch": 0.47416277133892554, "step": 7405 }, { "loss": 2.2073, "grad_norm": 1.6644835472106934, "learning_rate": 5e-05, "epoch": 0.47448293526285457, "step": 7410 }, { "loss": 2.2312, "grad_norm": 1.6765412092208862, "learning_rate": 5e-05, "epoch": 0.47480309918678365, "step": 7415 }, { "loss": 2.2219, "grad_norm": 1.6999036073684692, "learning_rate": 5e-05, "epoch": 0.4751232631107127, "step": 7420 }, { "loss": 2.2325, "grad_norm": 1.6527864933013916, "learning_rate": 5e-05, "epoch": 0.47544342703464176, "step": 7425 }, { "loss": 2.2194, "grad_norm": 1.6665046215057373, "learning_rate": 5e-05, "epoch": 0.4757635909585708, "step": 7430 }, { "loss": 2.2436, "grad_norm": 1.6418559551239014, "learning_rate": 5e-05, "epoch": 0.4760837548824998, "step": 7435 }, { "loss": 2.2442, "grad_norm": 1.6663761138916016, "learning_rate": 5e-05, "epoch": 0.4764039188064289, "step": 7440 }, { "loss": 2.2229, "grad_norm": 1.7055283784866333, "learning_rate": 5e-05, "epoch": 0.47672408273035793, "step": 7445 }, { "loss": 2.233, "grad_norm": 1.8025060892105103, "learning_rate": 5e-05, "epoch": 0.477044246654287, "step": 7450 }, { "loss": 2.2237, "grad_norm": 1.688681721687317, "learning_rate": 5e-05, "epoch": 0.47736441057821605, "step": 7455 }, { "loss": 2.2231, "grad_norm": 1.7080329656600952, "learning_rate": 5e-05, "epoch": 0.4776845745021451, "step": 7460 }, { "loss": 2.2249, "grad_norm": 1.6620713472366333, "learning_rate": 5e-05, "epoch": 0.47800473842607416, "step": 7465 }, { "loss": 2.2358, "grad_norm": 1.7365187406539917, "learning_rate": 5e-05, "epoch": 0.4783249023500032, "step": 7470 }, { "loss": 2.2135, "grad_norm": 1.6564573049545288, "learning_rate": 5e-05, "epoch": 0.4786450662739323, "step": 7475 }, { "loss": 2.2008, "grad_norm": 1.6541259288787842, "learning_rate": 5e-05, "epoch": 0.4789652301978613, "step": 7480 }, { "loss": 2.1807, "grad_norm": 1.7617090940475464, "learning_rate": 5e-05, "epoch": 0.47928539412179033, "step": 7485 }, { "loss": 2.2205, "grad_norm": 1.6854571104049683, "learning_rate": 5e-05, "epoch": 0.4796055580457194, "step": 7490 }, { "loss": 2.2363, "grad_norm": 1.7207971811294556, "learning_rate": 5e-05, "epoch": 0.47992572196964844, "step": 7495 }, { "loss": 2.2369, "grad_norm": 1.7152527570724487, "learning_rate": 5e-05, "epoch": 0.48024588589357753, "step": 7500 }, { "loss": 2.2348, "grad_norm": 1.6942616701126099, "learning_rate": 5e-05, "epoch": 0.48056604981750656, "step": 7505 }, { "loss": 2.2159, "grad_norm": 1.7268720865249634, "learning_rate": 5e-05, "epoch": 0.48088621374143564, "step": 7510 }, { "loss": 2.2252, "grad_norm": 1.7245421409606934, "learning_rate": 5e-05, "epoch": 0.48120637766536467, "step": 7515 }, { "loss": 2.235, "grad_norm": 1.8024479150772095, "learning_rate": 5e-05, "epoch": 0.4815265415892937, "step": 7520 }, { "loss": 2.2198, "grad_norm": 1.7535227537155151, "learning_rate": 5e-05, "epoch": 0.4818467055132228, "step": 7525 }, { "loss": 2.2312, "grad_norm": 1.6243641376495361, "learning_rate": 5e-05, "epoch": 0.4821668694371518, "step": 7530 }, { "loss": 2.2252, "grad_norm": 1.7475508451461792, "learning_rate": 5e-05, "epoch": 0.4824870333610809, "step": 7535 }, { "loss": 2.2565, "grad_norm": 1.6990846395492554, "learning_rate": 5e-05, "epoch": 0.4828071972850099, "step": 7540 }, { "loss": 2.2315, "grad_norm": 1.758632779121399, "learning_rate": 5e-05, "epoch": 0.48312736120893895, "step": 7545 }, { "loss": 2.2011, "grad_norm": 1.6367809772491455, "learning_rate": 5e-05, "epoch": 0.48344752513286804, "step": 7550 }, { "loss": 2.2523, "grad_norm": 1.667632818222046, "learning_rate": 5e-05, "epoch": 0.48376768905679707, "step": 7555 }, { "loss": 2.2309, "grad_norm": 1.674497365951538, "learning_rate": 5e-05, "epoch": 0.48408785298072615, "step": 7560 }, { "loss": 2.2254, "grad_norm": 1.6481330394744873, "learning_rate": 5e-05, "epoch": 0.4844080169046552, "step": 7565 }, { "loss": 2.2089, "grad_norm": 1.6713910102844238, "learning_rate": 5e-05, "epoch": 0.4847281808285842, "step": 7570 }, { "loss": 2.2481, "grad_norm": 1.807297945022583, "learning_rate": 5e-05, "epoch": 0.4850483447525133, "step": 7575 }, { "loss": 2.227, "grad_norm": 1.6840758323669434, "learning_rate": 5e-05, "epoch": 0.4853685086764423, "step": 7580 }, { "loss": 2.2443, "grad_norm": 1.7010893821716309, "learning_rate": 5e-05, "epoch": 0.4856886726003714, "step": 7585 }, { "loss": 2.2416, "grad_norm": 1.6932690143585205, "learning_rate": 5e-05, "epoch": 0.48600883652430044, "step": 7590 }, { "loss": 2.2301, "grad_norm": 1.7321279048919678, "learning_rate": 5e-05, "epoch": 0.4863290004482295, "step": 7595 }, { "loss": 2.2394, "grad_norm": 1.7051780223846436, "learning_rate": 5e-05, "epoch": 0.48664916437215855, "step": 7600 }, { "eval_loss": 2.1021814346313477, "eval_runtime": 9.0753, "eval_samples_per_second": 225.668, "eval_steps_per_second": 28.208, "epoch": 0.48664916437215855, "step": 7600 }, { "loss": 2.2093, "grad_norm": 1.7261319160461426, "learning_rate": 5e-05, "epoch": 0.4869693282960876, "step": 7605 }, { "loss": 2.2148, "grad_norm": 1.7780207395553589, "learning_rate": 5e-05, "epoch": 0.48728949222001666, "step": 7610 }, { "loss": 2.217, "grad_norm": 1.7456703186035156, "learning_rate": 5e-05, "epoch": 0.4876096561439457, "step": 7615 }, { "loss": 2.1884, "grad_norm": 1.8097208738327026, "learning_rate": 5e-05, "epoch": 0.4879298200678748, "step": 7620 }, { "loss": 2.2064, "grad_norm": 1.7063500881195068, "learning_rate": 5e-05, "epoch": 0.4882499839918038, "step": 7625 }, { "loss": 2.2059, "grad_norm": 1.7562440633773804, "learning_rate": 5e-05, "epoch": 0.48857014791573283, "step": 7630 }, { "loss": 2.2136, "grad_norm": 1.7514058351516724, "learning_rate": 5e-05, "epoch": 0.4888903118396619, "step": 7635 }, { "loss": 2.2369, "grad_norm": 1.7494462728500366, "learning_rate": 5e-05, "epoch": 0.48921047576359095, "step": 7640 }, { "loss": 2.2129, "grad_norm": 1.7088241577148438, "learning_rate": 5e-05, "epoch": 0.48953063968752003, "step": 7645 }, { "loss": 2.2257, "grad_norm": 1.6516956090927124, "learning_rate": 5e-05, "epoch": 0.48985080361144906, "step": 7650 }, { "loss": 2.246, "grad_norm": 1.7086745500564575, "learning_rate": 5e-05, "epoch": 0.4901709675353781, "step": 7655 }, { "loss": 2.2448, "grad_norm": 1.748744010925293, "learning_rate": 5e-05, "epoch": 0.4904911314593072, "step": 7660 }, { "loss": 2.2262, "grad_norm": 1.612053632736206, "learning_rate": 5e-05, "epoch": 0.4908112953832362, "step": 7665 }, { "loss": 2.2449, "grad_norm": 1.7028324604034424, "learning_rate": 5e-05, "epoch": 0.4911314593071653, "step": 7670 }, { "loss": 2.2583, "grad_norm": 1.735455870628357, "learning_rate": 5e-05, "epoch": 0.4914516232310943, "step": 7675 }, { "loss": 2.209, "grad_norm": 1.806516170501709, "learning_rate": 5e-05, "epoch": 0.4917717871550234, "step": 7680 }, { "loss": 2.226, "grad_norm": 1.6990715265274048, "learning_rate": 5e-05, "epoch": 0.4920919510789524, "step": 7685 }, { "loss": 2.2201, "grad_norm": 1.6955472230911255, "learning_rate": 5e-05, "epoch": 0.49241211500288146, "step": 7690 }, { "loss": 2.2677, "grad_norm": 1.6222796440124512, "learning_rate": 5e-05, "epoch": 0.49273227892681054, "step": 7695 }, { "loss": 2.2376, "grad_norm": 1.7312852144241333, "learning_rate": 5e-05, "epoch": 0.49305244285073957, "step": 7700 }, { "loss": 2.2389, "grad_norm": 1.6851000785827637, "learning_rate": 5e-05, "epoch": 0.49337260677466865, "step": 7705 }, { "loss": 2.2027, "grad_norm": 1.6476107835769653, "learning_rate": 5e-05, "epoch": 0.4936927706985977, "step": 7710 }, { "loss": 2.2448, "grad_norm": 1.6625196933746338, "learning_rate": 5e-05, "epoch": 0.4940129346225267, "step": 7715 }, { "loss": 2.1801, "grad_norm": 1.7839092016220093, "learning_rate": 5e-05, "epoch": 0.4943330985464558, "step": 7720 }, { "loss": 2.2286, "grad_norm": 1.6703824996948242, "learning_rate": 5e-05, "epoch": 0.4946532624703848, "step": 7725 }, { "loss": 2.1893, "grad_norm": 1.825608253479004, "learning_rate": 5e-05, "epoch": 0.4949734263943139, "step": 7730 }, { "loss": 2.2228, "grad_norm": 1.6634159088134766, "learning_rate": 5e-05, "epoch": 0.49529359031824294, "step": 7735 }, { "loss": 2.203, "grad_norm": 1.6320128440856934, "learning_rate": 5e-05, "epoch": 0.49561375424217197, "step": 7740 }, { "loss": 2.2567, "grad_norm": 1.6982113122940063, "learning_rate": 5e-05, "epoch": 0.49593391816610105, "step": 7745 }, { "loss": 2.2155, "grad_norm": 1.5984007120132446, "learning_rate": 5e-05, "epoch": 0.4962540820900301, "step": 7750 }, { "loss": 2.2294, "grad_norm": 1.7036561965942383, "learning_rate": 5e-05, "epoch": 0.49657424601395916, "step": 7755 }, { "loss": 2.2139, "grad_norm": 1.7015743255615234, "learning_rate": 5e-05, "epoch": 0.4968944099378882, "step": 7760 }, { "loss": 2.2231, "grad_norm": 1.7515677213668823, "learning_rate": 5e-05, "epoch": 0.4972145738618173, "step": 7765 }, { "loss": 2.2211, "grad_norm": 1.7809141874313354, "learning_rate": 5e-05, "epoch": 0.4975347377857463, "step": 7770 }, { "loss": 2.2251, "grad_norm": 1.6579275131225586, "learning_rate": 5e-05, "epoch": 0.49785490170967533, "step": 7775 }, { "loss": 2.2045, "grad_norm": 1.6126208305358887, "learning_rate": 5e-05, "epoch": 0.4981750656336044, "step": 7780 }, { "loss": 2.1937, "grad_norm": 1.7193243503570557, "learning_rate": 5e-05, "epoch": 0.49849522955753345, "step": 7785 }, { "loss": 2.2378, "grad_norm": 1.7689208984375, "learning_rate": 5e-05, "epoch": 0.49881539348146253, "step": 7790 }, { "loss": 2.2021, "grad_norm": 1.7081634998321533, "learning_rate": 5e-05, "epoch": 0.49913555740539156, "step": 7795 }, { "loss": 2.2286, "grad_norm": 1.682572841644287, "learning_rate": 5e-05, "epoch": 0.4994557213293206, "step": 7800 }, { "eval_loss": 2.093745231628418, "eval_runtime": 9.0651, "eval_samples_per_second": 225.921, "eval_steps_per_second": 28.24, "epoch": 0.4994557213293206, "step": 7800 }, { "loss": 2.2273, "grad_norm": 1.7163746356964111, "learning_rate": 5e-05, "epoch": 0.4997758852532497, "step": 7805 }, { "loss": 2.2204, "grad_norm": 1.7658613920211792, "learning_rate": 5e-05, "epoch": 0.5000960491771788, "step": 7810 }, { "loss": 2.2302, "grad_norm": 1.7190687656402588, "learning_rate": 5e-05, "epoch": 0.5004162131011077, "step": 7815 }, { "loss": 2.2075, "grad_norm": 1.7589188814163208, "learning_rate": 5e-05, "epoch": 0.5007363770250368, "step": 7820 }, { "loss": 2.2345, "grad_norm": 1.6196938753128052, "learning_rate": 5e-05, "epoch": 0.5010565409489659, "step": 7825 }, { "loss": 2.2037, "grad_norm": 1.5979619026184082, "learning_rate": 5e-05, "epoch": 0.5013767048728949, "step": 7830 }, { "loss": 2.2133, "grad_norm": 1.748252511024475, "learning_rate": 5e-05, "epoch": 0.501696868796824, "step": 7835 }, { "loss": 2.1874, "grad_norm": 1.6827281713485718, "learning_rate": 5e-05, "epoch": 0.502017032720753, "step": 7840 }, { "loss": 2.2298, "grad_norm": 1.7303760051727295, "learning_rate": 5e-05, "epoch": 0.5023371966446821, "step": 7845 }, { "loss": 2.2331, "grad_norm": 1.724907398223877, "learning_rate": 5e-05, "epoch": 0.5026573605686111, "step": 7850 }, { "loss": 2.2103, "grad_norm": 1.6769534349441528, "learning_rate": 5e-05, "epoch": 0.5029775244925402, "step": 7855 }, { "loss": 2.2251, "grad_norm": 1.7141268253326416, "learning_rate": 5e-05, "epoch": 0.5032976884164693, "step": 7860 }, { "loss": 2.2383, "grad_norm": 1.691082239151001, "learning_rate": 5e-05, "epoch": 0.5036178523403982, "step": 7865 }, { "loss": 2.2198, "grad_norm": 1.7353357076644897, "learning_rate": 5e-05, "epoch": 0.5039380162643273, "step": 7870 }, { "loss": 2.2259, "grad_norm": 1.6525827646255493, "learning_rate": 5e-05, "epoch": 0.5042581801882564, "step": 7875 }, { "loss": 2.1898, "grad_norm": 1.592120885848999, "learning_rate": 5e-05, "epoch": 0.5045783441121854, "step": 7880 }, { "loss": 2.2348, "grad_norm": 1.6837745904922485, "learning_rate": 5e-05, "epoch": 0.5048985080361145, "step": 7885 }, { "loss": 2.1958, "grad_norm": 1.5835824012756348, "learning_rate": 5e-05, "epoch": 0.5052186719600436, "step": 7890 }, { "loss": 2.218, "grad_norm": 1.701811671257019, "learning_rate": 5e-05, "epoch": 0.5055388358839726, "step": 7895 }, { "loss": 2.236, "grad_norm": 1.6456730365753174, "learning_rate": 5e-05, "epoch": 0.5058589998079016, "step": 7900 }, { "loss": 2.2219, "grad_norm": 1.8244099617004395, "learning_rate": 5e-05, "epoch": 0.5061791637318307, "step": 7905 }, { "loss": 2.2081, "grad_norm": 1.7030236721038818, "learning_rate": 5e-05, "epoch": 0.5064993276557598, "step": 7910 }, { "loss": 2.2244, "grad_norm": 1.7620062828063965, "learning_rate": 5e-05, "epoch": 0.5068194915796888, "step": 7915 }, { "loss": 2.2201, "grad_norm": 1.6482844352722168, "learning_rate": 5e-05, "epoch": 0.5071396555036178, "step": 7920 }, { "loss": 2.2033, "grad_norm": 1.6990326642990112, "learning_rate": 5e-05, "epoch": 0.5074598194275469, "step": 7925 }, { "loss": 2.2165, "grad_norm": 1.8352545499801636, "learning_rate": 5e-05, "epoch": 0.507779983351476, "step": 7930 }, { "loss": 2.2163, "grad_norm": 1.757162094116211, "learning_rate": 5e-05, "epoch": 0.508100147275405, "step": 7935 }, { "loss": 2.2313, "grad_norm": 1.762620210647583, "learning_rate": 5e-05, "epoch": 0.5084203111993341, "step": 7940 }, { "loss": 2.2162, "grad_norm": 1.82100510597229, "learning_rate": 5e-05, "epoch": 0.5087404751232631, "step": 7945 }, { "loss": 2.2278, "grad_norm": 1.6846513748168945, "learning_rate": 5e-05, "epoch": 0.5090606390471921, "step": 7950 }, { "loss": 2.1958, "grad_norm": 1.769995093345642, "learning_rate": 5e-05, "epoch": 0.5093808029711212, "step": 7955 }, { "loss": 2.1942, "grad_norm": 1.7381685972213745, "learning_rate": 5e-05, "epoch": 0.5097009668950503, "step": 7960 }, { "loss": 2.2312, "grad_norm": 1.723332166671753, "learning_rate": 5e-05, "epoch": 0.5100211308189793, "step": 7965 }, { "loss": 2.2185, "grad_norm": 1.7073355913162231, "learning_rate": 5e-05, "epoch": 0.5103412947429083, "step": 7970 }, { "loss": 2.2389, "grad_norm": 1.625958800315857, "learning_rate": 5e-05, "epoch": 0.5106614586668374, "step": 7975 }, { "loss": 2.2267, "grad_norm": 1.673528790473938, "learning_rate": 5e-05, "epoch": 0.5109816225907665, "step": 7980 }, { "loss": 2.224, "grad_norm": 1.6753426790237427, "learning_rate": 5e-05, "epoch": 0.5113017865146955, "step": 7985 }, { "loss": 2.2299, "grad_norm": 1.6783485412597656, "learning_rate": 5e-05, "epoch": 0.5116219504386246, "step": 7990 }, { "loss": 2.2078, "grad_norm": 1.6852103471755981, "learning_rate": 5e-05, "epoch": 0.5119421143625537, "step": 7995 }, { "loss": 2.2307, "grad_norm": 1.7385365962982178, "learning_rate": 5e-05, "epoch": 0.5122622782864826, "step": 8000 }, { "eval_loss": 2.0892796516418457, "eval_runtime": 9.339, "eval_samples_per_second": 219.294, "eval_steps_per_second": 27.412, "epoch": 0.5122622782864826, "step": 8000 }, { "loss": 2.2291, "grad_norm": 1.5970470905303955, "learning_rate": 5e-05, "epoch": 0.5125824422104117, "step": 8005 }, { "loss": 2.2039, "grad_norm": 1.7087442874908447, "learning_rate": 5e-05, "epoch": 0.5129026061343408, "step": 8010 }, { "loss": 2.204, "grad_norm": 1.634369969367981, "learning_rate": 5e-05, "epoch": 0.5132227700582699, "step": 8015 }, { "loss": 2.2052, "grad_norm": 1.6249363422393799, "learning_rate": 5e-05, "epoch": 0.5135429339821989, "step": 8020 }, { "loss": 2.227, "grad_norm": 1.75498366355896, "learning_rate": 5e-05, "epoch": 0.5138630979061279, "step": 8025 }, { "loss": 2.229, "grad_norm": 1.7273918390274048, "learning_rate": 5e-05, "epoch": 0.514183261830057, "step": 8030 }, { "loss": 2.2165, "grad_norm": 1.7545193433761597, "learning_rate": 5e-05, "epoch": 0.514503425753986, "step": 8035 }, { "loss": 2.224, "grad_norm": 1.73219895362854, "learning_rate": 5e-05, "epoch": 0.5148235896779151, "step": 8040 }, { "loss": 2.2239, "grad_norm": 1.7364838123321533, "learning_rate": 5e-05, "epoch": 0.5151437536018442, "step": 8045 }, { "loss": 2.2069, "grad_norm": 1.626757025718689, "learning_rate": 5e-05, "epoch": 0.5154639175257731, "step": 8050 }, { "loss": 2.1969, "grad_norm": 1.7197450399398804, "learning_rate": 5e-05, "epoch": 0.5157840814497022, "step": 8055 }, { "loss": 2.1923, "grad_norm": 1.7638471126556396, "learning_rate": 5e-05, "epoch": 0.5161042453736313, "step": 8060 }, { "loss": 2.218, "grad_norm": 1.6772651672363281, "learning_rate": 5e-05, "epoch": 0.5164244092975604, "step": 8065 }, { "loss": 2.2105, "grad_norm": 1.707062005996704, "learning_rate": 5e-05, "epoch": 0.5167445732214894, "step": 8070 }, { "loss": 2.235, "grad_norm": 1.679762601852417, "learning_rate": 5e-05, "epoch": 0.5170647371454185, "step": 8075 }, { "loss": 2.2301, "grad_norm": 1.6003955602645874, "learning_rate": 5e-05, "epoch": 0.5173849010693475, "step": 8080 }, { "loss": 2.2387, "grad_norm": 1.7114651203155518, "learning_rate": 5e-05, "epoch": 0.5177050649932765, "step": 8085 }, { "loss": 2.1936, "grad_norm": 1.6603319644927979, "learning_rate": 5e-05, "epoch": 0.5180252289172056, "step": 8090 }, { "loss": 2.2455, "grad_norm": 1.806725263595581, "learning_rate": 5e-05, "epoch": 0.5183453928411347, "step": 8095 }, { "loss": 2.2437, "grad_norm": 1.642220377922058, "learning_rate": 5e-05, "epoch": 0.5186655567650638, "step": 8100 }, { "loss": 2.2011, "grad_norm": 1.641445279121399, "learning_rate": 5e-05, "epoch": 0.5189857206889927, "step": 8105 }, { "loss": 2.2033, "grad_norm": 1.741835594177246, "learning_rate": 5e-05, "epoch": 0.5193058846129218, "step": 8110 }, { "loss": 2.2164, "grad_norm": 1.6442292928695679, "learning_rate": 5e-05, "epoch": 0.5196260485368509, "step": 8115 }, { "loss": 2.1894, "grad_norm": 1.6900848150253296, "learning_rate": 5e-05, "epoch": 0.5199462124607799, "step": 8120 }, { "loss": 2.1935, "grad_norm": 1.69651198387146, "learning_rate": 5e-05, "epoch": 0.520266376384709, "step": 8125 }, { "loss": 2.2244, "grad_norm": 1.6706749200820923, "learning_rate": 5e-05, "epoch": 0.520586540308638, "step": 8130 }, { "loss": 2.2044, "grad_norm": 1.7208715677261353, "learning_rate": 5e-05, "epoch": 0.520906704232567, "step": 8135 }, { "loss": 2.2312, "grad_norm": 1.554226040840149, "learning_rate": 5e-05, "epoch": 0.5212268681564961, "step": 8140 }, { "loss": 2.2347, "grad_norm": 1.7291901111602783, "learning_rate": 5e-05, "epoch": 0.5215470320804252, "step": 8145 }, { "loss": 2.2104, "grad_norm": 1.6915407180786133, "learning_rate": 5e-05, "epoch": 0.5218671960043543, "step": 8150 }, { "loss": 2.1875, "grad_norm": 1.6418696641921997, "learning_rate": 5e-05, "epoch": 0.5221873599282832, "step": 8155 }, { "loss": 2.2015, "grad_norm": 1.656497597694397, "learning_rate": 5e-05, "epoch": 0.5225075238522123, "step": 8160 }, { "loss": 2.2165, "grad_norm": 1.78023362159729, "learning_rate": 5e-05, "epoch": 0.5228276877761414, "step": 8165 }, { "loss": 2.2254, "grad_norm": 1.690218448638916, "learning_rate": 5e-05, "epoch": 0.5231478517000704, "step": 8170 }, { "loss": 2.2104, "grad_norm": 1.6391229629516602, "learning_rate": 5e-05, "epoch": 0.5234680156239995, "step": 8175 }, { "loss": 2.194, "grad_norm": 1.6620936393737793, "learning_rate": 5e-05, "epoch": 0.5237881795479286, "step": 8180 }, { "loss": 2.1981, "grad_norm": 1.6374820470809937, "learning_rate": 5e-05, "epoch": 0.5241083434718576, "step": 8185 }, { "loss": 2.2037, "grad_norm": 1.6494983434677124, "learning_rate": 5e-05, "epoch": 0.5244285073957866, "step": 8190 }, { "loss": 2.2126, "grad_norm": 1.6629283428192139, "learning_rate": 5e-05, "epoch": 0.5247486713197157, "step": 8195 }, { "loss": 2.1914, "grad_norm": 1.7156426906585693, "learning_rate": 5e-05, "epoch": 0.5250688352436448, "step": 8200 }, { "eval_loss": 2.0739922523498535, "eval_runtime": 9.5483, "eval_samples_per_second": 214.489, "eval_steps_per_second": 26.811, "epoch": 0.5250688352436448, "step": 8200 }, { "loss": 2.2416, "grad_norm": 1.7021827697753906, "learning_rate": 5e-05, "epoch": 0.5253889991675738, "step": 8205 }, { "loss": 2.1897, "grad_norm": 1.7273354530334473, "learning_rate": 5e-05, "epoch": 0.5257091630915028, "step": 8210 }, { "loss": 2.2093, "grad_norm": 1.6649446487426758, "learning_rate": 5e-05, "epoch": 0.5260293270154319, "step": 8215 }, { "loss": 2.2231, "grad_norm": 1.7139078378677368, "learning_rate": 5e-05, "epoch": 0.5263494909393609, "step": 8220 }, { "loss": 2.212, "grad_norm": 1.7619116306304932, "learning_rate": 5e-05, "epoch": 0.52666965486329, "step": 8225 }, { "loss": 2.2043, "grad_norm": 1.537295937538147, "learning_rate": 5e-05, "epoch": 0.5269898187872191, "step": 8230 }, { "loss": 2.1928, "grad_norm": 1.6285382509231567, "learning_rate": 5e-05, "epoch": 0.5273099827111482, "step": 8235 }, { "loss": 2.217, "grad_norm": 1.673884630203247, "learning_rate": 5e-05, "epoch": 0.5276301466350771, "step": 8240 }, { "loss": 2.2091, "grad_norm": 1.7069264650344849, "learning_rate": 5e-05, "epoch": 0.5279503105590062, "step": 8245 }, { "loss": 2.2122, "grad_norm": 1.6686755418777466, "learning_rate": 5e-05, "epoch": 0.5282704744829353, "step": 8250 }, { "loss": 2.2098, "grad_norm": 1.7338638305664062, "learning_rate": 5e-05, "epoch": 0.5285906384068643, "step": 8255 }, { "loss": 2.1981, "grad_norm": 1.7347779273986816, "learning_rate": 5e-05, "epoch": 0.5289108023307934, "step": 8260 }, { "loss": 2.2123, "grad_norm": 1.6788371801376343, "learning_rate": 5e-05, "epoch": 0.5292309662547224, "step": 8265 }, { "loss": 2.2125, "grad_norm": 1.7118918895721436, "learning_rate": 5e-05, "epoch": 0.5295511301786515, "step": 8270 }, { "loss": 2.2498, "grad_norm": 1.6705480813980103, "learning_rate": 5e-05, "epoch": 0.5298712941025805, "step": 8275 }, { "loss": 2.2, "grad_norm": 1.6503225564956665, "learning_rate": 5e-05, "epoch": 0.5301914580265096, "step": 8280 }, { "loss": 2.2383, "grad_norm": 1.7550666332244873, "learning_rate": 5e-05, "epoch": 0.5305116219504387, "step": 8285 }, { "loss": 2.1962, "grad_norm": 1.9004778861999512, "learning_rate": 5e-05, "epoch": 0.5308317858743676, "step": 8290 }, { "loss": 2.19, "grad_norm": 1.6682841777801514, "learning_rate": 5e-05, "epoch": 0.5311519497982967, "step": 8295 }, { "loss": 2.2168, "grad_norm": 1.801561951637268, "learning_rate": 5e-05, "epoch": 0.5314721137222258, "step": 8300 }, { "loss": 2.1998, "grad_norm": 1.621793270111084, "learning_rate": 5e-05, "epoch": 0.5317922776461548, "step": 8305 }, { "loss": 2.2359, "grad_norm": 1.597138524055481, "learning_rate": 5e-05, "epoch": 0.5321124415700839, "step": 8310 }, { "loss": 2.2064, "grad_norm": 1.5918503999710083, "learning_rate": 5e-05, "epoch": 0.532432605494013, "step": 8315 }, { "loss": 2.2203, "grad_norm": 1.6550569534301758, "learning_rate": 5e-05, "epoch": 0.532752769417942, "step": 8320 }, { "loss": 2.2105, "grad_norm": 1.5995575189590454, "learning_rate": 5e-05, "epoch": 0.533072933341871, "step": 8325 }, { "loss": 2.1901, "grad_norm": 1.7386360168457031, "learning_rate": 5e-05, "epoch": 0.5333930972658001, "step": 8330 }, { "loss": 2.2097, "grad_norm": 1.672532320022583, "learning_rate": 5e-05, "epoch": 0.5337132611897292, "step": 8335 }, { "loss": 2.2103, "grad_norm": 1.6559290885925293, "learning_rate": 5e-05, "epoch": 0.5340334251136581, "step": 8340 }, { "loss": 2.1924, "grad_norm": 1.7685126066207886, "learning_rate": 5e-05, "epoch": 0.5343535890375872, "step": 8345 }, { "loss": 2.2295, "grad_norm": 1.6063467264175415, "learning_rate": 5e-05, "epoch": 0.5346737529615163, "step": 8350 }, { "loss": 2.21, "grad_norm": 1.6099108457565308, "learning_rate": 5e-05, "epoch": 0.5349939168854454, "step": 8355 }, { "loss": 2.2266, "grad_norm": 1.751085877418518, "learning_rate": 5e-05, "epoch": 0.5353140808093744, "step": 8360 }, { "loss": 2.229, "grad_norm": 1.7449450492858887, "learning_rate": 5e-05, "epoch": 0.5356342447333035, "step": 8365 }, { "loss": 2.2268, "grad_norm": 1.6309188604354858, "learning_rate": 5e-05, "epoch": 0.5359544086572325, "step": 8370 }, { "loss": 2.1899, "grad_norm": 1.632546305656433, "learning_rate": 5e-05, "epoch": 0.5362745725811615, "step": 8375 }, { "loss": 2.2096, "grad_norm": 1.6440316438674927, "learning_rate": 5e-05, "epoch": 0.5365947365050906, "step": 8380 }, { "loss": 2.2389, "grad_norm": 1.7283806800842285, "learning_rate": 5e-05, "epoch": 0.5369149004290197, "step": 8385 }, { "loss": 2.2445, "grad_norm": 1.6345056295394897, "learning_rate": 5e-05, "epoch": 0.5372350643529487, "step": 8390 }, { "loss": 2.1955, "grad_norm": 1.7141886949539185, "learning_rate": 5e-05, "epoch": 0.5375552282768777, "step": 8395 }, { "loss": 2.2052, "grad_norm": 1.652571678161621, "learning_rate": 5e-05, "epoch": 0.5378753922008068, "step": 8400 }, { "eval_loss": 2.0771679878234863, "eval_runtime": 9.1156, "eval_samples_per_second": 224.67, "eval_steps_per_second": 28.084, "epoch": 0.5378753922008068, "step": 8400 }, { "loss": 2.2219, "grad_norm": 1.7584346532821655, "learning_rate": 5e-05, "epoch": 0.5381955561247359, "step": 8405 }, { "loss": 2.2288, "grad_norm": 1.6449029445648193, "learning_rate": 5e-05, "epoch": 0.5385157200486649, "step": 8410 }, { "loss": 2.2211, "grad_norm": 1.7171015739440918, "learning_rate": 5e-05, "epoch": 0.538835883972594, "step": 8415 }, { "loss": 2.2383, "grad_norm": 1.8249092102050781, "learning_rate": 5e-05, "epoch": 0.539156047896523, "step": 8420 }, { "loss": 2.2288, "grad_norm": 1.7422677278518677, "learning_rate": 5e-05, "epoch": 0.539476211820452, "step": 8425 }, { "loss": 2.2267, "grad_norm": 1.7028205394744873, "learning_rate": 5e-05, "epoch": 0.5397963757443811, "step": 8430 }, { "loss": 2.2037, "grad_norm": 1.6526613235473633, "learning_rate": 5e-05, "epoch": 0.5401165396683102, "step": 8435 }, { "loss": 2.2046, "grad_norm": 1.6816647052764893, "learning_rate": 5e-05, "epoch": 0.5404367035922393, "step": 8440 }, { "loss": 2.2139, "grad_norm": 1.7449307441711426, "learning_rate": 5e-05, "epoch": 0.5407568675161682, "step": 8445 }, { "loss": 2.2185, "grad_norm": 1.6552678346633911, "learning_rate": 5e-05, "epoch": 0.5410770314400973, "step": 8450 }, { "loss": 2.2082, "grad_norm": 1.6898199319839478, "learning_rate": 5e-05, "epoch": 0.5413971953640264, "step": 8455 }, { "loss": 2.2229, "grad_norm": 1.6607279777526855, "learning_rate": 5e-05, "epoch": 0.5417173592879554, "step": 8460 }, { "loss": 2.2008, "grad_norm": 1.650549054145813, "learning_rate": 5e-05, "epoch": 0.5420375232118845, "step": 8465 }, { "loss": 2.1981, "grad_norm": 1.6187115907669067, "learning_rate": 5e-05, "epoch": 0.5423576871358136, "step": 8470 }, { "loss": 2.1946, "grad_norm": 1.6210685968399048, "learning_rate": 5e-05, "epoch": 0.5426778510597425, "step": 8475 }, { "loss": 2.1881, "grad_norm": 1.6523209810256958, "learning_rate": 5e-05, "epoch": 0.5429980149836716, "step": 8480 }, { "loss": 2.1864, "grad_norm": 1.6581562757492065, "learning_rate": 5e-05, "epoch": 0.5433181789076007, "step": 8485 }, { "loss": 2.223, "grad_norm": 1.628360629081726, "learning_rate": 5e-05, "epoch": 0.5436383428315298, "step": 8490 }, { "loss": 2.1894, "grad_norm": 1.696500301361084, "learning_rate": 5e-05, "epoch": 0.5439585067554588, "step": 8495 }, { "loss": 2.2349, "grad_norm": 1.7601076364517212, "learning_rate": 5e-05, "epoch": 0.5442786706793878, "step": 8500 }, { "loss": 2.2428, "grad_norm": 1.718100666999817, "learning_rate": 5e-05, "epoch": 0.5445988346033169, "step": 8505 }, { "loss": 2.2134, "grad_norm": 1.6576154232025146, "learning_rate": 5e-05, "epoch": 0.5449189985272459, "step": 8510 }, { "loss": 2.1908, "grad_norm": 1.724191665649414, "learning_rate": 5e-05, "epoch": 0.545239162451175, "step": 8515 }, { "loss": 2.2296, "grad_norm": 1.724888801574707, "learning_rate": 5e-05, "epoch": 0.5455593263751041, "step": 8520 }, { "loss": 2.211, "grad_norm": 1.753718614578247, "learning_rate": 5e-05, "epoch": 0.5458794902990332, "step": 8525 }, { "loss": 2.1835, "grad_norm": 1.7066841125488281, "learning_rate": 5e-05, "epoch": 0.5461996542229621, "step": 8530 }, { "loss": 2.2209, "grad_norm": 1.7294259071350098, "learning_rate": 5e-05, "epoch": 0.5465198181468912, "step": 8535 }, { "loss": 2.1839, "grad_norm": 1.827268123626709, "learning_rate": 5e-05, "epoch": 0.5468399820708203, "step": 8540 }, { "loss": 2.2227, "grad_norm": 1.6551259756088257, "learning_rate": 5e-05, "epoch": 0.5471601459947493, "step": 8545 }, { "loss": 2.2131, "grad_norm": 1.7133010625839233, "learning_rate": 5e-05, "epoch": 0.5474803099186784, "step": 8550 }, { "loss": 2.2248, "grad_norm": 1.6199853420257568, "learning_rate": 5e-05, "epoch": 0.5478004738426074, "step": 8555 }, { "loss": 2.1953, "grad_norm": 1.6595536470413208, "learning_rate": 5e-05, "epoch": 0.5481206377665364, "step": 8560 }, { "loss": 2.2208, "grad_norm": 1.6121689081192017, "learning_rate": 5e-05, "epoch": 0.5484408016904655, "step": 8565 }, { "loss": 2.2003, "grad_norm": 1.670493721961975, "learning_rate": 5e-05, "epoch": 0.5487609656143946, "step": 8570 }, { "loss": 2.2055, "grad_norm": 1.6377112865447998, "learning_rate": 5e-05, "epoch": 0.5490811295383237, "step": 8575 }, { "loss": 2.1795, "grad_norm": 1.755138635635376, "learning_rate": 5e-05, "epoch": 0.5494012934622526, "step": 8580 }, { "loss": 2.2184, "grad_norm": 1.746940016746521, "learning_rate": 5e-05, "epoch": 0.5497214573861817, "step": 8585 }, { "loss": 2.1888, "grad_norm": 1.7026208639144897, "learning_rate": 5e-05, "epoch": 0.5500416213101108, "step": 8590 }, { "loss": 2.1954, "grad_norm": 1.622263789176941, "learning_rate": 5e-05, "epoch": 0.5503617852340398, "step": 8595 }, { "loss": 2.1978, "grad_norm": 1.7794569730758667, "learning_rate": 5e-05, "epoch": 0.5506819491579689, "step": 8600 }, { "eval_loss": 2.068195343017578, "eval_runtime": 8.9579, "eval_samples_per_second": 228.625, "eval_steps_per_second": 28.578, "epoch": 0.5506819491579689, "step": 8600 }, { "loss": 2.1963, "grad_norm": 1.67849862575531, "learning_rate": 5e-05, "epoch": 0.551002113081898, "step": 8605 }, { "loss": 2.2099, "grad_norm": 1.6759015321731567, "learning_rate": 5e-05, "epoch": 0.551322277005827, "step": 8610 }, { "loss": 2.2053, "grad_norm": 1.6613909006118774, "learning_rate": 5e-05, "epoch": 0.551642440929756, "step": 8615 }, { "loss": 2.2083, "grad_norm": 1.719401478767395, "learning_rate": 5e-05, "epoch": 0.5519626048536851, "step": 8620 }, { "loss": 2.1983, "grad_norm": 1.6846554279327393, "learning_rate": 5e-05, "epoch": 0.5522827687776142, "step": 8625 }, { "loss": 2.1777, "grad_norm": 1.585060715675354, "learning_rate": 5e-05, "epoch": 0.5526029327015431, "step": 8630 }, { "loss": 2.2142, "grad_norm": 1.5889817476272583, "learning_rate": 5e-05, "epoch": 0.5529230966254722, "step": 8635 }, { "loss": 2.2204, "grad_norm": 1.8110606670379639, "learning_rate": 5e-05, "epoch": 0.5532432605494013, "step": 8640 }, { "loss": 2.212, "grad_norm": 1.6412723064422607, "learning_rate": 5e-05, "epoch": 0.5535634244733303, "step": 8645 }, { "loss": 2.2182, "grad_norm": 1.7718569040298462, "learning_rate": 5e-05, "epoch": 0.5538835883972594, "step": 8650 }, { "loss": 2.193, "grad_norm": 1.6897401809692383, "learning_rate": 5e-05, "epoch": 0.5542037523211885, "step": 8655 }, { "loss": 2.1971, "grad_norm": 1.5787633657455444, "learning_rate": 5e-05, "epoch": 0.5545239162451175, "step": 8660 }, { "loss": 2.1995, "grad_norm": 1.791604995727539, "learning_rate": 5e-05, "epoch": 0.5548440801690465, "step": 8665 }, { "loss": 2.242, "grad_norm": 1.6345185041427612, "learning_rate": 5e-05, "epoch": 0.5551642440929756, "step": 8670 }, { "loss": 2.2313, "grad_norm": 1.6359039545059204, "learning_rate": 5e-05, "epoch": 0.5554844080169047, "step": 8675 }, { "loss": 2.2167, "grad_norm": 1.5969913005828857, "learning_rate": 5e-05, "epoch": 0.5558045719408337, "step": 8680 }, { "loss": 2.1846, "grad_norm": 1.6926162242889404, "learning_rate": 5e-05, "epoch": 0.5561247358647627, "step": 8685 }, { "loss": 2.2072, "grad_norm": 1.5990930795669556, "learning_rate": 5e-05, "epoch": 0.5564448997886918, "step": 8690 }, { "loss": 2.2057, "grad_norm": 1.6216379404067993, "learning_rate": 5e-05, "epoch": 0.5567650637126209, "step": 8695 }, { "loss": 2.1902, "grad_norm": 1.7529386281967163, "learning_rate": 5e-05, "epoch": 0.5570852276365499, "step": 8700 }, { "loss": 2.2107, "grad_norm": 1.7439981698989868, "learning_rate": 5e-05, "epoch": 0.557405391560479, "step": 8705 }, { "loss": 2.2051, "grad_norm": 1.6986141204833984, "learning_rate": 5e-05, "epoch": 0.557725555484408, "step": 8710 }, { "loss": 2.2122, "grad_norm": 1.678336501121521, "learning_rate": 5e-05, "epoch": 0.558045719408337, "step": 8715 }, { "loss": 2.2034, "grad_norm": 1.6236997842788696, "learning_rate": 5e-05, "epoch": 0.5583658833322661, "step": 8720 }, { "loss": 2.2108, "grad_norm": 1.568988561630249, "learning_rate": 5e-05, "epoch": 0.5586860472561952, "step": 8725 }, { "loss": 2.1891, "grad_norm": 1.6444505453109741, "learning_rate": 5e-05, "epoch": 0.5590062111801242, "step": 8730 }, { "loss": 2.2248, "grad_norm": 1.6724077463150024, "learning_rate": 5e-05, "epoch": 0.5593263751040533, "step": 8735 }, { "loss": 2.214, "grad_norm": 1.6417819261550903, "learning_rate": 5e-05, "epoch": 0.5596465390279823, "step": 8740 }, { "loss": 2.1809, "grad_norm": 1.6727244853973389, "learning_rate": 5e-05, "epoch": 0.5599667029519114, "step": 8745 }, { "loss": 2.2277, "grad_norm": 1.6233677864074707, "learning_rate": 5e-05, "epoch": 0.5602868668758404, "step": 8750 }, { "loss": 2.2074, "grad_norm": 1.6876188516616821, "learning_rate": 5e-05, "epoch": 0.5606070307997695, "step": 8755 }, { "loss": 2.2446, "grad_norm": 1.6457571983337402, "learning_rate": 5e-05, "epoch": 0.5609271947236986, "step": 8760 }, { "loss": 2.2219, "grad_norm": 1.6713467836380005, "learning_rate": 5e-05, "epoch": 0.5612473586476275, "step": 8765 }, { "loss": 2.1961, "grad_norm": 1.6506388187408447, "learning_rate": 5e-05, "epoch": 0.5615675225715566, "step": 8770 }, { "loss": 2.2156, "grad_norm": 1.7466049194335938, "learning_rate": 5e-05, "epoch": 0.5618876864954857, "step": 8775 }, { "loss": 2.2071, "grad_norm": 1.702660322189331, "learning_rate": 5e-05, "epoch": 0.5622078504194148, "step": 8780 }, { "loss": 2.1891, "grad_norm": 1.733842134475708, "learning_rate": 5e-05, "epoch": 0.5625280143433438, "step": 8785 }, { "loss": 2.1982, "grad_norm": 1.6536738872528076, "learning_rate": 5e-05, "epoch": 0.5628481782672728, "step": 8790 }, { "loss": 2.2137, "grad_norm": 1.6220111846923828, "learning_rate": 5e-05, "epoch": 0.5631683421912019, "step": 8795 }, { "loss": 2.2096, "grad_norm": 1.7264735698699951, "learning_rate": 5e-05, "epoch": 0.5634885061151309, "step": 8800 }, { "eval_loss": 2.054811477661133, "eval_runtime": 9.6124, "eval_samples_per_second": 213.059, "eval_steps_per_second": 26.632, "epoch": 0.5634885061151309, "step": 8800 }, { "loss": 2.207, "grad_norm": 1.7159490585327148, "learning_rate": 5e-05, "epoch": 0.56380867003906, "step": 8805 }, { "loss": 2.2107, "grad_norm": 1.6418389081954956, "learning_rate": 5e-05, "epoch": 0.5641288339629891, "step": 8810 }, { "loss": 2.2117, "grad_norm": 1.6357439756393433, "learning_rate": 5e-05, "epoch": 0.564448997886918, "step": 8815 }, { "loss": 2.2005, "grad_norm": 1.6727110147476196, "learning_rate": 5e-05, "epoch": 0.5647691618108471, "step": 8820 }, { "loss": 2.209, "grad_norm": 1.7824372053146362, "learning_rate": 5e-05, "epoch": 0.5650893257347762, "step": 8825 }, { "loss": 2.2069, "grad_norm": 1.6073055267333984, "learning_rate": 5e-05, "epoch": 0.5654094896587053, "step": 8830 }, { "loss": 2.2006, "grad_norm": 1.6197354793548584, "learning_rate": 5e-05, "epoch": 0.5657296535826343, "step": 8835 }, { "loss": 2.187, "grad_norm": 1.6128679513931274, "learning_rate": 5e-05, "epoch": 0.5660498175065634, "step": 8840 }, { "loss": 2.186, "grad_norm": 1.6642301082611084, "learning_rate": 5e-05, "epoch": 0.5663699814304924, "step": 8845 }, { "loss": 2.2474, "grad_norm": 1.6685996055603027, "learning_rate": 5e-05, "epoch": 0.5666901453544214, "step": 8850 }, { "loss": 2.2148, "grad_norm": 1.7215569019317627, "learning_rate": 5e-05, "epoch": 0.5670103092783505, "step": 8855 }, { "loss": 2.2341, "grad_norm": 1.8265643119812012, "learning_rate": 5e-05, "epoch": 0.5673304732022796, "step": 8860 }, { "loss": 2.1853, "grad_norm": 1.7127398252487183, "learning_rate": 5e-05, "epoch": 0.5676506371262087, "step": 8865 }, { "loss": 2.1994, "grad_norm": 1.6666933298110962, "learning_rate": 5e-05, "epoch": 0.5679708010501376, "step": 8870 }, { "loss": 2.1971, "grad_norm": 1.6345758438110352, "learning_rate": 5e-05, "epoch": 0.5682909649740667, "step": 8875 }, { "loss": 2.1836, "grad_norm": 1.6167995929718018, "learning_rate": 5e-05, "epoch": 0.5686111288979958, "step": 8880 }, { "loss": 2.1954, "grad_norm": 1.57766854763031, "learning_rate": 5e-05, "epoch": 0.5689312928219248, "step": 8885 }, { "loss": 2.2106, "grad_norm": 1.6215426921844482, "learning_rate": 5e-05, "epoch": 0.5692514567458539, "step": 8890 }, { "loss": 2.1891, "grad_norm": 1.6707900762557983, "learning_rate": 5e-05, "epoch": 0.569571620669783, "step": 8895 }, { "loss": 2.1936, "grad_norm": 1.6593118906021118, "learning_rate": 5e-05, "epoch": 0.5698917845937119, "step": 8900 }, { "loss": 2.2129, "grad_norm": 1.666658639907837, "learning_rate": 5e-05, "epoch": 0.570211948517641, "step": 8905 }, { "loss": 2.193, "grad_norm": 1.680935025215149, "learning_rate": 5e-05, "epoch": 0.5705321124415701, "step": 8910 }, { "loss": 2.2007, "grad_norm": 1.6215225458145142, "learning_rate": 5e-05, "epoch": 0.5708522763654992, "step": 8915 }, { "loss": 2.2161, "grad_norm": 1.716860294342041, "learning_rate": 5e-05, "epoch": 0.5711724402894282, "step": 8920 }, { "loss": 2.211, "grad_norm": 1.6454654932022095, "learning_rate": 5e-05, "epoch": 0.5714926042133572, "step": 8925 }, { "loss": 2.1803, "grad_norm": 1.6381767988204956, "learning_rate": 5e-05, "epoch": 0.5718127681372863, "step": 8930 }, { "loss": 2.2293, "grad_norm": 1.6502015590667725, "learning_rate": 5e-05, "epoch": 0.5721329320612153, "step": 8935 }, { "loss": 2.2001, "grad_norm": 1.691019892692566, "learning_rate": 5e-05, "epoch": 0.5724530959851444, "step": 8940 }, { "loss": 2.1875, "grad_norm": 1.6835181713104248, "learning_rate": 5e-05, "epoch": 0.5727732599090735, "step": 8945 }, { "loss": 2.2026, "grad_norm": 1.6180915832519531, "learning_rate": 5e-05, "epoch": 0.5730934238330025, "step": 8950 }, { "loss": 2.2328, "grad_norm": 1.57301926612854, "learning_rate": 5e-05, "epoch": 0.5734135877569315, "step": 8955 }, { "loss": 2.2343, "grad_norm": 1.712069034576416, "learning_rate": 5e-05, "epoch": 0.5737337516808606, "step": 8960 }, { "loss": 2.189, "grad_norm": 1.712024211883545, "learning_rate": 5e-05, "epoch": 0.5740539156047897, "step": 8965 }, { "loss": 2.1816, "grad_norm": 1.7028721570968628, "learning_rate": 5e-05, "epoch": 0.5743740795287187, "step": 8970 }, { "loss": 2.1809, "grad_norm": 1.7358742952346802, "learning_rate": 5e-05, "epoch": 0.5746942434526477, "step": 8975 }, { "loss": 2.2039, "grad_norm": 1.7282836437225342, "learning_rate": 5e-05, "epoch": 0.5750144073765768, "step": 8980 }, { "loss": 2.2086, "grad_norm": 1.6507220268249512, "learning_rate": 5e-05, "epoch": 0.5753345713005058, "step": 8985 }, { "loss": 2.1958, "grad_norm": 1.7284539937973022, "learning_rate": 5e-05, "epoch": 0.5756547352244349, "step": 8990 }, { "loss": 2.1792, "grad_norm": 1.5841361284255981, "learning_rate": 5e-05, "epoch": 0.575974899148364, "step": 8995 }, { "loss": 2.1905, "grad_norm": 1.7276769876480103, "learning_rate": 5e-05, "epoch": 0.5762950630722931, "step": 9000 }, { "eval_loss": 2.0714640617370605, "eval_runtime": 9.0714, "eval_samples_per_second": 225.763, "eval_steps_per_second": 28.22, "epoch": 0.5762950630722931, "step": 9000 }, { "loss": 2.217, "grad_norm": 1.657918095588684, "learning_rate": 5e-05, "epoch": 0.576615226996222, "step": 9005 }, { "loss": 2.2101, "grad_norm": 1.6392052173614502, "learning_rate": 5e-05, "epoch": 0.5769353909201511, "step": 9010 }, { "loss": 2.1809, "grad_norm": 1.600339412689209, "learning_rate": 5e-05, "epoch": 0.5772555548440802, "step": 9015 }, { "loss": 2.1968, "grad_norm": 1.6771150827407837, "learning_rate": 5e-05, "epoch": 0.5775757187680092, "step": 9020 }, { "loss": 2.2139, "grad_norm": 1.615399718284607, "learning_rate": 5e-05, "epoch": 0.5778958826919383, "step": 9025 }, { "loss": 2.1459, "grad_norm": 1.6415457725524902, "learning_rate": 5e-05, "epoch": 0.5782160466158673, "step": 9030 }, { "loss": 2.2141, "grad_norm": 1.626848816871643, "learning_rate": 5e-05, "epoch": 0.5785362105397964, "step": 9035 }, { "loss": 2.1816, "grad_norm": 1.6501692533493042, "learning_rate": 5e-05, "epoch": 0.5788563744637254, "step": 9040 }, { "loss": 2.2044, "grad_norm": 1.6761474609375, "learning_rate": 5e-05, "epoch": 0.5791765383876545, "step": 9045 }, { "loss": 2.2251, "grad_norm": 1.6290276050567627, "learning_rate": 5e-05, "epoch": 0.5794967023115836, "step": 9050 }, { "loss": 2.1981, "grad_norm": 1.680826187133789, "learning_rate": 5e-05, "epoch": 0.5798168662355125, "step": 9055 }, { "loss": 2.2105, "grad_norm": 1.649733066558838, "learning_rate": 5e-05, "epoch": 0.5801370301594416, "step": 9060 }, { "loss": 2.1951, "grad_norm": 1.550475001335144, "learning_rate": 5e-05, "epoch": 0.5804571940833707, "step": 9065 }, { "loss": 2.1964, "grad_norm": 1.6704422235488892, "learning_rate": 5e-05, "epoch": 0.5807773580072997, "step": 9070 }, { "loss": 2.1977, "grad_norm": 1.5833699703216553, "learning_rate": 5e-05, "epoch": 0.5810975219312288, "step": 9075 }, { "loss": 2.2158, "grad_norm": 1.6299668550491333, "learning_rate": 5e-05, "epoch": 0.5814176858551579, "step": 9080 }, { "loss": 2.1806, "grad_norm": 1.6842360496520996, "learning_rate": 5e-05, "epoch": 0.5817378497790869, "step": 9085 }, { "loss": 2.2131, "grad_norm": 1.6969107389450073, "learning_rate": 5e-05, "epoch": 0.5820580137030159, "step": 9090 }, { "loss": 2.2133, "grad_norm": 1.7344791889190674, "learning_rate": 5e-05, "epoch": 0.582378177626945, "step": 9095 }, { "loss": 2.2099, "grad_norm": 1.6542928218841553, "learning_rate": 5e-05, "epoch": 0.5826983415508741, "step": 9100 }, { "loss": 2.1778, "grad_norm": 1.6566765308380127, "learning_rate": 5e-05, "epoch": 0.583018505474803, "step": 9105 }, { "loss": 2.1857, "grad_norm": 1.6014552116394043, "learning_rate": 5e-05, "epoch": 0.5833386693987321, "step": 9110 }, { "loss": 2.2152, "grad_norm": 1.6418790817260742, "learning_rate": 5e-05, "epoch": 0.5836588333226612, "step": 9115 }, { "loss": 2.1682, "grad_norm": 1.601062536239624, "learning_rate": 5e-05, "epoch": 0.5839789972465903, "step": 9120 }, { "loss": 2.179, "grad_norm": 1.721508502960205, "learning_rate": 5e-05, "epoch": 0.5842991611705193, "step": 9125 }, { "loss": 2.1789, "grad_norm": 1.6199660301208496, "learning_rate": 5e-05, "epoch": 0.5846193250944484, "step": 9130 }, { "loss": 2.2258, "grad_norm": 1.7283803224563599, "learning_rate": 5e-05, "epoch": 0.5849394890183774, "step": 9135 }, { "loss": 2.1966, "grad_norm": 1.761349081993103, "learning_rate": 5e-05, "epoch": 0.5852596529423064, "step": 9140 }, { "loss": 2.1639, "grad_norm": 1.6975593566894531, "learning_rate": 5e-05, "epoch": 0.5855798168662355, "step": 9145 }, { "loss": 2.1935, "grad_norm": 1.672235369682312, "learning_rate": 5e-05, "epoch": 0.5858999807901646, "step": 9150 }, { "loss": 2.2, "grad_norm": 1.6695904731750488, "learning_rate": 5e-05, "epoch": 0.5862201447140936, "step": 9155 }, { "loss": 2.2062, "grad_norm": 1.6303057670593262, "learning_rate": 5e-05, "epoch": 0.5865403086380226, "step": 9160 }, { "loss": 2.2062, "grad_norm": 1.704704761505127, "learning_rate": 5e-05, "epoch": 0.5868604725619517, "step": 9165 }, { "loss": 2.183, "grad_norm": 1.5838806629180908, "learning_rate": 5e-05, "epoch": 0.5871806364858808, "step": 9170 }, { "loss": 2.2028, "grad_norm": 1.6641960144042969, "learning_rate": 5e-05, "epoch": 0.5875008004098098, "step": 9175 }, { "loss": 2.2251, "grad_norm": 1.6648904085159302, "learning_rate": 5e-05, "epoch": 0.5878209643337389, "step": 9180 }, { "loss": 2.2126, "grad_norm": 1.7670952081680298, "learning_rate": 5e-05, "epoch": 0.588141128257668, "step": 9185 }, { "loss": 2.2139, "grad_norm": 1.7805712223052979, "learning_rate": 5e-05, "epoch": 0.5884612921815969, "step": 9190 }, { "loss": 2.2055, "grad_norm": 1.7651017904281616, "learning_rate": 5e-05, "epoch": 0.588781456105526, "step": 9195 }, { "loss": 2.1971, "grad_norm": 1.8590534925460815, "learning_rate": 5e-05, "epoch": 0.5891016200294551, "step": 9200 }, { "eval_loss": 2.0600976943969727, "eval_runtime": 9.0846, "eval_samples_per_second": 225.436, "eval_steps_per_second": 28.18, "epoch": 0.5891016200294551, "step": 9200 }, { "loss": 2.1996, "grad_norm": 1.6831485033035278, "learning_rate": 5e-05, "epoch": 0.5894217839533842, "step": 9205 }, { "loss": 2.1846, "grad_norm": 1.6748493909835815, "learning_rate": 5e-05, "epoch": 0.5897419478773132, "step": 9210 }, { "loss": 2.1819, "grad_norm": 1.6625423431396484, "learning_rate": 5e-05, "epoch": 0.5900621118012422, "step": 9215 }, { "loss": 2.2139, "grad_norm": 1.6799538135528564, "learning_rate": 5e-05, "epoch": 0.5903822757251713, "step": 9220 }, { "loss": 2.2051, "grad_norm": 1.723824143409729, "learning_rate": 5e-05, "epoch": 0.5907024396491003, "step": 9225 }, { "loss": 2.1924, "grad_norm": 1.634695291519165, "learning_rate": 5e-05, "epoch": 0.5910226035730294, "step": 9230 }, { "loss": 2.1907, "grad_norm": 1.6800014972686768, "learning_rate": 5e-05, "epoch": 0.5913427674969585, "step": 9235 }, { "loss": 2.1846, "grad_norm": 1.5842480659484863, "learning_rate": 5e-05, "epoch": 0.5916629314208874, "step": 9240 }, { "loss": 2.1945, "grad_norm": 1.6501984596252441, "learning_rate": 5e-05, "epoch": 0.5919830953448165, "step": 9245 }, { "loss": 2.2027, "grad_norm": 1.6772314310073853, "learning_rate": 5e-05, "epoch": 0.5923032592687456, "step": 9250 }, { "loss": 2.1834, "grad_norm": 1.6418979167938232, "learning_rate": 5e-05, "epoch": 0.5926234231926747, "step": 9255 }, { "loss": 2.1975, "grad_norm": 1.6140997409820557, "learning_rate": 5e-05, "epoch": 0.5929435871166037, "step": 9260 }, { "loss": 2.2261, "grad_norm": 1.6440484523773193, "learning_rate": 5e-05, "epoch": 0.5932637510405327, "step": 9265 }, { "loss": 2.1802, "grad_norm": 1.642822027206421, "learning_rate": 5e-05, "epoch": 0.5935839149644618, "step": 9270 }, { "loss": 2.1736, "grad_norm": 1.622124433517456, "learning_rate": 5e-05, "epoch": 0.5939040788883908, "step": 9275 }, { "loss": 2.1894, "grad_norm": 1.6651124954223633, "learning_rate": 5e-05, "epoch": 0.5942242428123199, "step": 9280 }, { "loss": 2.1898, "grad_norm": 1.6245893239974976, "learning_rate": 5e-05, "epoch": 0.594544406736249, "step": 9285 }, { "loss": 2.1863, "grad_norm": 1.5905382633209229, "learning_rate": 5e-05, "epoch": 0.5948645706601781, "step": 9290 }, { "loss": 2.1795, "grad_norm": 1.688947319984436, "learning_rate": 5e-05, "epoch": 0.595184734584107, "step": 9295 }, { "loss": 2.2024, "grad_norm": 1.6358672380447388, "learning_rate": 5e-05, "epoch": 0.5955048985080361, "step": 9300 }, { "loss": 2.1899, "grad_norm": 1.6548181772232056, "learning_rate": 5e-05, "epoch": 0.5958250624319652, "step": 9305 }, { "loss": 2.1889, "grad_norm": 1.612070918083191, "learning_rate": 5e-05, "epoch": 0.5961452263558942, "step": 9310 }, { "loss": 2.2026, "grad_norm": 1.6708488464355469, "learning_rate": 5e-05, "epoch": 0.5964653902798233, "step": 9315 }, { "loss": 2.1895, "grad_norm": 1.5530942678451538, "learning_rate": 5e-05, "epoch": 0.5967855542037523, "step": 9320 }, { "loss": 2.2076, "grad_norm": 1.713191270828247, "learning_rate": 5e-05, "epoch": 0.5971057181276813, "step": 9325 }, { "loss": 2.2008, "grad_norm": 1.6353609561920166, "learning_rate": 5e-05, "epoch": 0.5974258820516104, "step": 9330 }, { "loss": 2.1978, "grad_norm": 1.6699703931808472, "learning_rate": 5e-05, "epoch": 0.5977460459755395, "step": 9335 }, { "loss": 2.2398, "grad_norm": 1.6570907831192017, "learning_rate": 5e-05, "epoch": 0.5980662098994686, "step": 9340 }, { "loss": 2.1825, "grad_norm": 1.5736383199691772, "learning_rate": 5e-05, "epoch": 0.5983863738233975, "step": 9345 }, { "loss": 2.1911, "grad_norm": 1.6274065971374512, "learning_rate": 5e-05, "epoch": 0.5987065377473266, "step": 9350 }, { "loss": 2.2171, "grad_norm": 1.6303297281265259, "learning_rate": 5e-05, "epoch": 0.5990267016712557, "step": 9355 }, { "loss": 2.2006, "grad_norm": 1.5498477220535278, "learning_rate": 5e-05, "epoch": 0.5993468655951847, "step": 9360 }, { "loss": 2.2076, "grad_norm": 1.6340336799621582, "learning_rate": 5e-05, "epoch": 0.5996670295191138, "step": 9365 }, { "loss": 2.197, "grad_norm": 1.693532943725586, "learning_rate": 5e-05, "epoch": 0.5999871934430429, "step": 9370 }, { "loss": 2.1982, "grad_norm": 1.7246029376983643, "learning_rate": 5e-05, "epoch": 0.6003073573669719, "step": 9375 }, { "loss": 2.1855, "grad_norm": 1.6022217273712158, "learning_rate": 5e-05, "epoch": 0.6006275212909009, "step": 9380 }, { "loss": 2.1501, "grad_norm": 1.6127444505691528, "learning_rate": 5e-05, "epoch": 0.60094768521483, "step": 9385 }, { "loss": 2.1776, "grad_norm": 1.6102303266525269, "learning_rate": 5e-05, "epoch": 0.6012678491387591, "step": 9390 }, { "loss": 2.1867, "grad_norm": 1.6318905353546143, "learning_rate": 5e-05, "epoch": 0.601588013062688, "step": 9395 }, { "loss": 2.1573, "grad_norm": 1.6750482320785522, "learning_rate": 5e-05, "epoch": 0.6019081769866171, "step": 9400 }, { "eval_loss": 2.0679101943969727, "eval_runtime": 9.4704, "eval_samples_per_second": 216.253, "eval_steps_per_second": 27.032, "epoch": 0.6019081769866171, "step": 9400 }, { "loss": 2.1869, "grad_norm": 1.6901905536651611, "learning_rate": 5e-05, "epoch": 0.6022283409105462, "step": 9405 }, { "loss": 2.1978, "grad_norm": 1.6613404750823975, "learning_rate": 5e-05, "epoch": 0.6025485048344752, "step": 9410 }, { "loss": 2.2079, "grad_norm": 1.7926342487335205, "learning_rate": 5e-05, "epoch": 0.6028686687584043, "step": 9415 }, { "loss": 2.2059, "grad_norm": 1.624189019203186, "learning_rate": 5e-05, "epoch": 0.6031888326823334, "step": 9420 }, { "loss": 2.1925, "grad_norm": 1.628775715827942, "learning_rate": 5e-05, "epoch": 0.6035089966062624, "step": 9425 }, { "loss": 2.199, "grad_norm": 1.6382794380187988, "learning_rate": 5e-05, "epoch": 0.6038291605301914, "step": 9430 }, { "loss": 2.174, "grad_norm": 1.6540191173553467, "learning_rate": 5e-05, "epoch": 0.6041493244541205, "step": 9435 }, { "loss": 2.2111, "grad_norm": 1.689916729927063, "learning_rate": 5e-05, "epoch": 0.6044694883780496, "step": 9440 }, { "loss": 2.163, "grad_norm": 1.7421029806137085, "learning_rate": 5e-05, "epoch": 0.6047896523019786, "step": 9445 }, { "loss": 2.1926, "grad_norm": 1.6341474056243896, "learning_rate": 5e-05, "epoch": 0.6051098162259076, "step": 9450 }, { "loss": 2.1649, "grad_norm": 1.5881348848342896, "learning_rate": 5e-05, "epoch": 0.6054299801498367, "step": 9455 }, { "loss": 2.199, "grad_norm": 1.727407693862915, "learning_rate": 5e-05, "epoch": 0.6057501440737658, "step": 9460 }, { "loss": 2.1982, "grad_norm": 1.7314342260360718, "learning_rate": 5e-05, "epoch": 0.6060703079976948, "step": 9465 }, { "loss": 2.1971, "grad_norm": 1.656754732131958, "learning_rate": 5e-05, "epoch": 0.6063904719216239, "step": 9470 }, { "loss": 2.1756, "grad_norm": 1.6005347967147827, "learning_rate": 5e-05, "epoch": 0.606710635845553, "step": 9475 }, { "loss": 2.1881, "grad_norm": 1.5883443355560303, "learning_rate": 5e-05, "epoch": 0.6070307997694819, "step": 9480 }, { "loss": 2.1901, "grad_norm": 1.8181809186935425, "learning_rate": 5e-05, "epoch": 0.607350963693411, "step": 9485 }, { "loss": 2.1893, "grad_norm": 1.8270716667175293, "learning_rate": 5e-05, "epoch": 0.6076711276173401, "step": 9490 }, { "loss": 2.2014, "grad_norm": 1.7661374807357788, "learning_rate": 5e-05, "epoch": 0.6079912915412691, "step": 9495 }, { "loss": 2.2077, "grad_norm": 1.6404733657836914, "learning_rate": 5e-05, "epoch": 0.6083114554651982, "step": 9500 }, { "loss": 2.1991, "grad_norm": 1.6874150037765503, "learning_rate": 5e-05, "epoch": 0.6086316193891272, "step": 9505 }, { "loss": 2.212, "grad_norm": 1.7771927118301392, "learning_rate": 5e-05, "epoch": 0.6089517833130563, "step": 9510 }, { "loss": 2.1921, "grad_norm": 1.7601332664489746, "learning_rate": 5e-05, "epoch": 0.6092719472369853, "step": 9515 }, { "loss": 2.1887, "grad_norm": 1.7171956300735474, "learning_rate": 5e-05, "epoch": 0.6095921111609144, "step": 9520 }, { "loss": 2.2107, "grad_norm": 1.5932697057724, "learning_rate": 5e-05, "epoch": 0.6099122750848435, "step": 9525 }, { "loss": 2.1952, "grad_norm": 1.6156558990478516, "learning_rate": 5e-05, "epoch": 0.6102324390087724, "step": 9530 }, { "loss": 2.2223, "grad_norm": 1.643193244934082, "learning_rate": 5e-05, "epoch": 0.6105526029327015, "step": 9535 }, { "loss": 2.1795, "grad_norm": 1.7439320087432861, "learning_rate": 5e-05, "epoch": 0.6108727668566306, "step": 9540 }, { "loss": 2.2021, "grad_norm": 1.6661970615386963, "learning_rate": 5e-05, "epoch": 0.6111929307805597, "step": 9545 }, { "loss": 2.1948, "grad_norm": 1.6624956130981445, "learning_rate": 5e-05, "epoch": 0.6115130947044887, "step": 9550 }, { "loss": 2.191, "grad_norm": 1.6625850200653076, "learning_rate": 5e-05, "epoch": 0.6118332586284178, "step": 9555 }, { "loss": 2.1988, "grad_norm": 1.6776957511901855, "learning_rate": 5e-05, "epoch": 0.6121534225523468, "step": 9560 }, { "loss": 2.2195, "grad_norm": 1.6892213821411133, "learning_rate": 5e-05, "epoch": 0.6124735864762758, "step": 9565 }, { "loss": 2.2082, "grad_norm": 1.7207622528076172, "learning_rate": 5e-05, "epoch": 0.6127937504002049, "step": 9570 }, { "loss": 2.1844, "grad_norm": 1.6284791231155396, "learning_rate": 5e-05, "epoch": 0.613113914324134, "step": 9575 }, { "loss": 2.199, "grad_norm": 1.6319007873535156, "learning_rate": 5e-05, "epoch": 0.613434078248063, "step": 9580 }, { "loss": 2.1955, "grad_norm": 1.634732723236084, "learning_rate": 5e-05, "epoch": 0.613754242171992, "step": 9585 }, { "loss": 2.1924, "grad_norm": 1.7350897789001465, "learning_rate": 5e-05, "epoch": 0.6140744060959211, "step": 9590 }, { "loss": 2.1722, "grad_norm": 1.6487643718719482, "learning_rate": 5e-05, "epoch": 0.6143945700198502, "step": 9595 }, { "loss": 2.1732, "grad_norm": 1.678441047668457, "learning_rate": 5e-05, "epoch": 0.6147147339437792, "step": 9600 }, { "eval_loss": 2.0491390228271484, "eval_runtime": 10.1854, "eval_samples_per_second": 201.071, "eval_steps_per_second": 25.134, "epoch": 0.6147147339437792, "step": 9600 }, { "loss": 2.2301, "grad_norm": 1.7390731573104858, "learning_rate": 5e-05, "epoch": 0.6150348978677083, "step": 9605 }, { "loss": 2.2033, "grad_norm": 1.70026433467865, "learning_rate": 5e-05, "epoch": 0.6153550617916373, "step": 9610 }, { "loss": 2.2143, "grad_norm": 1.6489602327346802, "learning_rate": 5e-05, "epoch": 0.6156752257155663, "step": 9615 }, { "loss": 2.1629, "grad_norm": 1.707454800605774, "learning_rate": 5e-05, "epoch": 0.6159953896394954, "step": 9620 }, { "loss": 2.1867, "grad_norm": 1.7108656167984009, "learning_rate": 5e-05, "epoch": 0.6163155535634245, "step": 9625 }, { "loss": 2.1792, "grad_norm": 1.6427521705627441, "learning_rate": 5e-05, "epoch": 0.6166357174873536, "step": 9630 }, { "loss": 2.193, "grad_norm": 1.6153441667556763, "learning_rate": 5e-05, "epoch": 0.6169558814112825, "step": 9635 }, { "loss": 2.1779, "grad_norm": 1.5934603214263916, "learning_rate": 5e-05, "epoch": 0.6172760453352116, "step": 9640 }, { "loss": 2.1825, "grad_norm": 1.7031588554382324, "learning_rate": 5e-05, "epoch": 0.6175962092591407, "step": 9645 }, { "loss": 2.1919, "grad_norm": 1.7012118101119995, "learning_rate": 5e-05, "epoch": 0.6179163731830697, "step": 9650 }, { "loss": 2.1638, "grad_norm": 1.695015549659729, "learning_rate": 5e-05, "epoch": 0.6182365371069988, "step": 9655 }, { "loss": 2.191, "grad_norm": 1.6796445846557617, "learning_rate": 5e-05, "epoch": 0.6185567010309279, "step": 9660 }, { "loss": 2.2073, "grad_norm": 1.71908700466156, "learning_rate": 5e-05, "epoch": 0.6188768649548568, "step": 9665 }, { "loss": 2.222, "grad_norm": 1.6226277351379395, "learning_rate": 5e-05, "epoch": 0.6191970288787859, "step": 9670 }, { "loss": 2.1851, "grad_norm": 1.6331515312194824, "learning_rate": 5e-05, "epoch": 0.619517192802715, "step": 9675 }, { "loss": 2.2073, "grad_norm": 1.7072153091430664, "learning_rate": 5e-05, "epoch": 0.6198373567266441, "step": 9680 }, { "loss": 2.1987, "grad_norm": 1.7724852561950684, "learning_rate": 5e-05, "epoch": 0.6201575206505731, "step": 9685 }, { "loss": 2.2316, "grad_norm": 1.5883903503417969, "learning_rate": 5e-05, "epoch": 0.6204776845745021, "step": 9690 }, { "loss": 2.1984, "grad_norm": 1.6855353116989136, "learning_rate": 5e-05, "epoch": 0.6207978484984312, "step": 9695 }, { "loss": 2.1934, "grad_norm": 1.6740260124206543, "learning_rate": 5e-05, "epoch": 0.6211180124223602, "step": 9700 }, { "loss": 2.1905, "grad_norm": 1.5806589126586914, "learning_rate": 5e-05, "epoch": 0.6214381763462893, "step": 9705 }, { "loss": 2.1811, "grad_norm": 1.7320712804794312, "learning_rate": 5e-05, "epoch": 0.6217583402702184, "step": 9710 }, { "loss": 2.1561, "grad_norm": 1.6316450834274292, "learning_rate": 5e-05, "epoch": 0.6220785041941475, "step": 9715 }, { "loss": 2.1887, "grad_norm": 1.693595290184021, "learning_rate": 5e-05, "epoch": 0.6223986681180764, "step": 9720 }, { "loss": 2.1818, "grad_norm": 1.6201201677322388, "learning_rate": 5e-05, "epoch": 0.6227188320420055, "step": 9725 }, { "loss": 2.1803, "grad_norm": 1.6515777111053467, "learning_rate": 5e-05, "epoch": 0.6230389959659346, "step": 9730 }, { "loss": 2.2073, "grad_norm": 1.6333870887756348, "learning_rate": 5e-05, "epoch": 0.6233591598898636, "step": 9735 }, { "loss": 2.2036, "grad_norm": 1.6375046968460083, "learning_rate": 5e-05, "epoch": 0.6236793238137927, "step": 9740 }, { "loss": 2.1786, "grad_norm": 1.6964610815048218, "learning_rate": 5e-05, "epoch": 0.6239994877377217, "step": 9745 }, { "loss": 2.1623, "grad_norm": 1.571964144706726, "learning_rate": 5e-05, "epoch": 0.6243196516616507, "step": 9750 }, { "loss": 2.1862, "grad_norm": 1.5701056718826294, "learning_rate": 5e-05, "epoch": 0.6246398155855798, "step": 9755 }, { "loss": 2.2058, "grad_norm": 1.6287676095962524, "learning_rate": 5e-05, "epoch": 0.6249599795095089, "step": 9760 }, { "loss": 2.1764, "grad_norm": 1.6932698488235474, "learning_rate": 5e-05, "epoch": 0.625280143433438, "step": 9765 }, { "loss": 2.1822, "grad_norm": 1.6898211240768433, "learning_rate": 5e-05, "epoch": 0.6256003073573669, "step": 9770 }, { "loss": 2.196, "grad_norm": 1.8175290822982788, "learning_rate": 5e-05, "epoch": 0.625920471281296, "step": 9775 }, { "loss": 2.1976, "grad_norm": 1.6941791772842407, "learning_rate": 5e-05, "epoch": 0.6262406352052251, "step": 9780 }, { "loss": 2.202, "grad_norm": 1.6704522371292114, "learning_rate": 5e-05, "epoch": 0.6265607991291541, "step": 9785 }, { "loss": 2.1804, "grad_norm": 1.6961215734481812, "learning_rate": 5e-05, "epoch": 0.6268809630530832, "step": 9790 }, { "loss": 2.1622, "grad_norm": 1.6616603136062622, "learning_rate": 5e-05, "epoch": 0.6272011269770122, "step": 9795 }, { "loss": 2.2176, "grad_norm": 1.7092373371124268, "learning_rate": 5e-05, "epoch": 0.6275212909009413, "step": 9800 }, { "eval_loss": 2.066584587097168, "eval_runtime": 9.576, "eval_samples_per_second": 213.869, "eval_steps_per_second": 26.734, "epoch": 0.6275212909009413, "step": 9800 }, { "loss": 2.1975, "grad_norm": 1.7031360864639282, "learning_rate": 5e-05, "epoch": 0.6278414548248703, "step": 9805 }, { "loss": 2.1963, "grad_norm": 1.5917502641677856, "learning_rate": 5e-05, "epoch": 0.6281616187487994, "step": 9810 }, { "loss": 2.1941, "grad_norm": 1.6682829856872559, "learning_rate": 5e-05, "epoch": 0.6284817826727285, "step": 9815 }, { "loss": 2.1917, "grad_norm": 1.5641679763793945, "learning_rate": 5e-05, "epoch": 0.6288019465966574, "step": 9820 }, { "loss": 2.1773, "grad_norm": 1.645300269126892, "learning_rate": 5e-05, "epoch": 0.6291221105205865, "step": 9825 }, { "loss": 2.1631, "grad_norm": 1.5623157024383545, "learning_rate": 5e-05, "epoch": 0.6294422744445156, "step": 9830 }, { "loss": 2.1714, "grad_norm": 1.5632286071777344, "learning_rate": 5e-05, "epoch": 0.6297624383684447, "step": 9835 }, { "loss": 2.2008, "grad_norm": 1.5870970487594604, "learning_rate": 5e-05, "epoch": 0.6300826022923737, "step": 9840 }, { "loss": 2.177, "grad_norm": 1.6274374723434448, "learning_rate": 5e-05, "epoch": 0.6304027662163028, "step": 9845 }, { "loss": 2.1598, "grad_norm": 1.6094468832015991, "learning_rate": 5e-05, "epoch": 0.6307229301402318, "step": 9850 }, { "loss": 2.2126, "grad_norm": 1.6894714832305908, "learning_rate": 5e-05, "epoch": 0.6310430940641608, "step": 9855 }, { "loss": 2.182, "grad_norm": 1.6404249668121338, "learning_rate": 5e-05, "epoch": 0.6313632579880899, "step": 9860 }, { "loss": 2.1884, "grad_norm": 1.5737414360046387, "learning_rate": 5e-05, "epoch": 0.631683421912019, "step": 9865 }, { "loss": 2.1913, "grad_norm": 1.6798261404037476, "learning_rate": 5e-05, "epoch": 0.632003585835948, "step": 9870 }, { "loss": 2.1629, "grad_norm": 1.6156235933303833, "learning_rate": 5e-05, "epoch": 0.632323749759877, "step": 9875 }, { "loss": 2.1785, "grad_norm": 1.7109487056732178, "learning_rate": 5e-05, "epoch": 0.6326439136838061, "step": 9880 }, { "loss": 2.211, "grad_norm": 1.6673591136932373, "learning_rate": 5e-05, "epoch": 0.6329640776077352, "step": 9885 }, { "loss": 2.2038, "grad_norm": 1.6591060161590576, "learning_rate": 5e-05, "epoch": 0.6332842415316642, "step": 9890 }, { "loss": 2.1607, "grad_norm": 1.6050101518630981, "learning_rate": 5e-05, "epoch": 0.6336044054555933, "step": 9895 }, { "loss": 2.1672, "grad_norm": 1.6460251808166504, "learning_rate": 5e-05, "epoch": 0.6339245693795224, "step": 9900 }, { "loss": 2.1672, "grad_norm": 1.7413328886032104, "learning_rate": 5e-05, "epoch": 0.6342447333034513, "step": 9905 }, { "loss": 2.2128, "grad_norm": 1.684525966644287, "learning_rate": 5e-05, "epoch": 0.6345648972273804, "step": 9910 }, { "loss": 2.1987, "grad_norm": 1.6904735565185547, "learning_rate": 5e-05, "epoch": 0.6348850611513095, "step": 9915 }, { "loss": 2.1979, "grad_norm": 1.6782430410385132, "learning_rate": 5e-05, "epoch": 0.6352052250752386, "step": 9920 }, { "loss": 2.1839, "grad_norm": 1.7190313339233398, "learning_rate": 5e-05, "epoch": 0.6355253889991676, "step": 9925 }, { "loss": 2.1885, "grad_norm": 1.745063304901123, "learning_rate": 5e-05, "epoch": 0.6358455529230966, "step": 9930 }, { "loss": 2.174, "grad_norm": 1.6339811086654663, "learning_rate": 5e-05, "epoch": 0.6361657168470257, "step": 9935 }, { "loss": 2.197, "grad_norm": 1.6524280309677124, "learning_rate": 5e-05, "epoch": 0.6364858807709547, "step": 9940 }, { "loss": 2.1844, "grad_norm": 1.7359994649887085, "learning_rate": 5e-05, "epoch": 0.6368060446948838, "step": 9945 }, { "loss": 2.1894, "grad_norm": 1.6910420656204224, "learning_rate": 5e-05, "epoch": 0.6371262086188129, "step": 9950 }, { "loss": 2.1981, "grad_norm": 1.6106345653533936, "learning_rate": 5e-05, "epoch": 0.6374463725427418, "step": 9955 }, { "loss": 2.2047, "grad_norm": 1.6369112730026245, "learning_rate": 5e-05, "epoch": 0.6377665364666709, "step": 9960 }, { "loss": 2.2073, "grad_norm": 1.6089766025543213, "learning_rate": 5e-05, "epoch": 0.6380867003906, "step": 9965 }, { "loss": 2.184, "grad_norm": 1.7142517566680908, "learning_rate": 5e-05, "epoch": 0.6384068643145291, "step": 9970 }, { "loss": 2.1588, "grad_norm": 1.6717356443405151, "learning_rate": 5e-05, "epoch": 0.6387270282384581, "step": 9975 }, { "loss": 2.1723, "grad_norm": 1.7235606908798218, "learning_rate": 5e-05, "epoch": 0.6390471921623871, "step": 9980 }, { "loss": 2.201, "grad_norm": 1.6770853996276855, "learning_rate": 5e-05, "epoch": 0.6393673560863162, "step": 9985 }, { "loss": 2.1764, "grad_norm": 1.6714833974838257, "learning_rate": 5e-05, "epoch": 0.6396875200102452, "step": 9990 }, { "loss": 2.1779, "grad_norm": 1.587377905845642, "learning_rate": 5e-05, "epoch": 0.6400076839341743, "step": 9995 }, { "loss": 2.1861, "grad_norm": 1.6190916299819946, "learning_rate": 5e-05, "epoch": 0.6403278478581034, "step": 10000 }, { "eval_loss": 2.057950258255005, "eval_runtime": 9.0347, "eval_samples_per_second": 226.682, "eval_steps_per_second": 28.335, "epoch": 0.6403278478581034, "step": 10000 }, { "loss": 2.2194, "grad_norm": 1.6693397760391235, "learning_rate": 5e-05, "epoch": 0.6406480117820325, "step": 10005 }, { "loss": 2.1777, "grad_norm": 1.6318798065185547, "learning_rate": 5e-05, "epoch": 0.6409681757059614, "step": 10010 }, { "loss": 2.1711, "grad_norm": 1.5991014242172241, "learning_rate": 5e-05, "epoch": 0.6412883396298905, "step": 10015 }, { "loss": 2.1958, "grad_norm": 1.601650595664978, "learning_rate": 5e-05, "epoch": 0.6416085035538196, "step": 10020 }, { "loss": 2.1848, "grad_norm": 1.6435421705245972, "learning_rate": 5e-05, "epoch": 0.6419286674777486, "step": 10025 }, { "loss": 2.1893, "grad_norm": 1.6513253450393677, "learning_rate": 5e-05, "epoch": 0.6422488314016777, "step": 10030 }, { "loss": 2.192, "grad_norm": 1.7683993577957153, "learning_rate": 5e-05, "epoch": 0.6425689953256067, "step": 10035 }, { "loss": 2.1817, "grad_norm": 1.6842045783996582, "learning_rate": 5e-05, "epoch": 0.6428891592495357, "step": 10040 }, { "loss": 2.1972, "grad_norm": 1.6233887672424316, "learning_rate": 5e-05, "epoch": 0.6432093231734648, "step": 10045 }, { "loss": 2.179, "grad_norm": 1.659524917602539, "learning_rate": 5e-05, "epoch": 0.6435294870973939, "step": 10050 }, { "loss": 2.1826, "grad_norm": 1.620766282081604, "learning_rate": 5e-05, "epoch": 0.643849651021323, "step": 10055 }, { "loss": 2.1796, "grad_norm": 1.7828584909439087, "learning_rate": 5e-05, "epoch": 0.6441698149452519, "step": 10060 }, { "loss": 2.1733, "grad_norm": 1.6462182998657227, "learning_rate": 5e-05, "epoch": 0.644489978869181, "step": 10065 }, { "loss": 2.1725, "grad_norm": 1.6294734477996826, "learning_rate": 5e-05, "epoch": 0.6448101427931101, "step": 10070 }, { "loss": 2.1705, "grad_norm": 1.6464056968688965, "learning_rate": 5e-05, "epoch": 0.6451303067170391, "step": 10075 }, { "loss": 2.1945, "grad_norm": 1.6422755718231201, "learning_rate": 5e-05, "epoch": 0.6454504706409682, "step": 10080 }, { "loss": 2.1755, "grad_norm": 1.6333072185516357, "learning_rate": 5e-05, "epoch": 0.6457706345648973, "step": 10085 }, { "loss": 2.1642, "grad_norm": 1.6198492050170898, "learning_rate": 5e-05, "epoch": 0.6460907984888263, "step": 10090 }, { "loss": 2.1547, "grad_norm": 1.7265499830245972, "learning_rate": 5e-05, "epoch": 0.6464109624127553, "step": 10095 }, { "loss": 2.1862, "grad_norm": 1.6740344762802124, "learning_rate": 5e-05, "epoch": 0.6467311263366844, "step": 10100 }, { "loss": 2.1726, "grad_norm": 1.6273069381713867, "learning_rate": 5e-05, "epoch": 0.6470512902606135, "step": 10105 }, { "loss": 2.1927, "grad_norm": 1.6094673871994019, "learning_rate": 5e-05, "epoch": 0.6473714541845424, "step": 10110 }, { "loss": 2.1617, "grad_norm": 1.621466040611267, "learning_rate": 5e-05, "epoch": 0.6476916181084715, "step": 10115 }, { "loss": 2.1788, "grad_norm": 1.7757426500320435, "learning_rate": 5e-05, "epoch": 0.6480117820324006, "step": 10120 }, { "loss": 2.2063, "grad_norm": 1.6837197542190552, "learning_rate": 5e-05, "epoch": 0.6483319459563296, "step": 10125 }, { "loss": 2.2046, "grad_norm": 1.810106635093689, "learning_rate": 5e-05, "epoch": 0.6486521098802587, "step": 10130 }, { "loss": 2.1667, "grad_norm": 1.6960062980651855, "learning_rate": 5e-05, "epoch": 0.6489722738041878, "step": 10135 }, { "loss": 2.1799, "grad_norm": 1.574356198310852, "learning_rate": 5e-05, "epoch": 0.6492924377281168, "step": 10140 }, { "loss": 2.1849, "grad_norm": 1.5934480428695679, "learning_rate": 5e-05, "epoch": 0.6496126016520458, "step": 10145 }, { "loss": 2.1843, "grad_norm": 1.5735338926315308, "learning_rate": 5e-05, "epoch": 0.6499327655759749, "step": 10150 }, { "loss": 2.1852, "grad_norm": 1.611663818359375, "learning_rate": 5e-05, "epoch": 0.650252929499904, "step": 10155 }, { "loss": 2.2181, "grad_norm": 1.699562430381775, "learning_rate": 5e-05, "epoch": 0.650573093423833, "step": 10160 }, { "loss": 2.1745, "grad_norm": 1.612734079360962, "learning_rate": 5e-05, "epoch": 0.650893257347762, "step": 10165 }, { "loss": 2.1816, "grad_norm": 1.620051622390747, "learning_rate": 5e-05, "epoch": 0.6512134212716911, "step": 10170 }, { "loss": 2.193, "grad_norm": 1.772910475730896, "learning_rate": 5e-05, "epoch": 0.6515335851956202, "step": 10175 }, { "loss": 2.2045, "grad_norm": 1.6278610229492188, "learning_rate": 5e-05, "epoch": 0.6518537491195492, "step": 10180 }, { "loss": 2.1702, "grad_norm": 1.6800163984298706, "learning_rate": 5e-05, "epoch": 0.6521739130434783, "step": 10185 }, { "loss": 2.1728, "grad_norm": 1.5924550294876099, "learning_rate": 5e-05, "epoch": 0.6524940769674074, "step": 10190 }, { "loss": 2.1876, "grad_norm": 1.7043887376785278, "learning_rate": 5e-05, "epoch": 0.6528142408913363, "step": 10195 }, { "loss": 2.197, "grad_norm": 1.7267884016036987, "learning_rate": 5e-05, "epoch": 0.6531344048152654, "step": 10200 }, { "eval_loss": 2.0496010780334473, "eval_runtime": 9.1194, "eval_samples_per_second": 224.576, "eval_steps_per_second": 28.072, "epoch": 0.6531344048152654, "step": 10200 }, { "loss": 2.1663, "grad_norm": 1.721843957901001, "learning_rate": 5e-05, "epoch": 0.6534545687391945, "step": 10205 }, { "loss": 2.194, "grad_norm": 1.666116714477539, "learning_rate": 5e-05, "epoch": 0.6537747326631235, "step": 10210 }, { "loss": 2.1648, "grad_norm": 1.6751972436904907, "learning_rate": 5e-05, "epoch": 0.6540948965870526, "step": 10215 }, { "loss": 2.1695, "grad_norm": 1.6365602016448975, "learning_rate": 5e-05, "epoch": 0.6544150605109816, "step": 10220 }, { "loss": 2.1997, "grad_norm": 1.6664323806762695, "learning_rate": 5e-05, "epoch": 0.6547352244349107, "step": 10225 }, { "loss": 2.1405, "grad_norm": 1.6744410991668701, "learning_rate": 5e-05, "epoch": 0.6550553883588397, "step": 10230 }, { "loss": 2.1786, "grad_norm": 1.5833547115325928, "learning_rate": 5e-05, "epoch": 0.6553755522827688, "step": 10235 }, { "loss": 2.1724, "grad_norm": 1.6934590339660645, "learning_rate": 5e-05, "epoch": 0.6556957162066979, "step": 10240 }, { "loss": 2.2072, "grad_norm": 1.6612778902053833, "learning_rate": 5e-05, "epoch": 0.6560158801306268, "step": 10245 }, { "loss": 2.2002, "grad_norm": 1.6952781677246094, "learning_rate": 5e-05, "epoch": 0.6563360440545559, "step": 10250 }, { "loss": 2.2005, "grad_norm": 1.5911494493484497, "learning_rate": 5e-05, "epoch": 0.656656207978485, "step": 10255 }, { "loss": 2.1954, "grad_norm": 1.6101614236831665, "learning_rate": 5e-05, "epoch": 0.6569763719024141, "step": 10260 }, { "loss": 2.2006, "grad_norm": 1.5616638660430908, "learning_rate": 5e-05, "epoch": 0.6572965358263431, "step": 10265 }, { "loss": 2.1962, "grad_norm": 1.6221803426742554, "learning_rate": 5e-05, "epoch": 0.6576166997502721, "step": 10270 }, { "loss": 2.1947, "grad_norm": 1.5761173963546753, "learning_rate": 5e-05, "epoch": 0.6579368636742012, "step": 10275 }, { "loss": 2.171, "grad_norm": 1.6054048538208008, "learning_rate": 5e-05, "epoch": 0.6582570275981302, "step": 10280 }, { "loss": 2.1872, "grad_norm": 1.6371126174926758, "learning_rate": 5e-05, "epoch": 0.6585771915220593, "step": 10285 }, { "loss": 2.1471, "grad_norm": 1.6947263479232788, "learning_rate": 5e-05, "epoch": 0.6588973554459884, "step": 10290 }, { "loss": 2.1586, "grad_norm": 1.6005196571350098, "learning_rate": 5e-05, "epoch": 0.6592175193699173, "step": 10295 }, { "loss": 2.1868, "grad_norm": 1.6588430404663086, "learning_rate": 5e-05, "epoch": 0.6595376832938464, "step": 10300 }, { "loss": 2.1795, "grad_norm": 1.719780683517456, "learning_rate": 5e-05, "epoch": 0.6598578472177755, "step": 10305 }, { "loss": 2.2096, "grad_norm": 1.7076449394226074, "learning_rate": 5e-05, "epoch": 0.6601780111417046, "step": 10310 }, { "loss": 2.1607, "grad_norm": 1.708767294883728, "learning_rate": 5e-05, "epoch": 0.6604981750656336, "step": 10315 }, { "loss": 2.1654, "grad_norm": 1.5694926977157593, "learning_rate": 5e-05, "epoch": 0.6608183389895627, "step": 10320 }, { "loss": 2.1733, "grad_norm": 1.6309330463409424, "learning_rate": 5e-05, "epoch": 0.6611385029134917, "step": 10325 }, { "loss": 2.1714, "grad_norm": 1.545966386795044, "learning_rate": 5e-05, "epoch": 0.6614586668374207, "step": 10330 }, { "loss": 2.1862, "grad_norm": 1.6850484609603882, "learning_rate": 5e-05, "epoch": 0.6617788307613498, "step": 10335 }, { "loss": 2.2057, "grad_norm": 1.7686723470687866, "learning_rate": 5e-05, "epoch": 0.6620989946852789, "step": 10340 }, { "loss": 2.2103, "grad_norm": 1.6861554384231567, "learning_rate": 5e-05, "epoch": 0.662419158609208, "step": 10345 }, { "loss": 2.1704, "grad_norm": 1.6404895782470703, "learning_rate": 5e-05, "epoch": 0.6627393225331369, "step": 10350 }, { "loss": 2.1692, "grad_norm": 1.6215230226516724, "learning_rate": 5e-05, "epoch": 0.663059486457066, "step": 10355 }, { "loss": 2.1937, "grad_norm": 1.6805126667022705, "learning_rate": 5e-05, "epoch": 0.6633796503809951, "step": 10360 }, { "loss": 2.1821, "grad_norm": 1.6423799991607666, "learning_rate": 5e-05, "epoch": 0.6636998143049241, "step": 10365 }, { "loss": 2.1924, "grad_norm": 1.6313560009002686, "learning_rate": 5e-05, "epoch": 0.6640199782288532, "step": 10370 }, { "loss": 2.1803, "grad_norm": 1.6506428718566895, "learning_rate": 5e-05, "epoch": 0.6643401421527823, "step": 10375 }, { "loss": 2.1903, "grad_norm": 1.6371464729309082, "learning_rate": 5e-05, "epoch": 0.6646603060767112, "step": 10380 }, { "loss": 2.1877, "grad_norm": 1.6620516777038574, "learning_rate": 5e-05, "epoch": 0.6649804700006403, "step": 10385 }, { "loss": 2.1649, "grad_norm": 1.6384953260421753, "learning_rate": 5e-05, "epoch": 0.6653006339245694, "step": 10390 }, { "loss": 2.2066, "grad_norm": 1.5989880561828613, "learning_rate": 5e-05, "epoch": 0.6656207978484985, "step": 10395 }, { "loss": 2.1917, "grad_norm": 1.6223093271255493, "learning_rate": 5e-05, "epoch": 0.6659409617724275, "step": 10400 }, { "eval_loss": 2.0479464530944824, "eval_runtime": 9.5359, "eval_samples_per_second": 214.766, "eval_steps_per_second": 26.846, "epoch": 0.6659409617724275, "step": 10400 }, { "loss": 2.1724, "grad_norm": 1.6815154552459717, "learning_rate": 5e-05, "epoch": 0.6662611256963565, "step": 10405 }, { "loss": 2.1865, "grad_norm": 1.6925560235977173, "learning_rate": 5e-05, "epoch": 0.6665812896202856, "step": 10410 }, { "loss": 2.1976, "grad_norm": 1.6829434633255005, "learning_rate": 5e-05, "epoch": 0.6669014535442146, "step": 10415 }, { "loss": 2.1846, "grad_norm": 1.6039493083953857, "learning_rate": 5e-05, "epoch": 0.6672216174681437, "step": 10420 }, { "loss": 2.1652, "grad_norm": 1.5750089883804321, "learning_rate": 5e-05, "epoch": 0.6675417813920728, "step": 10425 }, { "loss": 2.1989, "grad_norm": 1.6804012060165405, "learning_rate": 5e-05, "epoch": 0.6678619453160018, "step": 10430 }, { "loss": 2.1673, "grad_norm": 1.6494613885879517, "learning_rate": 5e-05, "epoch": 0.6681821092399308, "step": 10435 }, { "loss": 2.1598, "grad_norm": 1.5852676630020142, "learning_rate": 5e-05, "epoch": 0.6685022731638599, "step": 10440 }, { "loss": 2.188, "grad_norm": 1.6259963512420654, "learning_rate": 5e-05, "epoch": 0.668822437087789, "step": 10445 }, { "loss": 2.211, "grad_norm": 1.6978424787521362, "learning_rate": 5e-05, "epoch": 0.669142601011718, "step": 10450 }, { "loss": 2.1663, "grad_norm": 1.6471407413482666, "learning_rate": 5e-05, "epoch": 0.669462764935647, "step": 10455 }, { "loss": 2.1957, "grad_norm": 1.5860986709594727, "learning_rate": 5e-05, "epoch": 0.6697829288595761, "step": 10460 }, { "loss": 2.1557, "grad_norm": 1.65492582321167, "learning_rate": 5e-05, "epoch": 0.6701030927835051, "step": 10465 }, { "loss": 2.18, "grad_norm": 1.6820275783538818, "learning_rate": 5e-05, "epoch": 0.6704232567074342, "step": 10470 }, { "loss": 2.1984, "grad_norm": 1.6048341989517212, "learning_rate": 5e-05, "epoch": 0.6707434206313633, "step": 10475 }, { "loss": 2.1651, "grad_norm": 1.6907062530517578, "learning_rate": 5e-05, "epoch": 0.6710635845552924, "step": 10480 }, { "loss": 2.1792, "grad_norm": 1.6291903257369995, "learning_rate": 5e-05, "epoch": 0.6713837484792213, "step": 10485 }, { "loss": 2.1838, "grad_norm": 1.5670826435089111, "learning_rate": 5e-05, "epoch": 0.6717039124031504, "step": 10490 }, { "loss": 2.1827, "grad_norm": 1.6459773778915405, "learning_rate": 5e-05, "epoch": 0.6720240763270795, "step": 10495 }, { "loss": 2.1948, "grad_norm": 1.6180630922317505, "learning_rate": 5e-05, "epoch": 0.6723442402510085, "step": 10500 }, { "loss": 2.1589, "grad_norm": 1.5975276231765747, "learning_rate": 5e-05, "epoch": 0.6726644041749376, "step": 10505 }, { "loss": 2.1806, "grad_norm": 1.6482629776000977, "learning_rate": 5e-05, "epoch": 0.6729845680988666, "step": 10510 }, { "loss": 2.2065, "grad_norm": 1.535201907157898, "learning_rate": 5e-05, "epoch": 0.6733047320227957, "step": 10515 }, { "loss": 2.1731, "grad_norm": 1.6452282667160034, "learning_rate": 5e-05, "epoch": 0.6736248959467247, "step": 10520 }, { "loss": 2.1929, "grad_norm": 1.5697953701019287, "learning_rate": 5e-05, "epoch": 0.6739450598706538, "step": 10525 }, { "loss": 2.1677, "grad_norm": 1.689011812210083, "learning_rate": 5e-05, "epoch": 0.6742652237945829, "step": 10530 }, { "loss": 2.1907, "grad_norm": 1.5526947975158691, "learning_rate": 5e-05, "epoch": 0.6745853877185118, "step": 10535 }, { "loss": 2.1569, "grad_norm": 1.6372793912887573, "learning_rate": 5e-05, "epoch": 0.6749055516424409, "step": 10540 }, { "loss": 2.185, "grad_norm": 1.670624852180481, "learning_rate": 5e-05, "epoch": 0.67522571556637, "step": 10545 }, { "loss": 2.1731, "grad_norm": 1.6175780296325684, "learning_rate": 5e-05, "epoch": 0.675545879490299, "step": 10550 }, { "loss": 2.1483, "grad_norm": 1.5563071966171265, "learning_rate": 5e-05, "epoch": 0.6758660434142281, "step": 10555 }, { "loss": 2.1656, "grad_norm": 1.6369822025299072, "learning_rate": 5e-05, "epoch": 0.6761862073381572, "step": 10560 }, { "loss": 2.1781, "grad_norm": 1.5494641065597534, "learning_rate": 5e-05, "epoch": 0.6765063712620862, "step": 10565 }, { "loss": 2.175, "grad_norm": 1.6584258079528809, "learning_rate": 5e-05, "epoch": 0.6768265351860152, "step": 10570 }, { "loss": 2.1616, "grad_norm": 1.6767727136611938, "learning_rate": 5e-05, "epoch": 0.6771466991099443, "step": 10575 }, { "loss": 2.1636, "grad_norm": 1.555019497871399, "learning_rate": 5e-05, "epoch": 0.6774668630338734, "step": 10580 }, { "loss": 2.1667, "grad_norm": 1.597027063369751, "learning_rate": 5e-05, "epoch": 0.6777870269578024, "step": 10585 }, { "loss": 2.1898, "grad_norm": 1.5965244770050049, "learning_rate": 5e-05, "epoch": 0.6781071908817314, "step": 10590 }, { "loss": 2.1712, "grad_norm": 1.53036630153656, "learning_rate": 5e-05, "epoch": 0.6784273548056605, "step": 10595 }, { "loss": 2.1438, "grad_norm": 1.5849616527557373, "learning_rate": 5e-05, "epoch": 0.6787475187295896, "step": 10600 }, { "eval_loss": 2.038008213043213, "eval_runtime": 9.1765, "eval_samples_per_second": 223.178, "eval_steps_per_second": 27.897, "epoch": 0.6787475187295896, "step": 10600 }, { "loss": 2.1674, "grad_norm": 1.6421003341674805, "learning_rate": 5e-05, "epoch": 0.6790676826535186, "step": 10605 }, { "loss": 2.1808, "grad_norm": 1.602632761001587, "learning_rate": 5e-05, "epoch": 0.6793878465774477, "step": 10610 }, { "loss": 2.1703, "grad_norm": 1.7214833498001099, "learning_rate": 5e-05, "epoch": 0.6797080105013767, "step": 10615 }, { "loss": 2.2029, "grad_norm": 1.5542736053466797, "learning_rate": 5e-05, "epoch": 0.6800281744253057, "step": 10620 }, { "loss": 2.1652, "grad_norm": 1.7546051740646362, "learning_rate": 5e-05, "epoch": 0.6803483383492348, "step": 10625 }, { "loss": 2.1935, "grad_norm": 1.6220976114273071, "learning_rate": 5e-05, "epoch": 0.6806685022731639, "step": 10630 }, { "loss": 2.1631, "grad_norm": 1.7736234664916992, "learning_rate": 5e-05, "epoch": 0.6809886661970929, "step": 10635 }, { "loss": 2.1659, "grad_norm": 1.6324067115783691, "learning_rate": 5e-05, "epoch": 0.681308830121022, "step": 10640 }, { "loss": 2.1806, "grad_norm": 1.6361428499221802, "learning_rate": 5e-05, "epoch": 0.681628994044951, "step": 10645 }, { "loss": 2.1511, "grad_norm": 1.695727825164795, "learning_rate": 5e-05, "epoch": 0.6819491579688801, "step": 10650 }, { "loss": 2.1771, "grad_norm": 1.5814673900604248, "learning_rate": 5e-05, "epoch": 0.6822693218928091, "step": 10655 }, { "loss": 2.156, "grad_norm": 1.5904582738876343, "learning_rate": 5e-05, "epoch": 0.6825894858167382, "step": 10660 }, { "loss": 2.1845, "grad_norm": 1.6580817699432373, "learning_rate": 5e-05, "epoch": 0.6829096497406673, "step": 10665 }, { "loss": 2.1571, "grad_norm": 1.5745543241500854, "learning_rate": 5e-05, "epoch": 0.6832298136645962, "step": 10670 }, { "loss": 2.1504, "grad_norm": 1.5689435005187988, "learning_rate": 5e-05, "epoch": 0.6835499775885253, "step": 10675 }, { "loss": 2.1806, "grad_norm": 1.5868749618530273, "learning_rate": 5e-05, "epoch": 0.6838701415124544, "step": 10680 }, { "loss": 2.1685, "grad_norm": 1.6363295316696167, "learning_rate": 5e-05, "epoch": 0.6841903054363835, "step": 10685 }, { "loss": 2.1508, "grad_norm": 1.5658912658691406, "learning_rate": 5e-05, "epoch": 0.6845104693603125, "step": 10690 }, { "loss": 2.1798, "grad_norm": 1.73909330368042, "learning_rate": 5e-05, "epoch": 0.6848306332842415, "step": 10695 }, { "loss": 2.1349, "grad_norm": 1.5258785486221313, "learning_rate": 5e-05, "epoch": 0.6851507972081706, "step": 10700 }, { "loss": 2.1775, "grad_norm": 1.6583811044692993, "learning_rate": 5e-05, "epoch": 0.6854709611320996, "step": 10705 }, { "loss": 2.1723, "grad_norm": 1.623826503753662, "learning_rate": 5e-05, "epoch": 0.6857911250560287, "step": 10710 }, { "loss": 2.162, "grad_norm": 1.5844995975494385, "learning_rate": 5e-05, "epoch": 0.6861112889799578, "step": 10715 }, { "loss": 2.1719, "grad_norm": 1.5476335287094116, "learning_rate": 5e-05, "epoch": 0.6864314529038867, "step": 10720 }, { "loss": 2.1687, "grad_norm": 1.6284759044647217, "learning_rate": 5e-05, "epoch": 0.6867516168278158, "step": 10725 }, { "loss": 2.1917, "grad_norm": 1.6278412342071533, "learning_rate": 5e-05, "epoch": 0.6870717807517449, "step": 10730 }, { "loss": 2.1692, "grad_norm": 1.6887261867523193, "learning_rate": 5e-05, "epoch": 0.687391944675674, "step": 10735 }, { "loss": 2.1672, "grad_norm": 1.6040128469467163, "learning_rate": 5e-05, "epoch": 0.687712108599603, "step": 10740 }, { "loss": 2.1755, "grad_norm": 1.6104363203048706, "learning_rate": 5e-05, "epoch": 0.688032272523532, "step": 10745 }, { "loss": 2.1828, "grad_norm": 1.5533658266067505, "learning_rate": 5e-05, "epoch": 0.6883524364474611, "step": 10750 }, { "loss": 2.1762, "grad_norm": 1.6379051208496094, "learning_rate": 5e-05, "epoch": 0.6886726003713901, "step": 10755 }, { "loss": 2.1811, "grad_norm": 1.6067684888839722, "learning_rate": 5e-05, "epoch": 0.6889927642953192, "step": 10760 }, { "loss": 2.1711, "grad_norm": 1.6196955442428589, "learning_rate": 5e-05, "epoch": 0.6893129282192483, "step": 10765 }, { "loss": 2.1666, "grad_norm": 1.7209603786468506, "learning_rate": 5e-05, "epoch": 0.6896330921431774, "step": 10770 }, { "loss": 2.1632, "grad_norm": 1.6124151945114136, "learning_rate": 5e-05, "epoch": 0.6899532560671063, "step": 10775 }, { "loss": 2.1812, "grad_norm": 1.6349486112594604, "learning_rate": 5e-05, "epoch": 0.6902734199910354, "step": 10780 }, { "loss": 2.1459, "grad_norm": 1.5787373781204224, "learning_rate": 5e-05, "epoch": 0.6905935839149645, "step": 10785 }, { "loss": 2.1705, "grad_norm": 1.658933401107788, "learning_rate": 5e-05, "epoch": 0.6909137478388935, "step": 10790 }, { "loss": 2.2042, "grad_norm": 1.6257023811340332, "learning_rate": 5e-05, "epoch": 0.6912339117628226, "step": 10795 }, { "loss": 2.156, "grad_norm": 1.6555736064910889, "learning_rate": 5e-05, "epoch": 0.6915540756867516, "step": 10800 }, { "eval_loss": 2.0406501293182373, "eval_runtime": 9.4746, "eval_samples_per_second": 216.157, "eval_steps_per_second": 27.02, "epoch": 0.6915540756867516, "step": 10800 }, { "loss": 2.1915, "grad_norm": 1.5673022270202637, "learning_rate": 5e-05, "epoch": 0.6918742396106806, "step": 10805 }, { "loss": 2.1848, "grad_norm": 1.6205099821090698, "learning_rate": 5e-05, "epoch": 0.6921944035346097, "step": 10810 }, { "loss": 2.1478, "grad_norm": 1.605733871459961, "learning_rate": 5e-05, "epoch": 0.6925145674585388, "step": 10815 }, { "loss": 2.1869, "grad_norm": 1.6565288305282593, "learning_rate": 5e-05, "epoch": 0.6928347313824679, "step": 10820 }, { "loss": 2.1886, "grad_norm": 1.5877238512039185, "learning_rate": 5e-05, "epoch": 0.6931548953063968, "step": 10825 }, { "loss": 2.1702, "grad_norm": 1.612343668937683, "learning_rate": 5e-05, "epoch": 0.6934750592303259, "step": 10830 }, { "loss": 2.1676, "grad_norm": 1.5942809581756592, "learning_rate": 5e-05, "epoch": 0.693795223154255, "step": 10835 }, { "loss": 2.1859, "grad_norm": 1.575735330581665, "learning_rate": 5e-05, "epoch": 0.694115387078184, "step": 10840 }, { "loss": 2.1989, "grad_norm": 1.6750792264938354, "learning_rate": 5e-05, "epoch": 0.6944355510021131, "step": 10845 }, { "loss": 2.188, "grad_norm": 1.6374013423919678, "learning_rate": 5e-05, "epoch": 0.6947557149260422, "step": 10850 }, { "loss": 2.1745, "grad_norm": 1.6327176094055176, "learning_rate": 5e-05, "epoch": 0.6950758788499712, "step": 10855 }, { "loss": 2.1858, "grad_norm": 1.6038893461227417, "learning_rate": 5e-05, "epoch": 0.6953960427739002, "step": 10860 }, { "loss": 2.1917, "grad_norm": 1.6516578197479248, "learning_rate": 5e-05, "epoch": 0.6957162066978293, "step": 10865 }, { "loss": 2.1508, "grad_norm": 1.6180627346038818, "learning_rate": 5e-05, "epoch": 0.6960363706217584, "step": 10870 }, { "loss": 2.155, "grad_norm": 1.7027835845947266, "learning_rate": 5e-05, "epoch": 0.6963565345456874, "step": 10875 }, { "loss": 2.1702, "grad_norm": 1.6208568811416626, "learning_rate": 5e-05, "epoch": 0.6966766984696164, "step": 10880 }, { "loss": 2.1676, "grad_norm": 1.5398364067077637, "learning_rate": 5e-05, "epoch": 0.6969968623935455, "step": 10885 }, { "loss": 2.1617, "grad_norm": 1.579201340675354, "learning_rate": 5e-05, "epoch": 0.6973170263174745, "step": 10890 }, { "loss": 2.2142, "grad_norm": 1.629888653755188, "learning_rate": 5e-05, "epoch": 0.6976371902414036, "step": 10895 }, { "loss": 2.1654, "grad_norm": 1.5622855424880981, "learning_rate": 5e-05, "epoch": 0.6979573541653327, "step": 10900 }, { "loss": 2.1655, "grad_norm": 1.6659269332885742, "learning_rate": 5e-05, "epoch": 0.6982775180892618, "step": 10905 }, { "loss": 2.1779, "grad_norm": 1.5977221727371216, "learning_rate": 5e-05, "epoch": 0.6985976820131907, "step": 10910 }, { "loss": 2.1613, "grad_norm": 1.604508638381958, "learning_rate": 5e-05, "epoch": 0.6989178459371198, "step": 10915 }, { "loss": 2.1914, "grad_norm": 1.6567248106002808, "learning_rate": 5e-05, "epoch": 0.6992380098610489, "step": 10920 }, { "loss": 2.1479, "grad_norm": 1.681601881980896, "learning_rate": 5e-05, "epoch": 0.6995581737849779, "step": 10925 }, { "loss": 2.1561, "grad_norm": 1.6984683275222778, "learning_rate": 5e-05, "epoch": 0.699878337708907, "step": 10930 }, { "loss": 2.1897, "grad_norm": 1.6151689291000366, "learning_rate": 5e-05, "epoch": 0.700198501632836, "step": 10935 }, { "loss": 2.1693, "grad_norm": 1.680700659751892, "learning_rate": 5e-05, "epoch": 0.7005186655567651, "step": 10940 }, { "loss": 2.2056, "grad_norm": 1.6594295501708984, "learning_rate": 5e-05, "epoch": 0.7008388294806941, "step": 10945 }, { "loss": 2.1944, "grad_norm": 1.569491982460022, "learning_rate": 5e-05, "epoch": 0.7011589934046232, "step": 10950 }, { "loss": 2.1825, "grad_norm": 1.6589845418930054, "learning_rate": 5e-05, "epoch": 0.7014791573285523, "step": 10955 }, { "loss": 2.1556, "grad_norm": 1.6408551931381226, "learning_rate": 5e-05, "epoch": 0.7017993212524812, "step": 10960 }, { "loss": 2.1657, "grad_norm": 1.580729365348816, "learning_rate": 5e-05, "epoch": 0.7021194851764103, "step": 10965 }, { "loss": 2.203, "grad_norm": 2.0288913249969482, "learning_rate": 5e-05, "epoch": 0.7024396491003394, "step": 10970 }, { "loss": 2.1871, "grad_norm": 2.0533010959625244, "learning_rate": 5e-05, "epoch": 0.7027598130242684, "step": 10975 }, { "loss": 2.1798, "grad_norm": 1.7345025539398193, "learning_rate": 5e-05, "epoch": 0.7030799769481975, "step": 10980 }, { "loss": 2.1758, "grad_norm": 1.7670248746871948, "learning_rate": 5e-05, "epoch": 0.7034001408721265, "step": 10985 }, { "loss": 2.1745, "grad_norm": 1.7378064393997192, "learning_rate": 5e-05, "epoch": 0.7037203047960556, "step": 10990 }, { "loss": 2.1861, "grad_norm": 1.7046862840652466, "learning_rate": 5e-05, "epoch": 0.7040404687199846, "step": 10995 }, { "loss": 2.1677, "grad_norm": 1.7800571918487549, "learning_rate": 5e-05, "epoch": 0.7043606326439137, "step": 11000 }, { "eval_loss": 2.030477523803711, "eval_runtime": 11.977, "eval_samples_per_second": 170.995, "eval_steps_per_second": 21.374, "epoch": 0.7043606326439137, "step": 11000 }, { "loss": 2.1441, "grad_norm": 1.6233124732971191, "learning_rate": 5e-05, "epoch": 0.7046807965678428, "step": 11005 }, { "loss": 2.1677, "grad_norm": 1.6337502002716064, "learning_rate": 5e-05, "epoch": 0.7050009604917717, "step": 11010 }, { "loss": 2.1558, "grad_norm": 1.602023720741272, "learning_rate": 5e-05, "epoch": 0.7053211244157008, "step": 11015 }, { "loss": 2.2029, "grad_norm": 1.642838478088379, "learning_rate": 5e-05, "epoch": 0.7056412883396299, "step": 11020 }, { "loss": 2.1435, "grad_norm": 1.6262296438217163, "learning_rate": 5e-05, "epoch": 0.705961452263559, "step": 11025 }, { "loss": 2.1539, "grad_norm": 1.6026281118392944, "learning_rate": 5e-05, "epoch": 0.706281616187488, "step": 11030 }, { "loss": 2.1486, "grad_norm": 1.722970962524414, "learning_rate": 5e-05, "epoch": 0.706601780111417, "step": 11035 }, { "loss": 2.1827, "grad_norm": 1.606521487236023, "learning_rate": 5e-05, "epoch": 0.7069219440353461, "step": 11040 }, { "loss": 2.1569, "grad_norm": 1.6021209955215454, "learning_rate": 5e-05, "epoch": 0.7072421079592751, "step": 11045 }, { "loss": 2.1489, "grad_norm": 1.5476322174072266, "learning_rate": 5e-05, "epoch": 0.7075622718832042, "step": 11050 }, { "loss": 2.1817, "grad_norm": 1.5742813348770142, "learning_rate": 5e-05, "epoch": 0.7078824358071333, "step": 11055 }, { "loss": 2.1811, "grad_norm": 1.5461931228637695, "learning_rate": 5e-05, "epoch": 0.7082025997310623, "step": 11060 }, { "loss": 2.1569, "grad_norm": 1.5782997608184814, "learning_rate": 5e-05, "epoch": 0.7085227636549913, "step": 11065 }, { "loss": 2.1863, "grad_norm": 1.652904748916626, "learning_rate": 5e-05, "epoch": 0.7088429275789204, "step": 11070 }, { "loss": 2.1902, "grad_norm": 1.6661841869354248, "learning_rate": 5e-05, "epoch": 0.7091630915028495, "step": 11075 }, { "loss": 2.1641, "grad_norm": 1.5823413133621216, "learning_rate": 5e-05, "epoch": 0.7094832554267785, "step": 11080 }, { "loss": 2.1793, "grad_norm": 1.6849126815795898, "learning_rate": 5e-05, "epoch": 0.7098034193507076, "step": 11085 }, { "loss": 2.1686, "grad_norm": 1.4970881938934326, "learning_rate": 5e-05, "epoch": 0.7101235832746366, "step": 11090 }, { "loss": 2.1678, "grad_norm": 1.5892386436462402, "learning_rate": 5e-05, "epoch": 0.7104437471985656, "step": 11095 }, { "loss": 2.1718, "grad_norm": 1.6064002513885498, "learning_rate": 5e-05, "epoch": 0.7107639111224947, "step": 11100 }, { "loss": 2.1564, "grad_norm": 1.6332510709762573, "learning_rate": 5e-05, "epoch": 0.7110840750464238, "step": 11105 }, { "loss": 2.1548, "grad_norm": 1.5667170286178589, "learning_rate": 5e-05, "epoch": 0.7114042389703529, "step": 11110 }, { "loss": 2.1598, "grad_norm": 1.6204004287719727, "learning_rate": 5e-05, "epoch": 0.7117244028942818, "step": 11115 }, { "loss": 2.1704, "grad_norm": 1.6428627967834473, "learning_rate": 5e-05, "epoch": 0.7120445668182109, "step": 11120 }, { "loss": 2.1742, "grad_norm": 1.6737961769104004, "learning_rate": 5e-05, "epoch": 0.71236473074214, "step": 11125 }, { "loss": 2.1952, "grad_norm": 1.6475317478179932, "learning_rate": 5e-05, "epoch": 0.712684894666069, "step": 11130 }, { "loss": 2.1933, "grad_norm": 1.6665571928024292, "learning_rate": 5e-05, "epoch": 0.7130050585899981, "step": 11135 }, { "loss": 2.1678, "grad_norm": 1.5727708339691162, "learning_rate": 5e-05, "epoch": 0.7133252225139272, "step": 11140 }, { "loss": 2.1553, "grad_norm": 1.5942537784576416, "learning_rate": 5e-05, "epoch": 0.7136453864378561, "step": 11145 }, { "loss": 2.1724, "grad_norm": 1.6233826875686646, "learning_rate": 5e-05, "epoch": 0.7139655503617852, "step": 11150 }, { "loss": 2.17, "grad_norm": 1.6468729972839355, "learning_rate": 5e-05, "epoch": 0.7142857142857143, "step": 11155 }, { "loss": 2.1759, "grad_norm": 1.6635937690734863, "learning_rate": 5e-05, "epoch": 0.7146058782096434, "step": 11160 }, { "loss": 2.1676, "grad_norm": 1.67452073097229, "learning_rate": 5e-05, "epoch": 0.7149260421335724, "step": 11165 }, { "loss": 2.1679, "grad_norm": 1.6762311458587646, "learning_rate": 5e-05, "epoch": 0.7152462060575014, "step": 11170 }, { "loss": 2.2012, "grad_norm": 1.6281007528305054, "learning_rate": 5e-05, "epoch": 0.7155663699814305, "step": 11175 }, { "loss": 2.1857, "grad_norm": 1.6250513792037964, "learning_rate": 5e-05, "epoch": 0.7158865339053595, "step": 11180 }, { "loss": 2.1537, "grad_norm": 1.57022225856781, "learning_rate": 5e-05, "epoch": 0.7162066978292886, "step": 11185 }, { "loss": 2.1607, "grad_norm": 1.6798429489135742, "learning_rate": 5e-05, "epoch": 0.7165268617532177, "step": 11190 }, { "loss": 2.1764, "grad_norm": 1.6130719184875488, "learning_rate": 5e-05, "epoch": 0.7168470256771468, "step": 11195 }, { "loss": 2.1805, "grad_norm": 1.6312038898468018, "learning_rate": 5e-05, "epoch": 0.7171671896010757, "step": 11200 }, { "eval_loss": 2.0418291091918945, "eval_runtime": 9.4156, "eval_samples_per_second": 217.512, "eval_steps_per_second": 27.189, "epoch": 0.7171671896010757, "step": 11200 }, { "loss": 2.1583, "grad_norm": 1.6148508787155151, "learning_rate": 5e-05, "epoch": 0.7174873535250048, "step": 11205 }, { "loss": 2.156, "grad_norm": 1.6483427286148071, "learning_rate": 5e-05, "epoch": 0.7178075174489339, "step": 11210 }, { "loss": 2.1894, "grad_norm": 1.7447383403778076, "learning_rate": 5e-05, "epoch": 0.7181276813728629, "step": 11215 }, { "loss": 2.1905, "grad_norm": 1.605284571647644, "learning_rate": 5e-05, "epoch": 0.718447845296792, "step": 11220 }, { "loss": 2.1626, "grad_norm": 1.5911649465560913, "learning_rate": 5e-05, "epoch": 0.718768009220721, "step": 11225 }, { "loss": 2.1717, "grad_norm": 1.535005807876587, "learning_rate": 5e-05, "epoch": 0.71908817314465, "step": 11230 }, { "loss": 2.1844, "grad_norm": 1.5297882556915283, "learning_rate": 5e-05, "epoch": 0.7194083370685791, "step": 11235 }, { "loss": 2.1602, "grad_norm": 1.6361292600631714, "learning_rate": 5e-05, "epoch": 0.7197285009925082, "step": 11240 }, { "loss": 2.1776, "grad_norm": 1.6264761686325073, "learning_rate": 5e-05, "epoch": 0.7200486649164373, "step": 11245 }, { "loss": 2.1692, "grad_norm": 1.6878418922424316, "learning_rate": 5e-05, "epoch": 0.7203688288403662, "step": 11250 }, { "loss": 2.1613, "grad_norm": 1.6586750745773315, "learning_rate": 5e-05, "epoch": 0.7206889927642953, "step": 11255 }, { "loss": 2.1647, "grad_norm": 1.6286550760269165, "learning_rate": 5e-05, "epoch": 0.7210091566882244, "step": 11260 }, { "loss": 2.1485, "grad_norm": 1.6325013637542725, "learning_rate": 5e-05, "epoch": 0.7213293206121534, "step": 11265 }, { "loss": 2.1846, "grad_norm": 1.5908008813858032, "learning_rate": 5e-05, "epoch": 0.7216494845360825, "step": 11270 }, { "loss": 2.1725, "grad_norm": 1.5738271474838257, "learning_rate": 5e-05, "epoch": 0.7219696484600115, "step": 11275 }, { "loss": 2.1878, "grad_norm": 1.6095255613327026, "learning_rate": 5e-05, "epoch": 0.7222898123839406, "step": 11280 }, { "loss": 2.1599, "grad_norm": 1.5834318399429321, "learning_rate": 5e-05, "epoch": 0.7226099763078696, "step": 11285 }, { "loss": 2.2047, "grad_norm": 1.5938414335250854, "learning_rate": 5e-05, "epoch": 0.7229301402317987, "step": 11290 }, { "loss": 2.1833, "grad_norm": 1.62465238571167, "learning_rate": 5e-05, "epoch": 0.7232503041557278, "step": 11295 }, { "loss": 2.1639, "grad_norm": 1.6351125240325928, "learning_rate": 5e-05, "epoch": 0.7235704680796567, "step": 11300 }, { "loss": 2.1567, "grad_norm": 1.6405686140060425, "learning_rate": 5e-05, "epoch": 0.7238906320035858, "step": 11305 }, { "loss": 2.1425, "grad_norm": 1.6778993606567383, "learning_rate": 5e-05, "epoch": 0.7242107959275149, "step": 11310 }, { "loss": 2.1388, "grad_norm": 1.5974764823913574, "learning_rate": 5e-05, "epoch": 0.7245309598514439, "step": 11315 }, { "loss": 2.1866, "grad_norm": 1.6215441226959229, "learning_rate": 5e-05, "epoch": 0.724851123775373, "step": 11320 }, { "loss": 2.1612, "grad_norm": 1.638525366783142, "learning_rate": 5e-05, "epoch": 0.7251712876993021, "step": 11325 }, { "loss": 2.1588, "grad_norm": 1.677040934562683, "learning_rate": 5e-05, "epoch": 0.7254914516232311, "step": 11330 }, { "loss": 2.1894, "grad_norm": 1.6689181327819824, "learning_rate": 5e-05, "epoch": 0.7258116155471601, "step": 11335 }, { "loss": 2.1748, "grad_norm": 1.6328116655349731, "learning_rate": 5e-05, "epoch": 0.7261317794710892, "step": 11340 }, { "loss": 2.154, "grad_norm": 1.6823488473892212, "learning_rate": 5e-05, "epoch": 0.7264519433950183, "step": 11345 }, { "loss": 2.1432, "grad_norm": 1.6382944583892822, "learning_rate": 5e-05, "epoch": 0.7267721073189473, "step": 11350 }, { "loss": 2.1632, "grad_norm": 1.6119322776794434, "learning_rate": 5e-05, "epoch": 0.7270922712428763, "step": 11355 }, { "loss": 2.1636, "grad_norm": 1.6790704727172852, "learning_rate": 5e-05, "epoch": 0.7274124351668054, "step": 11360 }, { "loss": 2.1905, "grad_norm": 1.6697032451629639, "learning_rate": 5e-05, "epoch": 0.7277325990907345, "step": 11365 }, { "loss": 2.1917, "grad_norm": 1.6095983982086182, "learning_rate": 5e-05, "epoch": 0.7280527630146635, "step": 11370 }, { "loss": 2.1417, "grad_norm": 1.5918678045272827, "learning_rate": 5e-05, "epoch": 0.7283729269385926, "step": 11375 }, { "loss": 2.1782, "grad_norm": 1.5360814332962036, "learning_rate": 5e-05, "epoch": 0.7286930908625217, "step": 11380 }, { "loss": 2.1869, "grad_norm": 1.613787293434143, "learning_rate": 5e-05, "epoch": 0.7290132547864506, "step": 11385 }, { "loss": 2.1888, "grad_norm": 1.5174287557601929, "learning_rate": 5e-05, "epoch": 0.7293334187103797, "step": 11390 }, { "loss": 2.1803, "grad_norm": 1.625603437423706, "learning_rate": 5e-05, "epoch": 0.7296535826343088, "step": 11395 }, { "loss": 2.1788, "grad_norm": 1.572482943534851, "learning_rate": 5e-05, "epoch": 0.7299737465582378, "step": 11400 }, { "eval_loss": 2.03629732131958, "eval_runtime": 9.2282, "eval_samples_per_second": 221.929, "eval_steps_per_second": 27.741, "epoch": 0.7299737465582378, "step": 11400 }, { "loss": 2.1694, "grad_norm": 1.6652473211288452, "learning_rate": 5e-05, "epoch": 0.7302939104821669, "step": 11405 }, { "loss": 2.1589, "grad_norm": 1.5308523178100586, "learning_rate": 5e-05, "epoch": 0.7306140744060959, "step": 11410 }, { "loss": 2.1666, "grad_norm": 1.578356385231018, "learning_rate": 5e-05, "epoch": 0.730934238330025, "step": 11415 }, { "loss": 2.1538, "grad_norm": 1.6113513708114624, "learning_rate": 5e-05, "epoch": 0.731254402253954, "step": 11420 }, { "loss": 2.184, "grad_norm": 1.6025482416152954, "learning_rate": 5e-05, "epoch": 0.7315745661778831, "step": 11425 }, { "loss": 2.1466, "grad_norm": 1.674232006072998, "learning_rate": 5e-05, "epoch": 0.7318947301018122, "step": 11430 }, { "loss": 2.1636, "grad_norm": 1.6667330265045166, "learning_rate": 5e-05, "epoch": 0.7322148940257411, "step": 11435 }, { "loss": 2.1873, "grad_norm": 1.5927730798721313, "learning_rate": 5e-05, "epoch": 0.7325350579496702, "step": 11440 }, { "loss": 2.1408, "grad_norm": 1.6314619779586792, "learning_rate": 5e-05, "epoch": 0.7328552218735993, "step": 11445 }, { "loss": 2.1728, "grad_norm": 1.6256287097930908, "learning_rate": 5e-05, "epoch": 0.7331753857975284, "step": 11450 }, { "loss": 2.1653, "grad_norm": 1.556431531906128, "learning_rate": 5e-05, "epoch": 0.7334955497214574, "step": 11455 }, { "loss": 2.1638, "grad_norm": 1.600176453590393, "learning_rate": 5e-05, "epoch": 0.7338157136453864, "step": 11460 }, { "loss": 2.152, "grad_norm": 1.5737305879592896, "learning_rate": 5e-05, "epoch": 0.7341358775693155, "step": 11465 }, { "loss": 2.1757, "grad_norm": 1.5820810794830322, "learning_rate": 5e-05, "epoch": 0.7344560414932445, "step": 11470 }, { "loss": 2.1806, "grad_norm": 1.7114322185516357, "learning_rate": 5e-05, "epoch": 0.7347762054171736, "step": 11475 }, { "loss": 2.1672, "grad_norm": 1.5795398950576782, "learning_rate": 5e-05, "epoch": 0.7350963693411027, "step": 11480 }, { "loss": 2.178, "grad_norm": 1.6970211267471313, "learning_rate": 5e-05, "epoch": 0.7354165332650316, "step": 11485 }, { "loss": 2.1437, "grad_norm": 1.6389212608337402, "learning_rate": 5e-05, "epoch": 0.7357366971889607, "step": 11490 }, { "loss": 2.1463, "grad_norm": 1.5622590780258179, "learning_rate": 5e-05, "epoch": 0.7360568611128898, "step": 11495 }, { "loss": 2.1763, "grad_norm": 1.6667590141296387, "learning_rate": 5e-05, "epoch": 0.7363770250368189, "step": 11500 }, { "loss": 2.1737, "grad_norm": 1.6279217004776, "learning_rate": 5e-05, "epoch": 0.7366971889607479, "step": 11505 }, { "loss": 2.1736, "grad_norm": 1.5891218185424805, "learning_rate": 5e-05, "epoch": 0.737017352884677, "step": 11510 }, { "loss": 2.1645, "grad_norm": 1.5445228815078735, "learning_rate": 5e-05, "epoch": 0.737337516808606, "step": 11515 }, { "loss": 2.1715, "grad_norm": 1.6425042152404785, "learning_rate": 5e-05, "epoch": 0.737657680732535, "step": 11520 }, { "loss": 2.1548, "grad_norm": 1.6075410842895508, "learning_rate": 5e-05, "epoch": 0.7379778446564641, "step": 11525 }, { "loss": 2.1311, "grad_norm": 1.6689494848251343, "learning_rate": 5e-05, "epoch": 0.7382980085803932, "step": 11530 }, { "loss": 2.1788, "grad_norm": 1.6945607662200928, "learning_rate": 5e-05, "epoch": 0.7386181725043223, "step": 11535 }, { "loss": 2.1646, "grad_norm": 1.7109571695327759, "learning_rate": 5e-05, "epoch": 0.7389383364282512, "step": 11540 }, { "loss": 2.1656, "grad_norm": 1.6874438524246216, "learning_rate": 5e-05, "epoch": 0.7392585003521803, "step": 11545 }, { "loss": 2.1719, "grad_norm": 1.6662932634353638, "learning_rate": 5e-05, "epoch": 0.7395786642761094, "step": 11550 }, { "loss": 2.1489, "grad_norm": 1.5764780044555664, "learning_rate": 5e-05, "epoch": 0.7398988282000384, "step": 11555 }, { "loss": 2.1715, "grad_norm": 1.6159734725952148, "learning_rate": 5e-05, "epoch": 0.7402189921239675, "step": 11560 }, { "loss": 2.1861, "grad_norm": 1.5857573747634888, "learning_rate": 5e-05, "epoch": 0.7405391560478966, "step": 11565 }, { "loss": 2.1622, "grad_norm": 1.5534696578979492, "learning_rate": 5e-05, "epoch": 0.7408593199718255, "step": 11570 }, { "loss": 2.144, "grad_norm": 1.5971225500106812, "learning_rate": 5e-05, "epoch": 0.7411794838957546, "step": 11575 }, { "loss": 2.1669, "grad_norm": 1.6196457147598267, "learning_rate": 5e-05, "epoch": 0.7414996478196837, "step": 11580 }, { "loss": 2.1851, "grad_norm": 1.584476351737976, "learning_rate": 5e-05, "epoch": 0.7418198117436128, "step": 11585 }, { "loss": 2.1444, "grad_norm": 1.6179357767105103, "learning_rate": 5e-05, "epoch": 0.7421399756675418, "step": 11590 }, { "loss": 2.1497, "grad_norm": 1.549419641494751, "learning_rate": 5e-05, "epoch": 0.7424601395914708, "step": 11595 }, { "loss": 2.1902, "grad_norm": 1.6180243492126465, "learning_rate": 5e-05, "epoch": 0.7427803035153999, "step": 11600 }, { "eval_loss": 2.032823085784912, "eval_runtime": 12.5035, "eval_samples_per_second": 163.795, "eval_steps_per_second": 20.474, "epoch": 0.7427803035153999, "step": 11600 }, { "loss": 2.1372, "grad_norm": 1.7094162702560425, "learning_rate": 5e-05, "epoch": 0.7431004674393289, "step": 11605 }, { "loss": 2.169, "grad_norm": 1.5947761535644531, "learning_rate": 5e-05, "epoch": 0.743420631363258, "step": 11610 }, { "loss": 2.1527, "grad_norm": 1.634129524230957, "learning_rate": 5e-05, "epoch": 0.7437407952871871, "step": 11615 }, { "loss": 2.1599, "grad_norm": 1.6361533403396606, "learning_rate": 5e-05, "epoch": 0.7440609592111161, "step": 11620 }, { "loss": 2.1682, "grad_norm": 1.6670771837234497, "learning_rate": 5e-05, "epoch": 0.7443811231350451, "step": 11625 }, { "loss": 2.1661, "grad_norm": 1.6897422075271606, "learning_rate": 5e-05, "epoch": 0.7447012870589742, "step": 11630 }, { "loss": 2.1425, "grad_norm": 1.5701279640197754, "learning_rate": 5e-05, "epoch": 0.7450214509829033, "step": 11635 }, { "loss": 2.1747, "grad_norm": 1.6329246759414673, "learning_rate": 5e-05, "epoch": 0.7453416149068323, "step": 11640 }, { "loss": 2.186, "grad_norm": 1.651111364364624, "learning_rate": 5e-05, "epoch": 0.7456617788307613, "step": 11645 }, { "loss": 2.1843, "grad_norm": 1.5741336345672607, "learning_rate": 5e-05, "epoch": 0.7459819427546904, "step": 11650 }, { "loss": 2.1641, "grad_norm": 1.6159652471542358, "learning_rate": 5e-05, "epoch": 0.7463021066786194, "step": 11655 }, { "loss": 2.1791, "grad_norm": 1.538552165031433, "learning_rate": 5e-05, "epoch": 0.7466222706025485, "step": 11660 }, { "loss": 2.1707, "grad_norm": 1.616956353187561, "learning_rate": 5e-05, "epoch": 0.7469424345264776, "step": 11665 }, { "loss": 2.1778, "grad_norm": 1.530639410018921, "learning_rate": 5e-05, "epoch": 0.7472625984504067, "step": 11670 }, { "loss": 2.1523, "grad_norm": 1.5348337888717651, "learning_rate": 5e-05, "epoch": 0.7475827623743356, "step": 11675 }, { "loss": 2.1787, "grad_norm": 1.6515288352966309, "learning_rate": 5e-05, "epoch": 0.7479029262982647, "step": 11680 }, { "loss": 2.167, "grad_norm": 1.654701828956604, "learning_rate": 5e-05, "epoch": 0.7482230902221938, "step": 11685 }, { "loss": 2.149, "grad_norm": 1.5613808631896973, "learning_rate": 5e-05, "epoch": 0.7485432541461228, "step": 11690 }, { "loss": 2.1814, "grad_norm": 1.6129508018493652, "learning_rate": 5e-05, "epoch": 0.7488634180700519, "step": 11695 }, { "loss": 2.1738, "grad_norm": 1.5563007593154907, "learning_rate": 5e-05, "epoch": 0.7491835819939809, "step": 11700 }, { "loss": 2.1543, "grad_norm": 1.5803391933441162, "learning_rate": 5e-05, "epoch": 0.74950374591791, "step": 11705 }, { "loss": 2.19, "grad_norm": 1.61357581615448, "learning_rate": 5e-05, "epoch": 0.749823909841839, "step": 11710 }, { "loss": 2.1295, "grad_norm": 1.5979573726654053, "learning_rate": 5e-05, "epoch": 0.7501440737657681, "step": 11715 }, { "loss": 2.1445, "grad_norm": 1.6031203269958496, "learning_rate": 5e-05, "epoch": 0.7504642376896972, "step": 11720 }, { "loss": 2.1808, "grad_norm": 1.645012378692627, "learning_rate": 5e-05, "epoch": 0.7507844016136261, "step": 11725 }, { "loss": 2.161, "grad_norm": 1.629136562347412, "learning_rate": 5e-05, "epoch": 0.7511045655375552, "step": 11730 }, { "loss": 2.1554, "grad_norm": 1.5675849914550781, "learning_rate": 5e-05, "epoch": 0.7514247294614843, "step": 11735 }, { "loss": 2.1769, "grad_norm": 1.6842530965805054, "learning_rate": 5e-05, "epoch": 0.7517448933854133, "step": 11740 }, { "loss": 2.1571, "grad_norm": 1.645548939704895, "learning_rate": 5e-05, "epoch": 0.7520650573093424, "step": 11745 }, { "loss": 2.1565, "grad_norm": 1.6353763341903687, "learning_rate": 5e-05, "epoch": 0.7523852212332715, "step": 11750 }, { "loss": 2.1574, "grad_norm": 1.6045830249786377, "learning_rate": 5e-05, "epoch": 0.7527053851572005, "step": 11755 }, { "loss": 2.1557, "grad_norm": 1.6563011407852173, "learning_rate": 5e-05, "epoch": 0.7530255490811295, "step": 11760 }, { "loss": 2.1776, "grad_norm": 1.662995457649231, "learning_rate": 5e-05, "epoch": 0.7533457130050586, "step": 11765 }, { "loss": 2.1611, "grad_norm": 1.6755454540252686, "learning_rate": 5e-05, "epoch": 0.7536658769289877, "step": 11770 }, { "loss": 2.164, "grad_norm": 1.6755766868591309, "learning_rate": 5e-05, "epoch": 0.7539860408529167, "step": 11775 }, { "loss": 2.1476, "grad_norm": 1.637885570526123, "learning_rate": 5e-05, "epoch": 0.7543062047768457, "step": 11780 }, { "loss": 2.1591, "grad_norm": 1.5392520427703857, "learning_rate": 5e-05, "epoch": 0.7546263687007748, "step": 11785 }, { "loss": 2.1723, "grad_norm": 1.702089548110962, "learning_rate": 5e-05, "epoch": 0.7549465326247039, "step": 11790 }, { "loss": 2.1572, "grad_norm": 1.5574604272842407, "learning_rate": 5e-05, "epoch": 0.7552666965486329, "step": 11795 }, { "loss": 2.1696, "grad_norm": 1.565976858139038, "learning_rate": 5e-05, "epoch": 0.755586860472562, "step": 11800 }, { "eval_loss": 2.033466339111328, "eval_runtime": 9.2054, "eval_samples_per_second": 222.478, "eval_steps_per_second": 27.81, "epoch": 0.755586860472562, "step": 11800 }, { "loss": 2.154, "grad_norm": 1.5538219213485718, "learning_rate": 5e-05, "epoch": 0.755907024396491, "step": 11805 }, { "loss": 2.175, "grad_norm": 1.6611093282699585, "learning_rate": 5e-05, "epoch": 0.75622718832042, "step": 11810 }, { "loss": 2.1702, "grad_norm": 1.61954665184021, "learning_rate": 5e-05, "epoch": 0.7565473522443491, "step": 11815 }, { "loss": 2.1554, "grad_norm": 1.5703262090682983, "learning_rate": 5e-05, "epoch": 0.7568675161682782, "step": 11820 }, { "loss": 2.1639, "grad_norm": 1.5392646789550781, "learning_rate": 5e-05, "epoch": 0.7571876800922072, "step": 11825 }, { "loss": 2.1582, "grad_norm": 1.6835620403289795, "learning_rate": 5e-05, "epoch": 0.7575078440161362, "step": 11830 }, { "loss": 2.1526, "grad_norm": 1.6214685440063477, "learning_rate": 5e-05, "epoch": 0.7578280079400653, "step": 11835 }, { "loss": 2.1829, "grad_norm": 1.613537311553955, "learning_rate": 5e-05, "epoch": 0.7581481718639944, "step": 11840 }, { "loss": 2.1452, "grad_norm": 1.5517228841781616, "learning_rate": 5e-05, "epoch": 0.7584683357879234, "step": 11845 }, { "loss": 2.1803, "grad_norm": 1.6012214422225952, "learning_rate": 5e-05, "epoch": 0.7587884997118525, "step": 11850 }, { "loss": 2.1673, "grad_norm": 1.5846041440963745, "learning_rate": 5e-05, "epoch": 0.7591086636357816, "step": 11855 }, { "loss": 2.1348, "grad_norm": 1.588087558746338, "learning_rate": 5e-05, "epoch": 0.7594288275597105, "step": 11860 }, { "loss": 2.1479, "grad_norm": 1.585292100906372, "learning_rate": 5e-05, "epoch": 0.7597489914836396, "step": 11865 }, { "loss": 2.1432, "grad_norm": 1.5393071174621582, "learning_rate": 5e-05, "epoch": 0.7600691554075687, "step": 11870 }, { "loss": 2.1886, "grad_norm": 1.5727068185806274, "learning_rate": 5e-05, "epoch": 0.7603893193314978, "step": 11875 }, { "loss": 2.177, "grad_norm": 1.6157071590423584, "learning_rate": 5e-05, "epoch": 0.7607094832554268, "step": 11880 }, { "loss": 2.1629, "grad_norm": 1.6175097227096558, "learning_rate": 5e-05, "epoch": 0.7610296471793558, "step": 11885 }, { "loss": 2.1848, "grad_norm": 1.7654550075531006, "learning_rate": 5e-05, "epoch": 0.7613498111032849, "step": 11890 }, { "loss": 2.1423, "grad_norm": 1.6491674184799194, "learning_rate": 5e-05, "epoch": 0.7616699750272139, "step": 11895 }, { "loss": 2.1446, "grad_norm": 1.5910719633102417, "learning_rate": 5e-05, "epoch": 0.761990138951143, "step": 11900 }, { "loss": 2.1554, "grad_norm": 1.5420751571655273, "learning_rate": 5e-05, "epoch": 0.7623103028750721, "step": 11905 }, { "loss": 2.1404, "grad_norm": 1.6134288311004639, "learning_rate": 5e-05, "epoch": 0.762630466799001, "step": 11910 }, { "loss": 2.1559, "grad_norm": 1.4922846555709839, "learning_rate": 5e-05, "epoch": 0.7629506307229301, "step": 11915 }, { "loss": 2.152, "grad_norm": 1.5739011764526367, "learning_rate": 5e-05, "epoch": 0.7632707946468592, "step": 11920 }, { "loss": 2.1655, "grad_norm": 1.6074965000152588, "learning_rate": 5e-05, "epoch": 0.7635909585707883, "step": 11925 }, { "loss": 2.1552, "grad_norm": 1.5739712715148926, "learning_rate": 5e-05, "epoch": 0.7639111224947173, "step": 11930 }, { "loss": 2.1671, "grad_norm": 1.595373272895813, "learning_rate": 5e-05, "epoch": 0.7642312864186463, "step": 11935 }, { "loss": 2.1588, "grad_norm": 1.5630054473876953, "learning_rate": 5e-05, "epoch": 0.7645514503425754, "step": 11940 }, { "loss": 2.159, "grad_norm": 1.6202868223190308, "learning_rate": 5e-05, "epoch": 0.7648716142665044, "step": 11945 }, { "loss": 2.2056, "grad_norm": 1.671276330947876, "learning_rate": 5e-05, "epoch": 0.7651917781904335, "step": 11950 }, { "loss": 2.1376, "grad_norm": 1.714341163635254, "learning_rate": 5e-05, "epoch": 0.7655119421143626, "step": 11955 }, { "loss": 2.165, "grad_norm": 1.734993815422058, "learning_rate": 5e-05, "epoch": 0.7658321060382917, "step": 11960 }, { "loss": 2.1515, "grad_norm": 1.7148008346557617, "learning_rate": 5e-05, "epoch": 0.7661522699622206, "step": 11965 }, { "loss": 2.1716, "grad_norm": 1.6618858575820923, "learning_rate": 5e-05, "epoch": 0.7664724338861497, "step": 11970 }, { "loss": 2.1366, "grad_norm": 1.603656530380249, "learning_rate": 5e-05, "epoch": 0.7667925978100788, "step": 11975 }, { "loss": 2.1583, "grad_norm": 1.618535041809082, "learning_rate": 5e-05, "epoch": 0.7671127617340078, "step": 11980 }, { "loss": 2.14, "grad_norm": 1.553106665611267, "learning_rate": 5e-05, "epoch": 0.7674329256579369, "step": 11985 }, { "loss": 2.1562, "grad_norm": 1.5241880416870117, "learning_rate": 5e-05, "epoch": 0.7677530895818659, "step": 11990 }, { "loss": 2.1738, "grad_norm": 1.5153862237930298, "learning_rate": 5e-05, "epoch": 0.7680732535057949, "step": 11995 }, { "loss": 2.1916, "grad_norm": 1.577313780784607, "learning_rate": 5e-05, "epoch": 0.768393417429724, "step": 12000 }, { "eval_loss": 2.0328667163848877, "eval_runtime": 12.6548, "eval_samples_per_second": 161.836, "eval_steps_per_second": 20.23, "epoch": 0.768393417429724, "step": 12000 }, { "loss": 2.1293, "grad_norm": 1.6270264387130737, "learning_rate": 5e-05, "epoch": 0.7687135813536531, "step": 12005 }, { "loss": 2.1822, "grad_norm": 1.679459571838379, "learning_rate": 5e-05, "epoch": 0.7690337452775822, "step": 12010 }, { "loss": 2.1355, "grad_norm": 1.6691720485687256, "learning_rate": 5e-05, "epoch": 0.7693539092015111, "step": 12015 }, { "loss": 2.1708, "grad_norm": 1.6165096759796143, "learning_rate": 5e-05, "epoch": 0.7696740731254402, "step": 12020 }, { "loss": 2.1629, "grad_norm": 1.6087703704833984, "learning_rate": 5e-05, "epoch": 0.7699942370493693, "step": 12025 }, { "loss": 2.169, "grad_norm": 1.5682085752487183, "learning_rate": 5e-05, "epoch": 0.7703144009732983, "step": 12030 }, { "loss": 2.1344, "grad_norm": 1.5505659580230713, "learning_rate": 5e-05, "epoch": 0.7706345648972274, "step": 12035 }, { "loss": 2.1576, "grad_norm": 1.5263081789016724, "learning_rate": 5e-05, "epoch": 0.7709547288211565, "step": 12040 }, { "loss": 2.1756, "grad_norm": 1.700647234916687, "learning_rate": 5e-05, "epoch": 0.7712748927450855, "step": 12045 }, { "loss": 2.1783, "grad_norm": 1.6649285554885864, "learning_rate": 5e-05, "epoch": 0.7715950566690145, "step": 12050 }, { "loss": 2.1548, "grad_norm": 1.6890583038330078, "learning_rate": 5e-05, "epoch": 0.7719152205929436, "step": 12055 }, { "loss": 2.1775, "grad_norm": 1.7615318298339844, "learning_rate": 5e-05, "epoch": 0.7722353845168727, "step": 12060 }, { "loss": 2.1643, "grad_norm": 1.7467012405395508, "learning_rate": 5e-05, "epoch": 0.7725555484408017, "step": 12065 }, { "loss": 2.158, "grad_norm": 1.6698206663131714, "learning_rate": 5e-05, "epoch": 0.7728757123647307, "step": 12070 }, { "loss": 2.1472, "grad_norm": 1.592130422592163, "learning_rate": 5e-05, "epoch": 0.7731958762886598, "step": 12075 }, { "loss": 2.1408, "grad_norm": 1.6456536054611206, "learning_rate": 5e-05, "epoch": 0.7735160402125888, "step": 12080 }, { "loss": 2.1562, "grad_norm": 1.637117862701416, "learning_rate": 5e-05, "epoch": 0.7738362041365179, "step": 12085 }, { "loss": 2.1518, "grad_norm": 1.6462478637695312, "learning_rate": 5e-05, "epoch": 0.774156368060447, "step": 12090 }, { "loss": 2.1468, "grad_norm": 1.6634985208511353, "learning_rate": 5e-05, "epoch": 0.774476531984376, "step": 12095 }, { "loss": 2.1616, "grad_norm": 1.602612853050232, "learning_rate": 5e-05, "epoch": 0.774796695908305, "step": 12100 }, { "loss": 2.141, "grad_norm": 1.580141305923462, "learning_rate": 5e-05, "epoch": 0.7751168598322341, "step": 12105 }, { "loss": 2.1513, "grad_norm": 1.6583237648010254, "learning_rate": 5e-05, "epoch": 0.7754370237561632, "step": 12110 }, { "loss": 2.1497, "grad_norm": 1.6343486309051514, "learning_rate": 5e-05, "epoch": 0.7757571876800922, "step": 12115 }, { "loss": 2.1309, "grad_norm": 1.5499143600463867, "learning_rate": 5e-05, "epoch": 0.7760773516040212, "step": 12120 }, { "loss": 2.1546, "grad_norm": 1.6126841306686401, "learning_rate": 5e-05, "epoch": 0.7763975155279503, "step": 12125 }, { "loss": 2.1419, "grad_norm": 1.5695915222167969, "learning_rate": 5e-05, "epoch": 0.7767176794518794, "step": 12130 }, { "loss": 2.165, "grad_norm": 1.7308309078216553, "learning_rate": 5e-05, "epoch": 0.7770378433758084, "step": 12135 }, { "loss": 2.1722, "grad_norm": 1.6199977397918701, "learning_rate": 5e-05, "epoch": 0.7773580072997375, "step": 12140 }, { "loss": 2.1572, "grad_norm": 1.6317228078842163, "learning_rate": 5e-05, "epoch": 0.7776781712236666, "step": 12145 }, { "loss": 2.163, "grad_norm": 1.6066442728042603, "learning_rate": 5e-05, "epoch": 0.7779983351475955, "step": 12150 }, { "loss": 2.1893, "grad_norm": 1.6254138946533203, "learning_rate": 5e-05, "epoch": 0.7783184990715246, "step": 12155 }, { "loss": 2.1504, "grad_norm": 1.650006890296936, "learning_rate": 5e-05, "epoch": 0.7786386629954537, "step": 12160 }, { "loss": 2.1524, "grad_norm": 1.605966567993164, "learning_rate": 5e-05, "epoch": 0.7789588269193827, "step": 12165 }, { "loss": 2.1521, "grad_norm": 1.5684596300125122, "learning_rate": 5e-05, "epoch": 0.7792789908433118, "step": 12170 }, { "loss": 2.1581, "grad_norm": 1.5677785873413086, "learning_rate": 5e-05, "epoch": 0.7795991547672408, "step": 12175 }, { "loss": 2.1275, "grad_norm": 1.6513335704803467, "learning_rate": 5e-05, "epoch": 0.7799193186911699, "step": 12180 }, { "loss": 2.1446, "grad_norm": 1.650525450706482, "learning_rate": 5e-05, "epoch": 0.7802394826150989, "step": 12185 }, { "loss": 2.152, "grad_norm": 1.576680302619934, "learning_rate": 5e-05, "epoch": 0.780559646539028, "step": 12190 }, { "loss": 2.1527, "grad_norm": 1.634871244430542, "learning_rate": 5e-05, "epoch": 0.7808798104629571, "step": 12195 }, { "loss": 2.1764, "grad_norm": 1.6193811893463135, "learning_rate": 5e-05, "epoch": 0.781199974386886, "step": 12200 }, { "eval_loss": 2.030806303024292, "eval_runtime": 10.0212, "eval_samples_per_second": 204.366, "eval_steps_per_second": 25.546, "epoch": 0.781199974386886, "step": 12200 }, { "loss": 2.1441, "grad_norm": 1.6034654378890991, "learning_rate": 5e-05, "epoch": 0.7815201383108151, "step": 12205 }, { "loss": 2.1311, "grad_norm": 1.5852737426757812, "learning_rate": 5e-05, "epoch": 0.7818403022347442, "step": 12210 }, { "loss": 2.1637, "grad_norm": 1.6145589351654053, "learning_rate": 5e-05, "epoch": 0.7821604661586733, "step": 12215 }, { "loss": 2.1387, "grad_norm": 1.6558140516281128, "learning_rate": 5e-05, "epoch": 0.7824806300826023, "step": 12220 }, { "loss": 2.1663, "grad_norm": 1.5966055393218994, "learning_rate": 5e-05, "epoch": 0.7828007940065314, "step": 12225 }, { "loss": 2.1405, "grad_norm": 1.6753573417663574, "learning_rate": 5e-05, "epoch": 0.7831209579304604, "step": 12230 }, { "loss": 2.1673, "grad_norm": 1.6698678731918335, "learning_rate": 5e-05, "epoch": 0.7834411218543894, "step": 12235 }, { "loss": 2.1487, "grad_norm": 1.6776996850967407, "learning_rate": 5e-05, "epoch": 0.7837612857783185, "step": 12240 }, { "loss": 2.1743, "grad_norm": 1.6791410446166992, "learning_rate": 5e-05, "epoch": 0.7840814497022476, "step": 12245 }, { "loss": 2.2024, "grad_norm": 1.6838544607162476, "learning_rate": 5e-05, "epoch": 0.7844016136261766, "step": 12250 }, { "loss": 2.1468, "grad_norm": 1.572723627090454, "learning_rate": 5e-05, "epoch": 0.7847217775501056, "step": 12255 }, { "loss": 2.1581, "grad_norm": 1.5989056825637817, "learning_rate": 5e-05, "epoch": 0.7850419414740347, "step": 12260 }, { "loss": 2.1342, "grad_norm": 1.5742263793945312, "learning_rate": 5e-05, "epoch": 0.7853621053979638, "step": 12265 }, { "loss": 2.1455, "grad_norm": 1.6334121227264404, "learning_rate": 5e-05, "epoch": 0.7856822693218928, "step": 12270 }, { "loss": 2.1602, "grad_norm": 1.6674188375473022, "learning_rate": 5e-05, "epoch": 0.7860024332458219, "step": 12275 }, { "loss": 2.1542, "grad_norm": 1.6382502317428589, "learning_rate": 5e-05, "epoch": 0.786322597169751, "step": 12280 }, { "loss": 2.1901, "grad_norm": 1.642731785774231, "learning_rate": 5e-05, "epoch": 0.7866427610936799, "step": 12285 }, { "loss": 2.1802, "grad_norm": 1.6009544134140015, "learning_rate": 5e-05, "epoch": 0.786962925017609, "step": 12290 }, { "loss": 2.1362, "grad_norm": 1.54159414768219, "learning_rate": 5e-05, "epoch": 0.7872830889415381, "step": 12295 }, { "loss": 2.1792, "grad_norm": 1.5700483322143555, "learning_rate": 5e-05, "epoch": 0.7876032528654672, "step": 12300 }, { "loss": 2.1434, "grad_norm": 1.6987546682357788, "learning_rate": 5e-05, "epoch": 0.7879234167893961, "step": 12305 }, { "loss": 2.1475, "grad_norm": 1.595435380935669, "learning_rate": 5e-05, "epoch": 0.7882435807133252, "step": 12310 }, { "loss": 2.1381, "grad_norm": 1.5674047470092773, "learning_rate": 5e-05, "epoch": 0.7885637446372543, "step": 12315 }, { "loss": 2.186, "grad_norm": 1.5283838510513306, "learning_rate": 5e-05, "epoch": 0.7888839085611833, "step": 12320 }, { "loss": 2.1748, "grad_norm": 1.6678552627563477, "learning_rate": 5e-05, "epoch": 0.7892040724851124, "step": 12325 }, { "loss": 2.1484, "grad_norm": 1.5990204811096191, "learning_rate": 5e-05, "epoch": 0.7895242364090415, "step": 12330 }, { "loss": 2.1156, "grad_norm": 1.5516057014465332, "learning_rate": 5e-05, "epoch": 0.7898444003329704, "step": 12335 }, { "loss": 2.1274, "grad_norm": 1.4775924682617188, "learning_rate": 5e-05, "epoch": 0.7901645642568995, "step": 12340 }, { "loss": 2.1675, "grad_norm": 1.6825110912322998, "learning_rate": 5e-05, "epoch": 0.7904847281808286, "step": 12345 }, { "loss": 2.144, "grad_norm": 1.600658655166626, "learning_rate": 5e-05, "epoch": 0.7908048921047577, "step": 12350 }, { "loss": 2.1516, "grad_norm": 1.5016052722930908, "learning_rate": 5e-05, "epoch": 0.7911250560286867, "step": 12355 }, { "loss": 2.1524, "grad_norm": 1.5548697710037231, "learning_rate": 5e-05, "epoch": 0.7914452199526157, "step": 12360 }, { "loss": 2.1653, "grad_norm": 1.5483704805374146, "learning_rate": 5e-05, "epoch": 0.7917653838765448, "step": 12365 }, { "loss": 2.1624, "grad_norm": 1.5267717838287354, "learning_rate": 5e-05, "epoch": 0.7920855478004738, "step": 12370 }, { "loss": 2.1488, "grad_norm": 1.738539457321167, "learning_rate": 5e-05, "epoch": 0.7924057117244029, "step": 12375 }, { "loss": 2.1473, "grad_norm": 1.6237080097198486, "learning_rate": 5e-05, "epoch": 0.792725875648332, "step": 12380 }, { "loss": 2.166, "grad_norm": 1.6005204916000366, "learning_rate": 5e-05, "epoch": 0.793046039572261, "step": 12385 }, { "loss": 2.1519, "grad_norm": 1.5341179370880127, "learning_rate": 5e-05, "epoch": 0.79336620349619, "step": 12390 }, { "loss": 2.1747, "grad_norm": 1.566355586051941, "learning_rate": 5e-05, "epoch": 0.7936863674201191, "step": 12395 }, { "loss": 2.1942, "grad_norm": 1.706238865852356, "learning_rate": 5e-05, "epoch": 0.7940065313440482, "step": 12400 }, { "eval_loss": 2.0258755683898926, "eval_runtime": 11.8297, "eval_samples_per_second": 173.124, "eval_steps_per_second": 21.64, "epoch": 0.7940065313440482, "step": 12400 }, { "loss": 2.1421, "grad_norm": 1.6088368892669678, "learning_rate": 5e-05, "epoch": 0.7943266952679772, "step": 12405 }, { "loss": 2.1716, "grad_norm": 1.6068791151046753, "learning_rate": 5e-05, "epoch": 0.7946468591919063, "step": 12410 }, { "loss": 2.1201, "grad_norm": 1.5572909116744995, "learning_rate": 5e-05, "epoch": 0.7949670231158353, "step": 12415 }, { "loss": 2.1499, "grad_norm": 1.5897774696350098, "learning_rate": 5e-05, "epoch": 0.7952871870397643, "step": 12420 }, { "loss": 2.1673, "grad_norm": 1.5571075677871704, "learning_rate": 5e-05, "epoch": 0.7956073509636934, "step": 12425 }, { "loss": 2.1511, "grad_norm": 1.6140809059143066, "learning_rate": 5e-05, "epoch": 0.7959275148876225, "step": 12430 }, { "loss": 2.1885, "grad_norm": 1.743863821029663, "learning_rate": 5e-05, "epoch": 0.7962476788115516, "step": 12435 }, { "loss": 2.1563, "grad_norm": 1.5255663394927979, "learning_rate": 5e-05, "epoch": 0.7965678427354805, "step": 12440 }, { "loss": 2.1199, "grad_norm": 1.540199875831604, "learning_rate": 5e-05, "epoch": 0.7968880066594096, "step": 12445 }, { "loss": 2.1226, "grad_norm": 1.5515542030334473, "learning_rate": 5e-05, "epoch": 0.7972081705833387, "step": 12450 }, { "loss": 2.1358, "grad_norm": 1.60235595703125, "learning_rate": 5e-05, "epoch": 0.7975283345072677, "step": 12455 }, { "loss": 2.1982, "grad_norm": 1.604455590248108, "learning_rate": 5e-05, "epoch": 0.7978484984311968, "step": 12460 }, { "loss": 2.148, "grad_norm": 1.5921660661697388, "learning_rate": 5e-05, "epoch": 0.7981686623551258, "step": 12465 }, { "loss": 2.1461, "grad_norm": 1.583845615386963, "learning_rate": 5e-05, "epoch": 0.7984888262790549, "step": 12470 }, { "loss": 2.1552, "grad_norm": 1.599627137184143, "learning_rate": 5e-05, "epoch": 0.7988089902029839, "step": 12475 }, { "loss": 2.1764, "grad_norm": 1.6735512018203735, "learning_rate": 5e-05, "epoch": 0.799129154126913, "step": 12480 }, { "loss": 2.1477, "grad_norm": 1.571876049041748, "learning_rate": 5e-05, "epoch": 0.7994493180508421, "step": 12485 }, { "loss": 2.1418, "grad_norm": 1.580018162727356, "learning_rate": 5e-05, "epoch": 0.799769481974771, "step": 12490 }, { "loss": 2.1611, "grad_norm": 1.6157264709472656, "learning_rate": 5e-05, "epoch": 0.8000896458987001, "step": 12495 }, { "loss": 2.1363, "grad_norm": 1.5641580820083618, "learning_rate": 5e-05, "epoch": 0.8004098098226292, "step": 12500 }, { "loss": 2.1749, "grad_norm": 1.6098390817642212, "learning_rate": 5e-05, "epoch": 0.8007299737465582, "step": 12505 }, { "loss": 2.1095, "grad_norm": 1.5591392517089844, "learning_rate": 5e-05, "epoch": 0.8010501376704873, "step": 12510 }, { "loss": 2.1488, "grad_norm": 1.5563864707946777, "learning_rate": 5e-05, "epoch": 0.8013703015944164, "step": 12515 }, { "loss": 2.1685, "grad_norm": 1.564354419708252, "learning_rate": 5e-05, "epoch": 0.8016904655183454, "step": 12520 }, { "loss": 2.1467, "grad_norm": 1.567794680595398, "learning_rate": 5e-05, "epoch": 0.8020106294422744, "step": 12525 }, { "loss": 2.1129, "grad_norm": 1.5561848878860474, "learning_rate": 5e-05, "epoch": 0.8023307933662035, "step": 12530 }, { "loss": 2.1407, "grad_norm": 1.5778709650039673, "learning_rate": 5e-05, "epoch": 0.8026509572901326, "step": 12535 }, { "loss": 2.1594, "grad_norm": 1.5555260181427002, "learning_rate": 5e-05, "epoch": 0.8029711212140616, "step": 12540 }, { "loss": 2.1593, "grad_norm": 1.5886701345443726, "learning_rate": 5e-05, "epoch": 0.8032912851379906, "step": 12545 }, { "loss": 2.1298, "grad_norm": 1.5618678331375122, "learning_rate": 5e-05, "epoch": 0.8036114490619197, "step": 12550 }, { "loss": 2.1498, "grad_norm": 1.7027949094772339, "learning_rate": 5e-05, "epoch": 0.8039316129858488, "step": 12555 }, { "loss": 2.1509, "grad_norm": 1.625081181526184, "learning_rate": 5e-05, "epoch": 0.8042517769097778, "step": 12560 }, { "loss": 2.1231, "grad_norm": 1.631125569343567, "learning_rate": 5e-05, "epoch": 0.8045719408337069, "step": 12565 }, { "loss": 2.165, "grad_norm": 1.5852395296096802, "learning_rate": 5e-05, "epoch": 0.804892104757636, "step": 12570 }, { "loss": 2.1538, "grad_norm": 1.7171579599380493, "learning_rate": 5e-05, "epoch": 0.8052122686815649, "step": 12575 }, { "loss": 2.1685, "grad_norm": 1.660180687904358, "learning_rate": 5e-05, "epoch": 0.805532432605494, "step": 12580 }, { "loss": 2.1729, "grad_norm": 1.589772343635559, "learning_rate": 5e-05, "epoch": 0.8058525965294231, "step": 12585 }, { "loss": 2.1709, "grad_norm": 1.6301699876785278, "learning_rate": 5e-05, "epoch": 0.8061727604533521, "step": 12590 }, { "loss": 2.1447, "grad_norm": 1.6717979907989502, "learning_rate": 5e-05, "epoch": 0.8064929243772812, "step": 12595 }, { "loss": 2.1402, "grad_norm": 1.5903136730194092, "learning_rate": 5e-05, "epoch": 0.8068130883012102, "step": 12600 }, { "eval_loss": 2.0219223499298096, "eval_runtime": 9.19, "eval_samples_per_second": 222.851, "eval_steps_per_second": 27.856, "epoch": 0.8068130883012102, "step": 12600 }, { "loss": 2.1761, "grad_norm": 1.5364222526550293, "learning_rate": 5e-05, "epoch": 0.8071332522251393, "step": 12605 }, { "loss": 2.1475, "grad_norm": 1.5884590148925781, "learning_rate": 5e-05, "epoch": 0.8074534161490683, "step": 12610 }, { "loss": 2.1438, "grad_norm": 1.6371623277664185, "learning_rate": 5e-05, "epoch": 0.8077735800729974, "step": 12615 }, { "loss": 2.1583, "grad_norm": 1.5629013776779175, "learning_rate": 5e-05, "epoch": 0.8080937439969265, "step": 12620 }, { "loss": 2.1505, "grad_norm": 1.6306496858596802, "learning_rate": 5e-05, "epoch": 0.8084139079208554, "step": 12625 }, { "loss": 2.1618, "grad_norm": 1.6803566217422485, "learning_rate": 5e-05, "epoch": 0.8087340718447845, "step": 12630 }, { "loss": 2.155, "grad_norm": 1.5759366750717163, "learning_rate": 5e-05, "epoch": 0.8090542357687136, "step": 12635 }, { "loss": 2.1685, "grad_norm": 1.5736042261123657, "learning_rate": 5e-05, "epoch": 0.8093743996926427, "step": 12640 }, { "loss": 2.1334, "grad_norm": 1.639487862586975, "learning_rate": 5e-05, "epoch": 0.8096945636165717, "step": 12645 }, { "loss": 2.1347, "grad_norm": 1.5598664283752441, "learning_rate": 5e-05, "epoch": 0.8100147275405007, "step": 12650 }, { "loss": 2.163, "grad_norm": 1.6298346519470215, "learning_rate": 5e-05, "epoch": 0.8103348914644298, "step": 12655 }, { "loss": 2.1457, "grad_norm": 1.593445062637329, "learning_rate": 5e-05, "epoch": 0.8106550553883588, "step": 12660 }, { "loss": 2.199, "grad_norm": 1.5960693359375, "learning_rate": 5e-05, "epoch": 0.8109752193122879, "step": 12665 }, { "loss": 2.1398, "grad_norm": 1.595033884048462, "learning_rate": 5e-05, "epoch": 0.811295383236217, "step": 12670 }, { "loss": 2.1474, "grad_norm": 1.6295945644378662, "learning_rate": 5e-05, "epoch": 0.811615547160146, "step": 12675 }, { "loss": 2.1516, "grad_norm": 1.6431686878204346, "learning_rate": 5e-05, "epoch": 0.811935711084075, "step": 12680 }, { "loss": 2.1303, "grad_norm": 1.6344900131225586, "learning_rate": 5e-05, "epoch": 0.8122558750080041, "step": 12685 }, { "loss": 2.1446, "grad_norm": 1.6356950998306274, "learning_rate": 5e-05, "epoch": 0.8125760389319332, "step": 12690 }, { "loss": 2.1369, "grad_norm": 1.6093062162399292, "learning_rate": 5e-05, "epoch": 0.8128962028558622, "step": 12695 }, { "loss": 2.1676, "grad_norm": 1.7104160785675049, "learning_rate": 5e-05, "epoch": 0.8132163667797913, "step": 12700 }, { "loss": 2.1573, "grad_norm": 1.6704941987991333, "learning_rate": 5e-05, "epoch": 0.8135365307037203, "step": 12705 }, { "loss": 2.1416, "grad_norm": 1.568360686302185, "learning_rate": 5e-05, "epoch": 0.8138566946276493, "step": 12710 }, { "loss": 2.1544, "grad_norm": 1.5690499544143677, "learning_rate": 5e-05, "epoch": 0.8141768585515784, "step": 12715 }, { "loss": 2.1495, "grad_norm": 1.6691917181015015, "learning_rate": 5e-05, "epoch": 0.8144970224755075, "step": 12720 }, { "loss": 2.1458, "grad_norm": 1.5928118228912354, "learning_rate": 5e-05, "epoch": 0.8148171863994366, "step": 12725 }, { "loss": 2.1354, "grad_norm": 1.573602557182312, "learning_rate": 5e-05, "epoch": 0.8151373503233655, "step": 12730 }, { "loss": 2.1671, "grad_norm": 1.608485221862793, "learning_rate": 5e-05, "epoch": 0.8154575142472946, "step": 12735 }, { "loss": 2.1242, "grad_norm": 1.6055208444595337, "learning_rate": 5e-05, "epoch": 0.8157776781712237, "step": 12740 }, { "loss": 2.1676, "grad_norm": 1.4978792667388916, "learning_rate": 5e-05, "epoch": 0.8160978420951527, "step": 12745 }, { "loss": 2.1317, "grad_norm": 1.5075442790985107, "learning_rate": 5e-05, "epoch": 0.8164180060190818, "step": 12750 }, { "loss": 2.153, "grad_norm": 1.5905604362487793, "learning_rate": 5e-05, "epoch": 0.8167381699430108, "step": 12755 }, { "loss": 2.1718, "grad_norm": 1.6042873859405518, "learning_rate": 5e-05, "epoch": 0.8170583338669398, "step": 12760 }, { "loss": 2.1638, "grad_norm": 1.515039086341858, "learning_rate": 5e-05, "epoch": 0.8173784977908689, "step": 12765 }, { "loss": 2.1604, "grad_norm": 1.7022101879119873, "learning_rate": 5e-05, "epoch": 0.817698661714798, "step": 12770 }, { "loss": 2.1893, "grad_norm": 1.5627135038375854, "learning_rate": 5e-05, "epoch": 0.8180188256387271, "step": 12775 }, { "loss": 2.1391, "grad_norm": 1.6236927509307861, "learning_rate": 5e-05, "epoch": 0.818338989562656, "step": 12780 }, { "loss": 2.1661, "grad_norm": 1.627759337425232, "learning_rate": 5e-05, "epoch": 0.8186591534865851, "step": 12785 }, { "loss": 2.1579, "grad_norm": 1.5399779081344604, "learning_rate": 5e-05, "epoch": 0.8189793174105142, "step": 12790 }, { "loss": 2.1772, "grad_norm": 1.5167617797851562, "learning_rate": 5e-05, "epoch": 0.8192994813344432, "step": 12795 }, { "loss": 2.1573, "grad_norm": 1.5164883136749268, "learning_rate": 5e-05, "epoch": 0.8196196452583723, "step": 12800 }, { "eval_loss": 2.017529010772705, "eval_runtime": 10.1774, "eval_samples_per_second": 201.231, "eval_steps_per_second": 25.154, "epoch": 0.8196196452583723, "step": 12800 }, { "loss": 2.1762, "grad_norm": 1.6219836473464966, "learning_rate": 5e-05, "epoch": 0.8199398091823014, "step": 12805 }, { "loss": 2.1406, "grad_norm": 1.5230660438537598, "learning_rate": 5e-05, "epoch": 0.8202599731062304, "step": 12810 }, { "loss": 2.1179, "grad_norm": 1.6092220544815063, "learning_rate": 5e-05, "epoch": 0.8205801370301594, "step": 12815 }, { "loss": 2.157, "grad_norm": 1.5099693536758423, "learning_rate": 5e-05, "epoch": 0.8209003009540885, "step": 12820 }, { "loss": 2.1253, "grad_norm": 1.5344569683074951, "learning_rate": 5e-05, "epoch": 0.8212204648780176, "step": 12825 }, { "loss": 2.1538, "grad_norm": 1.6269365549087524, "learning_rate": 5e-05, "epoch": 0.8215406288019466, "step": 12830 }, { "loss": 2.1528, "grad_norm": 1.5916081666946411, "learning_rate": 5e-05, "epoch": 0.8218607927258756, "step": 12835 }, { "loss": 2.1554, "grad_norm": 1.5954616069793701, "learning_rate": 5e-05, "epoch": 0.8221809566498047, "step": 12840 }, { "loss": 2.1217, "grad_norm": 1.5432181358337402, "learning_rate": 5e-05, "epoch": 0.8225011205737337, "step": 12845 }, { "loss": 2.1388, "grad_norm": 1.579157829284668, "learning_rate": 5e-05, "epoch": 0.8228212844976628, "step": 12850 }, { "loss": 2.1628, "grad_norm": 1.5736826658248901, "learning_rate": 5e-05, "epoch": 0.8231414484215919, "step": 12855 }, { "loss": 2.1405, "grad_norm": 1.5886597633361816, "learning_rate": 5e-05, "epoch": 0.823461612345521, "step": 12860 }, { "loss": 2.1396, "grad_norm": 1.6655110120773315, "learning_rate": 5e-05, "epoch": 0.8237817762694499, "step": 12865 }, { "loss": 2.157, "grad_norm": 1.6713991165161133, "learning_rate": 5e-05, "epoch": 0.824101940193379, "step": 12870 }, { "loss": 2.1416, "grad_norm": 1.5701757669448853, "learning_rate": 5e-05, "epoch": 0.8244221041173081, "step": 12875 }, { "loss": 2.1355, "grad_norm": 1.609489917755127, "learning_rate": 5e-05, "epoch": 0.8247422680412371, "step": 12880 }, { "loss": 2.1341, "grad_norm": 1.637588620185852, "learning_rate": 5e-05, "epoch": 0.8250624319651662, "step": 12885 }, { "loss": 2.1274, "grad_norm": 1.5560768842697144, "learning_rate": 5e-05, "epoch": 0.8253825958890952, "step": 12890 }, { "loss": 2.1296, "grad_norm": 1.547018051147461, "learning_rate": 5e-05, "epoch": 0.8257027598130243, "step": 12895 }, { "loss": 2.1605, "grad_norm": 1.576536774635315, "learning_rate": 5e-05, "epoch": 0.8260229237369533, "step": 12900 }, { "loss": 2.1281, "grad_norm": 1.5521260499954224, "learning_rate": 5e-05, "epoch": 0.8263430876608824, "step": 12905 }, { "loss": 2.1792, "grad_norm": 1.5745978355407715, "learning_rate": 5e-05, "epoch": 0.8266632515848115, "step": 12910 }, { "loss": 2.1682, "grad_norm": 1.5464338064193726, "learning_rate": 5e-05, "epoch": 0.8269834155087404, "step": 12915 }, { "loss": 2.1368, "grad_norm": 1.6071354150772095, "learning_rate": 5e-05, "epoch": 0.8273035794326695, "step": 12920 }, { "loss": 2.1642, "grad_norm": 1.6548957824707031, "learning_rate": 5e-05, "epoch": 0.8276237433565986, "step": 12925 }, { "loss": 2.1636, "grad_norm": 1.6200032234191895, "learning_rate": 5e-05, "epoch": 0.8279439072805276, "step": 12930 }, { "loss": 2.1587, "grad_norm": 1.5933823585510254, "learning_rate": 5e-05, "epoch": 0.8282640712044567, "step": 12935 }, { "loss": 2.1459, "grad_norm": 1.6406372785568237, "learning_rate": 5e-05, "epoch": 0.8285842351283857, "step": 12940 }, { "loss": 2.1455, "grad_norm": 1.5688544511795044, "learning_rate": 5e-05, "epoch": 0.8289043990523148, "step": 12945 }, { "loss": 2.1704, "grad_norm": 1.5574911832809448, "learning_rate": 5e-05, "epoch": 0.8292245629762438, "step": 12950 }, { "loss": 2.1366, "grad_norm": 1.5636203289031982, "learning_rate": 5e-05, "epoch": 0.8295447269001729, "step": 12955 }, { "loss": 2.1216, "grad_norm": 1.5622069835662842, "learning_rate": 5e-05, "epoch": 0.829864890824102, "step": 12960 }, { "loss": 2.1306, "grad_norm": 1.5027891397476196, "learning_rate": 5e-05, "epoch": 0.830185054748031, "step": 12965 }, { "loss": 2.1621, "grad_norm": 1.565948486328125, "learning_rate": 5e-05, "epoch": 0.83050521867196, "step": 12970 }, { "loss": 2.1489, "grad_norm": 1.7101725339889526, "learning_rate": 5e-05, "epoch": 0.8308253825958891, "step": 12975 }, { "loss": 2.1543, "grad_norm": 1.6061644554138184, "learning_rate": 5e-05, "epoch": 0.8311455465198182, "step": 12980 }, { "loss": 2.1695, "grad_norm": 1.5707058906555176, "learning_rate": 5e-05, "epoch": 0.8314657104437472, "step": 12985 }, { "loss": 2.1433, "grad_norm": 1.5309937000274658, "learning_rate": 5e-05, "epoch": 0.8317858743676763, "step": 12990 }, { "loss": 2.1318, "grad_norm": 1.538222074508667, "learning_rate": 5e-05, "epoch": 0.8321060382916053, "step": 12995 }, { "loss": 2.1446, "grad_norm": 1.738500714302063, "learning_rate": 5e-05, "epoch": 0.8324262022155343, "step": 13000 }, { "eval_loss": 2.0216612815856934, "eval_runtime": 9.2438, "eval_samples_per_second": 221.554, "eval_steps_per_second": 27.694, "epoch": 0.8324262022155343, "step": 13000 }, { "loss": 2.1179, "grad_norm": 1.625812292098999, "learning_rate": 5e-05, "epoch": 0.8327463661394634, "step": 13005 }, { "loss": 2.1423, "grad_norm": 1.795238733291626, "learning_rate": 5e-05, "epoch": 0.8330665300633925, "step": 13010 }, { "loss": 2.1532, "grad_norm": 1.6200240850448608, "learning_rate": 5e-05, "epoch": 0.8333866939873215, "step": 13015 }, { "loss": 2.1766, "grad_norm": 1.5638982057571411, "learning_rate": 5e-05, "epoch": 0.8337068579112505, "step": 13020 }, { "loss": 2.1575, "grad_norm": 1.6555575132369995, "learning_rate": 5e-05, "epoch": 0.8340270218351796, "step": 13025 }, { "loss": 2.1378, "grad_norm": 1.560592532157898, "learning_rate": 5e-05, "epoch": 0.8343471857591087, "step": 13030 }, { "loss": 2.144, "grad_norm": 1.5742806196212769, "learning_rate": 5e-05, "epoch": 0.8346673496830377, "step": 13035 }, { "loss": 2.1508, "grad_norm": 1.6514239311218262, "learning_rate": 5e-05, "epoch": 0.8349875136069668, "step": 13040 }, { "loss": 2.1589, "grad_norm": 1.5992977619171143, "learning_rate": 5e-05, "epoch": 0.8353076775308959, "step": 13045 }, { "loss": 2.1343, "grad_norm": 1.783820629119873, "learning_rate": 5e-05, "epoch": 0.8356278414548248, "step": 13050 }, { "loss": 2.1822, "grad_norm": 1.6316050291061401, "learning_rate": 5e-05, "epoch": 0.8359480053787539, "step": 13055 }, { "loss": 2.1543, "grad_norm": 1.723607063293457, "learning_rate": 5e-05, "epoch": 0.836268169302683, "step": 13060 }, { "loss": 2.1511, "grad_norm": 1.5759859085083008, "learning_rate": 5e-05, "epoch": 0.8365883332266121, "step": 13065 }, { "loss": 2.162, "grad_norm": 1.5511044263839722, "learning_rate": 5e-05, "epoch": 0.836908497150541, "step": 13070 }, { "loss": 2.1452, "grad_norm": 1.5764498710632324, "learning_rate": 5e-05, "epoch": 0.8372286610744701, "step": 13075 }, { "loss": 2.1494, "grad_norm": 1.6332306861877441, "learning_rate": 5e-05, "epoch": 0.8375488249983992, "step": 13080 }, { "loss": 2.1495, "grad_norm": 1.5492876768112183, "learning_rate": 5e-05, "epoch": 0.8378689889223282, "step": 13085 }, { "loss": 2.1547, "grad_norm": 1.6958086490631104, "learning_rate": 5e-05, "epoch": 0.8381891528462573, "step": 13090 }, { "loss": 2.1328, "grad_norm": 1.6327171325683594, "learning_rate": 5e-05, "epoch": 0.8385093167701864, "step": 13095 }, { "loss": 2.1196, "grad_norm": 1.4885859489440918, "learning_rate": 5e-05, "epoch": 0.8388294806941153, "step": 13100 }, { "loss": 2.1367, "grad_norm": 1.5167481899261475, "learning_rate": 5e-05, "epoch": 0.8391496446180444, "step": 13105 }, { "loss": 2.1373, "grad_norm": 1.5418943166732788, "learning_rate": 5e-05, "epoch": 0.8394698085419735, "step": 13110 }, { "loss": 2.1341, "grad_norm": 1.5980948209762573, "learning_rate": 5e-05, "epoch": 0.8397899724659026, "step": 13115 }, { "loss": 2.1283, "grad_norm": 1.5357916355133057, "learning_rate": 5e-05, "epoch": 0.8401101363898316, "step": 13120 }, { "loss": 2.1357, "grad_norm": 1.6638524532318115, "learning_rate": 5e-05, "epoch": 0.8404303003137606, "step": 13125 }, { "loss": 2.1465, "grad_norm": 1.622886061668396, "learning_rate": 5e-05, "epoch": 0.8407504642376897, "step": 13130 }, { "loss": 2.1373, "grad_norm": 1.618874430656433, "learning_rate": 5e-05, "epoch": 0.8410706281616187, "step": 13135 }, { "loss": 2.1567, "grad_norm": 1.6146790981292725, "learning_rate": 5e-05, "epoch": 0.8413907920855478, "step": 13140 }, { "loss": 2.1464, "grad_norm": 1.5791168212890625, "learning_rate": 5e-05, "epoch": 0.8417109560094769, "step": 13145 }, { "loss": 2.158, "grad_norm": 1.506654143333435, "learning_rate": 5e-05, "epoch": 0.842031119933406, "step": 13150 }, { "loss": 2.1604, "grad_norm": 1.5425060987472534, "learning_rate": 5e-05, "epoch": 0.8423512838573349, "step": 13155 }, { "loss": 2.156, "grad_norm": 1.5820544958114624, "learning_rate": 5e-05, "epoch": 0.842671447781264, "step": 13160 }, { "loss": 2.136, "grad_norm": 1.6569517850875854, "learning_rate": 5e-05, "epoch": 0.8429916117051931, "step": 13165 }, { "loss": 2.135, "grad_norm": 1.7146352529525757, "learning_rate": 5e-05, "epoch": 0.8433117756291221, "step": 13170 }, { "loss": 2.1443, "grad_norm": 1.6757546663284302, "learning_rate": 5e-05, "epoch": 0.8436319395530512, "step": 13175 }, { "loss": 2.1559, "grad_norm": 1.5445369482040405, "learning_rate": 5e-05, "epoch": 0.8439521034769802, "step": 13180 }, { "loss": 2.1461, "grad_norm": 1.6682908535003662, "learning_rate": 5e-05, "epoch": 0.8442722674009092, "step": 13185 }, { "loss": 2.1599, "grad_norm": 1.5197023153305054, "learning_rate": 5e-05, "epoch": 0.8445924313248383, "step": 13190 }, { "loss": 2.1202, "grad_norm": 1.605768084526062, "learning_rate": 5e-05, "epoch": 0.8449125952487674, "step": 13195 }, { "loss": 2.1589, "grad_norm": 1.7030861377716064, "learning_rate": 5e-05, "epoch": 0.8452327591726965, "step": 13200 }, { "eval_loss": 2.025972843170166, "eval_runtime": 8.9924, "eval_samples_per_second": 227.748, "eval_steps_per_second": 28.469, "epoch": 0.8452327591726965, "step": 13200 }, { "loss": 2.1577, "grad_norm": 1.5597788095474243, "learning_rate": 5e-05, "epoch": 0.8455529230966254, "step": 13205 }, { "loss": 2.1244, "grad_norm": 1.5069013833999634, "learning_rate": 5e-05, "epoch": 0.8458730870205545, "step": 13210 }, { "loss": 2.1457, "grad_norm": 1.4982860088348389, "learning_rate": 5e-05, "epoch": 0.8461932509444836, "step": 13215 }, { "loss": 2.1392, "grad_norm": 1.5416525602340698, "learning_rate": 5e-05, "epoch": 0.8465134148684126, "step": 13220 }, { "loss": 2.1341, "grad_norm": 1.5496692657470703, "learning_rate": 5e-05, "epoch": 0.8468335787923417, "step": 13225 }, { "loss": 2.1419, "grad_norm": 1.55703604221344, "learning_rate": 5e-05, "epoch": 0.8471537427162708, "step": 13230 }, { "loss": 2.1623, "grad_norm": 1.5972099304199219, "learning_rate": 5e-05, "epoch": 0.8474739066401998, "step": 13235 }, { "loss": 2.15, "grad_norm": 1.569321870803833, "learning_rate": 5e-05, "epoch": 0.8477940705641288, "step": 13240 }, { "loss": 2.1483, "grad_norm": 1.549758791923523, "learning_rate": 5e-05, "epoch": 0.8481142344880579, "step": 13245 }, { "loss": 2.156, "grad_norm": 1.6335357427597046, "learning_rate": 5e-05, "epoch": 0.848434398411987, "step": 13250 }, { "loss": 2.1435, "grad_norm": 1.6042168140411377, "learning_rate": 5e-05, "epoch": 0.848754562335916, "step": 13255 }, { "loss": 2.1489, "grad_norm": 1.5949348211288452, "learning_rate": 5e-05, "epoch": 0.849074726259845, "step": 13260 }, { "loss": 2.1586, "grad_norm": 1.5524771213531494, "learning_rate": 5e-05, "epoch": 0.8493948901837741, "step": 13265 }, { "loss": 2.1251, "grad_norm": 1.6360365152359009, "learning_rate": 5e-05, "epoch": 0.8497150541077031, "step": 13270 }, { "loss": 2.1508, "grad_norm": 1.500728964805603, "learning_rate": 5e-05, "epoch": 0.8500352180316322, "step": 13275 }, { "loss": 2.1457, "grad_norm": 1.6618624925613403, "learning_rate": 5e-05, "epoch": 0.8503553819555613, "step": 13280 }, { "loss": 2.1543, "grad_norm": 1.5445833206176758, "learning_rate": 5e-05, "epoch": 0.8506755458794903, "step": 13285 }, { "loss": 2.1769, "grad_norm": 1.5497047901153564, "learning_rate": 5e-05, "epoch": 0.8509957098034193, "step": 13290 }, { "loss": 2.1584, "grad_norm": 1.6006513833999634, "learning_rate": 5e-05, "epoch": 0.8513158737273484, "step": 13295 }, { "loss": 2.1348, "grad_norm": 1.671229600906372, "learning_rate": 5e-05, "epoch": 0.8516360376512775, "step": 13300 }, { "loss": 2.1527, "grad_norm": 1.5722646713256836, "learning_rate": 5e-05, "epoch": 0.8519562015752065, "step": 13305 }, { "loss": 2.156, "grad_norm": 1.5288598537445068, "learning_rate": 5e-05, "epoch": 0.8522763654991355, "step": 13310 }, { "loss": 2.1362, "grad_norm": 1.6392875909805298, "learning_rate": 5e-05, "epoch": 0.8525965294230646, "step": 13315 }, { "loss": 2.1416, "grad_norm": 1.5657522678375244, "learning_rate": 5e-05, "epoch": 0.8529166933469937, "step": 13320 }, { "loss": 2.1231, "grad_norm": 1.6025290489196777, "learning_rate": 5e-05, "epoch": 0.8532368572709227, "step": 13325 }, { "loss": 2.1308, "grad_norm": 1.575181245803833, "learning_rate": 5e-05, "epoch": 0.8535570211948518, "step": 13330 }, { "loss": 2.1447, "grad_norm": 1.7142658233642578, "learning_rate": 5e-05, "epoch": 0.8538771851187809, "step": 13335 }, { "loss": 2.1016, "grad_norm": 1.573530912399292, "learning_rate": 5e-05, "epoch": 0.8541973490427098, "step": 13340 }, { "loss": 2.1634, "grad_norm": 1.64275324344635, "learning_rate": 5e-05, "epoch": 0.8545175129666389, "step": 13345 }, { "loss": 2.1234, "grad_norm": 1.6251364946365356, "learning_rate": 5e-05, "epoch": 0.854837676890568, "step": 13350 }, { "loss": 2.1461, "grad_norm": 1.5351976156234741, "learning_rate": 5e-05, "epoch": 0.855157840814497, "step": 13355 }, { "loss": 2.1326, "grad_norm": 1.557486891746521, "learning_rate": 5e-05, "epoch": 0.8554780047384261, "step": 13360 }, { "loss": 2.1336, "grad_norm": 1.638458013534546, "learning_rate": 5e-05, "epoch": 0.8557981686623551, "step": 13365 }, { "loss": 2.1435, "grad_norm": 1.5943306684494019, "learning_rate": 5e-05, "epoch": 0.8561183325862842, "step": 13370 }, { "loss": 2.1446, "grad_norm": 1.5555797815322876, "learning_rate": 5e-05, "epoch": 0.8564384965102132, "step": 13375 }, { "loss": 2.1498, "grad_norm": 1.5393874645233154, "learning_rate": 5e-05, "epoch": 0.8567586604341423, "step": 13380 }, { "loss": 2.1332, "grad_norm": 1.7171132564544678, "learning_rate": 5e-05, "epoch": 0.8570788243580714, "step": 13385 }, { "loss": 2.1282, "grad_norm": 1.6407911777496338, "learning_rate": 5e-05, "epoch": 0.8573989882820003, "step": 13390 }, { "loss": 2.1315, "grad_norm": 1.6605337858200073, "learning_rate": 5e-05, "epoch": 0.8577191522059294, "step": 13395 }, { "loss": 2.129, "grad_norm": 1.655444860458374, "learning_rate": 5e-05, "epoch": 0.8580393161298585, "step": 13400 }, { "eval_loss": 2.0151822566986084, "eval_runtime": 12.5576, "eval_samples_per_second": 163.088, "eval_steps_per_second": 20.386, "epoch": 0.8580393161298585, "step": 13400 }, { "loss": 2.1229, "grad_norm": 1.6577417850494385, "learning_rate": 5e-05, "epoch": 0.8583594800537876, "step": 13405 }, { "loss": 2.1458, "grad_norm": 1.5821425914764404, "learning_rate": 5e-05, "epoch": 0.8586796439777166, "step": 13410 }, { "loss": 2.1436, "grad_norm": 1.529543161392212, "learning_rate": 5e-05, "epoch": 0.8589998079016457, "step": 13415 }, { "loss": 2.1617, "grad_norm": 1.5528123378753662, "learning_rate": 5e-05, "epoch": 0.8593199718255747, "step": 13420 }, { "loss": 2.1315, "grad_norm": 1.5557422637939453, "learning_rate": 5e-05, "epoch": 0.8596401357495037, "step": 13425 }, { "loss": 2.1229, "grad_norm": 1.7199292182922363, "learning_rate": 5e-05, "epoch": 0.8599602996734328, "step": 13430 }, { "loss": 2.1343, "grad_norm": 1.5561965703964233, "learning_rate": 5e-05, "epoch": 0.8602804635973619, "step": 13435 }, { "loss": 2.1587, "grad_norm": 1.6795167922973633, "learning_rate": 5e-05, "epoch": 0.8606006275212909, "step": 13440 }, { "loss": 2.1469, "grad_norm": 1.5269910097122192, "learning_rate": 5e-05, "epoch": 0.8609207914452199, "step": 13445 }, { "loss": 2.1506, "grad_norm": 1.5669056177139282, "learning_rate": 5e-05, "epoch": 0.861240955369149, "step": 13450 }, { "loss": 2.1485, "grad_norm": 1.6965998411178589, "learning_rate": 5e-05, "epoch": 0.8615611192930781, "step": 13455 }, { "loss": 2.1213, "grad_norm": 1.7064182758331299, "learning_rate": 5e-05, "epoch": 0.8618812832170071, "step": 13460 }, { "loss": 2.1397, "grad_norm": 1.587528944015503, "learning_rate": 5e-05, "epoch": 0.8622014471409362, "step": 13465 }, { "loss": 2.1396, "grad_norm": 1.616944670677185, "learning_rate": 5e-05, "epoch": 0.8625216110648652, "step": 13470 }, { "loss": 2.1513, "grad_norm": 1.6129498481750488, "learning_rate": 5e-05, "epoch": 0.8628417749887942, "step": 13475 }, { "loss": 2.1434, "grad_norm": 1.612741231918335, "learning_rate": 5e-05, "epoch": 0.8631619389127233, "step": 13480 }, { "loss": 2.1399, "grad_norm": 1.6746602058410645, "learning_rate": 5e-05, "epoch": 0.8634821028366524, "step": 13485 }, { "loss": 2.1382, "grad_norm": 1.6259815692901611, "learning_rate": 5e-05, "epoch": 0.8638022667605815, "step": 13490 }, { "loss": 2.1307, "grad_norm": 1.5910407304763794, "learning_rate": 5e-05, "epoch": 0.8641224306845104, "step": 13495 }, { "loss": 2.1576, "grad_norm": 1.6779932975769043, "learning_rate": 5e-05, "epoch": 0.8644425946084395, "step": 13500 }, { "loss": 2.1316, "grad_norm": 1.5591740608215332, "learning_rate": 5e-05, "epoch": 0.8647627585323686, "step": 13505 }, { "loss": 2.1338, "grad_norm": 1.5584458112716675, "learning_rate": 5e-05, "epoch": 0.8650829224562976, "step": 13510 }, { "loss": 2.147, "grad_norm": 1.5230597257614136, "learning_rate": 5e-05, "epoch": 0.8654030863802267, "step": 13515 }, { "loss": 2.1538, "grad_norm": 1.5795865058898926, "learning_rate": 5e-05, "epoch": 0.8657232503041558, "step": 13520 }, { "loss": 2.1305, "grad_norm": 1.5907702445983887, "learning_rate": 5e-05, "epoch": 0.8660434142280847, "step": 13525 }, { "loss": 2.1522, "grad_norm": 1.5211584568023682, "learning_rate": 5e-05, "epoch": 0.8663635781520138, "step": 13530 }, { "loss": 2.1708, "grad_norm": 1.5619218349456787, "learning_rate": 5e-05, "epoch": 0.8666837420759429, "step": 13535 }, { "loss": 2.121, "grad_norm": 1.7085723876953125, "learning_rate": 5e-05, "epoch": 0.867003905999872, "step": 13540 }, { "loss": 2.1772, "grad_norm": 1.5624220371246338, "learning_rate": 5e-05, "epoch": 0.867324069923801, "step": 13545 }, { "loss": 2.1555, "grad_norm": 1.5626081228256226, "learning_rate": 5e-05, "epoch": 0.86764423384773, "step": 13550 }, { "loss": 2.1631, "grad_norm": 1.6283838748931885, "learning_rate": 5e-05, "epoch": 0.8679643977716591, "step": 13555 }, { "loss": 2.1582, "grad_norm": 1.5527262687683105, "learning_rate": 5e-05, "epoch": 0.8682845616955881, "step": 13560 }, { "loss": 2.1323, "grad_norm": 1.5154231786727905, "learning_rate": 5e-05, "epoch": 0.8686047256195172, "step": 13565 }, { "loss": 2.1506, "grad_norm": 1.5625056028366089, "learning_rate": 5e-05, "epoch": 0.8689248895434463, "step": 13570 }, { "loss": 2.1419, "grad_norm": 1.569746494293213, "learning_rate": 5e-05, "epoch": 0.8692450534673754, "step": 13575 }, { "loss": 2.1709, "grad_norm": 1.5288923978805542, "learning_rate": 5e-05, "epoch": 0.8695652173913043, "step": 13580 }, { "loss": 2.149, "grad_norm": 1.5755292177200317, "learning_rate": 5e-05, "epoch": 0.8698853813152334, "step": 13585 }, { "loss": 2.1528, "grad_norm": 1.5749847888946533, "learning_rate": 5e-05, "epoch": 0.8702055452391625, "step": 13590 }, { "loss": 2.1523, "grad_norm": 1.5499904155731201, "learning_rate": 5e-05, "epoch": 0.8705257091630915, "step": 13595 }, { "loss": 2.1692, "grad_norm": 1.6534972190856934, "learning_rate": 5e-05, "epoch": 0.8708458730870205, "step": 13600 }, { "eval_loss": 2.002166748046875, "eval_runtime": 9.601, "eval_samples_per_second": 213.311, "eval_steps_per_second": 26.664, "epoch": 0.8708458730870205, "step": 13600 }, { "loss": 2.1695, "grad_norm": 1.6838092803955078, "learning_rate": 5e-05, "epoch": 0.8711660370109496, "step": 13605 }, { "loss": 2.1425, "grad_norm": 1.5831983089447021, "learning_rate": 5e-05, "epoch": 0.8714862009348786, "step": 13610 }, { "loss": 2.1434, "grad_norm": 1.620985746383667, "learning_rate": 5e-05, "epoch": 0.8718063648588077, "step": 13615 }, { "loss": 2.1263, "grad_norm": 1.5088939666748047, "learning_rate": 5e-05, "epoch": 0.8721265287827368, "step": 13620 }, { "loss": 2.1572, "grad_norm": 1.5807956457138062, "learning_rate": 5e-05, "epoch": 0.8724466927066659, "step": 13625 }, { "loss": 2.1683, "grad_norm": 1.6206260919570923, "learning_rate": 5e-05, "epoch": 0.8727668566305948, "step": 13630 }, { "loss": 2.1358, "grad_norm": 1.5690107345581055, "learning_rate": 5e-05, "epoch": 0.8730870205545239, "step": 13635 }, { "loss": 2.1412, "grad_norm": 1.515356183052063, "learning_rate": 5e-05, "epoch": 0.873407184478453, "step": 13640 }, { "loss": 2.1233, "grad_norm": 1.603214979171753, "learning_rate": 5e-05, "epoch": 0.873727348402382, "step": 13645 }, { "loss": 2.1411, "grad_norm": 1.5593687295913696, "learning_rate": 5e-05, "epoch": 0.8740475123263111, "step": 13650 }, { "loss": 2.1334, "grad_norm": 1.5858949422836304, "learning_rate": 5e-05, "epoch": 0.8743676762502401, "step": 13655 }, { "loss": 2.1357, "grad_norm": 1.5124623775482178, "learning_rate": 5e-05, "epoch": 0.8746878401741692, "step": 13660 }, { "loss": 2.1547, "grad_norm": 1.6396076679229736, "learning_rate": 5e-05, "epoch": 0.8750080040980982, "step": 13665 }, { "loss": 2.1258, "grad_norm": 1.53853440284729, "learning_rate": 5e-05, "epoch": 0.8753281680220273, "step": 13670 }, { "loss": 2.1557, "grad_norm": 1.5771784782409668, "learning_rate": 5e-05, "epoch": 0.8756483319459564, "step": 13675 }, { "loss": 2.1493, "grad_norm": 1.5845115184783936, "learning_rate": 5e-05, "epoch": 0.8759684958698853, "step": 13680 }, { "loss": 2.1492, "grad_norm": 1.4543273448944092, "learning_rate": 5e-05, "epoch": 0.8762886597938144, "step": 13685 }, { "loss": 2.1413, "grad_norm": 1.622414469718933, "learning_rate": 5e-05, "epoch": 0.8766088237177435, "step": 13690 }, { "loss": 2.1324, "grad_norm": 1.5697697401046753, "learning_rate": 5e-05, "epoch": 0.8769289876416725, "step": 13695 }, { "loss": 2.138, "grad_norm": 1.5314058065414429, "learning_rate": 5e-05, "epoch": 0.8772491515656016, "step": 13700 }, { "loss": 2.1417, "grad_norm": 1.576690673828125, "learning_rate": 5e-05, "epoch": 0.8775693154895307, "step": 13705 }, { "loss": 2.1656, "grad_norm": 1.596118450164795, "learning_rate": 5e-05, "epoch": 0.8778894794134597, "step": 13710 }, { "loss": 2.1618, "grad_norm": 1.6418870687484741, "learning_rate": 5e-05, "epoch": 0.8782096433373887, "step": 13715 }, { "loss": 2.1169, "grad_norm": 1.5763425827026367, "learning_rate": 5e-05, "epoch": 0.8785298072613178, "step": 13720 }, { "loss": 2.1293, "grad_norm": 1.6006461381912231, "learning_rate": 5e-05, "epoch": 0.8788499711852469, "step": 13725 }, { "loss": 2.1461, "grad_norm": 1.6010429859161377, "learning_rate": 5e-05, "epoch": 0.8791701351091759, "step": 13730 }, { "loss": 2.1372, "grad_norm": 1.5433728694915771, "learning_rate": 5e-05, "epoch": 0.8794902990331049, "step": 13735 }, { "loss": 2.1334, "grad_norm": 1.6275326013565063, "learning_rate": 5e-05, "epoch": 0.879810462957034, "step": 13740 }, { "loss": 2.1458, "grad_norm": 1.6575015783309937, "learning_rate": 5e-05, "epoch": 0.8801306268809631, "step": 13745 }, { "loss": 2.1217, "grad_norm": 1.591966152191162, "learning_rate": 5e-05, "epoch": 0.8804507908048921, "step": 13750 }, { "loss": 2.1051, "grad_norm": 1.569419503211975, "learning_rate": 5e-05, "epoch": 0.8807709547288212, "step": 13755 }, { "loss": 2.1461, "grad_norm": 1.5138541460037231, "learning_rate": 5e-05, "epoch": 0.8810911186527502, "step": 13760 }, { "loss": 2.1666, "grad_norm": 1.5544683933258057, "learning_rate": 5e-05, "epoch": 0.8814112825766792, "step": 13765 }, { "loss": 2.1572, "grad_norm": 1.6272661685943604, "learning_rate": 5e-05, "epoch": 0.8817314465006083, "step": 13770 }, { "loss": 2.1285, "grad_norm": 1.59254789352417, "learning_rate": 5e-05, "epoch": 0.8820516104245374, "step": 13775 }, { "loss": 2.132, "grad_norm": 1.5939842462539673, "learning_rate": 5e-05, "epoch": 0.8823717743484664, "step": 13780 }, { "loss": 2.1404, "grad_norm": 1.512245774269104, "learning_rate": 5e-05, "epoch": 0.8826919382723954, "step": 13785 }, { "loss": 2.1267, "grad_norm": 1.4971529245376587, "learning_rate": 5e-05, "epoch": 0.8830121021963245, "step": 13790 }, { "loss": 2.1288, "grad_norm": 1.554547667503357, "learning_rate": 5e-05, "epoch": 0.8833322661202536, "step": 13795 }, { "loss": 2.124, "grad_norm": 1.5361340045928955, "learning_rate": 5e-05, "epoch": 0.8836524300441826, "step": 13800 }, { "eval_loss": 2.003274917602539, "eval_runtime": 9.0968, "eval_samples_per_second": 225.134, "eval_steps_per_second": 28.142, "epoch": 0.8836524300441826, "step": 13800 }, { "loss": 2.1452, "grad_norm": 1.6627310514450073, "learning_rate": 5e-05, "epoch": 0.8839725939681117, "step": 13805 }, { "loss": 2.1283, "grad_norm": 1.6165390014648438, "learning_rate": 5e-05, "epoch": 0.8842927578920408, "step": 13810 }, { "loss": 2.1567, "grad_norm": 1.6117839813232422, "learning_rate": 5e-05, "epoch": 0.8846129218159697, "step": 13815 }, { "loss": 2.1299, "grad_norm": 1.6143478155136108, "learning_rate": 5e-05, "epoch": 0.8849330857398988, "step": 13820 }, { "loss": 2.1314, "grad_norm": 1.5183966159820557, "learning_rate": 5e-05, "epoch": 0.8852532496638279, "step": 13825 }, { "loss": 2.1788, "grad_norm": 1.5318809747695923, "learning_rate": 5e-05, "epoch": 0.885573413587757, "step": 13830 }, { "loss": 2.1037, "grad_norm": 1.6234662532806396, "learning_rate": 5e-05, "epoch": 0.885893577511686, "step": 13835 }, { "loss": 2.1407, "grad_norm": 1.6068276166915894, "learning_rate": 5e-05, "epoch": 0.886213741435615, "step": 13840 }, { "loss": 2.1296, "grad_norm": 1.6118892431259155, "learning_rate": 5e-05, "epoch": 0.8865339053595441, "step": 13845 }, { "loss": 2.1491, "grad_norm": 1.5626516342163086, "learning_rate": 5e-05, "epoch": 0.8868540692834731, "step": 13850 }, { "loss": 2.1442, "grad_norm": 1.560748815536499, "learning_rate": 5e-05, "epoch": 0.8871742332074022, "step": 13855 }, { "loss": 2.1449, "grad_norm": 1.6533335447311401, "learning_rate": 5e-05, "epoch": 0.8874943971313313, "step": 13860 }, { "loss": 2.1301, "grad_norm": 1.671207070350647, "learning_rate": 5e-05, "epoch": 0.8878145610552602, "step": 13865 }, { "loss": 2.1246, "grad_norm": 1.6996030807495117, "learning_rate": 5e-05, "epoch": 0.8881347249791893, "step": 13870 }, { "loss": 2.1249, "grad_norm": 1.563635230064392, "learning_rate": 5e-05, "epoch": 0.8884548889031184, "step": 13875 }, { "loss": 2.1062, "grad_norm": 1.5765130519866943, "learning_rate": 5e-05, "epoch": 0.8887750528270475, "step": 13880 }, { "loss": 2.1372, "grad_norm": 1.61628258228302, "learning_rate": 5e-05, "epoch": 0.8890952167509765, "step": 13885 }, { "loss": 2.1548, "grad_norm": 1.6101058721542358, "learning_rate": 5e-05, "epoch": 0.8894153806749056, "step": 13890 }, { "loss": 2.1294, "grad_norm": 1.6023646593093872, "learning_rate": 5e-05, "epoch": 0.8897355445988346, "step": 13895 }, { "loss": 2.1555, "grad_norm": 1.5916895866394043, "learning_rate": 5e-05, "epoch": 0.8900557085227636, "step": 13900 }, { "loss": 2.1373, "grad_norm": 1.6304248571395874, "learning_rate": 5e-05, "epoch": 0.8903758724466927, "step": 13905 }, { "loss": 2.1543, "grad_norm": 1.578668475151062, "learning_rate": 5e-05, "epoch": 0.8906960363706218, "step": 13910 }, { "loss": 2.1368, "grad_norm": 1.7462356090545654, "learning_rate": 5e-05, "epoch": 0.8910162002945509, "step": 13915 }, { "loss": 2.126, "grad_norm": 1.5746142864227295, "learning_rate": 5e-05, "epoch": 0.8913363642184798, "step": 13920 }, { "loss": 2.1295, "grad_norm": 1.598989486694336, "learning_rate": 5e-05, "epoch": 0.8916565281424089, "step": 13925 }, { "loss": 2.1271, "grad_norm": 1.582812786102295, "learning_rate": 5e-05, "epoch": 0.891976692066338, "step": 13930 }, { "loss": 2.1282, "grad_norm": 1.6400415897369385, "learning_rate": 5e-05, "epoch": 0.892296855990267, "step": 13935 }, { "loss": 2.1327, "grad_norm": 1.5493074655532837, "learning_rate": 5e-05, "epoch": 0.8926170199141961, "step": 13940 }, { "loss": 2.1679, "grad_norm": 1.6458313465118408, "learning_rate": 5e-05, "epoch": 0.8929371838381251, "step": 13945 }, { "loss": 2.1539, "grad_norm": 1.5993520021438599, "learning_rate": 5e-05, "epoch": 0.8932573477620541, "step": 13950 }, { "loss": 2.1376, "grad_norm": 1.5654582977294922, "learning_rate": 5e-05, "epoch": 0.8935775116859832, "step": 13955 }, { "loss": 2.1552, "grad_norm": 1.5213743448257446, "learning_rate": 5e-05, "epoch": 0.8938976756099123, "step": 13960 }, { "loss": 2.1289, "grad_norm": 1.5369318723678589, "learning_rate": 5e-05, "epoch": 0.8942178395338414, "step": 13965 }, { "loss": 2.1416, "grad_norm": 1.6027822494506836, "learning_rate": 5e-05, "epoch": 0.8945380034577703, "step": 13970 }, { "loss": 2.1395, "grad_norm": 1.5644904375076294, "learning_rate": 5e-05, "epoch": 0.8948581673816994, "step": 13975 }, { "loss": 2.1543, "grad_norm": 1.6486353874206543, "learning_rate": 5e-05, "epoch": 0.8951783313056285, "step": 13980 }, { "loss": 2.1565, "grad_norm": 1.6289517879486084, "learning_rate": 5e-05, "epoch": 0.8954984952295575, "step": 13985 }, { "loss": 2.1493, "grad_norm": 1.700780987739563, "learning_rate": 5e-05, "epoch": 0.8958186591534866, "step": 13990 }, { "loss": 2.1413, "grad_norm": 1.6621733903884888, "learning_rate": 5e-05, "epoch": 0.8961388230774157, "step": 13995 }, { "loss": 2.1535, "grad_norm": 1.5872445106506348, "learning_rate": 5e-05, "epoch": 0.8964589870013447, "step": 14000 }, { "eval_loss": 2.003190517425537, "eval_runtime": 9.0419, "eval_samples_per_second": 226.5, "eval_steps_per_second": 28.313, "epoch": 0.8964589870013447, "step": 14000 }, { "loss": 2.1151, "grad_norm": 1.5972435474395752, "learning_rate": 5e-05, "epoch": 0.8967791509252737, "step": 14005 }, { "loss": 2.1639, "grad_norm": 1.5537108182907104, "learning_rate": 5e-05, "epoch": 0.8970993148492028, "step": 14010 }, { "loss": 2.1425, "grad_norm": 1.5643787384033203, "learning_rate": 5e-05, "epoch": 0.8974194787731319, "step": 14015 }, { "loss": 2.1339, "grad_norm": 1.5768754482269287, "learning_rate": 5e-05, "epoch": 0.8977396426970609, "step": 14020 }, { "loss": 2.1362, "grad_norm": 1.6462067365646362, "learning_rate": 5e-05, "epoch": 0.8980598066209899, "step": 14025 }, { "loss": 2.1413, "grad_norm": 1.540134072303772, "learning_rate": 5e-05, "epoch": 0.898379970544919, "step": 14030 }, { "loss": 2.1417, "grad_norm": 1.5940558910369873, "learning_rate": 5e-05, "epoch": 0.898700134468848, "step": 14035 }, { "loss": 2.1372, "grad_norm": 1.5724999904632568, "learning_rate": 5e-05, "epoch": 0.8990202983927771, "step": 14040 }, { "loss": 2.1628, "grad_norm": 1.611562728881836, "learning_rate": 5e-05, "epoch": 0.8993404623167062, "step": 14045 }, { "loss": 2.1253, "grad_norm": 1.5111039876937866, "learning_rate": 5e-05, "epoch": 0.8996606262406353, "step": 14050 }, { "loss": 2.17, "grad_norm": 1.5524070262908936, "learning_rate": 5e-05, "epoch": 0.8999807901645642, "step": 14055 }, { "loss": 2.1206, "grad_norm": 1.5320378541946411, "learning_rate": 4.999919096867105e-05, "epoch": 0.9003009540884933, "step": 14060 }, { "loss": 2.1422, "grad_norm": 1.5573452711105347, "learning_rate": 4.99959043686394e-05, "epoch": 0.9006211180124224, "step": 14065 }, { "loss": 2.1634, "grad_norm": 1.5653362274169922, "learning_rate": 4.999008996756062e-05, "epoch": 0.9009412819363514, "step": 14070 }, { "loss": 2.129, "grad_norm": 1.6500377655029297, "learning_rate": 4.998174835343699e-05, "epoch": 0.9012614458602805, "step": 14075 }, { "loss": 2.1201, "grad_norm": 1.5795848369598389, "learning_rate": 4.9970880369844344e-05, "epoch": 0.9015816097842095, "step": 14080 }, { "loss": 2.1409, "grad_norm": 1.48246431350708, "learning_rate": 4.995748711584676e-05, "epoch": 0.9019017737081386, "step": 14085 }, { "loss": 2.1532, "grad_norm": 1.6394940614700317, "learning_rate": 4.9941569945885383e-05, "epoch": 0.9022219376320676, "step": 14090 }, { "loss": 2.1489, "grad_norm": 1.6183077096939087, "learning_rate": 4.992313046964147e-05, "epoch": 0.9025421015559967, "step": 14095 }, { "loss": 2.1459, "grad_norm": 1.5216248035430908, "learning_rate": 4.990217055187362e-05, "epoch": 0.9028622654799258, "step": 14100 }, { "loss": 2.1391, "grad_norm": 1.5191268920898438, "learning_rate": 4.987869231222917e-05, "epoch": 0.9031824294038547, "step": 14105 }, { "loss": 2.1464, "grad_norm": 1.555911660194397, "learning_rate": 4.985269812502983e-05, "epoch": 0.9035025933277838, "step": 14110 }, { "loss": 2.1511, "grad_norm": 1.577181339263916, "learning_rate": 4.9824190619031616e-05, "epoch": 0.9038227572517129, "step": 14115 }, { "loss": 2.1753, "grad_norm": 1.580073595046997, "learning_rate": 4.979317267715895e-05, "epoch": 0.9041429211756419, "step": 14120 }, { "loss": 2.1283, "grad_norm": 1.5809470415115356, "learning_rate": 4.975964743621318e-05, "epoch": 0.904463085099571, "step": 14125 }, { "loss": 2.1146, "grad_norm": 1.4928776025772095, "learning_rate": 4.972361828655526e-05, "epoch": 0.9047832490235, "step": 14130 }, { "loss": 2.1686, "grad_norm": 1.5114343166351318, "learning_rate": 4.968508887176303e-05, "epoch": 0.9051034129474291, "step": 14135 }, { "loss": 2.1386, "grad_norm": 1.6162500381469727, "learning_rate": 4.964406308826261e-05, "epoch": 0.9054235768713581, "step": 14140 }, { "loss": 2.1504, "grad_norm": 1.7190195322036743, "learning_rate": 4.960054508493442e-05, "epoch": 0.9057437407952872, "step": 14145 }, { "loss": 2.0969, "grad_norm": 1.6564409732818604, "learning_rate": 4.9554539262693636e-05, "epoch": 0.9060639047192163, "step": 14150 }, { "loss": 2.1156, "grad_norm": 1.5606110095977783, "learning_rate": 4.9506050274045076e-05, "epoch": 0.9063840686431452, "step": 14155 }, { "loss": 2.133, "grad_norm": 1.4608800411224365, "learning_rate": 4.945508302261271e-05, "epoch": 0.9067042325670743, "step": 14160 }, { "loss": 2.113, "grad_norm": 1.5593559741973877, "learning_rate": 4.940164266264382e-05, "epoch": 0.9070243964910034, "step": 14165 }, { "loss": 2.1405, "grad_norm": 1.5246703624725342, "learning_rate": 4.934573459848768e-05, "epoch": 0.9073445604149325, "step": 14170 }, { "loss": 2.1462, "grad_norm": 1.6130108833312988, "learning_rate": 4.928736448404907e-05, "epoch": 0.9076647243388615, "step": 14175 }, { "loss": 2.1721, "grad_norm": 1.5642423629760742, "learning_rate": 4.9226538222216476e-05, "epoch": 0.9079848882627906, "step": 14180 }, { "loss": 2.1491, "grad_norm": 1.6237996816635132, "learning_rate": 4.9163261964265184e-05, "epoch": 0.9083050521867196, "step": 14185 }, { "loss": 2.1562, "grad_norm": 1.5854694843292236, "learning_rate": 4.909754210923515e-05, "epoch": 0.9086252161106486, "step": 14190 }, { "loss": 2.1569, "grad_norm": 1.578338861465454, "learning_rate": 4.902938530328393e-05, "epoch": 0.9089453800345777, "step": 14195 }, { "loss": 2.119, "grad_norm": 1.6499290466308594, "learning_rate": 4.895879843901451e-05, "epoch": 0.9092655439585068, "step": 14200 }, { "eval_loss": 1.9983034133911133, "eval_runtime": 9.4635, "eval_samples_per_second": 216.41, "eval_steps_per_second": 27.051, "epoch": 0.9092655439585068, "step": 14200 }, { "loss": 2.1602, "grad_norm": 1.5854569673538208, "learning_rate": 4.888578865477831e-05, "epoch": 0.9095857078824358, "step": 14205 }, { "loss": 2.1274, "grad_norm": 1.5863550901412964, "learning_rate": 4.881036333395329e-05, "epoch": 0.9099058718063648, "step": 14210 }, { "loss": 2.1412, "grad_norm": 1.5058035850524902, "learning_rate": 4.873253010419724e-05, "epoch": 0.9102260357302939, "step": 14215 }, { "loss": 2.133, "grad_norm": 1.5665518045425415, "learning_rate": 4.8652296836676435e-05, "epoch": 0.910546199654223, "step": 14220 }, { "loss": 2.1322, "grad_norm": 1.5273241996765137, "learning_rate": 4.856967164526966e-05, "epoch": 0.910866363578152, "step": 14225 }, { "loss": 2.1584, "grad_norm": 2.971830129623413, "learning_rate": 4.84846628857476e-05, "epoch": 0.9111865275020811, "step": 14230 }, { "loss": 2.1475, "grad_norm": 1.5898001194000244, "learning_rate": 4.83972791549279e-05, "epoch": 0.9115066914260102, "step": 14235 }, { "loss": 2.1378, "grad_norm": 1.674157977104187, "learning_rate": 4.8307529289805706e-05, "epoch": 0.9118268553499391, "step": 14240 }, { "loss": 2.1262, "grad_norm": 1.5460021495819092, "learning_rate": 4.821542236666009e-05, "epoch": 0.9121470192738682, "step": 14245 }, { "loss": 2.1465, "grad_norm": 1.5996989011764526, "learning_rate": 4.812096770013609e-05, "epoch": 0.9124671831977973, "step": 14250 }, { "loss": 2.1211, "grad_norm": 1.5312010049819946, "learning_rate": 4.802417484230277e-05, "epoch": 0.9127873471217264, "step": 14255 }, { "loss": 2.1254, "grad_norm": 1.5802662372589111, "learning_rate": 4.792505358168723e-05, "epoch": 0.9131075110456554, "step": 14260 }, { "loss": 2.1521, "grad_norm": 1.5600544214248657, "learning_rate": 4.782361394228472e-05, "epoch": 0.9134276749695844, "step": 14265 }, { "loss": 2.1041, "grad_norm": 1.5304535627365112, "learning_rate": 4.7719866182544894e-05, "epoch": 0.9137478388935135, "step": 14270 }, { "loss": 2.1418, "grad_norm": 1.540677785873413, "learning_rate": 4.761382079433441e-05, "epoch": 0.9140680028174425, "step": 14275 }, { "loss": 2.1354, "grad_norm": 1.5304155349731445, "learning_rate": 4.7505488501875907e-05, "epoch": 0.9143881667413716, "step": 14280 }, { "loss": 2.1853, "grad_norm": 1.581351637840271, "learning_rate": 4.739488026066347e-05, "epoch": 0.9147083306653007, "step": 14285 }, { "loss": 2.1387, "grad_norm": 1.5637761354446411, "learning_rate": 4.728200725635469e-05, "epoch": 0.9150284945892296, "step": 14290 }, { "loss": 2.1256, "grad_norm": 1.5060229301452637, "learning_rate": 4.716688090363953e-05, "epoch": 0.9153486585131587, "step": 14295 }, { "loss": 2.1226, "grad_norm": 1.548572301864624, "learning_rate": 4.7049512845085954e-05, "epoch": 0.9156688224370878, "step": 14300 }, { "loss": 2.1307, "grad_norm": 1.6175365447998047, "learning_rate": 4.692991494996247e-05, "epoch": 0.9159889863610169, "step": 14305 }, { "loss": 2.1273, "grad_norm": 1.5475963354110718, "learning_rate": 4.680809931303792e-05, "epoch": 0.9163091502849459, "step": 14310 }, { "loss": 2.159, "grad_norm": 1.644508719444275, "learning_rate": 4.668407825335823e-05, "epoch": 0.916629314208875, "step": 14315 }, { "loss": 2.1625, "grad_norm": 1.582242727279663, "learning_rate": 4.6557864313000695e-05, "epoch": 0.916949478132804, "step": 14320 }, { "loss": 2.1283, "grad_norm": 1.6576043367385864, "learning_rate": 4.642947025580559e-05, "epoch": 0.917269642056733, "step": 14325 }, { "loss": 2.1409, "grad_norm": 1.511075735092163, "learning_rate": 4.629890906608536e-05, "epoch": 0.9175898059806621, "step": 14330 }, { "loss": 2.1281, "grad_norm": 1.5506491661071777, "learning_rate": 4.6166193947311544e-05, "epoch": 0.9179099699045912, "step": 14335 }, { "loss": 2.1398, "grad_norm": 1.5191504955291748, "learning_rate": 4.6031338320779534e-05, "epoch": 0.9182301338285203, "step": 14340 }, { "loss": 2.1247, "grad_norm": 1.588230848312378, "learning_rate": 4.589435582425131e-05, "epoch": 0.9185502977524492, "step": 14345 }, { "loss": 2.1537, "grad_norm": 1.5594102144241333, "learning_rate": 4.5755260310576234e-05, "epoch": 0.9188704616763783, "step": 14350 }, { "loss": 2.1339, "grad_norm": 1.5035042762756348, "learning_rate": 4.561406584629018e-05, "epoch": 0.9191906256003074, "step": 14355 }, { "loss": 2.1301, "grad_norm": 1.648313283920288, "learning_rate": 4.547078671019294e-05, "epoch": 0.9195107895242364, "step": 14360 }, { "loss": 2.1223, "grad_norm": 1.5859969854354858, "learning_rate": 4.5325437391904316e-05, "epoch": 0.9198309534481655, "step": 14365 }, { "loss": 2.1238, "grad_norm": 1.5037835836410522, "learning_rate": 4.51780325903987e-05, "epoch": 0.9201511173720945, "step": 14370 }, { "loss": 2.1529, "grad_norm": 1.5427237749099731, "learning_rate": 4.5028587212518705e-05, "epoch": 0.9204712812960235, "step": 14375 }, { "loss": 2.1016, "grad_norm": 1.4729586839675903, "learning_rate": 4.487711637146754e-05, "epoch": 0.9207914452199526, "step": 14380 }, { "loss": 2.1257, "grad_norm": 1.5223755836486816, "learning_rate": 4.4723635385280724e-05, "epoch": 0.9211116091438817, "step": 14385 }, { "loss": 2.1473, "grad_norm": 1.5194729566574097, "learning_rate": 4.456815977527694e-05, "epoch": 0.9214317730678108, "step": 14390 }, { "loss": 2.141, "grad_norm": 1.5059860944747925, "learning_rate": 4.4410705264488415e-05, "epoch": 0.9217519369917397, "step": 14395 }, { "loss": 2.1138, "grad_norm": 1.543553113937378, "learning_rate": 4.425128777607084e-05, "epoch": 0.9220721009156688, "step": 14400 }, { "eval_loss": 2.00039005279541, "eval_runtime": 13.1104, "eval_samples_per_second": 156.212, "eval_steps_per_second": 19.526, "epoch": 0.9220721009156688, "step": 14400 }, { "loss": 2.1468, "grad_norm": 1.5583864450454712, "learning_rate": 4.4089923431693136e-05, "epoch": 0.9223922648395979, "step": 14405 }, { "loss": 2.1244, "grad_norm": 1.5649158954620361, "learning_rate": 4.392662854990702e-05, "epoch": 0.9227124287635269, "step": 14410 }, { "loss": 2.1579, "grad_norm": 1.5519460439682007, "learning_rate": 4.376141964449681e-05, "epoch": 0.923032592687456, "step": 14415 }, { "loss": 2.1185, "grad_norm": 1.6381224393844604, "learning_rate": 4.359431342280935e-05, "epoch": 0.923352756611385, "step": 14420 }, { "loss": 2.1166, "grad_norm": 1.5199573040008545, "learning_rate": 4.342532678406444e-05, "epoch": 0.9236729205353141, "step": 14425 }, { "loss": 2.1598, "grad_norm": 1.5303763151168823, "learning_rate": 4.325447681764586e-05, "epoch": 0.9239930844592431, "step": 14430 }, { "loss": 2.1426, "grad_norm": 1.6057510375976562, "learning_rate": 4.3081780801373104e-05, "epoch": 0.9243132483831722, "step": 14435 }, { "loss": 2.0851, "grad_norm": 1.566265344619751, "learning_rate": 4.290725619975413e-05, "epoch": 0.9246334123071013, "step": 14440 }, { "loss": 2.118, "grad_norm": 1.5874476432800293, "learning_rate": 4.27309206622192e-05, "epoch": 0.9249535762310303, "step": 14445 }, { "loss": 2.1075, "grad_norm": 1.531272530555725, "learning_rate": 4.255279202133598e-05, "epoch": 0.9252737401549593, "step": 14450 }, { "loss": 2.1417, "grad_norm": 1.5390820503234863, "learning_rate": 4.237288829100622e-05, "epoch": 0.9255939040788884, "step": 14455 }, { "loss": 2.1287, "grad_norm": 1.5704469680786133, "learning_rate": 4.219122766464396e-05, "epoch": 0.9259140680028174, "step": 14460 }, { "loss": 2.1114, "grad_norm": 1.4786990880966187, "learning_rate": 4.200782851333571e-05, "epoch": 0.9262342319267465, "step": 14465 }, { "loss": 2.1369, "grad_norm": 1.518187403678894, "learning_rate": 4.1822709383982607e-05, "epoch": 0.9265543958506756, "step": 14470 }, { "loss": 2.1398, "grad_norm": 1.5425680875778198, "learning_rate": 4.163588899742474e-05, "epoch": 0.9268745597746046, "step": 14475 }, { "loss": 2.1393, "grad_norm": 1.5290255546569824, "learning_rate": 4.1447386246547995e-05, "epoch": 0.9271947236985336, "step": 14480 }, { "loss": 2.1433, "grad_norm": 1.6013634204864502, "learning_rate": 4.1257220194373424e-05, "epoch": 0.9275148876224627, "step": 14485 }, { "loss": 2.1047, "grad_norm": 1.5584988594055176, "learning_rate": 4.106541007212942e-05, "epoch": 0.9278350515463918, "step": 14490 }, { "loss": 2.1444, "grad_norm": 1.5811387300491333, "learning_rate": 4.0871975277306894e-05, "epoch": 0.9281552154703208, "step": 14495 }, { "loss": 2.1093, "grad_norm": 1.5312178134918213, "learning_rate": 4.067693537169764e-05, "epoch": 0.9284753793942498, "step": 14500 }, { "loss": 2.1307, "grad_norm": 1.6398024559020996, "learning_rate": 4.048031007941607e-05, "epoch": 0.9287955433181789, "step": 14505 }, { "loss": 2.1452, "grad_norm": 1.535327434539795, "learning_rate": 4.028211928490454e-05, "epoch": 0.929115707242108, "step": 14510 }, { "loss": 2.1035, "grad_norm": 1.545807123184204, "learning_rate": 4.008238303092249e-05, "epoch": 0.929435871166037, "step": 14515 }, { "loss": 2.1376, "grad_norm": 1.4855430126190186, "learning_rate": 3.98811215165195e-05, "epoch": 0.9297560350899661, "step": 14520 }, { "loss": 2.1208, "grad_norm": 1.5296348333358765, "learning_rate": 3.9678355094992644e-05, "epoch": 0.9300761990138952, "step": 14525 }, { "loss": 2.1218, "grad_norm": 1.5518286228179932, "learning_rate": 3.9474104271828126e-05, "epoch": 0.9303963629378241, "step": 14530 }, { "loss": 2.1161, "grad_norm": 1.6967196464538574, "learning_rate": 3.926838970262765e-05, "epoch": 0.9307165268617532, "step": 14535 }, { "loss": 2.124, "grad_norm": 1.6289931535720825, "learning_rate": 3.906123219101952e-05, "epoch": 0.9310366907856823, "step": 14540 }, { "loss": 2.1255, "grad_norm": 1.6077406406402588, "learning_rate": 3.885265268655478e-05, "epoch": 0.9313568547096113, "step": 14545 }, { "loss": 2.1284, "grad_norm": 1.5821962356567383, "learning_rate": 3.864267228258866e-05, "epoch": 0.9316770186335404, "step": 14550 }, { "loss": 2.1048, "grad_norm": 1.696872591972351, "learning_rate": 3.843131221414738e-05, "epoch": 0.9319971825574694, "step": 14555 }, { "loss": 2.1074, "grad_norm": 1.5824540853500366, "learning_rate": 3.8218593855780746e-05, "epoch": 0.9323173464813985, "step": 14560 }, { "loss": 2.1297, "grad_norm": 1.553502082824707, "learning_rate": 3.800453871940049e-05, "epoch": 0.9326375104053275, "step": 14565 }, { "loss": 2.106, "grad_norm": 1.4843323230743408, "learning_rate": 3.778916845210487e-05, "epoch": 0.9329576743292566, "step": 14570 }, { "loss": 2.1326, "grad_norm": 1.5960259437561035, "learning_rate": 3.757250483398952e-05, "epoch": 0.9332778382531857, "step": 14575 }, { "loss": 2.1133, "grad_norm": 1.5438355207443237, "learning_rate": 3.735456977594481e-05, "epoch": 0.9335980021771146, "step": 14580 }, { "loss": 2.1361, "grad_norm": 1.5123685598373413, "learning_rate": 3.71353853174401e-05, "epoch": 0.9339181661010437, "step": 14585 }, { "loss": 2.1555, "grad_norm": 1.6334389448165894, "learning_rate": 3.691497362429485e-05, "epoch": 0.9342383300249728, "step": 14590 }, { "loss": 2.1254, "grad_norm": 1.5371829271316528, "learning_rate": 3.669335698643704e-05, "epoch": 0.9345584939489019, "step": 14595 }, { "loss": 2.1017, "grad_norm": 1.5454310178756714, "learning_rate": 3.64705578156491e-05, "epoch": 0.9348786578728309, "step": 14600 }, { "eval_loss": 1.994555950164795, "eval_runtime": 9.3529, "eval_samples_per_second": 218.97, "eval_steps_per_second": 27.371, "epoch": 0.9348786578728309, "step": 14600 }, { "loss": 2.1203, "grad_norm": 1.554782509803772, "learning_rate": 3.624659864330129e-05, "epoch": 0.93519882179676, "step": 14605 }, { "loss": 2.1085, "grad_norm": 1.4939922094345093, "learning_rate": 3.602150211807326e-05, "epoch": 0.935518985720689, "step": 14610 }, { "loss": 2.1079, "grad_norm": 1.5748850107192993, "learning_rate": 3.5795291003663575e-05, "epoch": 0.935839149644618, "step": 14615 }, { "loss": 2.109, "grad_norm": 1.5259861946105957, "learning_rate": 3.556798817648763e-05, "epoch": 0.9361593135685471, "step": 14620 }, { "loss": 2.1514, "grad_norm": 1.5462098121643066, "learning_rate": 3.533961662336424e-05, "epoch": 0.9364794774924762, "step": 14625 }, { "loss": 2.1115, "grad_norm": 1.5656838417053223, "learning_rate": 3.511019943919098e-05, "epoch": 0.9367996414164051, "step": 14630 }, { "loss": 2.123, "grad_norm": 1.528436541557312, "learning_rate": 3.487975982460863e-05, "epoch": 0.9371198053403342, "step": 14635 }, { "loss": 2.1343, "grad_norm": 1.5504084825515747, "learning_rate": 3.4648321083654935e-05, "epoch": 0.9374399692642633, "step": 14640 }, { "loss": 2.1361, "grad_norm": 1.524232029914856, "learning_rate": 3.441590662140792e-05, "epoch": 0.9377601331881924, "step": 14645 }, { "loss": 2.1085, "grad_norm": 1.4713977575302124, "learning_rate": 3.418253994161892e-05, "epoch": 0.9380802971121214, "step": 14650 }, { "loss": 2.115, "grad_norm": 1.5077587366104126, "learning_rate": 3.3948244644335735e-05, "epoch": 0.9384004610360505, "step": 14655 }, { "loss": 2.1197, "grad_norm": 1.5058708190917969, "learning_rate": 3.3713044423515946e-05, "epoch": 0.9387206249599795, "step": 14660 }, { "loss": 2.1231, "grad_norm": 1.5846128463745117, "learning_rate": 3.3476963064630786e-05, "epoch": 0.9390407888839085, "step": 14665 }, { "loss": 2.0968, "grad_norm": 1.511231541633606, "learning_rate": 3.324002444225976e-05, "epoch": 0.9393609528078376, "step": 14670 }, { "loss": 2.1139, "grad_norm": 1.600589632987976, "learning_rate": 3.3002252517676244e-05, "epoch": 0.9396811167317667, "step": 14675 }, { "loss": 2.1021, "grad_norm": 1.4865034818649292, "learning_rate": 3.27636713364243e-05, "epoch": 0.9400012806556958, "step": 14680 }, { "loss": 2.1081, "grad_norm": 1.5039657354354858, "learning_rate": 3.2524305025887e-05, "epoch": 0.9403214445796247, "step": 14685 }, { "loss": 2.1285, "grad_norm": 1.5387564897537231, "learning_rate": 3.228417779284643e-05, "epoch": 0.9406416085035538, "step": 14690 }, { "loss": 2.1274, "grad_norm": 1.550644040107727, "learning_rate": 3.2043313921035743e-05, "epoch": 0.9409617724274829, "step": 14695 }, { "loss": 2.1279, "grad_norm": 1.5030955076217651, "learning_rate": 3.180173776868331e-05, "epoch": 0.9412819363514119, "step": 14700 }, { "loss": 2.1246, "grad_norm": 1.5117207765579224, "learning_rate": 3.155947376604948e-05, "epoch": 0.941602100275341, "step": 14705 }, { "loss": 2.1481, "grad_norm": 1.5794004201889038, "learning_rate": 3.13165464129559e-05, "epoch": 0.94192226419927, "step": 14710 }, { "loss": 2.1415, "grad_norm": 1.4979078769683838, "learning_rate": 3.107298027630797e-05, "epoch": 0.942242428123199, "step": 14715 }, { "loss": 2.1283, "grad_norm": 1.5171302556991577, "learning_rate": 3.082879998761035e-05, "epoch": 0.9425625920471281, "step": 14720 }, { "loss": 2.1219, "grad_norm": 1.5499953031539917, "learning_rate": 3.058403024047607e-05, "epoch": 0.9428827559710572, "step": 14725 }, { "loss": 2.111, "grad_norm": 1.4895777702331543, "learning_rate": 3.033869578812924e-05, "epoch": 0.9432029198949863, "step": 14730 }, { "loss": 2.1037, "grad_norm": 1.5428434610366821, "learning_rate": 3.0092821440901857e-05, "epoch": 0.9435230838189153, "step": 14735 }, { "loss": 2.1368, "grad_norm": 1.5094553232192993, "learning_rate": 2.984643206372471e-05, "epoch": 0.9438432477428443, "step": 14740 }, { "loss": 2.1381, "grad_norm": 1.5818499326705933, "learning_rate": 2.959955257361286e-05, "epoch": 0.9441634116667734, "step": 14745 }, { "loss": 2.0922, "grad_norm": 1.513429880142212, "learning_rate": 2.935220793714582e-05, "epoch": 0.9444835755907024, "step": 14750 }, { "loss": 2.1022, "grad_norm": 1.556632161140442, "learning_rate": 2.9104423167942678e-05, "epoch": 0.9448037395146315, "step": 14755 }, { "loss": 2.1063, "grad_norm": 1.5335612297058105, "learning_rate": 2.885622332413256e-05, "epoch": 0.9451239034385606, "step": 14760 }, { "loss": 2.1308, "grad_norm": 1.5133774280548096, "learning_rate": 2.8607633505820504e-05, "epoch": 0.9454440673624896, "step": 14765 }, { "loss": 2.0859, "grad_norm": 1.4861382246017456, "learning_rate": 2.835867885254912e-05, "epoch": 0.9457642312864186, "step": 14770 }, { "loss": 2.1228, "grad_norm": 1.5895521640777588, "learning_rate": 2.8109384540756267e-05, "epoch": 0.9460843952103477, "step": 14775 }, { "loss": 2.0916, "grad_norm": 1.5556671619415283, "learning_rate": 2.7859775781229013e-05, "epoch": 0.9464045591342768, "step": 14780 }, { "loss": 2.1495, "grad_norm": 1.494764804840088, "learning_rate": 2.7609877816554085e-05, "epoch": 0.9467247230582058, "step": 14785 }, { "loss": 2.1671, "grad_norm": 1.5063127279281616, "learning_rate": 2.7359715918565103e-05, "epoch": 0.9470448869821348, "step": 14790 }, { "loss": 2.1287, "grad_norm": 1.612107515335083, "learning_rate": 2.710931538578692e-05, "epoch": 0.9473650509060639, "step": 14795 }, { "loss": 2.1083, "grad_norm": 1.5210696458816528, "learning_rate": 2.6858701540877185e-05, "epoch": 0.9476852148299929, "step": 14800 }, { "eval_loss": 1.9744480848312378, "eval_runtime": 9.307, "eval_samples_per_second": 220.049, "eval_steps_per_second": 27.506, "epoch": 0.9476852148299929, "step": 14800 }, { "loss": 2.1447, "grad_norm": 1.492371916770935, "learning_rate": 2.660789972806551e-05, "epoch": 0.948005378753922, "step": 14805 }, { "loss": 2.1099, "grad_norm": 1.541387677192688, "learning_rate": 2.635693531059043e-05, "epoch": 0.9483255426778511, "step": 14810 }, { "loss": 2.1105, "grad_norm": 1.4616005420684814, "learning_rate": 2.6105833668134473e-05, "epoch": 0.9486457066017802, "step": 14815 }, { "loss": 2.1132, "grad_norm": 1.4997398853302002, "learning_rate": 2.5854620194257533e-05, "epoch": 0.9489658705257091, "step": 14820 }, { "loss": 2.104, "grad_norm": 1.5401780605316162, "learning_rate": 2.5603320293828866e-05, "epoch": 0.9492860344496382, "step": 14825 }, { "loss": 2.1319, "grad_norm": 1.4212836027145386, "learning_rate": 2.535195938045791e-05, "epoch": 0.9496061983735673, "step": 14830 }, { "loss": 2.1123, "grad_norm": 1.498403549194336, "learning_rate": 2.5100562873924283e-05, "epoch": 0.9499263622974963, "step": 14835 }, { "loss": 2.1251, "grad_norm": 1.5054161548614502, "learning_rate": 2.484915619760707e-05, "epoch": 0.9502465262214254, "step": 14840 }, { "loss": 2.1148, "grad_norm": 1.5165938138961792, "learning_rate": 2.4597764775913813e-05, "epoch": 0.9505666901453544, "step": 14845 }, { "loss": 2.1301, "grad_norm": 1.4614144563674927, "learning_rate": 2.4346414031709386e-05, "epoch": 0.9508868540692835, "step": 14850 }, { "loss": 2.0923, "grad_norm": 1.522401213645935, "learning_rate": 2.409512938374499e-05, "epoch": 0.9512070179932125, "step": 14855 }, { "loss": 2.1349, "grad_norm": 1.5170820951461792, "learning_rate": 2.384393624408761e-05, "epoch": 0.9515271819171416, "step": 14860 }, { "loss": 2.0957, "grad_norm": 1.4755724668502808, "learning_rate": 2.3592860015550146e-05, "epoch": 0.9518473458410707, "step": 14865 }, { "loss": 2.1389, "grad_norm": 1.517958164215088, "learning_rate": 2.334192608912241e-05, "epoch": 0.9521675097649996, "step": 14870 }, { "loss": 2.1245, "grad_norm": 1.4963104724884033, "learning_rate": 2.3091159841403398e-05, "epoch": 0.9524876736889287, "step": 14875 }, { "loss": 2.1243, "grad_norm": 1.4569158554077148, "learning_rate": 2.2840586632035014e-05, "epoch": 0.9528078376128578, "step": 14880 }, { "loss": 2.1134, "grad_norm": 1.542040467262268, "learning_rate": 2.2590231801137447e-05, "epoch": 0.9531280015367868, "step": 14885 }, { "loss": 2.1003, "grad_norm": 1.4777690172195435, "learning_rate": 2.2340120666746577e-05, "epoch": 0.9534481654607159, "step": 14890 }, { "loss": 2.1317, "grad_norm": 1.4963622093200684, "learning_rate": 2.2090278522253604e-05, "epoch": 0.953768329384645, "step": 14895 }, { "loss": 2.1372, "grad_norm": 1.5573043823242188, "learning_rate": 2.1840730633847156e-05, "epoch": 0.954088493308574, "step": 14900 }, { "loss": 2.1405, "grad_norm": 1.5820651054382324, "learning_rate": 2.1591502237958115e-05, "epoch": 0.954408657232503, "step": 14905 }, { "loss": 2.1104, "grad_norm": 1.5473122596740723, "learning_rate": 2.134261853870757e-05, "epoch": 0.9547288211564321, "step": 14910 }, { "loss": 2.1043, "grad_norm": 1.5598843097686768, "learning_rate": 2.1094104705357908e-05, "epoch": 0.9550489850803612, "step": 14915 }, { "loss": 2.1012, "grad_norm": 1.5229307413101196, "learning_rate": 2.0845985869767487e-05, "epoch": 0.9553691490042902, "step": 14920 }, { "loss": 2.0979, "grad_norm": 1.5292434692382812, "learning_rate": 2.0598287123849095e-05, "epoch": 0.9556893129282192, "step": 14925 }, { "loss": 2.1127, "grad_norm": 1.5072777271270752, "learning_rate": 2.0351033517032427e-05, "epoch": 0.9560094768521483, "step": 14930 }, { "loss": 2.1393, "grad_norm": 1.4828133583068848, "learning_rate": 2.0104250053730905e-05, "epoch": 0.9563296407760774, "step": 14935 }, { "loss": 2.1392, "grad_norm": 1.572771668434143, "learning_rate": 1.9857961690812945e-05, "epoch": 0.9566498047000064, "step": 14940 }, { "loss": 2.0975, "grad_norm": 1.4665296077728271, "learning_rate": 1.9612193335078193e-05, "epoch": 0.9569699686239355, "step": 14945 }, { "loss": 2.1348, "grad_norm": 1.47166109085083, "learning_rate": 1.936696984073867e-05, "epoch": 0.9572901325478645, "step": 14950 }, { "loss": 2.0937, "grad_norm": 1.4156910181045532, "learning_rate": 1.9122316006905333e-05, "epoch": 0.9576102964717935, "step": 14955 }, { "loss": 2.1046, "grad_norm": 1.5509635210037231, "learning_rate": 1.887825657508016e-05, "epoch": 0.9579304603957226, "step": 14960 }, { "loss": 2.1508, "grad_norm": 1.4977760314941406, "learning_rate": 1.8634816226654074e-05, "epoch": 0.9582506243196517, "step": 14965 }, { "loss": 2.1104, "grad_norm": 1.4559874534606934, "learning_rate": 1.839201958041096e-05, "epoch": 0.9585707882435807, "step": 14970 }, { "loss": 2.1156, "grad_norm": 1.5095617771148682, "learning_rate": 1.8149891190038e-05, "epoch": 0.9588909521675097, "step": 14975 }, { "loss": 2.0877, "grad_norm": 1.4174879789352417, "learning_rate": 1.7908455541642584e-05, "epoch": 0.9592111160914388, "step": 14980 }, { "loss": 2.1116, "grad_norm": 1.5217187404632568, "learning_rate": 1.7667737051276076e-05, "epoch": 0.9595312800153679, "step": 14985 }, { "loss": 2.1417, "grad_norm": 1.4732062816619873, "learning_rate": 1.742776006246463e-05, "epoch": 0.9598514439392969, "step": 14990 }, { "loss": 2.1204, "grad_norm": 1.4818220138549805, "learning_rate": 1.71885488437474e-05, "epoch": 0.960171607863226, "step": 14995 }, { "loss": 2.0826, "grad_norm": 1.4875595569610596, "learning_rate": 1.695012758622226e-05, "epoch": 0.9604917717871551, "step": 15000 }, { "eval_loss": 1.9857242107391357, "eval_runtime": 15.0438, "eval_samples_per_second": 136.136, "eval_steps_per_second": 17.017, "epoch": 0.9604917717871551, "step": 15000 }, { "loss": 2.0994, "grad_norm": 1.5387358665466309, "learning_rate": 1.6712520401099422e-05, "epoch": 0.960811935711084, "step": 15005 }, { "loss": 2.1318, "grad_norm": 1.4918915033340454, "learning_rate": 1.6475751317263063e-05, "epoch": 0.9611320996350131, "step": 15010 }, { "loss": 2.122, "grad_norm": 1.4722011089324951, "learning_rate": 1.6239844278841366e-05, "epoch": 0.9614522635589422, "step": 15015 }, { "loss": 2.1167, "grad_norm": 1.4381043910980225, "learning_rate": 1.600482314278505e-05, "epoch": 0.9617724274828713, "step": 15020 }, { "loss": 2.0594, "grad_norm": 1.5075242519378662, "learning_rate": 1.5770711676454767e-05, "epoch": 0.9620925914068003, "step": 15025 }, { "loss": 2.1349, "grad_norm": 1.49933660030365, "learning_rate": 1.5537533555217525e-05, "epoch": 0.9624127553307293, "step": 15030 }, { "loss": 2.1218, "grad_norm": 1.4700450897216797, "learning_rate": 1.5305312360052442e-05, "epoch": 0.9627329192546584, "step": 15035 }, { "loss": 2.102, "grad_norm": 1.5126475095748901, "learning_rate": 1.5074071575166057e-05, "epoch": 0.9630530831785874, "step": 15040 }, { "loss": 2.0933, "grad_norm": 1.501659870147705, "learning_rate": 1.4843834585617333e-05, "epoch": 0.9633732471025165, "step": 15045 }, { "loss": 2.1065, "grad_norm": 1.4653816223144531, "learning_rate": 1.4614624674952842e-05, "epoch": 0.9636934110264456, "step": 15050 }, { "loss": 2.1135, "grad_norm": 1.5033470392227173, "learning_rate": 1.4386465022852091e-05, "epoch": 0.9640135749503745, "step": 15055 }, { "loss": 2.1284, "grad_norm": 1.539259672164917, "learning_rate": 1.4159378702783404e-05, "epoch": 0.9643337388743036, "step": 15060 }, { "loss": 2.1003, "grad_norm": 1.4910892248153687, "learning_rate": 1.3933388679670506e-05, "epoch": 0.9646539027982327, "step": 15065 }, { "loss": 2.1035, "grad_norm": 1.4681600332260132, "learning_rate": 1.3708517807570171e-05, "epoch": 0.9649740667221618, "step": 15070 }, { "loss": 2.0992, "grad_norm": 1.4621437788009644, "learning_rate": 1.3484788827360955e-05, "epoch": 0.9652942306460908, "step": 15075 }, { "loss": 2.1292, "grad_norm": 1.438450813293457, "learning_rate": 1.3262224364443493e-05, "epoch": 0.9656143945700199, "step": 15080 }, { "loss": 2.0915, "grad_norm": 1.4840413331985474, "learning_rate": 1.3040846926452386e-05, "epoch": 0.9659345584939489, "step": 15085 }, { "loss": 2.1174, "grad_norm": 1.4679360389709473, "learning_rate": 1.2820678900980093e-05, "epoch": 0.9662547224178779, "step": 15090 }, { "loss": 2.1146, "grad_norm": 1.46363365650177, "learning_rate": 1.260174255331282e-05, "epoch": 0.966574886341807, "step": 15095 }, { "loss": 2.1146, "grad_norm": 1.5261812210083008, "learning_rate": 1.2384060024178956e-05, "epoch": 0.9668950502657361, "step": 15100 }, { "loss": 2.1373, "grad_norm": 1.5290420055389404, "learning_rate": 1.2167653327509926e-05, "epoch": 0.9672152141896652, "step": 15105 }, { "loss": 2.0911, "grad_norm": 1.4681881666183472, "learning_rate": 1.1952544348214028e-05, "epoch": 0.9675353781135941, "step": 15110 }, { "loss": 2.1215, "grad_norm": 1.5597201585769653, "learning_rate": 1.1738754839963159e-05, "epoch": 0.9678555420375232, "step": 15115 }, { "loss": 2.0924, "grad_norm": 1.4340606927871704, "learning_rate": 1.1526306422992994e-05, "epoch": 0.9681757059614523, "step": 15120 }, { "loss": 2.1189, "grad_norm": 1.51260244846344, "learning_rate": 1.1315220581916477e-05, "epoch": 0.9684958698853813, "step": 15125 }, { "loss": 2.117, "grad_norm": 1.4482096433639526, "learning_rate": 1.1105518663551176e-05, "epoch": 0.9688160338093104, "step": 15130 }, { "loss": 2.1359, "grad_norm": 1.4650722742080688, "learning_rate": 1.0897221874760444e-05, "epoch": 0.9691361977332394, "step": 15135 }, { "loss": 2.1113, "grad_norm": 1.5193512439727783, "learning_rate": 1.0690351280308877e-05, "epoch": 0.9694563616571684, "step": 15140 }, { "loss": 2.1186, "grad_norm": 1.5083808898925781, "learning_rate": 1.0484927800731984e-05, "epoch": 0.9697765255810975, "step": 15145 }, { "loss": 2.088, "grad_norm": 1.4598770141601562, "learning_rate": 1.0280972210220578e-05, "epoch": 0.9700966895050266, "step": 15150 }, { "loss": 2.091, "grad_norm": 1.4730453491210938, "learning_rate": 1.0078505134519874e-05, "epoch": 0.9704168534289557, "step": 15155 }, { "loss": 2.0936, "grad_norm": 1.4336142539978027, "learning_rate": 9.87754704884369e-06, "epoch": 0.9707370173528846, "step": 15160 }, { "loss": 2.099, "grad_norm": 1.4537841081619263, "learning_rate": 9.678118275803749e-06, "epoch": 0.9710571812768137, "step": 15165 }, { "loss": 2.0901, "grad_norm": 1.4845715761184692, "learning_rate": 9.480238983354515e-06, "epoch": 0.9713773452007428, "step": 15170 }, { "loss": 2.1275, "grad_norm": 1.5654568672180176, "learning_rate": 9.283929182753659e-06, "epoch": 0.9716975091246718, "step": 15175 }, { "loss": 2.1, "grad_norm": 1.4920252561569214, "learning_rate": 9.089208726538304e-06, "epoch": 0.9720176730486009, "step": 15180 }, { "loss": 2.1293, "grad_norm": 1.5260412693023682, "learning_rate": 8.896097306517388e-06, "epoch": 0.97233783697253, "step": 15185 }, { "loss": 2.1188, "grad_norm": 1.475035548210144, "learning_rate": 8.70461445178025e-06, "epoch": 0.972658000896459, "step": 15190 }, { "loss": 2.1094, "grad_norm": 1.465287208557129, "learning_rate": 8.514779526721713e-06, "epoch": 0.972978164820388, "step": 15195 }, { "loss": 2.1036, "grad_norm": 1.4647490978240967, "learning_rate": 8.32661172908373e-06, "epoch": 0.9732983287443171, "step": 15200 }, { "eval_loss": 1.9738588333129883, "eval_runtime": 13.379, "eval_samples_per_second": 153.076, "eval_steps_per_second": 19.134, "epoch": 0.9732983287443171, "step": 15200 }, { "loss": 2.112, "grad_norm": 1.4496986865997314, "learning_rate": 8.140130088014008e-06, "epoch": 0.9736184926682462, "step": 15205 }, { "loss": 2.0984, "grad_norm": 1.439038634300232, "learning_rate": 7.955353462141554e-06, "epoch": 0.9739386565921752, "step": 15210 }, { "loss": 2.0996, "grad_norm": 1.4874390363693237, "learning_rate": 7.7723005376696e-06, "epoch": 0.9742588205161042, "step": 15215 }, { "loss": 2.0868, "grad_norm": 1.4709018468856812, "learning_rate": 7.5909898264857895e-06, "epoch": 0.9745789844400333, "step": 15220 }, { "loss": 2.1179, "grad_norm": 1.4825295209884644, "learning_rate": 7.411439664290226e-06, "epoch": 0.9748991483639623, "step": 15225 }, { "loss": 2.0792, "grad_norm": 1.4520797729492188, "learning_rate": 7.2336682087410985e-06, "epoch": 0.9752193122878914, "step": 15230 }, { "loss": 2.0882, "grad_norm": 1.4495720863342285, "learning_rate": 7.05769343761849e-06, "epoch": 0.9755394762118205, "step": 15235 }, { "loss": 2.1166, "grad_norm": 1.4639912843704224, "learning_rate": 6.883533147006266e-06, "epoch": 0.9758596401357496, "step": 15240 }, { "loss": 2.0994, "grad_norm": 1.4839041233062744, "learning_rate": 6.7112049494924364e-06, "epoch": 0.9761798040596785, "step": 15245 }, { "loss": 2.0832, "grad_norm": 1.4434202909469604, "learning_rate": 6.540726272387926e-06, "epoch": 0.9764999679836076, "step": 15250 }, { "loss": 2.0947, "grad_norm": 1.4894647598266602, "learning_rate": 6.372114355964293e-06, "epoch": 0.9768201319075367, "step": 15255 }, { "loss": 2.1189, "grad_norm": 1.4748380184173584, "learning_rate": 6.205386251710138e-06, "epoch": 0.9771402958314657, "step": 15260 }, { "loss": 2.0734, "grad_norm": 1.4855523109436035, "learning_rate": 6.040558820606795e-06, "epoch": 0.9774604597553948, "step": 15265 }, { "loss": 2.1203, "grad_norm": 1.4771287441253662, "learning_rate": 5.877648731423133e-06, "epoch": 0.9777806236793238, "step": 15270 }, { "loss": 2.0971, "grad_norm": 1.4301459789276123, "learning_rate": 5.716672459029926e-06, "epoch": 0.9781007876032529, "step": 15275 }, { "loss": 2.0943, "grad_norm": 1.4706257581710815, "learning_rate": 5.557646282733725e-06, "epoch": 0.9784209515271819, "step": 15280 }, { "loss": 2.0998, "grad_norm": 1.485478401184082, "learning_rate": 5.400586284630579e-06, "epoch": 0.978741115451111, "step": 15285 }, { "loss": 2.0991, "grad_norm": 1.4847251176834106, "learning_rate": 5.245508347979675e-06, "epoch": 0.9790612793750401, "step": 15290 }, { "loss": 2.1203, "grad_norm": 1.4288967847824097, "learning_rate": 5.092428155597084e-06, "epoch": 0.979381443298969, "step": 15295 }, { "loss": 2.1178, "grad_norm": 1.4488484859466553, "learning_rate": 4.941361188269775e-06, "epoch": 0.9797016072228981, "step": 15300 }, { "loss": 2.1052, "grad_norm": 1.464503288269043, "learning_rate": 4.792322723190057e-06, "epoch": 0.9800217711468272, "step": 15305 }, { "loss": 2.0986, "grad_norm": 1.5340265035629272, "learning_rate": 4.645327832410648e-06, "epoch": 0.9803419350707562, "step": 15310 }, { "loss": 2.096, "grad_norm": 1.4559099674224854, "learning_rate": 4.500391381320421e-06, "epoch": 0.9806620989946853, "step": 15315 }, { "loss": 2.1186, "grad_norm": 1.4796267747879028, "learning_rate": 4.357528027141117e-06, "epoch": 0.9809822629186143, "step": 15320 }, { "loss": 2.1225, "grad_norm": 1.4935765266418457, "learning_rate": 4.216752217445052e-06, "epoch": 0.9813024268425434, "step": 15325 }, { "loss": 2.0997, "grad_norm": 1.4788649082183838, "learning_rate": 4.078078188694101e-06, "epoch": 0.9816225907664724, "step": 15330 }, { "loss": 2.1163, "grad_norm": 1.52225661277771, "learning_rate": 3.941519964799928e-06, "epoch": 0.9819427546904015, "step": 15335 }, { "loss": 2.1381, "grad_norm": 1.479812502861023, "learning_rate": 3.807091355705811e-06, "epoch": 0.9822629186143306, "step": 15340 }, { "loss": 2.0806, "grad_norm": 1.448622226715088, "learning_rate": 3.674805955990032e-06, "epoch": 0.9825830825382595, "step": 15345 }, { "loss": 2.1262, "grad_norm": 1.514609694480896, "learning_rate": 3.5446771434911096e-06, "epoch": 0.9829032464621886, "step": 15350 }, { "loss": 2.0836, "grad_norm": 1.492081880569458, "learning_rate": 3.416718077954864e-06, "epoch": 0.9832234103861177, "step": 15355 }, { "loss": 2.113, "grad_norm": 1.4686152935028076, "learning_rate": 3.290941699703651e-06, "epoch": 0.9835435743100468, "step": 15360 }, { "loss": 2.1112, "grad_norm": 1.521332859992981, "learning_rate": 3.1673607283276813e-06, "epoch": 0.9838637382339758, "step": 15365 }, { "loss": 2.1045, "grad_norm": 1.4462871551513672, "learning_rate": 3.045987661398736e-06, "epoch": 0.9841839021579049, "step": 15370 }, { "loss": 2.1058, "grad_norm": 1.4568146467208862, "learning_rate": 2.926834773206269e-06, "epoch": 0.9845040660818339, "step": 15375 }, { "loss": 2.123, "grad_norm": 1.486159324645996, "learning_rate": 2.809914113516171e-06, "epoch": 0.9848242300057629, "step": 15380 }, { "loss": 2.1106, "grad_norm": 1.4326839447021484, "learning_rate": 2.6952375063521467e-06, "epoch": 0.985144393929692, "step": 15385 }, { "loss": 2.1136, "grad_norm": 1.4544931650161743, "learning_rate": 2.582816548800002e-06, "epoch": 0.9854645578536211, "step": 15390 }, { "loss": 2.0961, "grad_norm": 1.5159916877746582, "learning_rate": 2.472662609834825e-06, "epoch": 0.98578472177755, "step": 15395 }, { "loss": 2.0906, "grad_norm": 1.4797563552856445, "learning_rate": 2.364786829171281e-06, "epoch": 0.9861048857014791, "step": 15400 }, { "eval_loss": 1.969813346862793, "eval_runtime": 9.427, "eval_samples_per_second": 217.248, "eval_steps_per_second": 27.156, "epoch": 0.9861048857014791, "step": 15400 }, { "loss": 2.1251, "grad_norm": 1.4745688438415527, "learning_rate": 2.2592001161370392e-06, "epoch": 0.9864250496254082, "step": 15405 }, { "loss": 2.0873, "grad_norm": 1.4373728036880493, "learning_rate": 2.155913148569558e-06, "epoch": 0.9867452135493373, "step": 15410 }, { "loss": 2.0997, "grad_norm": 1.454837679862976, "learning_rate": 2.0549363717362215e-06, "epoch": 0.9870653774732663, "step": 15415 }, { "loss": 2.1055, "grad_norm": 1.472625970840454, "learning_rate": 1.956279997278043e-06, "epoch": 0.9873855413971954, "step": 15420 }, { "loss": 2.1318, "grad_norm": 1.3973727226257324, "learning_rate": 1.8599540021769695e-06, "epoch": 0.9877057053211244, "step": 15425 }, { "loss": 2.0958, "grad_norm": 1.4296380281448364, "learning_rate": 1.765968127746928e-06, "epoch": 0.9880258692450534, "step": 15430 }, { "loss": 2.0831, "grad_norm": 1.433281421661377, "learning_rate": 1.6743318786486966e-06, "epoch": 0.9883460331689825, "step": 15435 }, { "loss": 2.0827, "grad_norm": 1.4774094820022583, "learning_rate": 1.5850545219287e-06, "epoch": 0.9886661970929116, "step": 15440 }, { "loss": 2.125, "grad_norm": 1.477018117904663, "learning_rate": 1.4981450860818757e-06, "epoch": 0.9889863610168407, "step": 15445 }, { "loss": 2.076, "grad_norm": 1.5008845329284668, "learning_rate": 1.4136123601385998e-06, "epoch": 0.9893065249407696, "step": 15450 }, { "loss": 2.0926, "grad_norm": 1.450234055519104, "learning_rate": 1.3314648927758966e-06, "epoch": 0.9896266888646987, "step": 15455 }, { "loss": 2.0658, "grad_norm": 1.4952715635299683, "learning_rate": 1.2517109914528841e-06, "epoch": 0.9899468527886278, "step": 15460 }, { "loss": 2.1135, "grad_norm": 1.434744954109192, "learning_rate": 1.1743587215707007e-06, "epoch": 0.9902670167125568, "step": 15465 }, { "loss": 2.0885, "grad_norm": 1.4307448863983154, "learning_rate": 1.0994159056568198e-06, "epoch": 0.9905871806364859, "step": 15470 }, { "loss": 2.0971, "grad_norm": 1.4121356010437012, "learning_rate": 1.026890122573998e-06, "epoch": 0.990907344560415, "step": 15475 }, { "loss": 2.1023, "grad_norm": 1.4584524631500244, "learning_rate": 9.56788706753814e-07, "epoch": 0.9912275084843439, "step": 15480 }, { "loss": 2.1039, "grad_norm": 1.4847089052200317, "learning_rate": 8.891187474549617e-07, "epoch": 0.991547672408273, "step": 15485 }, { "loss": 2.0808, "grad_norm": 1.496368408203125, "learning_rate": 8.238870880463157e-07, "epoch": 0.9918678363322021, "step": 15490 }, { "loss": 2.0938, "grad_norm": 1.4407403469085693, "learning_rate": 7.611003253148757e-07, "epoch": 0.9921880002561312, "step": 15495 }, { "loss": 2.0824, "grad_norm": 1.4371376037597656, "learning_rate": 7.007648087986374e-07, "epoch": 0.9925081641800602, "step": 15500 }, { "loss": 2.0959, "grad_norm": 1.4718334674835205, "learning_rate": 6.428866401444799e-07, "epoch": 0.9928283281039892, "step": 15505 }, { "loss": 2.1124, "grad_norm": 1.5052322149276733, "learning_rate": 5.874716724911078e-07, "epoch": 0.9931484920279183, "step": 15510 }, { "loss": 2.0862, "grad_norm": 1.485822319984436, "learning_rate": 5.345255098771346e-07, "epoch": 0.9934686559518473, "step": 15515 }, { "loss": 2.0994, "grad_norm": 1.4460045099258423, "learning_rate": 4.840535066743506e-07, "epoch": 0.9937888198757764, "step": 15520 }, { "loss": 2.093, "grad_norm": 1.4056727886199951, "learning_rate": 4.360607670462591e-07, "epoch": 0.9941089837997055, "step": 15525 }, { "loss": 2.094, "grad_norm": 1.4093832969665527, "learning_rate": 3.905521444318605e-07, "epoch": 0.9944291477236346, "step": 15530 }, { "loss": 2.1117, "grad_norm": 1.445207953453064, "learning_rate": 3.4753224105488204e-07, "epoch": 0.9947493116475635, "step": 15535 }, { "loss": 2.1148, "grad_norm": 1.4611190557479858, "learning_rate": 3.070054074583162e-07, "epoch": 0.9950694755714926, "step": 15540 }, { "loss": 2.1264, "grad_norm": 1.5278282165527344, "learning_rate": 2.689757420644951e-07, "epoch": 0.9953896394954217, "step": 15545 }, { "loss": 2.1253, "grad_norm": 1.4608923196792603, "learning_rate": 2.334470907605829e-07, "epoch": 0.9957098034193507, "step": 15550 }, { "loss": 2.0896, "grad_norm": 1.4511940479278564, "learning_rate": 2.004230465096818e-07, "epoch": 0.9960299673432798, "step": 15555 }, { "loss": 2.0991, "grad_norm": 1.4206980466842651, "learning_rate": 1.699069489874583e-07, "epoch": 0.9963501312672088, "step": 15560 }, { "loss": 2.0942, "grad_norm": 1.4339975118637085, "learning_rate": 1.419018842444164e-07, "epoch": 0.9966702951911378, "step": 15565 }, { "loss": 2.1004, "grad_norm": 1.4129743576049805, "learning_rate": 1.1641068439380842e-07, "epoch": 0.9969904591150669, "step": 15570 }, { "loss": 2.1244, "grad_norm": 1.4470487833023071, "learning_rate": 9.343592732521944e-08, "epoch": 0.997310623038996, "step": 15575 }, { "loss": 2.0782, "grad_norm": 1.457018256187439, "learning_rate": 7.29799364438899e-08, "epoch": 0.9976307869629251, "step": 15580 }, { "loss": 2.1337, "grad_norm": 1.4190618991851807, "learning_rate": 5.5044780435722923e-08, "epoch": 0.997950950886854, "step": 15585 }, { "loss": 2.1195, "grad_norm": 1.4926364421844482, "learning_rate": 3.963227305810724e-08, "epoch": 0.9982711148107831, "step": 15590 }, { "loss": 2.0792, "grad_norm": 1.4838751554489136, "learning_rate": 2.6743972956475016e-08, "epoch": 0.9985912787347122, "step": 15595 }, { "loss": 2.0793, "grad_norm": 1.4116333723068237, "learning_rate": 1.6381183506697374e-08, "epoch": 0.9989114426586412, "step": 15600 }, { "eval_loss": 1.9771896600723267, "eval_runtime": 12.6984, "eval_samples_per_second": 161.281, "eval_steps_per_second": 20.16, "epoch": 0.9989114426586412, "step": 15600 }, { "loss": 2.1191, "grad_norm": 1.5354454517364502, "learning_rate": 8.544952683253726e-09, "epoch": 0.9992316065825703, "step": 15605 }, { "loss": 2.097, "grad_norm": 1.3956998586654663, "learning_rate": 3.2360729532776578e-09, "epoch": 0.9995517705064993, "step": 15610 }, { "loss": 2.0965, "grad_norm": 1.407570481300354, "learning_rate": 4.550811963849322e-10, "epoch": 0.9998719344304284, "step": 15615 }, { "train_runtime": 71.4624, "train_samples_per_second": 55944.819, "train_steps_per_second": 218.534, "total_flos": 5.2748789856731136e+17, "train_loss": 0.0022892785559331377, "epoch": 1.0, "step": 15617 }, { "eval_loss": 1.9724633693695068, "eval_runtime": 9.3608, "eval_samples_per_second": 218.784, "eval_steps_per_second": 27.348, "epoch": 1.0, "step": 15617 }, { "train_runtime": 0.0087, "train_samples_per_second": 461932896.212, "train_steps_per_second": 1804425.376, "total_flos": 5.2748789856731136e+17, "train_loss": 0.0, "epoch": 1.0, "step": 15617 }, { "eval_loss": 1.9763054847717285, "eval_runtime": 13.0331, "eval_samples_per_second": 157.138, "eval_steps_per_second": 19.642, "epoch": 1.0, "step": 15617 } ], "best_metric": null, "best_model_checkpoint": null, "is_local_process_zero": true, "is_world_process_zero": true, "is_hyper_param_search": false, "trial_name": null, "trial_params": null, "stateful_callbacks": { "TrainerControl": { "args": { "should_training_stop": false, "should_epoch_stop": false, "should_save": false, "should_evaluate": false, "should_log": false }, "attributes": {} } } }