diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,24862 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997885984074414, + "eval_steps": 500, + "global_step": 3547, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00028186879007821857, + "grad_norm": 3201.3330078125, + "learning_rate": 1.0000000000000002e-06, + "loss": 24.9745, + "step": 1 + }, + { + "epoch": 0.0005637375801564371, + "grad_norm": 3121.533203125, + "learning_rate": 2.0000000000000003e-06, + "loss": 26.2865, + "step": 2 + }, + { + "epoch": 0.0008456063702346558, + "grad_norm": 3041.408203125, + "learning_rate": 3e-06, + "loss": 26.1628, + "step": 3 + }, + { + "epoch": 0.0011274751603128743, + "grad_norm": 3249.3740234375, + "learning_rate": 4.000000000000001e-06, + "loss": 24.3271, + "step": 4 + }, + { + "epoch": 0.001409343950391093, + "grad_norm": 3121.421142578125, + "learning_rate": 5e-06, + "loss": 26.3375, + "step": 5 + }, + { + "epoch": 0.0016912127404693116, + "grad_norm": 2865.541748046875, + "learning_rate": 6e-06, + "loss": 24.3371, + "step": 6 + }, + { + "epoch": 0.00197308153054753, + "grad_norm": 3281.491943359375, + "learning_rate": 7.000000000000001e-06, + "loss": 25.4805, + "step": 7 + }, + { + "epoch": 0.0022549503206257485, + "grad_norm": 3361.418212890625, + "learning_rate": 8.000000000000001e-06, + "loss": 24.8545, + "step": 8 + }, + { + "epoch": 0.0025368191107039673, + "grad_norm": 3041.356201171875, + "learning_rate": 9e-06, + "loss": 24.1732, + "step": 9 + }, + { + "epoch": 0.002818687900782186, + "grad_norm": 3425.238037109375, + "learning_rate": 1e-05, + "loss": 24.3092, + "step": 10 + }, + { + "epoch": 0.0031005566908604044, + "grad_norm": 2849.370849609375, + "learning_rate": 1.1000000000000001e-05, + "loss": 22.5827, + "step": 11 + }, + { + "epoch": 0.0033824254809386232, + "grad_norm": 3201.043701171875, + "learning_rate": 1.2e-05, + "loss": 21.3213, + "step": 12 + }, + { + "epoch": 0.0036642942710168416, + "grad_norm": 2785.003173828125, + "learning_rate": 1.3000000000000001e-05, + "loss": 21.0856, + "step": 13 + }, + { + "epoch": 0.00394616306109506, + "grad_norm": 3392.8330078125, + "learning_rate": 1.4000000000000001e-05, + "loss": 20.8247, + "step": 14 + }, + { + "epoch": 0.004228031851173279, + "grad_norm": 2544.8818359375, + "learning_rate": 1.5e-05, + "loss": 18.3088, + "step": 15 + }, + { + "epoch": 0.004509900641251497, + "grad_norm": 2544.896240234375, + "learning_rate": 1.6000000000000003e-05, + "loss": 17.8927, + "step": 16 + }, + { + "epoch": 0.004791769431329716, + "grad_norm": 2464.917724609375, + "learning_rate": 1.7000000000000003e-05, + "loss": 16.6426, + "step": 17 + }, + { + "epoch": 0.005073638221407935, + "grad_norm": 2656.76953125, + "learning_rate": 1.8e-05, + "loss": 15.2539, + "step": 18 + }, + { + "epoch": 0.005355507011486153, + "grad_norm": 2848.775146484375, + "learning_rate": 1.9e-05, + "loss": 13.4845, + "step": 19 + }, + { + "epoch": 0.005637375801564372, + "grad_norm": 2848.8046875, + "learning_rate": 2e-05, + "loss": 12.6589, + "step": 20 + }, + { + "epoch": 0.0059192445916425905, + "grad_norm": 2336.906982421875, + "learning_rate": 2.1e-05, + "loss": 11.4312, + "step": 21 + }, + { + "epoch": 0.006201113381720809, + "grad_norm": 2752.837158203125, + "learning_rate": 2.2000000000000003e-05, + "loss": 10.3123, + "step": 22 + }, + { + "epoch": 0.006482982171799027, + "grad_norm": 2352.88671875, + "learning_rate": 2.3000000000000003e-05, + "loss": 8.2024, + "step": 23 + }, + { + "epoch": 0.0067648509618772465, + "grad_norm": 2336.90478515625, + "learning_rate": 2.4e-05, + "loss": 7.7463, + "step": 24 + }, + { + "epoch": 0.007046719751955465, + "grad_norm": 1480.7607421875, + "learning_rate": 2.5e-05, + "loss": 6.1722, + "step": 25 + }, + { + "epoch": 0.007328588542033683, + "grad_norm": 1408.4468994140625, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6663, + "step": 26 + }, + { + "epoch": 0.0076104573321119015, + "grad_norm": 2288.400390625, + "learning_rate": 2.7000000000000002e-05, + "loss": 6.2052, + "step": 27 + }, + { + "epoch": 0.00789232612219012, + "grad_norm": 2112.26708984375, + "learning_rate": 2.8000000000000003e-05, + "loss": 4.5299, + "step": 28 + }, + { + "epoch": 0.00817419491226834, + "grad_norm": 2080.291015625, + "learning_rate": 2.9e-05, + "loss": 4.5946, + "step": 29 + }, + { + "epoch": 0.008456063702346557, + "grad_norm": 1152.1561279296875, + "learning_rate": 3e-05, + "loss": 3.3207, + "step": 30 + }, + { + "epoch": 0.008737932492424777, + "grad_norm": 652.1309204101562, + "learning_rate": 3.1e-05, + "loss": 2.9112, + "step": 31 + }, + { + "epoch": 0.009019801282502994, + "grad_norm": 378.2425537109375, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.0573, + "step": 32 + }, + { + "epoch": 0.009301670072581213, + "grad_norm": 580.2664184570312, + "learning_rate": 3.3e-05, + "loss": 2.8483, + "step": 33 + }, + { + "epoch": 0.009583538862659433, + "grad_norm": 704.36962890625, + "learning_rate": 3.4000000000000007e-05, + "loss": 4.3029, + "step": 34 + }, + { + "epoch": 0.00986540765273765, + "grad_norm": 1336.138916015625, + "learning_rate": 3.5e-05, + "loss": 2.7256, + "step": 35 + }, + { + "epoch": 0.01014727644281587, + "grad_norm": 444.2579345703125, + "learning_rate": 3.6e-05, + "loss": 2.7225, + "step": 36 + }, + { + "epoch": 0.010429145232894088, + "grad_norm": 1096.1943359375, + "learning_rate": 3.7e-05, + "loss": 2.904, + "step": 37 + }, + { + "epoch": 0.010711014022972306, + "grad_norm": 1664.31689453125, + "learning_rate": 3.8e-05, + "loss": 3.6761, + "step": 38 + }, + { + "epoch": 0.010992882813050525, + "grad_norm": 1088.144775390625, + "learning_rate": 3.9000000000000006e-05, + "loss": 2.938, + "step": 39 + }, + { + "epoch": 0.011274751603128744, + "grad_norm": 936.2603149414062, + "learning_rate": 4e-05, + "loss": 3.131, + "step": 40 + }, + { + "epoch": 0.011556620393206962, + "grad_norm": 596.1867065429688, + "learning_rate": 4.1e-05, + "loss": 2.8774, + "step": 41 + }, + { + "epoch": 0.011838489183285181, + "grad_norm": 1296.11181640625, + "learning_rate": 4.2e-05, + "loss": 3.0997, + "step": 42 + }, + { + "epoch": 0.012120357973363399, + "grad_norm": 390.1418762207031, + "learning_rate": 4.3e-05, + "loss": 3.3588, + "step": 43 + }, + { + "epoch": 0.012402226763441618, + "grad_norm": 1096.264892578125, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.5362, + "step": 44 + }, + { + "epoch": 0.012684095553519837, + "grad_norm": 1856.4984130859375, + "learning_rate": 4.5e-05, + "loss": 3.7973, + "step": 45 + }, + { + "epoch": 0.012965964343598054, + "grad_norm": 860.24169921875, + "learning_rate": 4.600000000000001e-05, + "loss": 3.2442, + "step": 46 + }, + { + "epoch": 0.013247833133676274, + "grad_norm": 1328.1826171875, + "learning_rate": 4.7e-05, + "loss": 3.5489, + "step": 47 + }, + { + "epoch": 0.013529701923754493, + "grad_norm": 776.1255493164062, + "learning_rate": 4.8e-05, + "loss": 2.9624, + "step": 48 + }, + { + "epoch": 0.01381157071383271, + "grad_norm": 1408.1900634765625, + "learning_rate": 4.9e-05, + "loss": 2.5552, + "step": 49 + }, + { + "epoch": 0.01409343950391093, + "grad_norm": 1936.4412841796875, + "learning_rate": 5e-05, + "loss": 3.337, + "step": 50 + }, + { + "epoch": 0.014375308293989149, + "grad_norm": 2384.7138671875, + "learning_rate": 5.1000000000000006e-05, + "loss": 4.1907, + "step": 51 + }, + { + "epoch": 0.014657177084067366, + "grad_norm": 2224.531005859375, + "learning_rate": 5.2000000000000004e-05, + "loss": 4.1515, + "step": 52 + }, + { + "epoch": 0.014939045874145586, + "grad_norm": 2656.57177734375, + "learning_rate": 5.300000000000001e-05, + "loss": 4.1675, + "step": 53 + }, + { + "epoch": 0.015220914664223803, + "grad_norm": 288.10455322265625, + "learning_rate": 5.4000000000000005e-05, + "loss": 2.3403, + "step": 54 + }, + { + "epoch": 0.015502783454302022, + "grad_norm": 1232.16552734375, + "learning_rate": 5.500000000000001e-05, + "loss": 3.374, + "step": 55 + }, + { + "epoch": 0.01578465224438024, + "grad_norm": 1584.2393798828125, + "learning_rate": 5.6000000000000006e-05, + "loss": 3.1683, + "step": 56 + }, + { + "epoch": 0.01606652103445846, + "grad_norm": 1752.1072998046875, + "learning_rate": 5.6999999999999996e-05, + "loss": 3.5206, + "step": 57 + }, + { + "epoch": 0.01634838982453668, + "grad_norm": 2144.36572265625, + "learning_rate": 5.8e-05, + "loss": 3.9018, + "step": 58 + }, + { + "epoch": 0.016630258614614896, + "grad_norm": 756.0360107421875, + "learning_rate": 5.9e-05, + "loss": 2.8526, + "step": 59 + }, + { + "epoch": 0.016912127404693115, + "grad_norm": 1544.2462158203125, + "learning_rate": 6e-05, + "loss": 2.9151, + "step": 60 + }, + { + "epoch": 0.017193996194771334, + "grad_norm": 792.1146240234375, + "learning_rate": 6.1e-05, + "loss": 3.2269, + "step": 61 + }, + { + "epoch": 0.017475864984849553, + "grad_norm": 944.0771484375, + "learning_rate": 6.2e-05, + "loss": 3.0691, + "step": 62 + }, + { + "epoch": 0.017757733774927772, + "grad_norm": 2992.30908203125, + "learning_rate": 6.3e-05, + "loss": 4.3083, + "step": 63 + }, + { + "epoch": 0.018039602565005988, + "grad_norm": 1072.1134033203125, + "learning_rate": 6.400000000000001e-05, + "loss": 2.8794, + "step": 64 + }, + { + "epoch": 0.018321471355084207, + "grad_norm": 1504.101806640625, + "learning_rate": 6.500000000000001e-05, + "loss": 3.2184, + "step": 65 + }, + { + "epoch": 0.018603340145162427, + "grad_norm": 1640.104248046875, + "learning_rate": 6.6e-05, + "loss": 3.5601, + "step": 66 + }, + { + "epoch": 0.018885208935240646, + "grad_norm": 760.0393676757812, + "learning_rate": 6.7e-05, + "loss": 2.7743, + "step": 67 + }, + { + "epoch": 0.019167077725318865, + "grad_norm": 1232.0220947265625, + "learning_rate": 6.800000000000001e-05, + "loss": 2.9969, + "step": 68 + }, + { + "epoch": 0.019448946515397084, + "grad_norm": 784.0505981445312, + "learning_rate": 6.9e-05, + "loss": 2.8653, + "step": 69 + }, + { + "epoch": 0.0197308153054753, + "grad_norm": 1592.080810546875, + "learning_rate": 7e-05, + "loss": 3.5193, + "step": 70 + }, + { + "epoch": 0.02001268409555352, + "grad_norm": 1280.058349609375, + "learning_rate": 7.1e-05, + "loss": 3.3156, + "step": 71 + }, + { + "epoch": 0.02029455288563174, + "grad_norm": 516.025390625, + "learning_rate": 7.2e-05, + "loss": 2.4201, + "step": 72 + }, + { + "epoch": 0.020576421675709958, + "grad_norm": 1544.0926513671875, + "learning_rate": 7.3e-05, + "loss": 2.8335, + "step": 73 + }, + { + "epoch": 0.020858290465788177, + "grad_norm": 488.015869140625, + "learning_rate": 7.4e-05, + "loss": 2.7653, + "step": 74 + }, + { + "epoch": 0.021140159255866393, + "grad_norm": 1936.1307373046875, + "learning_rate": 7.500000000000001e-05, + "loss": 3.3837, + "step": 75 + }, + { + "epoch": 0.021422028045944612, + "grad_norm": 1640.03173828125, + "learning_rate": 7.6e-05, + "loss": 2.7917, + "step": 76 + }, + { + "epoch": 0.02170389683602283, + "grad_norm": 776.02880859375, + "learning_rate": 7.7e-05, + "loss": 3.0215, + "step": 77 + }, + { + "epoch": 0.02198576562610105, + "grad_norm": 1280.0423583984375, + "learning_rate": 7.800000000000001e-05, + "loss": 3.3529, + "step": 78 + }, + { + "epoch": 0.02226763441617927, + "grad_norm": 1592.0833740234375, + "learning_rate": 7.900000000000001e-05, + "loss": 3.7503, + "step": 79 + }, + { + "epoch": 0.02254950320625749, + "grad_norm": 1168.06396484375, + "learning_rate": 8e-05, + "loss": 2.7328, + "step": 80 + }, + { + "epoch": 0.022831371996335705, + "grad_norm": 3120.0478515625, + "learning_rate": 8.1e-05, + "loss": 4.6558, + "step": 81 + }, + { + "epoch": 0.023113240786413924, + "grad_norm": 1568.045654296875, + "learning_rate": 8.2e-05, + "loss": 3.3103, + "step": 82 + }, + { + "epoch": 0.023395109576492143, + "grad_norm": 616.0255126953125, + "learning_rate": 8.3e-05, + "loss": 3.3021, + "step": 83 + }, + { + "epoch": 0.023676978366570362, + "grad_norm": 1528.0614013671875, + "learning_rate": 8.4e-05, + "loss": 3.5196, + "step": 84 + }, + { + "epoch": 0.02395884715664858, + "grad_norm": 2192.08056640625, + "learning_rate": 8.5e-05, + "loss": 3.388, + "step": 85 + }, + { + "epoch": 0.024240715946726797, + "grad_norm": 1464.0775146484375, + "learning_rate": 8.6e-05, + "loss": 3.0918, + "step": 86 + }, + { + "epoch": 0.024522584736805016, + "grad_norm": 1360.0303955078125, + "learning_rate": 8.7e-05, + "loss": 3.0225, + "step": 87 + }, + { + "epoch": 0.024804453526883236, + "grad_norm": 1088.1307373046875, + "learning_rate": 8.800000000000001e-05, + "loss": 3.2328, + "step": 88 + }, + { + "epoch": 0.025086322316961455, + "grad_norm": 398.02947998046875, + "learning_rate": 8.900000000000001e-05, + "loss": 3.1781, + "step": 89 + }, + { + "epoch": 0.025368191107039674, + "grad_norm": 584.0406494140625, + "learning_rate": 9e-05, + "loss": 3.0596, + "step": 90 + }, + { + "epoch": 0.025650059897117893, + "grad_norm": 1232.0458984375, + "learning_rate": 9.1e-05, + "loss": 3.1335, + "step": 91 + }, + { + "epoch": 0.02593192868719611, + "grad_norm": 524.0310668945312, + "learning_rate": 9.200000000000001e-05, + "loss": 2.9502, + "step": 92 + }, + { + "epoch": 0.026213797477274328, + "grad_norm": 1304.051025390625, + "learning_rate": 9.300000000000001e-05, + "loss": 3.3406, + "step": 93 + }, + { + "epoch": 0.026495666267352547, + "grad_norm": 844.023681640625, + "learning_rate": 9.4e-05, + "loss": 3.2569, + "step": 94 + }, + { + "epoch": 0.026777535057430767, + "grad_norm": 1888.04833984375, + "learning_rate": 9.5e-05, + "loss": 3.4175, + "step": 95 + }, + { + "epoch": 0.027059403847508986, + "grad_norm": 1480.038818359375, + "learning_rate": 9.6e-05, + "loss": 2.474, + "step": 96 + }, + { + "epoch": 0.0273412726375872, + "grad_norm": 1240.03125, + "learning_rate": 9.7e-05, + "loss": 3.6212, + "step": 97 + }, + { + "epoch": 0.02762314142766542, + "grad_norm": 1648.06787109375, + "learning_rate": 9.8e-05, + "loss": 3.0212, + "step": 98 + }, + { + "epoch": 0.02790501021774364, + "grad_norm": 2624.08935546875, + "learning_rate": 9.900000000000001e-05, + "loss": 4.1253, + "step": 99 + }, + { + "epoch": 0.02818687900782186, + "grad_norm": 912.013671875, + "learning_rate": 0.0001, + "loss": 2.8806, + "step": 100 + }, + { + "epoch": 0.02846874779790008, + "grad_norm": 1064.0206298828125, + "learning_rate": 9.999997923379404e-05, + "loss": 2.796, + "step": 101 + }, + { + "epoch": 0.028750616587978298, + "grad_norm": 660.0154418945312, + "learning_rate": 9.99999169351934e-05, + "loss": 2.9488, + "step": 102 + }, + { + "epoch": 0.029032485378056513, + "grad_norm": 1008.0136108398438, + "learning_rate": 9.99998131042498e-05, + "loss": 2.9024, + "step": 103 + }, + { + "epoch": 0.029314354168134733, + "grad_norm": 1096.02294921875, + "learning_rate": 9.999966774104954e-05, + "loss": 2.9005, + "step": 104 + }, + { + "epoch": 0.029596222958212952, + "grad_norm": 904.0103759765625, + "learning_rate": 9.99994808457133e-05, + "loss": 2.8191, + "step": 105 + }, + { + "epoch": 0.02987809174829117, + "grad_norm": 1864.079345703125, + "learning_rate": 9.999925241839641e-05, + "loss": 3.3132, + "step": 106 + }, + { + "epoch": 0.03015996053836939, + "grad_norm": 1296.0328369140625, + "learning_rate": 9.999898245928854e-05, + "loss": 3.2192, + "step": 107 + }, + { + "epoch": 0.030441829328447606, + "grad_norm": 968.0171508789062, + "learning_rate": 9.999867096861395e-05, + "loss": 2.8037, + "step": 108 + }, + { + "epoch": 0.030723698118525825, + "grad_norm": 1056.0286865234375, + "learning_rate": 9.99983179466314e-05, + "loss": 2.9572, + "step": 109 + }, + { + "epoch": 0.031005566908604044, + "grad_norm": 748.0323486328125, + "learning_rate": 9.99979233936341e-05, + "loss": 3.2365, + "step": 110 + }, + { + "epoch": 0.03128743569868226, + "grad_norm": 1020.0203247070312, + "learning_rate": 9.999748730994978e-05, + "loss": 2.8062, + "step": 111 + }, + { + "epoch": 0.03156930448876048, + "grad_norm": 1224.019775390625, + "learning_rate": 9.999700969594073e-05, + "loss": 2.7543, + "step": 112 + }, + { + "epoch": 0.0318511732788387, + "grad_norm": 796.014404296875, + "learning_rate": 9.99964905520036e-05, + "loss": 3.0047, + "step": 113 + }, + { + "epoch": 0.03213304206891692, + "grad_norm": 1840.020751953125, + "learning_rate": 9.999592987856967e-05, + "loss": 3.0398, + "step": 114 + }, + { + "epoch": 0.03241491085899514, + "grad_norm": 1232.019775390625, + "learning_rate": 9.999532767610464e-05, + "loss": 2.7813, + "step": 115 + }, + { + "epoch": 0.03269677964907336, + "grad_norm": 1376.0625, + "learning_rate": 9.999468394510875e-05, + "loss": 3.2069, + "step": 116 + }, + { + "epoch": 0.032978648439151576, + "grad_norm": 1000.0181274414062, + "learning_rate": 9.999399868611669e-05, + "loss": 3.3771, + "step": 117 + }, + { + "epoch": 0.03326051722922979, + "grad_norm": 740.0157470703125, + "learning_rate": 9.999327189969767e-05, + "loss": 3.0834, + "step": 118 + }, + { + "epoch": 0.033542386019308014, + "grad_norm": 1384.0299072265625, + "learning_rate": 9.999250358645543e-05, + "loss": 3.0395, + "step": 119 + }, + { + "epoch": 0.03382425480938623, + "grad_norm": 800.0287475585938, + "learning_rate": 9.99916937470281e-05, + "loss": 2.6387, + "step": 120 + }, + { + "epoch": 0.03410612359946445, + "grad_norm": 944.014892578125, + "learning_rate": 9.999084238208844e-05, + "loss": 2.8689, + "step": 121 + }, + { + "epoch": 0.03438799238954267, + "grad_norm": 1200.0296630859375, + "learning_rate": 9.998994949234359e-05, + "loss": 3.2579, + "step": 122 + }, + { + "epoch": 0.034669861179620884, + "grad_norm": 324.0240478515625, + "learning_rate": 9.998901507853526e-05, + "loss": 3.0594, + "step": 123 + }, + { + "epoch": 0.03495172996969911, + "grad_norm": 952.022705078125, + "learning_rate": 9.99880391414396e-05, + "loss": 2.7347, + "step": 124 + }, + { + "epoch": 0.03523359875977732, + "grad_norm": 912.029541015625, + "learning_rate": 9.998702168186726e-05, + "loss": 2.4565, + "step": 125 + }, + { + "epoch": 0.035515467549855545, + "grad_norm": 1600.04833984375, + "learning_rate": 9.998596270066341e-05, + "loss": 3.687, + "step": 126 + }, + { + "epoch": 0.03579733633993376, + "grad_norm": 964.0154418945312, + "learning_rate": 9.998486219870769e-05, + "loss": 3.341, + "step": 127 + }, + { + "epoch": 0.036079205130011976, + "grad_norm": 880.0108032226562, + "learning_rate": 9.998372017691422e-05, + "loss": 2.6546, + "step": 128 + }, + { + "epoch": 0.0363610739200902, + "grad_norm": 964.0204467773438, + "learning_rate": 9.998253663623162e-05, + "loss": 2.9644, + "step": 129 + }, + { + "epoch": 0.036642942710168415, + "grad_norm": 1104.0260009765625, + "learning_rate": 9.9981311577643e-05, + "loss": 3.057, + "step": 130 + }, + { + "epoch": 0.03692481150024664, + "grad_norm": 644.0272827148438, + "learning_rate": 9.998004500216596e-05, + "loss": 3.167, + "step": 131 + }, + { + "epoch": 0.03720668029032485, + "grad_norm": 1128.048095703125, + "learning_rate": 9.997873691085257e-05, + "loss": 2.6348, + "step": 132 + }, + { + "epoch": 0.03748854908040307, + "grad_norm": 1072.017822265625, + "learning_rate": 9.997738730478938e-05, + "loss": 2.5675, + "step": 133 + }, + { + "epoch": 0.03777041787048129, + "grad_norm": 768.0225830078125, + "learning_rate": 9.997599618509747e-05, + "loss": 3.2898, + "step": 134 + }, + { + "epoch": 0.03805228666055951, + "grad_norm": 988.035400390625, + "learning_rate": 9.997456355293235e-05, + "loss": 3.3013, + "step": 135 + }, + { + "epoch": 0.03833415545063773, + "grad_norm": 860.0079345703125, + "learning_rate": 9.997308940948405e-05, + "loss": 2.4569, + "step": 136 + }, + { + "epoch": 0.038616024240715946, + "grad_norm": 1280.042724609375, + "learning_rate": 9.997157375597704e-05, + "loss": 3.1722, + "step": 137 + }, + { + "epoch": 0.03889789303079417, + "grad_norm": 1672.041015625, + "learning_rate": 9.997001659367029e-05, + "loss": 2.7695, + "step": 138 + }, + { + "epoch": 0.039179761820872384, + "grad_norm": 1304.032470703125, + "learning_rate": 9.996841792385728e-05, + "loss": 3.3587, + "step": 139 + }, + { + "epoch": 0.0394616306109506, + "grad_norm": 832.01318359375, + "learning_rate": 9.996677774786593e-05, + "loss": 3.2345, + "step": 140 + }, + { + "epoch": 0.03974349940102882, + "grad_norm": 1360.021484375, + "learning_rate": 9.996509606705867e-05, + "loss": 3.0717, + "step": 141 + }, + { + "epoch": 0.04002536819110704, + "grad_norm": 2304.018310546875, + "learning_rate": 9.996337288283236e-05, + "loss": 3.4219, + "step": 142 + }, + { + "epoch": 0.04030723698118526, + "grad_norm": 1696.01611328125, + "learning_rate": 9.996160819661837e-05, + "loss": 2.9167, + "step": 143 + }, + { + "epoch": 0.04058910577126348, + "grad_norm": 1400.0169677734375, + "learning_rate": 9.995980200988252e-05, + "loss": 2.7753, + "step": 144 + }, + { + "epoch": 0.04087097456134169, + "grad_norm": 1784.03857421875, + "learning_rate": 9.995795432412514e-05, + "loss": 3.8663, + "step": 145 + }, + { + "epoch": 0.041152843351419915, + "grad_norm": 416.0094909667969, + "learning_rate": 9.995606514088099e-05, + "loss": 2.9116, + "step": 146 + }, + { + "epoch": 0.04143471214149813, + "grad_norm": 836.0244750976562, + "learning_rate": 9.99541344617193e-05, + "loss": 2.8228, + "step": 147 + }, + { + "epoch": 0.041716580931576354, + "grad_norm": 748.013427734375, + "learning_rate": 9.995216228824383e-05, + "loss": 2.9738, + "step": 148 + }, + { + "epoch": 0.04199844972165457, + "grad_norm": 1608.0435791015625, + "learning_rate": 9.995014862209273e-05, + "loss": 3.3352, + "step": 149 + }, + { + "epoch": 0.042280318511732785, + "grad_norm": 1504.015625, + "learning_rate": 9.994809346493867e-05, + "loss": 3.335, + "step": 150 + }, + { + "epoch": 0.04256218730181101, + "grad_norm": 864.0142211914062, + "learning_rate": 9.994599681848873e-05, + "loss": 3.2762, + "step": 151 + }, + { + "epoch": 0.042844056091889224, + "grad_norm": 1392.0233154296875, + "learning_rate": 9.994385868448451e-05, + "loss": 3.0056, + "step": 152 + }, + { + "epoch": 0.043125924881967447, + "grad_norm": 1144.0101318359375, + "learning_rate": 9.994167906470204e-05, + "loss": 3.1925, + "step": 153 + }, + { + "epoch": 0.04340779367204566, + "grad_norm": 1976.0264892578125, + "learning_rate": 9.993945796095183e-05, + "loss": 4.0391, + "step": 154 + }, + { + "epoch": 0.04368966246212388, + "grad_norm": 1184.0137939453125, + "learning_rate": 9.993719537507882e-05, + "loss": 2.9539, + "step": 155 + }, + { + "epoch": 0.0439715312522021, + "grad_norm": 1288.0863037109375, + "learning_rate": 9.993489130896244e-05, + "loss": 2.7753, + "step": 156 + }, + { + "epoch": 0.044253400042280316, + "grad_norm": 1264.0220947265625, + "learning_rate": 9.993254576451651e-05, + "loss": 3.1394, + "step": 157 + }, + { + "epoch": 0.04453526883235854, + "grad_norm": 908.0206298828125, + "learning_rate": 9.993015874368942e-05, + "loss": 3.1762, + "step": 158 + }, + { + "epoch": 0.044817137622436755, + "grad_norm": 972.0093994140625, + "learning_rate": 9.99277302484639e-05, + "loss": 2.9491, + "step": 159 + }, + { + "epoch": 0.04509900641251498, + "grad_norm": 576.0098876953125, + "learning_rate": 9.99252602808572e-05, + "loss": 2.9285, + "step": 160 + }, + { + "epoch": 0.04538087520259319, + "grad_norm": 748.0140380859375, + "learning_rate": 9.992274884292098e-05, + "loss": 3.1622, + "step": 161 + }, + { + "epoch": 0.04566274399267141, + "grad_norm": 1320.0299072265625, + "learning_rate": 9.992019593674135e-05, + "loss": 3.5382, + "step": 162 + }, + { + "epoch": 0.04594461278274963, + "grad_norm": 2024.0355224609375, + "learning_rate": 9.991760156443893e-05, + "loss": 4.0823, + "step": 163 + }, + { + "epoch": 0.04622648157282785, + "grad_norm": 728.00732421875, + "learning_rate": 9.991496572816865e-05, + "loss": 2.6446, + "step": 164 + }, + { + "epoch": 0.04650835036290607, + "grad_norm": 1504.0096435546875, + "learning_rate": 9.991228843012003e-05, + "loss": 3.0649, + "step": 165 + }, + { + "epoch": 0.046790219152984286, + "grad_norm": 1648.0419921875, + "learning_rate": 9.990956967251692e-05, + "loss": 3.7365, + "step": 166 + }, + { + "epoch": 0.0470720879430625, + "grad_norm": 1528.0145263671875, + "learning_rate": 9.990680945761768e-05, + "loss": 2.8897, + "step": 167 + }, + { + "epoch": 0.047353956733140724, + "grad_norm": 1872.0111083984375, + "learning_rate": 9.990400778771506e-05, + "loss": 2.7716, + "step": 168 + }, + { + "epoch": 0.04763582552321894, + "grad_norm": 1200.02880859375, + "learning_rate": 9.990116466513628e-05, + "loss": 2.916, + "step": 169 + }, + { + "epoch": 0.04791769431329716, + "grad_norm": 1360.013427734375, + "learning_rate": 9.989828009224296e-05, + "loss": 3.1576, + "step": 170 + }, + { + "epoch": 0.04819956310337538, + "grad_norm": 848.0136108398438, + "learning_rate": 9.989535407143118e-05, + "loss": 2.5964, + "step": 171 + }, + { + "epoch": 0.048481431893453594, + "grad_norm": 836.0110473632812, + "learning_rate": 9.989238660513141e-05, + "loss": 3.0209, + "step": 172 + }, + { + "epoch": 0.04876330068353182, + "grad_norm": 1184.02001953125, + "learning_rate": 9.98893776958086e-05, + "loss": 2.7208, + "step": 173 + }, + { + "epoch": 0.04904516947361003, + "grad_norm": 1648.0079345703125, + "learning_rate": 9.988632734596206e-05, + "loss": 3.2865, + "step": 174 + }, + { + "epoch": 0.049327038263688255, + "grad_norm": 1632.00830078125, + "learning_rate": 9.988323555812558e-05, + "loss": 3.1352, + "step": 175 + }, + { + "epoch": 0.04960890705376647, + "grad_norm": 956.0103149414062, + "learning_rate": 9.988010233486736e-05, + "loss": 2.4896, + "step": 176 + }, + { + "epoch": 0.04989077584384469, + "grad_norm": 1600.0140380859375, + "learning_rate": 9.987692767878996e-05, + "loss": 2.8297, + "step": 177 + }, + { + "epoch": 0.05017264463392291, + "grad_norm": 1144.2337646484375, + "learning_rate": 9.987371159253046e-05, + "loss": 2.9894, + "step": 178 + }, + { + "epoch": 0.050454513424001125, + "grad_norm": 952.0095825195312, + "learning_rate": 9.987045407876026e-05, + "loss": 2.7511, + "step": 179 + }, + { + "epoch": 0.05073638221407935, + "grad_norm": 1656.0137939453125, + "learning_rate": 9.986715514018523e-05, + "loss": 2.8883, + "step": 180 + }, + { + "epoch": 0.051018251004157564, + "grad_norm": 1312.012939453125, + "learning_rate": 9.98638147795456e-05, + "loss": 2.9077, + "step": 181 + }, + { + "epoch": 0.051300119794235786, + "grad_norm": 1280.0126953125, + "learning_rate": 9.986043299961607e-05, + "loss": 3.1157, + "step": 182 + }, + { + "epoch": 0.051581988584314, + "grad_norm": 820.0106201171875, + "learning_rate": 9.985700980320567e-05, + "loss": 2.6123, + "step": 183 + }, + { + "epoch": 0.05186385737439222, + "grad_norm": 748.0186767578125, + "learning_rate": 9.98535451931579e-05, + "loss": 2.6663, + "step": 184 + }, + { + "epoch": 0.05214572616447044, + "grad_norm": 976.009033203125, + "learning_rate": 9.985003917235063e-05, + "loss": 3.2953, + "step": 185 + }, + { + "epoch": 0.052427594954548656, + "grad_norm": 1296.0125732421875, + "learning_rate": 9.984649174369613e-05, + "loss": 3.0502, + "step": 186 + }, + { + "epoch": 0.05270946374462688, + "grad_norm": 604.0127563476562, + "learning_rate": 9.984290291014105e-05, + "loss": 2.949, + "step": 187 + }, + { + "epoch": 0.052991332534705095, + "grad_norm": 1512.01708984375, + "learning_rate": 9.983927267466645e-05, + "loss": 3.1583, + "step": 188 + }, + { + "epoch": 0.05327320132478331, + "grad_norm": 1176.01318359375, + "learning_rate": 9.98356010402878e-05, + "loss": 3.2654, + "step": 189 + }, + { + "epoch": 0.05355507011486153, + "grad_norm": 1296.020263671875, + "learning_rate": 9.983188801005492e-05, + "loss": 3.698, + "step": 190 + }, + { + "epoch": 0.05383693890493975, + "grad_norm": 2080.026123046875, + "learning_rate": 9.982813358705203e-05, + "loss": 2.8911, + "step": 191 + }, + { + "epoch": 0.05411880769501797, + "grad_norm": 552.013427734375, + "learning_rate": 9.982433777439775e-05, + "loss": 2.8669, + "step": 192 + }, + { + "epoch": 0.05440067648509619, + "grad_norm": 692.0086669921875, + "learning_rate": 9.982050057524505e-05, + "loss": 2.9594, + "step": 193 + }, + { + "epoch": 0.0546825452751744, + "grad_norm": 912.0121459960938, + "learning_rate": 9.98166219927813e-05, + "loss": 2.6993, + "step": 194 + }, + { + "epoch": 0.054964414065252626, + "grad_norm": 1448.0205078125, + "learning_rate": 9.981270203022823e-05, + "loss": 3.3888, + "step": 195 + }, + { + "epoch": 0.05524628285533084, + "grad_norm": 2040.016845703125, + "learning_rate": 9.980874069084196e-05, + "loss": 3.3993, + "step": 196 + }, + { + "epoch": 0.055528151645409064, + "grad_norm": 552.010498046875, + "learning_rate": 9.980473797791296e-05, + "loss": 3.335, + "step": 197 + }, + { + "epoch": 0.05581002043548728, + "grad_norm": 2064.0283203125, + "learning_rate": 9.98006938947661e-05, + "loss": 3.6622, + "step": 198 + }, + { + "epoch": 0.0560918892255655, + "grad_norm": 1480.0189208984375, + "learning_rate": 9.979660844476055e-05, + "loss": 3.5823, + "step": 199 + }, + { + "epoch": 0.05637375801564372, + "grad_norm": 1344.02587890625, + "learning_rate": 9.979248163128991e-05, + "loss": 3.6052, + "step": 200 + }, + { + "epoch": 0.056655626805721934, + "grad_norm": 1520.019775390625, + "learning_rate": 9.978831345778212e-05, + "loss": 3.2443, + "step": 201 + }, + { + "epoch": 0.05693749559580016, + "grad_norm": 1744.013916015625, + "learning_rate": 9.978410392769943e-05, + "loss": 3.5345, + "step": 202 + }, + { + "epoch": 0.05721936438587837, + "grad_norm": 2024.03369140625, + "learning_rate": 9.977985304453851e-05, + "loss": 3.2863, + "step": 203 + }, + { + "epoch": 0.057501233175956595, + "grad_norm": 2000.0101318359375, + "learning_rate": 9.977556081183035e-05, + "loss": 3.1629, + "step": 204 + }, + { + "epoch": 0.05778310196603481, + "grad_norm": 1080.010986328125, + "learning_rate": 9.977122723314025e-05, + "loss": 3.2283, + "step": 205 + }, + { + "epoch": 0.05806497075611303, + "grad_norm": 856.0095825195312, + "learning_rate": 9.976685231206792e-05, + "loss": 2.6466, + "step": 206 + }, + { + "epoch": 0.05834683954619125, + "grad_norm": 189.01341247558594, + "learning_rate": 9.976243605224738e-05, + "loss": 2.701, + "step": 207 + }, + { + "epoch": 0.058628708336269465, + "grad_norm": 1088.0081787109375, + "learning_rate": 9.975797845734698e-05, + "loss": 3.5568, + "step": 208 + }, + { + "epoch": 0.05891057712634769, + "grad_norm": 1152.00927734375, + "learning_rate": 9.975347953106941e-05, + "loss": 2.7412, + "step": 209 + }, + { + "epoch": 0.059192445916425904, + "grad_norm": 1096.0172119140625, + "learning_rate": 9.974893927715171e-05, + "loss": 3.1794, + "step": 210 + }, + { + "epoch": 0.05947431470650412, + "grad_norm": 364.01409912109375, + "learning_rate": 9.974435769936522e-05, + "loss": 2.5934, + "step": 211 + }, + { + "epoch": 0.05975618349658234, + "grad_norm": 1256.025146484375, + "learning_rate": 9.973973480151562e-05, + "loss": 3.0958, + "step": 212 + }, + { + "epoch": 0.06003805228666056, + "grad_norm": 2176.01953125, + "learning_rate": 9.973507058744292e-05, + "loss": 3.544, + "step": 213 + }, + { + "epoch": 0.06031992107673878, + "grad_norm": 872.0086669921875, + "learning_rate": 9.973036506102143e-05, + "loss": 2.7159, + "step": 214 + }, + { + "epoch": 0.060601789866816996, + "grad_norm": 624.0116577148438, + "learning_rate": 9.97256182261598e-05, + "loss": 3.094, + "step": 215 + }, + { + "epoch": 0.06088365865689521, + "grad_norm": 860.015869140625, + "learning_rate": 9.972083008680097e-05, + "loss": 2.8694, + "step": 216 + }, + { + "epoch": 0.061165527446973435, + "grad_norm": 1040.007568359375, + "learning_rate": 9.971600064692222e-05, + "loss": 3.0431, + "step": 217 + }, + { + "epoch": 0.06144739623705165, + "grad_norm": 1912.03369140625, + "learning_rate": 9.971112991053508e-05, + "loss": 3.671, + "step": 218 + }, + { + "epoch": 0.06172926502712987, + "grad_norm": 988.0165405273438, + "learning_rate": 9.970621788168546e-05, + "loss": 2.9387, + "step": 219 + }, + { + "epoch": 0.06201113381720809, + "grad_norm": 1384.01318359375, + "learning_rate": 9.970126456445347e-05, + "loss": 3.6176, + "step": 220 + }, + { + "epoch": 0.06229300260728631, + "grad_norm": 736.0071411132812, + "learning_rate": 9.969626996295365e-05, + "loss": 3.1484, + "step": 221 + }, + { + "epoch": 0.06257487139736452, + "grad_norm": 836.0046997070312, + "learning_rate": 9.96912340813347e-05, + "loss": 2.7384, + "step": 222 + }, + { + "epoch": 0.06285674018744275, + "grad_norm": 1536.01416015625, + "learning_rate": 9.968615692377968e-05, + "loss": 3.2882, + "step": 223 + }, + { + "epoch": 0.06313860897752097, + "grad_norm": 560.0055541992188, + "learning_rate": 9.968103849450593e-05, + "loss": 2.9888, + "step": 224 + }, + { + "epoch": 0.06342047776759918, + "grad_norm": 1392.025634765625, + "learning_rate": 9.967587879776505e-05, + "loss": 2.5862, + "step": 225 + }, + { + "epoch": 0.0637023465576774, + "grad_norm": 1704.0128173828125, + "learning_rate": 9.967067783784296e-05, + "loss": 2.6804, + "step": 226 + }, + { + "epoch": 0.06398421534775561, + "grad_norm": 1232.019775390625, + "learning_rate": 9.966543561905982e-05, + "loss": 3.678, + "step": 227 + }, + { + "epoch": 0.06426608413783384, + "grad_norm": 1224.009521484375, + "learning_rate": 9.966015214577003e-05, + "loss": 2.8695, + "step": 228 + }, + { + "epoch": 0.06454795292791206, + "grad_norm": 1032.00732421875, + "learning_rate": 9.965482742236233e-05, + "loss": 3.1655, + "step": 229 + }, + { + "epoch": 0.06482982171799027, + "grad_norm": 1568.013671875, + "learning_rate": 9.964946145325971e-05, + "loss": 3.7086, + "step": 230 + }, + { + "epoch": 0.06511169050806849, + "grad_norm": 2096.027587890625, + "learning_rate": 9.964405424291938e-05, + "loss": 3.6194, + "step": 231 + }, + { + "epoch": 0.06539355929814672, + "grad_norm": 1080.0123291015625, + "learning_rate": 9.963860579583283e-05, + "loss": 2.993, + "step": 232 + }, + { + "epoch": 0.06567542808822494, + "grad_norm": 1768.0328369140625, + "learning_rate": 9.963311611652582e-05, + "loss": 3.6154, + "step": 233 + }, + { + "epoch": 0.06595729687830315, + "grad_norm": 1592.009521484375, + "learning_rate": 9.96275852095583e-05, + "loss": 2.3096, + "step": 234 + }, + { + "epoch": 0.06623916566838137, + "grad_norm": 2464.031494140625, + "learning_rate": 9.962201307952455e-05, + "loss": 3.3855, + "step": 235 + }, + { + "epoch": 0.06652103445845958, + "grad_norm": 512.0086669921875, + "learning_rate": 9.961639973105306e-05, + "loss": 3.3746, + "step": 236 + }, + { + "epoch": 0.06680290324853781, + "grad_norm": 330.00665283203125, + "learning_rate": 9.961074516880649e-05, + "loss": 2.7502, + "step": 237 + }, + { + "epoch": 0.06708477203861603, + "grad_norm": 668.0062255859375, + "learning_rate": 9.960504939748184e-05, + "loss": 3.3791, + "step": 238 + }, + { + "epoch": 0.06736664082869424, + "grad_norm": 988.0054321289062, + "learning_rate": 9.959931242181028e-05, + "loss": 3.1521, + "step": 239 + }, + { + "epoch": 0.06764850961877246, + "grad_norm": 1744.0166015625, + "learning_rate": 9.959353424655722e-05, + "loss": 3.9929, + "step": 240 + }, + { + "epoch": 0.06793037840885068, + "grad_norm": 1744.01611328125, + "learning_rate": 9.95877148765223e-05, + "loss": 3.4849, + "step": 241 + }, + { + "epoch": 0.0682122471989289, + "grad_norm": 1240.0184326171875, + "learning_rate": 9.958185431653935e-05, + "loss": 3.1818, + "step": 242 + }, + { + "epoch": 0.06849411598900712, + "grad_norm": 1144.0079345703125, + "learning_rate": 9.957595257147643e-05, + "loss": 2.5822, + "step": 243 + }, + { + "epoch": 0.06877598477908534, + "grad_norm": 1224.0166015625, + "learning_rate": 9.957000964623583e-05, + "loss": 3.6024, + "step": 244 + }, + { + "epoch": 0.06905785356916355, + "grad_norm": 868.0048828125, + "learning_rate": 9.956402554575404e-05, + "loss": 3.1835, + "step": 245 + }, + { + "epoch": 0.06933972235924177, + "grad_norm": 1272.009765625, + "learning_rate": 9.955800027500173e-05, + "loss": 2.7265, + "step": 246 + }, + { + "epoch": 0.06962159114932, + "grad_norm": 796.0093994140625, + "learning_rate": 9.955193383898376e-05, + "loss": 2.6884, + "step": 247 + }, + { + "epoch": 0.06990345993939821, + "grad_norm": 872.0044555664062, + "learning_rate": 9.954582624273925e-05, + "loss": 3.021, + "step": 248 + }, + { + "epoch": 0.07018532872947643, + "grad_norm": 1448.010986328125, + "learning_rate": 9.953967749134143e-05, + "loss": 3.1117, + "step": 249 + }, + { + "epoch": 0.07046719751955464, + "grad_norm": 1488.01171875, + "learning_rate": 9.953348758989775e-05, + "loss": 3.0371, + "step": 250 + }, + { + "epoch": 0.07074906630963286, + "grad_norm": 704.00537109375, + "learning_rate": 9.952725654354986e-05, + "loss": 3.2669, + "step": 251 + }, + { + "epoch": 0.07103093509971109, + "grad_norm": 1880.007080078125, + "learning_rate": 9.952098435747354e-05, + "loss": 3.136, + "step": 252 + }, + { + "epoch": 0.0713128038897893, + "grad_norm": 736.0087890625, + "learning_rate": 9.95146710368788e-05, + "loss": 2.7951, + "step": 253 + }, + { + "epoch": 0.07159467267986752, + "grad_norm": 1368.008056640625, + "learning_rate": 9.950831658700976e-05, + "loss": 3.3312, + "step": 254 + }, + { + "epoch": 0.07187654146994574, + "grad_norm": 868.0150756835938, + "learning_rate": 9.950192101314477e-05, + "loss": 2.8603, + "step": 255 + }, + { + "epoch": 0.07215841026002395, + "grad_norm": 1560.0213623046875, + "learning_rate": 9.949548432059628e-05, + "loss": 3.2821, + "step": 256 + }, + { + "epoch": 0.07244027905010218, + "grad_norm": 1112.0252685546875, + "learning_rate": 9.94890065147109e-05, + "loss": 3.6, + "step": 257 + }, + { + "epoch": 0.0727221478401804, + "grad_norm": 1416.0059814453125, + "learning_rate": 9.948248760086944e-05, + "loss": 2.8726, + "step": 258 + }, + { + "epoch": 0.07300401663025861, + "grad_norm": 211.00765991210938, + "learning_rate": 9.94759275844868e-05, + "loss": 2.6796, + "step": 259 + }, + { + "epoch": 0.07328588542033683, + "grad_norm": 1200.0108642578125, + "learning_rate": 9.946932647101205e-05, + "loss": 3.2164, + "step": 260 + }, + { + "epoch": 0.07356775421041505, + "grad_norm": 1888.008056640625, + "learning_rate": 9.946268426592841e-05, + "loss": 2.8853, + "step": 261 + }, + { + "epoch": 0.07384962300049328, + "grad_norm": 1552.0113525390625, + "learning_rate": 9.945600097475321e-05, + "loss": 2.8843, + "step": 262 + }, + { + "epoch": 0.07413149179057149, + "grad_norm": 1392.007080078125, + "learning_rate": 9.944927660303791e-05, + "loss": 2.674, + "step": 263 + }, + { + "epoch": 0.0744133605806497, + "grad_norm": 1648.008544921875, + "learning_rate": 9.944251115636809e-05, + "loss": 2.8589, + "step": 264 + }, + { + "epoch": 0.07469522937072792, + "grad_norm": 1696.00732421875, + "learning_rate": 9.943570464036347e-05, + "loss": 3.4651, + "step": 265 + }, + { + "epoch": 0.07497709816080614, + "grad_norm": 2176.027587890625, + "learning_rate": 9.942885706067786e-05, + "loss": 3.3993, + "step": 266 + }, + { + "epoch": 0.07525896695088437, + "grad_norm": 1968.011474609375, + "learning_rate": 9.942196842299919e-05, + "loss": 3.2937, + "step": 267 + }, + { + "epoch": 0.07554083574096258, + "grad_norm": 2624.011474609375, + "learning_rate": 9.941503873304949e-05, + "loss": 3.0907, + "step": 268 + }, + { + "epoch": 0.0758227045310408, + "grad_norm": 1088.0074462890625, + "learning_rate": 9.940806799658492e-05, + "loss": 3.3619, + "step": 269 + }, + { + "epoch": 0.07610457332111902, + "grad_norm": 768.0106811523438, + "learning_rate": 9.940105621939568e-05, + "loss": 3.2139, + "step": 270 + }, + { + "epoch": 0.07638644211119723, + "grad_norm": 1168.0081787109375, + "learning_rate": 9.939400340730611e-05, + "loss": 2.7424, + "step": 271 + }, + { + "epoch": 0.07666831090127546, + "grad_norm": 1368.0074462890625, + "learning_rate": 9.938690956617463e-05, + "loss": 2.7837, + "step": 272 + }, + { + "epoch": 0.07695017969135368, + "grad_norm": 644.0048828125, + "learning_rate": 9.937977470189368e-05, + "loss": 2.7814, + "step": 273 + }, + { + "epoch": 0.07723204848143189, + "grad_norm": 860.0054931640625, + "learning_rate": 9.937259882038985e-05, + "loss": 2.7815, + "step": 274 + }, + { + "epoch": 0.07751391727151011, + "grad_norm": 832.00537109375, + "learning_rate": 9.936538192762377e-05, + "loss": 2.6158, + "step": 275 + }, + { + "epoch": 0.07779578606158834, + "grad_norm": 812.0056762695312, + "learning_rate": 9.935812402959015e-05, + "loss": 2.7814, + "step": 276 + }, + { + "epoch": 0.07807765485166655, + "grad_norm": 928.0069580078125, + "learning_rate": 9.935082513231775e-05, + "loss": 2.6391, + "step": 277 + }, + { + "epoch": 0.07835952364174477, + "grad_norm": 764.0093383789062, + "learning_rate": 9.934348524186936e-05, + "loss": 2.5998, + "step": 278 + }, + { + "epoch": 0.07864139243182298, + "grad_norm": 1552.0086669921875, + "learning_rate": 9.933610436434186e-05, + "loss": 2.9766, + "step": 279 + }, + { + "epoch": 0.0789232612219012, + "grad_norm": 1664.009033203125, + "learning_rate": 9.932868250586619e-05, + "loss": 3.4543, + "step": 280 + }, + { + "epoch": 0.07920513001197943, + "grad_norm": 2240.0068359375, + "learning_rate": 9.932121967260727e-05, + "loss": 4.2736, + "step": 281 + }, + { + "epoch": 0.07948699880205765, + "grad_norm": 1216.0054931640625, + "learning_rate": 9.93137158707641e-05, + "loss": 3.0386, + "step": 282 + }, + { + "epoch": 0.07976886759213586, + "grad_norm": 1096.0096435546875, + "learning_rate": 9.93061711065697e-05, + "loss": 2.8758, + "step": 283 + }, + { + "epoch": 0.08005073638221408, + "grad_norm": 944.0118408203125, + "learning_rate": 9.929858538629111e-05, + "loss": 2.6805, + "step": 284 + }, + { + "epoch": 0.08033260517229229, + "grad_norm": 1128.00732421875, + "learning_rate": 9.929095871622942e-05, + "loss": 2.8791, + "step": 285 + }, + { + "epoch": 0.08061447396237052, + "grad_norm": 1024.0089111328125, + "learning_rate": 9.928329110271968e-05, + "loss": 2.952, + "step": 286 + }, + { + "epoch": 0.08089634275244874, + "grad_norm": 1744.008544921875, + "learning_rate": 9.927558255213098e-05, + "loss": 2.7007, + "step": 287 + }, + { + "epoch": 0.08117821154252695, + "grad_norm": 1176.0054931640625, + "learning_rate": 9.926783307086644e-05, + "loss": 2.6887, + "step": 288 + }, + { + "epoch": 0.08146008033260517, + "grad_norm": 900.0033569335938, + "learning_rate": 9.926004266536313e-05, + "loss": 2.5497, + "step": 289 + }, + { + "epoch": 0.08174194912268339, + "grad_norm": 1064.0068359375, + "learning_rate": 9.925221134209213e-05, + "loss": 2.6804, + "step": 290 + }, + { + "epoch": 0.08202381791276162, + "grad_norm": 1216.005615234375, + "learning_rate": 9.924433910755855e-05, + "loss": 3.7049, + "step": 291 + }, + { + "epoch": 0.08230568670283983, + "grad_norm": 1712.005126953125, + "learning_rate": 9.923642596830141e-05, + "loss": 3.0197, + "step": 292 + }, + { + "epoch": 0.08258755549291805, + "grad_norm": 1704.0086669921875, + "learning_rate": 9.922847193089377e-05, + "loss": 3.2649, + "step": 293 + }, + { + "epoch": 0.08286942428299626, + "grad_norm": 808.006591796875, + "learning_rate": 9.922047700194264e-05, + "loss": 3.0914, + "step": 294 + }, + { + "epoch": 0.08315129307307448, + "grad_norm": 600.0067749023438, + "learning_rate": 9.921244118808896e-05, + "loss": 2.941, + "step": 295 + }, + { + "epoch": 0.08343316186315271, + "grad_norm": 2352.011962890625, + "learning_rate": 9.920436449600771e-05, + "loss": 4.0959, + "step": 296 + }, + { + "epoch": 0.08371503065323092, + "grad_norm": 1632.0120849609375, + "learning_rate": 9.919624693240774e-05, + "loss": 3.1232, + "step": 297 + }, + { + "epoch": 0.08399689944330914, + "grad_norm": 620.0032958984375, + "learning_rate": 9.918808850403192e-05, + "loss": 2.8176, + "step": 298 + }, + { + "epoch": 0.08427876823338736, + "grad_norm": 1864.007568359375, + "learning_rate": 9.917988921765702e-05, + "loss": 3.0051, + "step": 299 + }, + { + "epoch": 0.08456063702346557, + "grad_norm": 1192.0057373046875, + "learning_rate": 9.917164908009375e-05, + "loss": 3.0327, + "step": 300 + }, + { + "epoch": 0.0848425058135438, + "grad_norm": 2080.029052734375, + "learning_rate": 9.916336809818678e-05, + "loss": 3.2499, + "step": 301 + }, + { + "epoch": 0.08512437460362202, + "grad_norm": 2112.02001953125, + "learning_rate": 9.91550462788147e-05, + "loss": 2.9172, + "step": 302 + }, + { + "epoch": 0.08540624339370023, + "grad_norm": 1440.010498046875, + "learning_rate": 9.914668362889002e-05, + "loss": 3.9365, + "step": 303 + }, + { + "epoch": 0.08568811218377845, + "grad_norm": 1272.0047607421875, + "learning_rate": 9.913828015535913e-05, + "loss": 2.6525, + "step": 304 + }, + { + "epoch": 0.08596998097385666, + "grad_norm": 1312.0128173828125, + "learning_rate": 9.912983586520239e-05, + "loss": 3.5568, + "step": 305 + }, + { + "epoch": 0.08625184976393489, + "grad_norm": 1424.0076904296875, + "learning_rate": 9.912135076543401e-05, + "loss": 3.0154, + "step": 306 + }, + { + "epoch": 0.08653371855401311, + "grad_norm": 1888.020751953125, + "learning_rate": 9.911282486310213e-05, + "loss": 3.2831, + "step": 307 + }, + { + "epoch": 0.08681558734409132, + "grad_norm": 394.0055847167969, + "learning_rate": 9.910425816528881e-05, + "loss": 2.9569, + "step": 308 + }, + { + "epoch": 0.08709745613416954, + "grad_norm": 764.0032958984375, + "learning_rate": 9.909565067910992e-05, + "loss": 3.0038, + "step": 309 + }, + { + "epoch": 0.08737932492424776, + "grad_norm": 1200.0062255859375, + "learning_rate": 9.908700241171527e-05, + "loss": 3.006, + "step": 310 + }, + { + "epoch": 0.08766119371432599, + "grad_norm": 108.51055908203125, + "learning_rate": 9.907831337028853e-05, + "loss": 2.422, + "step": 311 + }, + { + "epoch": 0.0879430625044042, + "grad_norm": 2000.024169921875, + "learning_rate": 9.906958356204722e-05, + "loss": 3.3242, + "step": 312 + }, + { + "epoch": 0.08822493129448242, + "grad_norm": 2320.006591796875, + "learning_rate": 9.906081299424276e-05, + "loss": 3.5269, + "step": 313 + }, + { + "epoch": 0.08850680008456063, + "grad_norm": 1072.0074462890625, + "learning_rate": 9.90520016741604e-05, + "loss": 3.3849, + "step": 314 + }, + { + "epoch": 0.08878866887463886, + "grad_norm": 458.00787353515625, + "learning_rate": 9.904314960911925e-05, + "loss": 2.6863, + "step": 315 + }, + { + "epoch": 0.08907053766471708, + "grad_norm": 1304.011962890625, + "learning_rate": 9.903425680647225e-05, + "loss": 2.8595, + "step": 316 + }, + { + "epoch": 0.0893524064547953, + "grad_norm": 1120.0054931640625, + "learning_rate": 9.90253232736062e-05, + "loss": 2.7411, + "step": 317 + }, + { + "epoch": 0.08963427524487351, + "grad_norm": 1304.00830078125, + "learning_rate": 9.901634901794172e-05, + "loss": 2.7814, + "step": 318 + }, + { + "epoch": 0.08991614403495173, + "grad_norm": 832.0069580078125, + "learning_rate": 9.900733404693327e-05, + "loss": 3.2007, + "step": 319 + }, + { + "epoch": 0.09019801282502996, + "grad_norm": 624.0113525390625, + "learning_rate": 9.899827836806912e-05, + "loss": 3.0886, + "step": 320 + }, + { + "epoch": 0.09047988161510817, + "grad_norm": 1472.0166015625, + "learning_rate": 9.898918198887133e-05, + "loss": 2.7502, + "step": 321 + }, + { + "epoch": 0.09076175040518639, + "grad_norm": 1080.013427734375, + "learning_rate": 9.898004491689582e-05, + "loss": 3.176, + "step": 322 + }, + { + "epoch": 0.0910436191952646, + "grad_norm": 1432.0057373046875, + "learning_rate": 9.897086715973227e-05, + "loss": 2.6461, + "step": 323 + }, + { + "epoch": 0.09132548798534282, + "grad_norm": 1168.009033203125, + "learning_rate": 9.896164872500417e-05, + "loss": 2.8779, + "step": 324 + }, + { + "epoch": 0.09160735677542105, + "grad_norm": 1024.0118408203125, + "learning_rate": 9.895238962036878e-05, + "loss": 3.5176, + "step": 325 + }, + { + "epoch": 0.09188922556549926, + "grad_norm": 1024.015625, + "learning_rate": 9.89430898535172e-05, + "loss": 3.1278, + "step": 326 + }, + { + "epoch": 0.09217109435557748, + "grad_norm": 1080.0111083984375, + "learning_rate": 9.893374943217422e-05, + "loss": 3.3753, + "step": 327 + }, + { + "epoch": 0.0924529631456557, + "grad_norm": 1392.0198974609375, + "learning_rate": 9.892436836409845e-05, + "loss": 3.1897, + "step": 328 + }, + { + "epoch": 0.09273483193573391, + "grad_norm": 1704.0084228515625, + "learning_rate": 9.89149466570823e-05, + "loss": 3.3362, + "step": 329 + }, + { + "epoch": 0.09301670072581214, + "grad_norm": 1408.0067138671875, + "learning_rate": 9.890548431895183e-05, + "loss": 3.6424, + "step": 330 + }, + { + "epoch": 0.09329856951589036, + "grad_norm": 1048.0096435546875, + "learning_rate": 9.8895981357567e-05, + "loss": 3.3582, + "step": 331 + }, + { + "epoch": 0.09358043830596857, + "grad_norm": 1048.0052490234375, + "learning_rate": 9.888643778082133e-05, + "loss": 2.6863, + "step": 332 + }, + { + "epoch": 0.09386230709604679, + "grad_norm": 2024.0157470703125, + "learning_rate": 9.887685359664226e-05, + "loss": 3.7032, + "step": 333 + }, + { + "epoch": 0.094144175886125, + "grad_norm": 414.00421142578125, + "learning_rate": 9.886722881299081e-05, + "loss": 2.8606, + "step": 334 + }, + { + "epoch": 0.09442604467620323, + "grad_norm": 1020.0057373046875, + "learning_rate": 9.885756343786183e-05, + "loss": 3.6095, + "step": 335 + }, + { + "epoch": 0.09470791346628145, + "grad_norm": 1264.0093994140625, + "learning_rate": 9.884785747928384e-05, + "loss": 3.0046, + "step": 336 + }, + { + "epoch": 0.09498978225635966, + "grad_norm": 944.0106811523438, + "learning_rate": 9.883811094531906e-05, + "loss": 2.9376, + "step": 337 + }, + { + "epoch": 0.09527165104643788, + "grad_norm": 520.0059814453125, + "learning_rate": 9.882832384406345e-05, + "loss": 3.058, + "step": 338 + }, + { + "epoch": 0.0955535198365161, + "grad_norm": 972.0081176757812, + "learning_rate": 9.881849618364665e-05, + "loss": 3.1114, + "step": 339 + }, + { + "epoch": 0.09583538862659433, + "grad_norm": 1232.005126953125, + "learning_rate": 9.880862797223197e-05, + "loss": 2.9952, + "step": 340 + }, + { + "epoch": 0.09611725741667254, + "grad_norm": 1648.013916015625, + "learning_rate": 9.879871921801644e-05, + "loss": 2.5567, + "step": 341 + }, + { + "epoch": 0.09639912620675076, + "grad_norm": 2032.011474609375, + "learning_rate": 9.878876992923074e-05, + "loss": 4.1814, + "step": 342 + }, + { + "epoch": 0.09668099499682897, + "grad_norm": 1120.0133056640625, + "learning_rate": 9.877878011413923e-05, + "loss": 3.0895, + "step": 343 + }, + { + "epoch": 0.09696286378690719, + "grad_norm": 2176.0263671875, + "learning_rate": 9.876874978103995e-05, + "loss": 3.254, + "step": 344 + }, + { + "epoch": 0.09724473257698542, + "grad_norm": 1632.0115966796875, + "learning_rate": 9.875867893826454e-05, + "loss": 3.8725, + "step": 345 + }, + { + "epoch": 0.09752660136706363, + "grad_norm": 1048.0107421875, + "learning_rate": 9.874856759417836e-05, + "loss": 2.8319, + "step": 346 + }, + { + "epoch": 0.09780847015714185, + "grad_norm": 900.0040893554688, + "learning_rate": 9.873841575718038e-05, + "loss": 3.0015, + "step": 347 + }, + { + "epoch": 0.09809033894722007, + "grad_norm": 920.0099487304688, + "learning_rate": 9.872822343570319e-05, + "loss": 2.9483, + "step": 348 + }, + { + "epoch": 0.09837220773729828, + "grad_norm": 1960.009521484375, + "learning_rate": 9.871799063821303e-05, + "loss": 3.2247, + "step": 349 + }, + { + "epoch": 0.09865407652737651, + "grad_norm": 2128.00732421875, + "learning_rate": 9.870771737320976e-05, + "loss": 2.9484, + "step": 350 + }, + { + "epoch": 0.09893594531745473, + "grad_norm": 330.0079345703125, + "learning_rate": 9.869740364922684e-05, + "loss": 2.7146, + "step": 351 + }, + { + "epoch": 0.09921781410753294, + "grad_norm": 190.01210021972656, + "learning_rate": 9.868704947483134e-05, + "loss": 2.9674, + "step": 352 + }, + { + "epoch": 0.09949968289761116, + "grad_norm": 1560.018310546875, + "learning_rate": 9.867665485862397e-05, + "loss": 3.057, + "step": 353 + }, + { + "epoch": 0.09978155168768937, + "grad_norm": 1432.00634765625, + "learning_rate": 9.866621980923896e-05, + "loss": 2.5479, + "step": 354 + }, + { + "epoch": 0.1000634204777676, + "grad_norm": 1664.0076904296875, + "learning_rate": 9.86557443353442e-05, + "loss": 3.0492, + "step": 355 + }, + { + "epoch": 0.10034528926784582, + "grad_norm": 1512.0074462890625, + "learning_rate": 9.86452284456411e-05, + "loss": 2.7502, + "step": 356 + }, + { + "epoch": 0.10062715805792403, + "grad_norm": 1112.009033203125, + "learning_rate": 9.863467214886466e-05, + "loss": 2.7698, + "step": 357 + }, + { + "epoch": 0.10090902684800225, + "grad_norm": 788.007080078125, + "learning_rate": 9.862407545378348e-05, + "loss": 2.8294, + "step": 358 + }, + { + "epoch": 0.10119089563808048, + "grad_norm": 2048.00537109375, + "learning_rate": 9.861343836919965e-05, + "loss": 2.7579, + "step": 359 + }, + { + "epoch": 0.1014727644281587, + "grad_norm": 1456.0169677734375, + "learning_rate": 9.860276090394889e-05, + "loss": 2.9658, + "step": 360 + }, + { + "epoch": 0.10175463321823691, + "grad_norm": 1120.0069580078125, + "learning_rate": 9.859204306690037e-05, + "loss": 3.0882, + "step": 361 + }, + { + "epoch": 0.10203650200831513, + "grad_norm": 1012.00537109375, + "learning_rate": 9.858128486695687e-05, + "loss": 2.7085, + "step": 362 + }, + { + "epoch": 0.10231837079839334, + "grad_norm": 664.0088500976562, + "learning_rate": 9.857048631305466e-05, + "loss": 3.021, + "step": 363 + }, + { + "epoch": 0.10260023958847157, + "grad_norm": 1064.0069580078125, + "learning_rate": 9.855964741416354e-05, + "loss": 3.5172, + "step": 364 + }, + { + "epoch": 0.10288210837854979, + "grad_norm": 1264.0166015625, + "learning_rate": 9.854876817928684e-05, + "loss": 2.9387, + "step": 365 + }, + { + "epoch": 0.103163977168628, + "grad_norm": 584.0070190429688, + "learning_rate": 9.853784861746136e-05, + "loss": 2.7815, + "step": 366 + }, + { + "epoch": 0.10344584595870622, + "grad_norm": 1328.00439453125, + "learning_rate": 9.85268887377574e-05, + "loss": 2.9699, + "step": 367 + }, + { + "epoch": 0.10372771474878444, + "grad_norm": 520.0093383789062, + "learning_rate": 9.85158885492788e-05, + "loss": 2.6877, + "step": 368 + }, + { + "epoch": 0.10400958353886267, + "grad_norm": 772.0089721679688, + "learning_rate": 9.850484806116283e-05, + "loss": 2.7502, + "step": 369 + }, + { + "epoch": 0.10429145232894088, + "grad_norm": 1768.011962890625, + "learning_rate": 9.849376728258024e-05, + "loss": 2.7814, + "step": 370 + }, + { + "epoch": 0.1045733211190191, + "grad_norm": 1512.0123291015625, + "learning_rate": 9.848264622273527e-05, + "loss": 2.7501, + "step": 371 + }, + { + "epoch": 0.10485518990909731, + "grad_norm": 1128.013671875, + "learning_rate": 9.847148489086562e-05, + "loss": 3.073, + "step": 372 + }, + { + "epoch": 0.10513705869917553, + "grad_norm": 1176.00732421875, + "learning_rate": 9.846028329624241e-05, + "loss": 2.7813, + "step": 373 + }, + { + "epoch": 0.10541892748925376, + "grad_norm": 1688.0167236328125, + "learning_rate": 9.844904144817026e-05, + "loss": 3.1381, + "step": 374 + }, + { + "epoch": 0.10570079627933197, + "grad_norm": 668.01123046875, + "learning_rate": 9.843775935598714e-05, + "loss": 3.0197, + "step": 375 + }, + { + "epoch": 0.10598266506941019, + "grad_norm": 612.0072631835938, + "learning_rate": 9.842643702906453e-05, + "loss": 2.8585, + "step": 376 + }, + { + "epoch": 0.1062645338594884, + "grad_norm": 1504.0162353515625, + "learning_rate": 9.841507447680732e-05, + "loss": 3.0262, + "step": 377 + }, + { + "epoch": 0.10654640264956662, + "grad_norm": 1344.0096435546875, + "learning_rate": 9.840367170865375e-05, + "loss": 4.0079, + "step": 378 + }, + { + "epoch": 0.10682827143964485, + "grad_norm": 159.00929260253906, + "learning_rate": 9.839222873407553e-05, + "loss": 2.8897, + "step": 379 + }, + { + "epoch": 0.10711014022972307, + "grad_norm": 2176.011474609375, + "learning_rate": 9.838074556257774e-05, + "loss": 3.1231, + "step": 380 + }, + { + "epoch": 0.10739200901980128, + "grad_norm": 1408.010986328125, + "learning_rate": 9.836922220369889e-05, + "loss": 2.6646, + "step": 381 + }, + { + "epoch": 0.1076738778098795, + "grad_norm": 1976.017578125, + "learning_rate": 9.835765866701079e-05, + "loss": 3.1854, + "step": 382 + }, + { + "epoch": 0.10795574659995771, + "grad_norm": 688.0047607421875, + "learning_rate": 9.83460549621187e-05, + "loss": 2.7813, + "step": 383 + }, + { + "epoch": 0.10823761539003594, + "grad_norm": 616.0049438476562, + "learning_rate": 9.833441109866121e-05, + "loss": 2.8898, + "step": 384 + }, + { + "epoch": 0.10851948418011416, + "grad_norm": 764.0093383789062, + "learning_rate": 9.832272708631028e-05, + "loss": 3.1381, + "step": 385 + }, + { + "epoch": 0.10880135297019237, + "grad_norm": 1232.0052490234375, + "learning_rate": 9.83110029347712e-05, + "loss": 2.7501, + "step": 386 + }, + { + "epoch": 0.10908322176027059, + "grad_norm": 2080.005126953125, + "learning_rate": 9.829923865378264e-05, + "loss": 3.0801, + "step": 387 + }, + { + "epoch": 0.1093650905503488, + "grad_norm": 688.0045166015625, + "learning_rate": 9.828743425311653e-05, + "loss": 2.7813, + "step": 388 + }, + { + "epoch": 0.10964695934042704, + "grad_norm": 1224.01123046875, + "learning_rate": 9.827558974257823e-05, + "loss": 3.1095, + "step": 389 + }, + { + "epoch": 0.10992882813050525, + "grad_norm": 1680.0076904296875, + "learning_rate": 9.826370513200635e-05, + "loss": 2.7025, + "step": 390 + }, + { + "epoch": 0.11021069692058347, + "grad_norm": 1856.01123046875, + "learning_rate": 9.82517804312728e-05, + "loss": 3.0209, + "step": 391 + }, + { + "epoch": 0.11049256571066168, + "grad_norm": 1712.0189208984375, + "learning_rate": 9.823981565028282e-05, + "loss": 2.8275, + "step": 392 + }, + { + "epoch": 0.1107744345007399, + "grad_norm": 832.0068969726562, + "learning_rate": 9.822781079897494e-05, + "loss": 2.5821, + "step": 393 + }, + { + "epoch": 0.11105630329081813, + "grad_norm": 1040.0087890625, + "learning_rate": 9.821576588732095e-05, + "loss": 3.2947, + "step": 394 + }, + { + "epoch": 0.11133817208089634, + "grad_norm": 1392.0091552734375, + "learning_rate": 9.820368092532597e-05, + "loss": 3.5176, + "step": 395 + }, + { + "epoch": 0.11162004087097456, + "grad_norm": 1376.0047607421875, + "learning_rate": 9.819155592302834e-05, + "loss": 3.0314, + "step": 396 + }, + { + "epoch": 0.11190190966105278, + "grad_norm": 640.0049438476562, + "learning_rate": 9.817939089049964e-05, + "loss": 3.5255, + "step": 397 + }, + { + "epoch": 0.112183778451131, + "grad_norm": 908.0057983398438, + "learning_rate": 9.816718583784477e-05, + "loss": 2.7934, + "step": 398 + }, + { + "epoch": 0.11246564724120922, + "grad_norm": 2048.005126953125, + "learning_rate": 9.815494077520184e-05, + "loss": 2.9881, + "step": 399 + }, + { + "epoch": 0.11274751603128744, + "grad_norm": 1856.005859375, + "learning_rate": 9.814265571274214e-05, + "loss": 2.8784, + "step": 400 + }, + { + "epoch": 0.11302938482136565, + "grad_norm": 1576.0076904296875, + "learning_rate": 9.813033066067028e-05, + "loss": 2.9067, + "step": 401 + }, + { + "epoch": 0.11331125361144387, + "grad_norm": 776.0106201171875, + "learning_rate": 9.811796562922404e-05, + "loss": 2.473, + "step": 402 + }, + { + "epoch": 0.1135931224015221, + "grad_norm": 1256.018798828125, + "learning_rate": 9.81055606286744e-05, + "loss": 3.3988, + "step": 403 + }, + { + "epoch": 0.11387499119160031, + "grad_norm": 1064.0078125, + "learning_rate": 9.809311566932556e-05, + "loss": 3.0804, + "step": 404 + }, + { + "epoch": 0.11415685998167853, + "grad_norm": 1248.0150146484375, + "learning_rate": 9.808063076151487e-05, + "loss": 3.0211, + "step": 405 + }, + { + "epoch": 0.11443872877175675, + "grad_norm": 632.0076904296875, + "learning_rate": 9.806810591561295e-05, + "loss": 2.7814, + "step": 406 + }, + { + "epoch": 0.11472059756183496, + "grad_norm": 648.0088500976562, + "learning_rate": 9.805554114202351e-05, + "loss": 2.8899, + "step": 407 + }, + { + "epoch": 0.11500246635191319, + "grad_norm": 820.0057373046875, + "learning_rate": 9.804293645118345e-05, + "loss": 3.0002, + "step": 408 + }, + { + "epoch": 0.1152843351419914, + "grad_norm": 1192.0068359375, + "learning_rate": 9.803029185356286e-05, + "loss": 2.7945, + "step": 409 + }, + { + "epoch": 0.11556620393206962, + "grad_norm": 1592.0093994140625, + "learning_rate": 9.801760735966494e-05, + "loss": 3.4299, + "step": 410 + }, + { + "epoch": 0.11584807272214784, + "grad_norm": 700.0099487304688, + "learning_rate": 9.800488298002604e-05, + "loss": 2.7535, + "step": 411 + }, + { + "epoch": 0.11612994151222605, + "grad_norm": 1968.0103759765625, + "learning_rate": 9.799211872521565e-05, + "loss": 3.2964, + "step": 412 + }, + { + "epoch": 0.11641181030230428, + "grad_norm": 1416.010498046875, + "learning_rate": 9.797931460583636e-05, + "loss": 3.3127, + "step": 413 + }, + { + "epoch": 0.1166936790923825, + "grad_norm": 864.0117797851562, + "learning_rate": 9.796647063252391e-05, + "loss": 3.5063, + "step": 414 + }, + { + "epoch": 0.11697554788246071, + "grad_norm": 1288.0146484375, + "learning_rate": 9.79535868159471e-05, + "loss": 3.007, + "step": 415 + }, + { + "epoch": 0.11725741667253893, + "grad_norm": 1120.0062255859375, + "learning_rate": 9.794066316680787e-05, + "loss": 2.5937, + "step": 416 + }, + { + "epoch": 0.11753928546261715, + "grad_norm": 2288.025146484375, + "learning_rate": 9.792769969584124e-05, + "loss": 3.1076, + "step": 417 + }, + { + "epoch": 0.11782115425269538, + "grad_norm": 620.01025390625, + "learning_rate": 9.791469641381527e-05, + "loss": 2.7144, + "step": 418 + }, + { + "epoch": 0.11810302304277359, + "grad_norm": 1616.0123291015625, + "learning_rate": 9.79016533315311e-05, + "loss": 4.3888, + "step": 419 + }, + { + "epoch": 0.11838489183285181, + "grad_norm": 816.0095825195312, + "learning_rate": 9.788857045982297e-05, + "loss": 2.6051, + "step": 420 + }, + { + "epoch": 0.11866676062293002, + "grad_norm": 2608.019287109375, + "learning_rate": 9.787544780955815e-05, + "loss": 3.5803, + "step": 421 + }, + { + "epoch": 0.11894862941300824, + "grad_norm": 350.0066833496094, + "learning_rate": 9.786228539163691e-05, + "loss": 3.0142, + "step": 422 + }, + { + "epoch": 0.11923049820308647, + "grad_norm": 1072.004150390625, + "learning_rate": 9.784908321699263e-05, + "loss": 3.2159, + "step": 423 + }, + { + "epoch": 0.11951236699316468, + "grad_norm": 884.006591796875, + "learning_rate": 9.783584129659162e-05, + "loss": 3.0604, + "step": 424 + }, + { + "epoch": 0.1197942357832429, + "grad_norm": 1688.0206298828125, + "learning_rate": 9.782255964143332e-05, + "loss": 3.0675, + "step": 425 + }, + { + "epoch": 0.12007610457332112, + "grad_norm": 940.0105590820312, + "learning_rate": 9.780923826255008e-05, + "loss": 3.1459, + "step": 426 + }, + { + "epoch": 0.12035797336339933, + "grad_norm": 1264.0167236328125, + "learning_rate": 9.779587717100729e-05, + "loss": 3.0731, + "step": 427 + }, + { + "epoch": 0.12063984215347756, + "grad_norm": 1360.01708984375, + "learning_rate": 9.778247637790332e-05, + "loss": 2.7814, + "step": 428 + }, + { + "epoch": 0.12092171094355578, + "grad_norm": 2304.023193359375, + "learning_rate": 9.77690358943695e-05, + "loss": 3.5317, + "step": 429 + }, + { + "epoch": 0.12120357973363399, + "grad_norm": 1112.0120849609375, + "learning_rate": 9.775555573157015e-05, + "loss": 3.1766, + "step": 430 + }, + { + "epoch": 0.12148544852371221, + "grad_norm": 1112.0057373046875, + "learning_rate": 9.774203590070255e-05, + "loss": 2.8377, + "step": 431 + }, + { + "epoch": 0.12176731731379042, + "grad_norm": 1720.0084228515625, + "learning_rate": 9.772847641299691e-05, + "loss": 3.8491, + "step": 432 + }, + { + "epoch": 0.12204918610386865, + "grad_norm": 1448.00732421875, + "learning_rate": 9.771487727971641e-05, + "loss": 3.5193, + "step": 433 + }, + { + "epoch": 0.12233105489394687, + "grad_norm": 2048.013427734375, + "learning_rate": 9.770123851215713e-05, + "loss": 3.0851, + "step": 434 + }, + { + "epoch": 0.12261292368402509, + "grad_norm": 888.0106201171875, + "learning_rate": 9.768756012164811e-05, + "loss": 3.0196, + "step": 435 + }, + { + "epoch": 0.1228947924741033, + "grad_norm": 556.00390625, + "learning_rate": 9.767384211955127e-05, + "loss": 3.4245, + "step": 436 + }, + { + "epoch": 0.12317666126418152, + "grad_norm": 796.0034790039062, + "learning_rate": 9.766008451726145e-05, + "loss": 2.8299, + "step": 437 + }, + { + "epoch": 0.12345853005425975, + "grad_norm": 592.00732421875, + "learning_rate": 9.764628732620636e-05, + "loss": 3.2006, + "step": 438 + }, + { + "epoch": 0.12374039884433796, + "grad_norm": 1704.0079345703125, + "learning_rate": 9.763245055784662e-05, + "loss": 3.2976, + "step": 439 + }, + { + "epoch": 0.12402226763441618, + "grad_norm": 1504.0074462890625, + "learning_rate": 9.761857422367573e-05, + "loss": 3.2143, + "step": 440 + }, + { + "epoch": 0.1243041364244944, + "grad_norm": 1488.0072021484375, + "learning_rate": 9.760465833522002e-05, + "loss": 3.0565, + "step": 441 + }, + { + "epoch": 0.12458600521457262, + "grad_norm": 1016.0110473632812, + "learning_rate": 9.759070290403872e-05, + "loss": 2.8501, + "step": 442 + }, + { + "epoch": 0.12486787400465084, + "grad_norm": 1096.01123046875, + "learning_rate": 9.75767079417239e-05, + "loss": 3.6723, + "step": 443 + }, + { + "epoch": 0.12514974279472904, + "grad_norm": 2016.02001953125, + "learning_rate": 9.75626734599004e-05, + "loss": 3.373, + "step": 444 + }, + { + "epoch": 0.12543161158480728, + "grad_norm": 1136.0047607421875, + "learning_rate": 9.754859947022596e-05, + "loss": 3.1036, + "step": 445 + }, + { + "epoch": 0.1257134803748855, + "grad_norm": 434.0086364746094, + "learning_rate": 9.753448598439112e-05, + "loss": 2.9509, + "step": 446 + }, + { + "epoch": 0.12599534916496372, + "grad_norm": 524.00390625, + "learning_rate": 9.752033301411925e-05, + "loss": 2.4796, + "step": 447 + }, + { + "epoch": 0.12627721795504193, + "grad_norm": 1688.0150146484375, + "learning_rate": 9.750614057116643e-05, + "loss": 3.4865, + "step": 448 + }, + { + "epoch": 0.12655908674512015, + "grad_norm": 512.0062255859375, + "learning_rate": 9.749190866732164e-05, + "loss": 3.0644, + "step": 449 + }, + { + "epoch": 0.12684095553519836, + "grad_norm": 520.0073852539062, + "learning_rate": 9.747763731440656e-05, + "loss": 3.1349, + "step": 450 + }, + { + "epoch": 0.12712282432527658, + "grad_norm": 1168.0115966796875, + "learning_rate": 9.746332652427566e-05, + "loss": 3.1281, + "step": 451 + }, + { + "epoch": 0.1274046931153548, + "grad_norm": 1040.0064697265625, + "learning_rate": 9.744897630881619e-05, + "loss": 2.7248, + "step": 452 + }, + { + "epoch": 0.127686561905433, + "grad_norm": 1808.00830078125, + "learning_rate": 9.743458667994811e-05, + "loss": 3.4248, + "step": 453 + }, + { + "epoch": 0.12796843069551123, + "grad_norm": 1192.014892578125, + "learning_rate": 9.742015764962416e-05, + "loss": 3.6449, + "step": 454 + }, + { + "epoch": 0.12825029948558947, + "grad_norm": 772.0075073242188, + "learning_rate": 9.74056892298298e-05, + "loss": 2.9513, + "step": 455 + }, + { + "epoch": 0.12853216827566769, + "grad_norm": 1496.0162353515625, + "learning_rate": 9.739118143258317e-05, + "loss": 2.8682, + "step": 456 + }, + { + "epoch": 0.1288140370657459, + "grad_norm": 1024.0101318359375, + "learning_rate": 9.737663426993513e-05, + "loss": 2.9807, + "step": 457 + }, + { + "epoch": 0.12909590585582412, + "grad_norm": 1592.005126953125, + "learning_rate": 9.736204775396931e-05, + "loss": 2.8074, + "step": 458 + }, + { + "epoch": 0.12937777464590233, + "grad_norm": 752.0069580078125, + "learning_rate": 9.734742189680193e-05, + "loss": 2.1964, + "step": 459 + }, + { + "epoch": 0.12965964343598055, + "grad_norm": 1336.010009765625, + "learning_rate": 9.733275671058195e-05, + "loss": 3.0458, + "step": 460 + }, + { + "epoch": 0.12994151222605876, + "grad_norm": 2464.0263671875, + "learning_rate": 9.731805220749097e-05, + "loss": 3.4776, + "step": 461 + }, + { + "epoch": 0.13022338101613698, + "grad_norm": 1624.013427734375, + "learning_rate": 9.730330839974328e-05, + "loss": 3.0707, + "step": 462 + }, + { + "epoch": 0.1305052498062152, + "grad_norm": 1080.0068359375, + "learning_rate": 9.728852529958579e-05, + "loss": 3.2425, + "step": 463 + }, + { + "epoch": 0.13078711859629344, + "grad_norm": 1384.0054931640625, + "learning_rate": 9.727370291929803e-05, + "loss": 2.6357, + "step": 464 + }, + { + "epoch": 0.13106898738637165, + "grad_norm": 1904.0118408203125, + "learning_rate": 9.725884127119223e-05, + "loss": 2.7838, + "step": 465 + }, + { + "epoch": 0.13135085617644987, + "grad_norm": 848.0050659179688, + "learning_rate": 9.724394036761315e-05, + "loss": 2.8259, + "step": 466 + }, + { + "epoch": 0.1316327249665281, + "grad_norm": 394.0042724609375, + "learning_rate": 9.722900022093822e-05, + "loss": 2.9826, + "step": 467 + }, + { + "epoch": 0.1319145937566063, + "grad_norm": 1312.0052490234375, + "learning_rate": 9.721402084357744e-05, + "loss": 2.7085, + "step": 468 + }, + { + "epoch": 0.13219646254668452, + "grad_norm": 1792.0140380859375, + "learning_rate": 9.71990022479734e-05, + "loss": 3.2007, + "step": 469 + }, + { + "epoch": 0.13247833133676273, + "grad_norm": 932.003173828125, + "learning_rate": 9.718394444660128e-05, + "loss": 2.6151, + "step": 470 + }, + { + "epoch": 0.13276020012684095, + "grad_norm": 956.00390625, + "learning_rate": 9.716884745196883e-05, + "loss": 2.8898, + "step": 471 + }, + { + "epoch": 0.13304206891691917, + "grad_norm": 416.0052490234375, + "learning_rate": 9.715371127661631e-05, + "loss": 2.7813, + "step": 472 + }, + { + "epoch": 0.13332393770699738, + "grad_norm": 592.0042724609375, + "learning_rate": 9.713853593311657e-05, + "loss": 2.8898, + "step": 473 + }, + { + "epoch": 0.13360580649707562, + "grad_norm": 1672.007568359375, + "learning_rate": 9.712332143407499e-05, + "loss": 2.9402, + "step": 474 + }, + { + "epoch": 0.13388767528715384, + "grad_norm": 748.0035400390625, + "learning_rate": 9.710806779212946e-05, + "loss": 2.6446, + "step": 475 + }, + { + "epoch": 0.13416954407723206, + "grad_norm": 764.0061645507812, + "learning_rate": 9.70927750199504e-05, + "loss": 2.493, + "step": 476 + }, + { + "epoch": 0.13445141286731027, + "grad_norm": 1432.0128173828125, + "learning_rate": 9.707744313024069e-05, + "loss": 2.6958, + "step": 477 + }, + { + "epoch": 0.1347332816573885, + "grad_norm": 1864.0079345703125, + "learning_rate": 9.70620721357358e-05, + "loss": 3.8647, + "step": 478 + }, + { + "epoch": 0.1350151504474667, + "grad_norm": 656.0078125, + "learning_rate": 9.704666204920356e-05, + "loss": 3.0099, + "step": 479 + }, + { + "epoch": 0.13529701923754492, + "grad_norm": 968.0064086914062, + "learning_rate": 9.703121288344437e-05, + "loss": 2.9503, + "step": 480 + }, + { + "epoch": 0.13557888802762313, + "grad_norm": 1144.010009765625, + "learning_rate": 9.7015724651291e-05, + "loss": 3.1348, + "step": 481 + }, + { + "epoch": 0.13586075681770135, + "grad_norm": 1288.007568359375, + "learning_rate": 9.700019736560879e-05, + "loss": 3.8959, + "step": 482 + }, + { + "epoch": 0.13614262560777957, + "grad_norm": 1224.0079345703125, + "learning_rate": 9.698463103929542e-05, + "loss": 2.8194, + "step": 483 + }, + { + "epoch": 0.1364244943978578, + "grad_norm": 848.0071411132812, + "learning_rate": 9.696902568528104e-05, + "loss": 2.6363, + "step": 484 + }, + { + "epoch": 0.13670636318793603, + "grad_norm": 1776.0037841796875, + "learning_rate": 9.695338131652818e-05, + "loss": 2.7813, + "step": 485 + }, + { + "epoch": 0.13698823197801424, + "grad_norm": 648.00439453125, + "learning_rate": 9.693769794603184e-05, + "loss": 2.7813, + "step": 486 + }, + { + "epoch": 0.13727010076809246, + "grad_norm": 600.0034790039062, + "learning_rate": 9.69219755868194e-05, + "loss": 2.7813, + "step": 487 + }, + { + "epoch": 0.13755196955817067, + "grad_norm": 1456.010009765625, + "learning_rate": 9.690621425195054e-05, + "loss": 2.8588, + "step": 488 + }, + { + "epoch": 0.1378338383482489, + "grad_norm": 928.0029907226562, + "learning_rate": 9.689041395451745e-05, + "loss": 3.128, + "step": 489 + }, + { + "epoch": 0.1381157071383271, + "grad_norm": 498.0049743652344, + "learning_rate": 9.68745747076446e-05, + "loss": 2.7693, + "step": 490 + }, + { + "epoch": 0.13839757592840532, + "grad_norm": 948.004150390625, + "learning_rate": 9.685869652448884e-05, + "loss": 3.0456, + "step": 491 + }, + { + "epoch": 0.13867944471848354, + "grad_norm": 668.0029296875, + "learning_rate": 9.684277941823933e-05, + "loss": 2.6862, + "step": 492 + }, + { + "epoch": 0.13896131350856175, + "grad_norm": 812.006591796875, + "learning_rate": 9.682682340211762e-05, + "loss": 2.905, + "step": 493 + }, + { + "epoch": 0.13924318229864, + "grad_norm": 944.004638671875, + "learning_rate": 9.681082848937752e-05, + "loss": 3.0782, + "step": 494 + }, + { + "epoch": 0.1395250510887182, + "grad_norm": 828.0071411132812, + "learning_rate": 9.679479469330518e-05, + "loss": 2.6765, + "step": 495 + }, + { + "epoch": 0.13980691987879643, + "grad_norm": 2608.008544921875, + "learning_rate": 9.677872202721905e-05, + "loss": 3.3826, + "step": 496 + }, + { + "epoch": 0.14008878866887464, + "grad_norm": 446.005126953125, + "learning_rate": 9.676261050446986e-05, + "loss": 2.8657, + "step": 497 + }, + { + "epoch": 0.14037065745895286, + "grad_norm": 872.008056640625, + "learning_rate": 9.674646013844064e-05, + "loss": 3.3262, + "step": 498 + }, + { + "epoch": 0.14065252624903107, + "grad_norm": 956.0062866210938, + "learning_rate": 9.673027094254663e-05, + "loss": 3.3236, + "step": 499 + }, + { + "epoch": 0.1409343950391093, + "grad_norm": 1624.0113525390625, + "learning_rate": 9.671404293023536e-05, + "loss": 2.8331, + "step": 500 + }, + { + "epoch": 0.1412162638291875, + "grad_norm": 544.0059204101562, + "learning_rate": 9.669777611498662e-05, + "loss": 2.8546, + "step": 501 + }, + { + "epoch": 0.14149813261926572, + "grad_norm": 932.0029907226562, + "learning_rate": 9.66814705103124e-05, + "loss": 2.9611, + "step": 502 + }, + { + "epoch": 0.14178000140934396, + "grad_norm": 1264.006591796875, + "learning_rate": 9.66651261297569e-05, + "loss": 2.488, + "step": 503 + }, + { + "epoch": 0.14206187019942218, + "grad_norm": 1792.011474609375, + "learning_rate": 9.664874298689662e-05, + "loss": 3.4363, + "step": 504 + }, + { + "epoch": 0.1423437389895004, + "grad_norm": 1504.0106201171875, + "learning_rate": 9.66323210953401e-05, + "loss": 3.128, + "step": 505 + }, + { + "epoch": 0.1426256077795786, + "grad_norm": 1680.0089111328125, + "learning_rate": 9.661586046872822e-05, + "loss": 3.3621, + "step": 506 + }, + { + "epoch": 0.14290747656965683, + "grad_norm": 768.0060424804688, + "learning_rate": 9.659936112073393e-05, + "loss": 3.1517, + "step": 507 + }, + { + "epoch": 0.14318934535973504, + "grad_norm": 552.0060424804688, + "learning_rate": 9.658282306506243e-05, + "loss": 2.8849, + "step": 508 + }, + { + "epoch": 0.14347121414981326, + "grad_norm": 1128.00439453125, + "learning_rate": 9.656624631545098e-05, + "loss": 3.0001, + "step": 509 + }, + { + "epoch": 0.14375308293989147, + "grad_norm": 944.0043334960938, + "learning_rate": 9.654963088566905e-05, + "loss": 3.0565, + "step": 510 + }, + { + "epoch": 0.1440349517299697, + "grad_norm": 1232.004150390625, + "learning_rate": 9.653297678951821e-05, + "loss": 3.2876, + "step": 511 + }, + { + "epoch": 0.1443168205200479, + "grad_norm": 346.00433349609375, + "learning_rate": 9.651628404083218e-05, + "loss": 3.2042, + "step": 512 + }, + { + "epoch": 0.14459868931012615, + "grad_norm": 396.00506591796875, + "learning_rate": 9.649955265347672e-05, + "loss": 2.8058, + "step": 513 + }, + { + "epoch": 0.14488055810020437, + "grad_norm": 1472.005126953125, + "learning_rate": 9.648278264134976e-05, + "loss": 3.588, + "step": 514 + }, + { + "epoch": 0.14516242689028258, + "grad_norm": 1448.005126953125, + "learning_rate": 9.646597401838127e-05, + "loss": 3.7214, + "step": 515 + }, + { + "epoch": 0.1454442956803608, + "grad_norm": 1936.0048828125, + "learning_rate": 9.644912679853331e-05, + "loss": 3.6984, + "step": 516 + }, + { + "epoch": 0.145726164470439, + "grad_norm": 1304.0067138671875, + "learning_rate": 9.643224099579998e-05, + "loss": 3.396, + "step": 517 + }, + { + "epoch": 0.14600803326051723, + "grad_norm": 1440.0059814453125, + "learning_rate": 9.641531662420745e-05, + "loss": 2.5461, + "step": 518 + }, + { + "epoch": 0.14628990205059544, + "grad_norm": 692.0057983398438, + "learning_rate": 9.639835369781391e-05, + "loss": 2.948, + "step": 519 + }, + { + "epoch": 0.14657177084067366, + "grad_norm": 1624.0067138671875, + "learning_rate": 9.63813522307096e-05, + "loss": 3.2364, + "step": 520 + }, + { + "epoch": 0.14685363963075188, + "grad_norm": 1424.0079345703125, + "learning_rate": 9.636431223701676e-05, + "loss": 2.853, + "step": 521 + }, + { + "epoch": 0.1471355084208301, + "grad_norm": 792.00341796875, + "learning_rate": 9.634723373088963e-05, + "loss": 2.8172, + "step": 522 + }, + { + "epoch": 0.14741737721090833, + "grad_norm": 776.0035400390625, + "learning_rate": 9.633011672651443e-05, + "loss": 2.8693, + "step": 523 + }, + { + "epoch": 0.14769924600098655, + "grad_norm": 836.0042724609375, + "learning_rate": 9.631296123810938e-05, + "loss": 2.6071, + "step": 524 + }, + { + "epoch": 0.14798111479106477, + "grad_norm": 402.0040283203125, + "learning_rate": 9.629576727992463e-05, + "loss": 2.6066, + "step": 525 + }, + { + "epoch": 0.14826298358114298, + "grad_norm": 1496.006103515625, + "learning_rate": 9.627853486624234e-05, + "loss": 3.2996, + "step": 526 + }, + { + "epoch": 0.1485448523712212, + "grad_norm": 900.0068969726562, + "learning_rate": 9.626126401137658e-05, + "loss": 2.8692, + "step": 527 + }, + { + "epoch": 0.1488267211612994, + "grad_norm": 1568.00634765625, + "learning_rate": 9.624395472967336e-05, + "loss": 3.0077, + "step": 528 + }, + { + "epoch": 0.14910858995137763, + "grad_norm": 800.0057373046875, + "learning_rate": 9.622660703551059e-05, + "loss": 3.1847, + "step": 529 + }, + { + "epoch": 0.14939045874145584, + "grad_norm": 892.0039672851562, + "learning_rate": 9.62092209432981e-05, + "loss": 2.5552, + "step": 530 + }, + { + "epoch": 0.14967232753153406, + "grad_norm": 1248.0052490234375, + "learning_rate": 9.619179646747762e-05, + "loss": 2.7813, + "step": 531 + }, + { + "epoch": 0.14995419632161228, + "grad_norm": 1232.0045166015625, + "learning_rate": 9.617433362252277e-05, + "loss": 2.9402, + "step": 532 + }, + { + "epoch": 0.15023606511169052, + "grad_norm": 932.0064086914062, + "learning_rate": 9.615683242293903e-05, + "loss": 2.6788, + "step": 533 + }, + { + "epoch": 0.15051793390176874, + "grad_norm": 1088.0037841796875, + "learning_rate": 9.613929288326373e-05, + "loss": 3.3184, + "step": 534 + }, + { + "epoch": 0.15079980269184695, + "grad_norm": 880.0042724609375, + "learning_rate": 9.612171501806606e-05, + "loss": 3.2781, + "step": 535 + }, + { + "epoch": 0.15108167148192517, + "grad_norm": 1048.0089111328125, + "learning_rate": 9.610409884194704e-05, + "loss": 2.7381, + "step": 536 + }, + { + "epoch": 0.15136354027200338, + "grad_norm": 458.0058288574219, + "learning_rate": 9.608644436953955e-05, + "loss": 3.0209, + "step": 537 + }, + { + "epoch": 0.1516454090620816, + "grad_norm": 568.003662109375, + "learning_rate": 9.606875161550818e-05, + "loss": 2.8897, + "step": 538 + }, + { + "epoch": 0.15192727785215981, + "grad_norm": 1160.0028076171875, + "learning_rate": 9.605102059454947e-05, + "loss": 2.961, + "step": 539 + }, + { + "epoch": 0.15220914664223803, + "grad_norm": 516.0048828125, + "learning_rate": 9.603325132139159e-05, + "loss": 2.9004, + "step": 540 + }, + { + "epoch": 0.15249101543231625, + "grad_norm": 1352.00830078125, + "learning_rate": 9.601544381079457e-05, + "loss": 3.3425, + "step": 541 + }, + { + "epoch": 0.15277288422239446, + "grad_norm": 2240.0078125, + "learning_rate": 9.599759807755021e-05, + "loss": 3.6908, + "step": 542 + }, + { + "epoch": 0.1530547530124727, + "grad_norm": 1704.0103759765625, + "learning_rate": 9.597971413648202e-05, + "loss": 3.5593, + "step": 543 + }, + { + "epoch": 0.15333662180255092, + "grad_norm": 1304.002685546875, + "learning_rate": 9.596179200244527e-05, + "loss": 3.0046, + "step": 544 + }, + { + "epoch": 0.15361849059262914, + "grad_norm": 1264.008056640625, + "learning_rate": 9.594383169032695e-05, + "loss": 2.7813, + "step": 545 + }, + { + "epoch": 0.15390035938270735, + "grad_norm": 688.0040283203125, + "learning_rate": 9.592583321504576e-05, + "loss": 2.6377, + "step": 546 + }, + { + "epoch": 0.15418222817278557, + "grad_norm": 668.0037841796875, + "learning_rate": 9.59077965915521e-05, + "loss": 2.8225, + "step": 547 + }, + { + "epoch": 0.15446409696286378, + "grad_norm": 1480.009765625, + "learning_rate": 9.588972183482805e-05, + "loss": 2.7117, + "step": 548 + }, + { + "epoch": 0.154745965752942, + "grad_norm": 2008.0186767578125, + "learning_rate": 9.58716089598874e-05, + "loss": 3.6768, + "step": 549 + }, + { + "epoch": 0.15502783454302022, + "grad_norm": 1408.004638671875, + "learning_rate": 9.585345798177557e-05, + "loss": 3.4414, + "step": 550 + }, + { + "epoch": 0.15530970333309843, + "grad_norm": 660.007568359375, + "learning_rate": 9.58352689155696e-05, + "loss": 3.0002, + "step": 551 + }, + { + "epoch": 0.15559157212317667, + "grad_norm": 1888.0054931640625, + "learning_rate": 9.581704177637826e-05, + "loss": 2.6147, + "step": 552 + }, + { + "epoch": 0.1558734409132549, + "grad_norm": 1088.0050048828125, + "learning_rate": 9.579877657934187e-05, + "loss": 3.0522, + "step": 553 + }, + { + "epoch": 0.1561553097033331, + "grad_norm": 924.0023803710938, + "learning_rate": 9.578047333963238e-05, + "loss": 2.7085, + "step": 554 + }, + { + "epoch": 0.15643717849341132, + "grad_norm": 1128.00439453125, + "learning_rate": 9.576213207245332e-05, + "loss": 2.8647, + "step": 555 + }, + { + "epoch": 0.15671904728348954, + "grad_norm": 708.0021362304688, + "learning_rate": 9.574375279303988e-05, + "loss": 3.0369, + "step": 556 + }, + { + "epoch": 0.15700091607356775, + "grad_norm": 1984.0169677734375, + "learning_rate": 9.572533551665876e-05, + "loss": 3.8934, + "step": 557 + }, + { + "epoch": 0.15728278486364597, + "grad_norm": 888.0040893554688, + "learning_rate": 9.570688025860823e-05, + "loss": 3.4234, + "step": 558 + }, + { + "epoch": 0.15756465365372418, + "grad_norm": 406.0031433105469, + "learning_rate": 9.568838703421809e-05, + "loss": 2.9497, + "step": 559 + }, + { + "epoch": 0.1578465224438024, + "grad_norm": 648.0021362304688, + "learning_rate": 9.566985585884977e-05, + "loss": 2.6743, + "step": 560 + }, + { + "epoch": 0.15812839123388062, + "grad_norm": 716.0048217773438, + "learning_rate": 9.56512867478961e-05, + "loss": 3.0314, + "step": 561 + }, + { + "epoch": 0.15841026002395886, + "grad_norm": 584.0037841796875, + "learning_rate": 9.563267971678151e-05, + "loss": 2.7814, + "step": 562 + }, + { + "epoch": 0.15869212881403708, + "grad_norm": 1288.0108642578125, + "learning_rate": 9.561403478096188e-05, + "loss": 2.9299, + "step": 563 + }, + { + "epoch": 0.1589739976041153, + "grad_norm": 628.0034790039062, + "learning_rate": 9.55953519559246e-05, + "loss": 3.0991, + "step": 564 + }, + { + "epoch": 0.1592558663941935, + "grad_norm": 2112.01025390625, + "learning_rate": 9.557663125718854e-05, + "loss": 3.0991, + "step": 565 + }, + { + "epoch": 0.15953773518427172, + "grad_norm": 780.0059814453125, + "learning_rate": 9.555787270030398e-05, + "loss": 3.0802, + "step": 566 + }, + { + "epoch": 0.15981960397434994, + "grad_norm": 1024.007080078125, + "learning_rate": 9.553907630085273e-05, + "loss": 2.7244, + "step": 567 + }, + { + "epoch": 0.16010147276442815, + "grad_norm": 1728.0137939453125, + "learning_rate": 9.552024207444794e-05, + "loss": 3.3374, + "step": 568 + }, + { + "epoch": 0.16038334155450637, + "grad_norm": 1688.0042724609375, + "learning_rate": 9.550137003673427e-05, + "loss": 3.0672, + "step": 569 + }, + { + "epoch": 0.16066521034458459, + "grad_norm": 1896.007080078125, + "learning_rate": 9.54824602033877e-05, + "loss": 3.4966, + "step": 570 + }, + { + "epoch": 0.1609470791346628, + "grad_norm": 996.0037841796875, + "learning_rate": 9.546351259011568e-05, + "loss": 2.5192, + "step": 571 + }, + { + "epoch": 0.16122894792474105, + "grad_norm": 2144.011962890625, + "learning_rate": 9.544452721265701e-05, + "loss": 3.2446, + "step": 572 + }, + { + "epoch": 0.16151081671481926, + "grad_norm": 1080.0030517578125, + "learning_rate": 9.542550408678185e-05, + "loss": 2.6066, + "step": 573 + }, + { + "epoch": 0.16179268550489748, + "grad_norm": 1496.0057373046875, + "learning_rate": 9.540644322829173e-05, + "loss": 2.7792, + "step": 574 + }, + { + "epoch": 0.1620745542949757, + "grad_norm": 1184.0062255859375, + "learning_rate": 9.538734465301952e-05, + "loss": 2.7804, + "step": 575 + }, + { + "epoch": 0.1623564230850539, + "grad_norm": 616.0038452148438, + "learning_rate": 9.536820837682941e-05, + "loss": 2.9402, + "step": 576 + }, + { + "epoch": 0.16263829187513212, + "grad_norm": 1544.0068359375, + "learning_rate": 9.534903441561693e-05, + "loss": 2.7804, + "step": 577 + }, + { + "epoch": 0.16292016066521034, + "grad_norm": 852.0056762695312, + "learning_rate": 9.532982278530889e-05, + "loss": 2.5182, + "step": 578 + }, + { + "epoch": 0.16320202945528856, + "grad_norm": 2000.0096435546875, + "learning_rate": 9.531057350186337e-05, + "loss": 3.0196, + "step": 579 + }, + { + "epoch": 0.16348389824536677, + "grad_norm": 1072.00244140625, + "learning_rate": 9.52912865812698e-05, + "loss": 2.7694, + "step": 580 + }, + { + "epoch": 0.163765767035445, + "grad_norm": 1992.008056640625, + "learning_rate": 9.52719620395488e-05, + "loss": 2.7502, + "step": 581 + }, + { + "epoch": 0.16404763582552323, + "grad_norm": 980.0035400390625, + "learning_rate": 9.525259989275228e-05, + "loss": 2.8565, + "step": 582 + }, + { + "epoch": 0.16432950461560145, + "grad_norm": 1888.0155029296875, + "learning_rate": 9.523320015696335e-05, + "loss": 3.4048, + "step": 583 + }, + { + "epoch": 0.16461137340567966, + "grad_norm": 916.00439453125, + "learning_rate": 9.52137628482964e-05, + "loss": 3.0295, + "step": 584 + }, + { + "epoch": 0.16489324219575788, + "grad_norm": 556.0028076171875, + "learning_rate": 9.519428798289695e-05, + "loss": 3.0489, + "step": 585 + }, + { + "epoch": 0.1651751109858361, + "grad_norm": 1728.0050048828125, + "learning_rate": 9.517477557694182e-05, + "loss": 2.9271, + "step": 586 + }, + { + "epoch": 0.1654569797759143, + "grad_norm": 1208.0052490234375, + "learning_rate": 9.51552256466389e-05, + "loss": 2.7813, + "step": 587 + }, + { + "epoch": 0.16573884856599252, + "grad_norm": 716.004150390625, + "learning_rate": 9.513563820822734e-05, + "loss": 2.7813, + "step": 588 + }, + { + "epoch": 0.16602071735607074, + "grad_norm": 928.0067749023438, + "learning_rate": 9.51160132779774e-05, + "loss": 2.7813, + "step": 589 + }, + { + "epoch": 0.16630258614614896, + "grad_norm": 1232.0064697265625, + "learning_rate": 9.50963508721905e-05, + "loss": 2.9004, + "step": 590 + }, + { + "epoch": 0.1665844549362272, + "grad_norm": 904.0032958984375, + "learning_rate": 9.507665100719917e-05, + "loss": 2.905, + "step": 591 + }, + { + "epoch": 0.16686632372630542, + "grad_norm": 1864.00537109375, + "learning_rate": 9.50569136993671e-05, + "loss": 3.2696, + "step": 592 + }, + { + "epoch": 0.16714819251638363, + "grad_norm": 732.0064697265625, + "learning_rate": 9.5037138965089e-05, + "loss": 2.7693, + "step": 593 + }, + { + "epoch": 0.16743006130646185, + "grad_norm": 1040.002197265625, + "learning_rate": 9.501732682079074e-05, + "loss": 3.2578, + "step": 594 + }, + { + "epoch": 0.16771193009654006, + "grad_norm": 1496.0048828125, + "learning_rate": 9.499747728292927e-05, + "loss": 3.3662, + "step": 595 + }, + { + "epoch": 0.16799379888661828, + "grad_norm": 1672.01025390625, + "learning_rate": 9.497759036799254e-05, + "loss": 3.1753, + "step": 596 + }, + { + "epoch": 0.1682756676766965, + "grad_norm": 820.005615234375, + "learning_rate": 9.495766609249959e-05, + "loss": 2.893, + "step": 597 + }, + { + "epoch": 0.1685575364667747, + "grad_norm": 1672.0108642578125, + "learning_rate": 9.493770447300049e-05, + "loss": 3.0593, + "step": 598 + }, + { + "epoch": 0.16883940525685293, + "grad_norm": 1864.0079345703125, + "learning_rate": 9.491770552607631e-05, + "loss": 2.9576, + "step": 599 + }, + { + "epoch": 0.16912127404693114, + "grad_norm": 592.0045166015625, + "learning_rate": 9.489766926833916e-05, + "loss": 3.1678, + "step": 600 + }, + { + "epoch": 0.16940314283700939, + "grad_norm": 768.0078735351562, + "learning_rate": 9.487759571643211e-05, + "loss": 2.6772, + "step": 601 + }, + { + "epoch": 0.1696850116270876, + "grad_norm": 1584.0054931640625, + "learning_rate": 9.485748488702924e-05, + "loss": 2.909, + "step": 602 + }, + { + "epoch": 0.16996688041716582, + "grad_norm": 1184.0057373046875, + "learning_rate": 9.483733679683552e-05, + "loss": 2.7398, + "step": 603 + }, + { + "epoch": 0.17024874920724403, + "grad_norm": 1680.005859375, + "learning_rate": 9.481715146258699e-05, + "loss": 2.6086, + "step": 604 + }, + { + "epoch": 0.17053061799732225, + "grad_norm": 692.0042724609375, + "learning_rate": 9.479692890105054e-05, + "loss": 2.7814, + "step": 605 + }, + { + "epoch": 0.17081248678740046, + "grad_norm": 1080.0047607421875, + "learning_rate": 9.477666912902399e-05, + "loss": 2.9005, + "step": 606 + }, + { + "epoch": 0.17109435557747868, + "grad_norm": 1320.0040283203125, + "learning_rate": 9.47563721633361e-05, + "loss": 2.5956, + "step": 607 + }, + { + "epoch": 0.1713762243675569, + "grad_norm": 748.0076904296875, + "learning_rate": 9.473603802084649e-05, + "loss": 2.4481, + "step": 608 + }, + { + "epoch": 0.1716580931576351, + "grad_norm": 2176.00390625, + "learning_rate": 9.471566671844571e-05, + "loss": 3.3458, + "step": 609 + }, + { + "epoch": 0.17193996194771333, + "grad_norm": 1048.00439453125, + "learning_rate": 9.469525827305514e-05, + "loss": 2.796, + "step": 610 + }, + { + "epoch": 0.17222183073779157, + "grad_norm": 1728.0079345703125, + "learning_rate": 9.467481270162699e-05, + "loss": 3.1381, + "step": 611 + }, + { + "epoch": 0.17250369952786979, + "grad_norm": 984.0035400390625, + "learning_rate": 9.465433002114437e-05, + "loss": 2.6062, + "step": 612 + }, + { + "epoch": 0.172785568317948, + "grad_norm": 836.0082397460938, + "learning_rate": 9.463381024862114e-05, + "loss": 2.6029, + "step": 613 + }, + { + "epoch": 0.17306743710802622, + "grad_norm": 988.0074462890625, + "learning_rate": 9.461325340110207e-05, + "loss": 3.0738, + "step": 614 + }, + { + "epoch": 0.17334930589810443, + "grad_norm": 292.0051574707031, + "learning_rate": 9.459265949566263e-05, + "loss": 2.6837, + "step": 615 + }, + { + "epoch": 0.17363117468818265, + "grad_norm": 1808.012939453125, + "learning_rate": 9.457202854940914e-05, + "loss": 3.2664, + "step": 616 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 392.004638671875, + "learning_rate": 9.455136057947863e-05, + "loss": 2.5191, + "step": 617 + }, + { + "epoch": 0.17419491226833908, + "grad_norm": 908.0045166015625, + "learning_rate": 9.453065560303893e-05, + "loss": 2.6388, + "step": 618 + }, + { + "epoch": 0.1744767810584173, + "grad_norm": 804.0067138671875, + "learning_rate": 9.450991363728857e-05, + "loss": 3.1086, + "step": 619 + }, + { + "epoch": 0.1747586498484955, + "grad_norm": 1144.004150390625, + "learning_rate": 9.448913469945687e-05, + "loss": 2.9936, + "step": 620 + }, + { + "epoch": 0.17504051863857376, + "grad_norm": 2032.004638671875, + "learning_rate": 9.446831880680378e-05, + "loss": 3.5187, + "step": 621 + }, + { + "epoch": 0.17532238742865197, + "grad_norm": 310.00592041015625, + "learning_rate": 9.444746597661999e-05, + "loss": 2.6067, + "step": 622 + }, + { + "epoch": 0.1756042562187302, + "grad_norm": 1376.0040283203125, + "learning_rate": 9.442657622622688e-05, + "loss": 3.5261, + "step": 623 + }, + { + "epoch": 0.1758861250088084, + "grad_norm": 1528.012451171875, + "learning_rate": 9.440564957297648e-05, + "loss": 2.9731, + "step": 624 + }, + { + "epoch": 0.17616799379888662, + "grad_norm": 1408.005615234375, + "learning_rate": 9.438468603425147e-05, + "loss": 3.1863, + "step": 625 + }, + { + "epoch": 0.17644986258896483, + "grad_norm": 2016.0137939453125, + "learning_rate": 9.436368562746518e-05, + "loss": 4.1121, + "step": 626 + }, + { + "epoch": 0.17673173137904305, + "grad_norm": 1328.0078125, + "learning_rate": 9.434264837006156e-05, + "loss": 2.6932, + "step": 627 + }, + { + "epoch": 0.17701360016912127, + "grad_norm": 852.0068359375, + "learning_rate": 9.43215742795152e-05, + "loss": 2.5321, + "step": 628 + }, + { + "epoch": 0.17729546895919948, + "grad_norm": 215.00625610351562, + "learning_rate": 9.430046337333122e-05, + "loss": 2.4402, + "step": 629 + }, + { + "epoch": 0.17757733774927772, + "grad_norm": 1880.005859375, + "learning_rate": 9.427931566904536e-05, + "loss": 2.8182, + "step": 630 + }, + { + "epoch": 0.17785920653935594, + "grad_norm": 1336.0078125, + "learning_rate": 9.425813118422393e-05, + "loss": 2.7611, + "step": 631 + }, + { + "epoch": 0.17814107532943416, + "grad_norm": 856.0049438476562, + "learning_rate": 9.423690993646378e-05, + "loss": 2.4172, + "step": 632 + }, + { + "epoch": 0.17842294411951237, + "grad_norm": 1192.0045166015625, + "learning_rate": 9.421565194339233e-05, + "loss": 2.9021, + "step": 633 + }, + { + "epoch": 0.1787048129095906, + "grad_norm": 900.0032348632812, + "learning_rate": 9.419435722266745e-05, + "loss": 2.9344, + "step": 634 + }, + { + "epoch": 0.1789866816996688, + "grad_norm": 1904.0142822265625, + "learning_rate": 9.417302579197757e-05, + "loss": 3.5254, + "step": 635 + }, + { + "epoch": 0.17926855048974702, + "grad_norm": 1576.0101318359375, + "learning_rate": 9.415165766904166e-05, + "loss": 3.1706, + "step": 636 + }, + { + "epoch": 0.17955041927982524, + "grad_norm": 1456.0067138671875, + "learning_rate": 9.413025287160905e-05, + "loss": 3.0, + "step": 637 + }, + { + "epoch": 0.17983228806990345, + "grad_norm": 868.0060424804688, + "learning_rate": 9.410881141745962e-05, + "loss": 2.545, + "step": 638 + }, + { + "epoch": 0.18011415685998167, + "grad_norm": 1496.010986328125, + "learning_rate": 9.408733332440369e-05, + "loss": 2.8738, + "step": 639 + }, + { + "epoch": 0.1803960256500599, + "grad_norm": 1520.00390625, + "learning_rate": 9.406581861028198e-05, + "loss": 2.6863, + "step": 640 + }, + { + "epoch": 0.18067789444013813, + "grad_norm": 1520.00439453125, + "learning_rate": 9.404426729296564e-05, + "loss": 2.6788, + "step": 641 + }, + { + "epoch": 0.18095976323021634, + "grad_norm": 474.0049743652344, + "learning_rate": 9.402267939035627e-05, + "loss": 2.9688, + "step": 642 + }, + { + "epoch": 0.18124163202029456, + "grad_norm": 1512.0072021484375, + "learning_rate": 9.40010549203858e-05, + "loss": 3.5043, + "step": 643 + }, + { + "epoch": 0.18152350081037277, + "grad_norm": 1128.005859375, + "learning_rate": 9.397939390101657e-05, + "loss": 3.1277, + "step": 644 + }, + { + "epoch": 0.181805369600451, + "grad_norm": 1760.0089111328125, + "learning_rate": 9.395769635024126e-05, + "loss": 2.9089, + "step": 645 + }, + { + "epoch": 0.1820872383905292, + "grad_norm": 876.0042114257812, + "learning_rate": 9.393596228608288e-05, + "loss": 2.5753, + "step": 646 + }, + { + "epoch": 0.18236910718060742, + "grad_norm": 976.0053100585938, + "learning_rate": 9.391419172659481e-05, + "loss": 2.8692, + "step": 647 + }, + { + "epoch": 0.18265097597068564, + "grad_norm": 536.0048828125, + "learning_rate": 9.389238468986075e-05, + "loss": 2.9688, + "step": 648 + }, + { + "epoch": 0.18293284476076385, + "grad_norm": 1416.01123046875, + "learning_rate": 9.387054119399466e-05, + "loss": 2.9689, + "step": 649 + }, + { + "epoch": 0.1832147135508421, + "grad_norm": 1616.00341796875, + "learning_rate": 9.384866125714079e-05, + "loss": 2.9402, + "step": 650 + }, + { + "epoch": 0.1834965823409203, + "grad_norm": 748.01123046875, + "learning_rate": 9.382674489747368e-05, + "loss": 3.3957, + "step": 651 + }, + { + "epoch": 0.18377845113099853, + "grad_norm": 1296.0115966796875, + "learning_rate": 9.38047921331981e-05, + "loss": 2.4758, + "step": 652 + }, + { + "epoch": 0.18406031992107674, + "grad_norm": 1272.0045166015625, + "learning_rate": 9.378280298254911e-05, + "loss": 2.767, + "step": 653 + }, + { + "epoch": 0.18434218871115496, + "grad_norm": 840.005126953125, + "learning_rate": 9.376077746379192e-05, + "loss": 3.0969, + "step": 654 + }, + { + "epoch": 0.18462405750123317, + "grad_norm": 940.0057983398438, + "learning_rate": 9.373871559522202e-05, + "loss": 2.6071, + "step": 655 + }, + { + "epoch": 0.1849059262913114, + "grad_norm": 1520.0059814453125, + "learning_rate": 9.371661739516506e-05, + "loss": 3.6071, + "step": 656 + }, + { + "epoch": 0.1851877950813896, + "grad_norm": 1160.004638671875, + "learning_rate": 9.369448288197685e-05, + "loss": 2.8906, + "step": 657 + }, + { + "epoch": 0.18546966387146782, + "grad_norm": 856.0042114257812, + "learning_rate": 9.36723120740434e-05, + "loss": 2.9261, + "step": 658 + }, + { + "epoch": 0.18575153266154604, + "grad_norm": 752.0050048828125, + "learning_rate": 9.365010498978084e-05, + "loss": 3.2581, + "step": 659 + }, + { + "epoch": 0.18603340145162428, + "grad_norm": 1520.0068359375, + "learning_rate": 9.362786164763546e-05, + "loss": 3.1474, + "step": 660 + }, + { + "epoch": 0.1863152702417025, + "grad_norm": 2256.003662109375, + "learning_rate": 9.360558206608363e-05, + "loss": 2.6255, + "step": 661 + }, + { + "epoch": 0.1865971390317807, + "grad_norm": 796.0069580078125, + "learning_rate": 9.358326626363187e-05, + "loss": 3.3571, + "step": 662 + }, + { + "epoch": 0.18687900782185893, + "grad_norm": 848.0032958984375, + "learning_rate": 9.356091425881677e-05, + "loss": 2.719, + "step": 663 + }, + { + "epoch": 0.18716087661193714, + "grad_norm": 1096.0030517578125, + "learning_rate": 9.353852607020495e-05, + "loss": 2.7502, + "step": 664 + }, + { + "epoch": 0.18744274540201536, + "grad_norm": 1504.006591796875, + "learning_rate": 9.351610171639314e-05, + "loss": 3.3233, + "step": 665 + }, + { + "epoch": 0.18772461419209358, + "grad_norm": 1080.0047607421875, + "learning_rate": 9.349364121600807e-05, + "loss": 2.8349, + "step": 666 + }, + { + "epoch": 0.1880064829821718, + "grad_norm": 588.0042114257812, + "learning_rate": 9.347114458770657e-05, + "loss": 3.1967, + "step": 667 + }, + { + "epoch": 0.18828835177225, + "grad_norm": 1416.0081787109375, + "learning_rate": 9.344861185017535e-05, + "loss": 3.2181, + "step": 668 + }, + { + "epoch": 0.18857022056232822, + "grad_norm": 1496.0128173828125, + "learning_rate": 9.342604302213125e-05, + "loss": 3.469, + "step": 669 + }, + { + "epoch": 0.18885208935240647, + "grad_norm": 1360.00927734375, + "learning_rate": 9.340343812232097e-05, + "loss": 2.8481, + "step": 670 + }, + { + "epoch": 0.18913395814248468, + "grad_norm": 560.0032958984375, + "learning_rate": 9.338079716952129e-05, + "loss": 2.8691, + "step": 671 + }, + { + "epoch": 0.1894158269325629, + "grad_norm": 1632.0101318359375, + "learning_rate": 9.335812018253884e-05, + "loss": 3.021, + "step": 672 + }, + { + "epoch": 0.1896976957226411, + "grad_norm": 944.0042724609375, + "learning_rate": 9.333540718021023e-05, + "loss": 2.8257, + "step": 673 + }, + { + "epoch": 0.18997956451271933, + "grad_norm": 1520.0078125, + "learning_rate": 9.331265818140196e-05, + "loss": 2.6864, + "step": 674 + }, + { + "epoch": 0.19026143330279754, + "grad_norm": 1056.0057373046875, + "learning_rate": 9.328987320501048e-05, + "loss": 2.8692, + "step": 675 + }, + { + "epoch": 0.19054330209287576, + "grad_norm": 1048.00830078125, + "learning_rate": 9.326705226996207e-05, + "loss": 3.379, + "step": 676 + }, + { + "epoch": 0.19082517088295398, + "grad_norm": 1216.00830078125, + "learning_rate": 9.32441953952129e-05, + "loss": 3.6251, + "step": 677 + }, + { + "epoch": 0.1911070396730322, + "grad_norm": 548.0093383789062, + "learning_rate": 9.322130259974898e-05, + "loss": 2.795, + "step": 678 + }, + { + "epoch": 0.19138890846311044, + "grad_norm": 1856.0067138671875, + "learning_rate": 9.319837390258619e-05, + "loss": 3.1198, + "step": 679 + }, + { + "epoch": 0.19167077725318865, + "grad_norm": 1512.0115966796875, + "learning_rate": 9.31754093227702e-05, + "loss": 2.7046, + "step": 680 + }, + { + "epoch": 0.19195264604326687, + "grad_norm": 660.0037231445312, + "learning_rate": 9.31524088793765e-05, + "loss": 2.8184, + "step": 681 + }, + { + "epoch": 0.19223451483334508, + "grad_norm": 1680.0108642578125, + "learning_rate": 9.312937259151039e-05, + "loss": 3.6185, + "step": 682 + }, + { + "epoch": 0.1925163836234233, + "grad_norm": 1424.0079345703125, + "learning_rate": 9.310630047830688e-05, + "loss": 3.3548, + "step": 683 + }, + { + "epoch": 0.19279825241350151, + "grad_norm": 632.0027465820312, + "learning_rate": 9.308319255893083e-05, + "loss": 2.7813, + "step": 684 + }, + { + "epoch": 0.19308012120357973, + "grad_norm": 676.0064086914062, + "learning_rate": 9.306004885257675e-05, + "loss": 2.3928, + "step": 685 + }, + { + "epoch": 0.19336198999365795, + "grad_norm": 1880.0101318359375, + "learning_rate": 9.303686937846892e-05, + "loss": 3.2227, + "step": 686 + }, + { + "epoch": 0.19364385878373616, + "grad_norm": 1600.006103515625, + "learning_rate": 9.301365415586137e-05, + "loss": 3.6924, + "step": 687 + }, + { + "epoch": 0.19392572757381438, + "grad_norm": 960.008056640625, + "learning_rate": 9.299040320403773e-05, + "loss": 3.6146, + "step": 688 + }, + { + "epoch": 0.19420759636389262, + "grad_norm": 219.00584411621094, + "learning_rate": 9.296711654231141e-05, + "loss": 3.1758, + "step": 689 + }, + { + "epoch": 0.19448946515397084, + "grad_norm": 1760.007568359375, + "learning_rate": 9.29437941900254e-05, + "loss": 2.9896, + "step": 690 + }, + { + "epoch": 0.19477133394404905, + "grad_norm": 346.0037841796875, + "learning_rate": 9.292043616655239e-05, + "loss": 2.8585, + "step": 691 + }, + { + "epoch": 0.19505320273412727, + "grad_norm": 832.0064697265625, + "learning_rate": 9.289704249129468e-05, + "loss": 2.6713, + "step": 692 + }, + { + "epoch": 0.19533507152420548, + "grad_norm": 1808.00439453125, + "learning_rate": 9.287361318368417e-05, + "loss": 3.3008, + "step": 693 + }, + { + "epoch": 0.1956169403142837, + "grad_norm": 1968.005859375, + "learning_rate": 9.28501482631824e-05, + "loss": 3.7115, + "step": 694 + }, + { + "epoch": 0.19589880910436192, + "grad_norm": 1952.008544921875, + "learning_rate": 9.282664774928045e-05, + "loss": 2.9188, + "step": 695 + }, + { + "epoch": 0.19618067789444013, + "grad_norm": 2272.010009765625, + "learning_rate": 9.280311166149896e-05, + "loss": 3.7787, + "step": 696 + }, + { + "epoch": 0.19646254668451835, + "grad_norm": 1616.0120849609375, + "learning_rate": 9.277954001938818e-05, + "loss": 3.7468, + "step": 697 + }, + { + "epoch": 0.19674441547459656, + "grad_norm": 1104.003173828125, + "learning_rate": 9.275593284252782e-05, + "loss": 2.7143, + "step": 698 + }, + { + "epoch": 0.1970262842646748, + "grad_norm": 608.0025024414062, + "learning_rate": 9.273229015052716e-05, + "loss": 2.9239, + "step": 699 + }, + { + "epoch": 0.19730815305475302, + "grad_norm": 2240.024169921875, + "learning_rate": 9.270861196302494e-05, + "loss": 3.9011, + "step": 700 + }, + { + "epoch": 0.19759002184483124, + "grad_norm": 1288.0078125, + "learning_rate": 9.268489829968943e-05, + "loss": 2.8961, + "step": 701 + }, + { + "epoch": 0.19787189063490945, + "grad_norm": 896.0060424804688, + "learning_rate": 9.266114918021832e-05, + "loss": 3.042, + "step": 702 + }, + { + "epoch": 0.19815375942498767, + "grad_norm": 1632.016357421875, + "learning_rate": 9.263736462433878e-05, + "loss": 3.0269, + "step": 703 + }, + { + "epoch": 0.19843562821506588, + "grad_norm": 1544.0137939453125, + "learning_rate": 9.261354465180742e-05, + "loss": 3.4325, + "step": 704 + }, + { + "epoch": 0.1987174970051441, + "grad_norm": 808.0053100585938, + "learning_rate": 9.258968928241027e-05, + "loss": 3.2963, + "step": 705 + }, + { + "epoch": 0.19899936579522232, + "grad_norm": 880.0064697265625, + "learning_rate": 9.256579853596273e-05, + "loss": 3.4064, + "step": 706 + }, + { + "epoch": 0.19928123458530053, + "grad_norm": 2576.012451171875, + "learning_rate": 9.254187243230959e-05, + "loss": 3.0042, + "step": 707 + }, + { + "epoch": 0.19956310337537875, + "grad_norm": 1360.0084228515625, + "learning_rate": 9.251791099132506e-05, + "loss": 3.3237, + "step": 708 + }, + { + "epoch": 0.199844972165457, + "grad_norm": 628.0073852539062, + "learning_rate": 9.249391423291262e-05, + "loss": 3.2183, + "step": 709 + }, + { + "epoch": 0.2001268409555352, + "grad_norm": 1128.0069580078125, + "learning_rate": 9.24698821770052e-05, + "loss": 2.7791, + "step": 710 + }, + { + "epoch": 0.20040870974561342, + "grad_norm": 1160.0057373046875, + "learning_rate": 9.244581484356495e-05, + "loss": 2.5384, + "step": 711 + }, + { + "epoch": 0.20069057853569164, + "grad_norm": 2400.017333984375, + "learning_rate": 9.242171225258336e-05, + "loss": 3.5364, + "step": 712 + }, + { + "epoch": 0.20097244732576985, + "grad_norm": 980.0098876953125, + "learning_rate": 9.239757442408121e-05, + "loss": 3.5929, + "step": 713 + }, + { + "epoch": 0.20125431611584807, + "grad_norm": 1104.0028076171875, + "learning_rate": 9.237340137810854e-05, + "loss": 2.9662, + "step": 714 + }, + { + "epoch": 0.20153618490592629, + "grad_norm": 536.0048217773438, + "learning_rate": 9.234919313474464e-05, + "loss": 2.9655, + "step": 715 + }, + { + "epoch": 0.2018180536960045, + "grad_norm": 1880.0078125, + "learning_rate": 9.232494971409806e-05, + "loss": 3.3609, + "step": 716 + }, + { + "epoch": 0.20209992248608272, + "grad_norm": 1480.006103515625, + "learning_rate": 9.230067113630656e-05, + "loss": 2.7915, + "step": 717 + }, + { + "epoch": 0.20238179127616096, + "grad_norm": 1360.009521484375, + "learning_rate": 9.227635742153706e-05, + "loss": 3.1923, + "step": 718 + }, + { + "epoch": 0.20266366006623918, + "grad_norm": 1272.0030517578125, + "learning_rate": 9.225200858998576e-05, + "loss": 2.7092, + "step": 719 + }, + { + "epoch": 0.2029455288563174, + "grad_norm": 1056.0057373046875, + "learning_rate": 9.222762466187792e-05, + "loss": 2.7384, + "step": 720 + }, + { + "epoch": 0.2032273976463956, + "grad_norm": 1552.0107421875, + "learning_rate": 9.220320565746805e-05, + "loss": 2.8552, + "step": 721 + }, + { + "epoch": 0.20350926643647382, + "grad_norm": 668.0068969726562, + "learning_rate": 9.217875159703973e-05, + "loss": 2.6145, + "step": 722 + }, + { + "epoch": 0.20379113522655204, + "grad_norm": 996.005126953125, + "learning_rate": 9.215426250090568e-05, + "loss": 2.7635, + "step": 723 + }, + { + "epoch": 0.20407300401663026, + "grad_norm": 1688.0098876953125, + "learning_rate": 9.212973838940775e-05, + "loss": 2.9223, + "step": 724 + }, + { + "epoch": 0.20435487280670847, + "grad_norm": 450.0055236816406, + "learning_rate": 9.210517928291681e-05, + "loss": 3.1407, + "step": 725 + }, + { + "epoch": 0.2046367415967867, + "grad_norm": 1320.0064697265625, + "learning_rate": 9.208058520183289e-05, + "loss": 3.2374, + "step": 726 + }, + { + "epoch": 0.2049186103868649, + "grad_norm": 1024.00830078125, + "learning_rate": 9.205595616658495e-05, + "loss": 2.8536, + "step": 727 + }, + { + "epoch": 0.20520047917694315, + "grad_norm": 556.0033569335938, + "learning_rate": 9.203129219763114e-05, + "loss": 3.0991, + "step": 728 + }, + { + "epoch": 0.20548234796702136, + "grad_norm": 648.0046997070312, + "learning_rate": 9.200659331545846e-05, + "loss": 2.7489, + "step": 729 + }, + { + "epoch": 0.20576421675709958, + "grad_norm": 1224.0040283203125, + "learning_rate": 9.198185954058305e-05, + "loss": 3.1485, + "step": 730 + }, + { + "epoch": 0.2060460855471778, + "grad_norm": 1208.0076904296875, + "learning_rate": 9.195709089354994e-05, + "loss": 3.1441, + "step": 731 + }, + { + "epoch": 0.206327954337256, + "grad_norm": 1488.006591796875, + "learning_rate": 9.193228739493321e-05, + "loss": 3.3467, + "step": 732 + }, + { + "epoch": 0.20660982312733422, + "grad_norm": 2040.0076904296875, + "learning_rate": 9.190744906533578e-05, + "loss": 4.4298, + "step": 733 + }, + { + "epoch": 0.20689169191741244, + "grad_norm": 660.00634765625, + "learning_rate": 9.188257592538961e-05, + "loss": 3.0762, + "step": 734 + }, + { + "epoch": 0.20717356070749066, + "grad_norm": 924.0027465820312, + "learning_rate": 9.185766799575552e-05, + "loss": 2.7578, + "step": 735 + }, + { + "epoch": 0.20745542949756887, + "grad_norm": 1728.0101318359375, + "learning_rate": 9.183272529712323e-05, + "loss": 2.8272, + "step": 736 + }, + { + "epoch": 0.2077372982876471, + "grad_norm": 1320.0081787109375, + "learning_rate": 9.180774785021136e-05, + "loss": 3.8249, + "step": 737 + }, + { + "epoch": 0.20801916707772533, + "grad_norm": 644.004150390625, + "learning_rate": 9.178273567576739e-05, + "loss": 3.3379, + "step": 738 + }, + { + "epoch": 0.20830103586780355, + "grad_norm": 1976.0140380859375, + "learning_rate": 9.175768879456759e-05, + "loss": 2.3982, + "step": 739 + }, + { + "epoch": 0.20858290465788176, + "grad_norm": 1176.0042724609375, + "learning_rate": 9.173260722741716e-05, + "loss": 3.3454, + "step": 740 + }, + { + "epoch": 0.20886477344795998, + "grad_norm": 1440.010498046875, + "learning_rate": 9.170749099515005e-05, + "loss": 2.9434, + "step": 741 + }, + { + "epoch": 0.2091466422380382, + "grad_norm": 1020.0086669921875, + "learning_rate": 9.1682340118629e-05, + "loss": 2.99, + "step": 742 + }, + { + "epoch": 0.2094285110281164, + "grad_norm": 1984.013916015625, + "learning_rate": 9.165715461874553e-05, + "loss": 3.3711, + "step": 743 + }, + { + "epoch": 0.20971037981819463, + "grad_norm": 1232.002197265625, + "learning_rate": 9.163193451641997e-05, + "loss": 3.0951, + "step": 744 + }, + { + "epoch": 0.20999224860827284, + "grad_norm": 1040.0084228515625, + "learning_rate": 9.160667983260132e-05, + "loss": 3.3396, + "step": 745 + }, + { + "epoch": 0.21027411739835106, + "grad_norm": 1128.0032958984375, + "learning_rate": 9.158139058826737e-05, + "loss": 3.1974, + "step": 746 + }, + { + "epoch": 0.21055598618842927, + "grad_norm": 1112.006591796875, + "learning_rate": 9.155606680442454e-05, + "loss": 3.7439, + "step": 747 + }, + { + "epoch": 0.21083785497850752, + "grad_norm": 1344.004150390625, + "learning_rate": 9.153070850210803e-05, + "loss": 3.42, + "step": 748 + }, + { + "epoch": 0.21111972376858573, + "grad_norm": 816.0069580078125, + "learning_rate": 9.150531570238166e-05, + "loss": 3.0447, + "step": 749 + }, + { + "epoch": 0.21140159255866395, + "grad_norm": 1120.0052490234375, + "learning_rate": 9.14798884263379e-05, + "loss": 3.433, + "step": 750 + }, + { + "epoch": 0.21168346134874216, + "grad_norm": 800.0077514648438, + "learning_rate": 9.145442669509787e-05, + "loss": 3.021, + "step": 751 + }, + { + "epoch": 0.21196533013882038, + "grad_norm": 194.0108642578125, + "learning_rate": 9.142893052981132e-05, + "loss": 2.6789, + "step": 752 + }, + { + "epoch": 0.2122471989288986, + "grad_norm": 1208.0078125, + "learning_rate": 9.140339995165661e-05, + "loss": 2.7801, + "step": 753 + }, + { + "epoch": 0.2125290677189768, + "grad_norm": 492.0043029785156, + "learning_rate": 9.137783498184064e-05, + "loss": 3.0983, + "step": 754 + }, + { + "epoch": 0.21281093650905503, + "grad_norm": 1336.006591796875, + "learning_rate": 9.135223564159894e-05, + "loss": 3.1174, + "step": 755 + }, + { + "epoch": 0.21309280529913324, + "grad_norm": 450.0135803222656, + "learning_rate": 9.132660195219554e-05, + "loss": 2.664, + "step": 756 + }, + { + "epoch": 0.21337467408921149, + "grad_norm": 1944.010498046875, + "learning_rate": 9.1300933934923e-05, + "loss": 3.0657, + "step": 757 + }, + { + "epoch": 0.2136565428792897, + "grad_norm": 788.007568359375, + "learning_rate": 9.127523161110244e-05, + "loss": 3.0607, + "step": 758 + }, + { + "epoch": 0.21393841166936792, + "grad_norm": 1624.0069580078125, + "learning_rate": 9.124949500208344e-05, + "loss": 3.0239, + "step": 759 + }, + { + "epoch": 0.21422028045944613, + "grad_norm": 1176.0074462890625, + "learning_rate": 9.122372412924408e-05, + "loss": 2.8334, + "step": 760 + }, + { + "epoch": 0.21450214924952435, + "grad_norm": 1736.0118408203125, + "learning_rate": 9.119791901399089e-05, + "loss": 2.9799, + "step": 761 + }, + { + "epoch": 0.21478401803960256, + "grad_norm": 1864.010498046875, + "learning_rate": 9.11720796777588e-05, + "loss": 3.3208, + "step": 762 + }, + { + "epoch": 0.21506588682968078, + "grad_norm": 1480.0078125, + "learning_rate": 9.114620614201128e-05, + "loss": 2.965, + "step": 763 + }, + { + "epoch": 0.215347755619759, + "grad_norm": 656.0064086914062, + "learning_rate": 9.112029842824008e-05, + "loss": 2.7009, + "step": 764 + }, + { + "epoch": 0.2156296244098372, + "grad_norm": 1240.0052490234375, + "learning_rate": 9.109435655796542e-05, + "loss": 2.4737, + "step": 765 + }, + { + "epoch": 0.21591149319991543, + "grad_norm": 540.0079345703125, + "learning_rate": 9.106838055273588e-05, + "loss": 3.4155, + "step": 766 + }, + { + "epoch": 0.21619336198999367, + "grad_norm": 1032.0057373046875, + "learning_rate": 9.104237043412836e-05, + "loss": 3.1485, + "step": 767 + }, + { + "epoch": 0.2164752307800719, + "grad_norm": 548.0059814453125, + "learning_rate": 9.101632622374811e-05, + "loss": 3.1273, + "step": 768 + }, + { + "epoch": 0.2167570995701501, + "grad_norm": 1144.0035400390625, + "learning_rate": 9.099024794322874e-05, + "loss": 3.5519, + "step": 769 + }, + { + "epoch": 0.21703896836022832, + "grad_norm": 1440.00732421875, + "learning_rate": 9.096413561423213e-05, + "loss": 2.9985, + "step": 770 + }, + { + "epoch": 0.21732083715030653, + "grad_norm": 1080.00341796875, + "learning_rate": 9.09379892584484e-05, + "loss": 3.948, + "step": 771 + }, + { + "epoch": 0.21760270594038475, + "grad_norm": 2576.01953125, + "learning_rate": 9.091180889759602e-05, + "loss": 3.3458, + "step": 772 + }, + { + "epoch": 0.21788457473046297, + "grad_norm": 2336.010009765625, + "learning_rate": 9.088559455342163e-05, + "loss": 3.4104, + "step": 773 + }, + { + "epoch": 0.21816644352054118, + "grad_norm": 1984.0062255859375, + "learning_rate": 9.085934624770014e-05, + "loss": 3.0951, + "step": 774 + }, + { + "epoch": 0.2184483123106194, + "grad_norm": 980.0105590820312, + "learning_rate": 9.083306400223465e-05, + "loss": 3.2266, + "step": 775 + }, + { + "epoch": 0.2187301811006976, + "grad_norm": 398.0069885253906, + "learning_rate": 9.080674783885647e-05, + "loss": 2.9005, + "step": 776 + }, + { + "epoch": 0.21901204989077586, + "grad_norm": 760.005615234375, + "learning_rate": 9.078039777942506e-05, + "loss": 3.1832, + "step": 777 + }, + { + "epoch": 0.21929391868085407, + "grad_norm": 840.0048217773438, + "learning_rate": 9.07540138458281e-05, + "loss": 2.7814, + "step": 778 + }, + { + "epoch": 0.2195757874709323, + "grad_norm": 1136.008544921875, + "learning_rate": 9.072759605998128e-05, + "loss": 3.1374, + "step": 779 + }, + { + "epoch": 0.2198576562610105, + "grad_norm": 780.0025024414062, + "learning_rate": 9.070114444382855e-05, + "loss": 2.5102, + "step": 780 + }, + { + "epoch": 0.22013952505108872, + "grad_norm": 532.0042724609375, + "learning_rate": 9.067465901934187e-05, + "loss": 3.0687, + "step": 781 + }, + { + "epoch": 0.22042139384116693, + "grad_norm": 1176.0052490234375, + "learning_rate": 9.064813980852132e-05, + "loss": 3.432, + "step": 782 + }, + { + "epoch": 0.22070326263124515, + "grad_norm": 1248.0064697265625, + "learning_rate": 9.062158683339503e-05, + "loss": 3.0209, + "step": 783 + }, + { + "epoch": 0.22098513142132337, + "grad_norm": 1424.0052490234375, + "learning_rate": 9.059500011601918e-05, + "loss": 2.7986, + "step": 784 + }, + { + "epoch": 0.22126700021140158, + "grad_norm": 506.0037536621094, + "learning_rate": 9.056837967847799e-05, + "loss": 3.0261, + "step": 785 + }, + { + "epoch": 0.2215488690014798, + "grad_norm": 1632.0103759765625, + "learning_rate": 9.054172554288368e-05, + "loss": 2.8776, + "step": 786 + }, + { + "epoch": 0.22183073779155804, + "grad_norm": 916.0033569335938, + "learning_rate": 9.051503773137646e-05, + "loss": 3.0251, + "step": 787 + }, + { + "epoch": 0.22211260658163626, + "grad_norm": 1544.0111083984375, + "learning_rate": 9.04883162661245e-05, + "loss": 3.639, + "step": 788 + }, + { + "epoch": 0.22239447537171447, + "grad_norm": 1224.0062255859375, + "learning_rate": 9.046156116932395e-05, + "loss": 3.0443, + "step": 789 + }, + { + "epoch": 0.2226763441617927, + "grad_norm": 1576.009033203125, + "learning_rate": 9.043477246319888e-05, + "loss": 3.4715, + "step": 790 + }, + { + "epoch": 0.2229582129518709, + "grad_norm": 352.0040283203125, + "learning_rate": 9.040795017000128e-05, + "loss": 2.975, + "step": 791 + }, + { + "epoch": 0.22324008174194912, + "grad_norm": 1448.0052490234375, + "learning_rate": 9.038109431201106e-05, + "loss": 3.0453, + "step": 792 + }, + { + "epoch": 0.22352195053202734, + "grad_norm": 1800.0079345703125, + "learning_rate": 9.035420491153595e-05, + "loss": 3.2231, + "step": 793 + }, + { + "epoch": 0.22380381932210555, + "grad_norm": 1760.0140380859375, + "learning_rate": 9.032728199091162e-05, + "loss": 2.8618, + "step": 794 + }, + { + "epoch": 0.22408568811218377, + "grad_norm": 2112.012939453125, + "learning_rate": 9.030032557250155e-05, + "loss": 4.1192, + "step": 795 + }, + { + "epoch": 0.224367556902262, + "grad_norm": 1200.0057373046875, + "learning_rate": 9.0273335678697e-05, + "loss": 3.2663, + "step": 796 + }, + { + "epoch": 0.22464942569234023, + "grad_norm": 680.006103515625, + "learning_rate": 9.024631233191711e-05, + "loss": 2.9193, + "step": 797 + }, + { + "epoch": 0.22493129448241844, + "grad_norm": 1568.0059814453125, + "learning_rate": 9.021925555460878e-05, + "loss": 3.0274, + "step": 798 + }, + { + "epoch": 0.22521316327249666, + "grad_norm": 1560.0115966796875, + "learning_rate": 9.019216536924666e-05, + "loss": 2.6729, + "step": 799 + }, + { + "epoch": 0.22549503206257487, + "grad_norm": 2272.0068359375, + "learning_rate": 9.016504179833316e-05, + "loss": 3.3881, + "step": 800 + }, + { + "epoch": 0.2257769008526531, + "grad_norm": 836.00732421875, + "learning_rate": 9.013788486439844e-05, + "loss": 2.8717, + "step": 801 + }, + { + "epoch": 0.2260587696427313, + "grad_norm": 708.007080078125, + "learning_rate": 9.011069459000035e-05, + "loss": 2.6966, + "step": 802 + }, + { + "epoch": 0.22634063843280952, + "grad_norm": 952.0055541992188, + "learning_rate": 9.008347099772445e-05, + "loss": 3.2982, + "step": 803 + }, + { + "epoch": 0.22662250722288774, + "grad_norm": 482.0067138671875, + "learning_rate": 9.005621411018396e-05, + "loss": 2.9066, + "step": 804 + }, + { + "epoch": 0.22690437601296595, + "grad_norm": 1048.004638671875, + "learning_rate": 9.002892395001978e-05, + "loss": 3.3988, + "step": 805 + }, + { + "epoch": 0.2271862448030442, + "grad_norm": 1408.00634765625, + "learning_rate": 9.000160053990044e-05, + "loss": 2.7744, + "step": 806 + }, + { + "epoch": 0.2274681135931224, + "grad_norm": 532.0096435546875, + "learning_rate": 8.997424390252204e-05, + "loss": 3.0556, + "step": 807 + }, + { + "epoch": 0.22774998238320063, + "grad_norm": 984.0089111328125, + "learning_rate": 8.994685406060836e-05, + "loss": 3.2606, + "step": 808 + }, + { + "epoch": 0.22803185117327884, + "grad_norm": 1184.008544921875, + "learning_rate": 8.991943103691073e-05, + "loss": 3.1981, + "step": 809 + }, + { + "epoch": 0.22831371996335706, + "grad_norm": 1376.0084228515625, + "learning_rate": 8.989197485420802e-05, + "loss": 2.8588, + "step": 810 + }, + { + "epoch": 0.22859558875343527, + "grad_norm": 872.0040283203125, + "learning_rate": 8.986448553530664e-05, + "loss": 2.7978, + "step": 811 + }, + { + "epoch": 0.2288774575435135, + "grad_norm": 924.0028686523438, + "learning_rate": 8.983696310304057e-05, + "loss": 2.6075, + "step": 812 + }, + { + "epoch": 0.2291593263335917, + "grad_norm": 924.006103515625, + "learning_rate": 8.980940758027127e-05, + "loss": 2.8425, + "step": 813 + }, + { + "epoch": 0.22944119512366992, + "grad_norm": 1344.005126953125, + "learning_rate": 8.97818189898877e-05, + "loss": 2.4354, + "step": 814 + }, + { + "epoch": 0.22972306391374814, + "grad_norm": 860.0071411132812, + "learning_rate": 8.975419735480623e-05, + "loss": 3.211, + "step": 815 + }, + { + "epoch": 0.23000493270382638, + "grad_norm": 2064.006591796875, + "learning_rate": 8.972654269797075e-05, + "loss": 3.3057, + "step": 816 + }, + { + "epoch": 0.2302868014939046, + "grad_norm": 800.007080078125, + "learning_rate": 8.969885504235256e-05, + "loss": 3.4499, + "step": 817 + }, + { + "epoch": 0.2305686702839828, + "grad_norm": 684.0055541992188, + "learning_rate": 8.967113441095034e-05, + "loss": 2.8643, + "step": 818 + }, + { + "epoch": 0.23085053907406103, + "grad_norm": 1392.00439453125, + "learning_rate": 8.96433808267902e-05, + "loss": 2.628, + "step": 819 + }, + { + "epoch": 0.23113240786413924, + "grad_norm": 1184.0045166015625, + "learning_rate": 8.961559431292562e-05, + "loss": 3.4017, + "step": 820 + }, + { + "epoch": 0.23141427665421746, + "grad_norm": 288.00701904296875, + "learning_rate": 8.958777489243739e-05, + "loss": 2.8262, + "step": 821 + }, + { + "epoch": 0.23169614544429568, + "grad_norm": 876.00341796875, + "learning_rate": 8.955992258843367e-05, + "loss": 2.8715, + "step": 822 + }, + { + "epoch": 0.2319780142343739, + "grad_norm": 664.0036010742188, + "learning_rate": 8.953203742404993e-05, + "loss": 2.8715, + "step": 823 + }, + { + "epoch": 0.2322598830244521, + "grad_norm": 836.0067749023438, + "learning_rate": 8.950411942244892e-05, + "loss": 3.1498, + "step": 824 + }, + { + "epoch": 0.23254175181453032, + "grad_norm": 516.0051879882812, + "learning_rate": 8.94761686068207e-05, + "loss": 2.5437, + "step": 825 + }, + { + "epoch": 0.23282362060460857, + "grad_norm": 1384.0032958984375, + "learning_rate": 8.944818500038257e-05, + "loss": 2.9525, + "step": 826 + }, + { + "epoch": 0.23310548939468678, + "grad_norm": 1096.0045166015625, + "learning_rate": 8.942016862637905e-05, + "loss": 3.3441, + "step": 827 + }, + { + "epoch": 0.233387358184765, + "grad_norm": 1304.0107421875, + "learning_rate": 8.939211950808188e-05, + "loss": 2.6846, + "step": 828 + }, + { + "epoch": 0.2336692269748432, + "grad_norm": 940.0071411132812, + "learning_rate": 8.936403766879004e-05, + "loss": 2.751, + "step": 829 + }, + { + "epoch": 0.23395109576492143, + "grad_norm": 1168.0040283203125, + "learning_rate": 8.933592313182963e-05, + "loss": 2.7139, + "step": 830 + }, + { + "epoch": 0.23423296455499965, + "grad_norm": 784.0033569335938, + "learning_rate": 8.930777592055395e-05, + "loss": 2.6862, + "step": 831 + }, + { + "epoch": 0.23451483334507786, + "grad_norm": 1248.0048828125, + "learning_rate": 8.927959605834347e-05, + "loss": 2.8783, + "step": 832 + }, + { + "epoch": 0.23479670213515608, + "grad_norm": 600.0045776367188, + "learning_rate": 8.925138356860567e-05, + "loss": 2.5674, + "step": 833 + }, + { + "epoch": 0.2350785709252343, + "grad_norm": 796.0030517578125, + "learning_rate": 8.922313847477526e-05, + "loss": 2.7803, + "step": 834 + }, + { + "epoch": 0.2353604397153125, + "grad_norm": 1976.0089111328125, + "learning_rate": 8.919486080031396e-05, + "loss": 2.986, + "step": 835 + }, + { + "epoch": 0.23564230850539075, + "grad_norm": 1456.007080078125, + "learning_rate": 8.916655056871057e-05, + "loss": 3.2702, + "step": 836 + }, + { + "epoch": 0.23592417729546897, + "grad_norm": 1760.00732421875, + "learning_rate": 8.913820780348094e-05, + "loss": 3.1712, + "step": 837 + }, + { + "epoch": 0.23620604608554718, + "grad_norm": 504.0041809082031, + "learning_rate": 8.910983252816794e-05, + "loss": 2.7933, + "step": 838 + }, + { + "epoch": 0.2364879148756254, + "grad_norm": 1536.005615234375, + "learning_rate": 8.908142476634141e-05, + "loss": 3.0026, + "step": 839 + }, + { + "epoch": 0.23676978366570361, + "grad_norm": 1960.014404296875, + "learning_rate": 8.905298454159825e-05, + "loss": 2.8451, + "step": 840 + }, + { + "epoch": 0.23705165245578183, + "grad_norm": 1400.0059814453125, + "learning_rate": 8.902451187756226e-05, + "loss": 2.6875, + "step": 841 + }, + { + "epoch": 0.23733352124586005, + "grad_norm": 1240.0093994140625, + "learning_rate": 8.899600679788424e-05, + "loss": 2.7256, + "step": 842 + }, + { + "epoch": 0.23761539003593826, + "grad_norm": 1776.0098876953125, + "learning_rate": 8.896746932624184e-05, + "loss": 3.1349, + "step": 843 + }, + { + "epoch": 0.23789725882601648, + "grad_norm": 1128.0052490234375, + "learning_rate": 8.893889948633968e-05, + "loss": 2.8487, + "step": 844 + }, + { + "epoch": 0.23817912761609472, + "grad_norm": 1184.0062255859375, + "learning_rate": 8.891029730190925e-05, + "loss": 3.1315, + "step": 845 + }, + { + "epoch": 0.23846099640617294, + "grad_norm": 872.004638671875, + "learning_rate": 8.888166279670889e-05, + "loss": 3.2813, + "step": 846 + }, + { + "epoch": 0.23874286519625115, + "grad_norm": 744.0040893554688, + "learning_rate": 8.885299599452382e-05, + "loss": 2.3784, + "step": 847 + }, + { + "epoch": 0.23902473398632937, + "grad_norm": 1240.006103515625, + "learning_rate": 8.882429691916605e-05, + "loss": 3.405, + "step": 848 + }, + { + "epoch": 0.23930660277640758, + "grad_norm": 1256.0042724609375, + "learning_rate": 8.879556559447443e-05, + "loss": 2.9731, + "step": 849 + }, + { + "epoch": 0.2395884715664858, + "grad_norm": 1536.007568359375, + "learning_rate": 8.87668020443146e-05, + "loss": 2.9089, + "step": 850 + }, + { + "epoch": 0.23987034035656402, + "grad_norm": 1264.006103515625, + "learning_rate": 8.873800629257893e-05, + "loss": 3.1617, + "step": 851 + }, + { + "epoch": 0.24015220914664223, + "grad_norm": 720.0042114257812, + "learning_rate": 8.870917836318655e-05, + "loss": 2.9652, + "step": 852 + }, + { + "epoch": 0.24043407793672045, + "grad_norm": 1232.003662109375, + "learning_rate": 8.868031828008334e-05, + "loss": 3.1416, + "step": 853 + }, + { + "epoch": 0.24071594672679866, + "grad_norm": 672.003662109375, + "learning_rate": 8.865142606724189e-05, + "loss": 2.8021, + "step": 854 + }, + { + "epoch": 0.2409978155168769, + "grad_norm": 1928.0068359375, + "learning_rate": 8.862250174866146e-05, + "loss": 3.6511, + "step": 855 + }, + { + "epoch": 0.24127968430695512, + "grad_norm": 1376.00439453125, + "learning_rate": 8.859354534836797e-05, + "loss": 3.153, + "step": 856 + }, + { + "epoch": 0.24156155309703334, + "grad_norm": 1952.0067138671875, + "learning_rate": 8.856455689041402e-05, + "loss": 2.9297, + "step": 857 + }, + { + "epoch": 0.24184342188711155, + "grad_norm": 354.00396728515625, + "learning_rate": 8.853553639887881e-05, + "loss": 2.9034, + "step": 858 + }, + { + "epoch": 0.24212529067718977, + "grad_norm": 2096.0126953125, + "learning_rate": 8.850648389786817e-05, + "loss": 3.6973, + "step": 859 + }, + { + "epoch": 0.24240715946726799, + "grad_norm": 1680.0078125, + "learning_rate": 8.84773994115145e-05, + "loss": 4.0951, + "step": 860 + }, + { + "epoch": 0.2426890282573462, + "grad_norm": 1168.00390625, + "learning_rate": 8.844828296397677e-05, + "loss": 3.0349, + "step": 861 + }, + { + "epoch": 0.24297089704742442, + "grad_norm": 1384.0045166015625, + "learning_rate": 8.841913457944054e-05, + "loss": 3.126, + "step": 862 + }, + { + "epoch": 0.24325276583750263, + "grad_norm": 1688.0030517578125, + "learning_rate": 8.838995428211781e-05, + "loss": 3.7057, + "step": 863 + }, + { + "epoch": 0.24353463462758085, + "grad_norm": 1048.0032958984375, + "learning_rate": 8.836074209624719e-05, + "loss": 3.5163, + "step": 864 + }, + { + "epoch": 0.2438165034176591, + "grad_norm": 1552.0064697265625, + "learning_rate": 8.833149804609372e-05, + "loss": 2.961, + "step": 865 + }, + { + "epoch": 0.2440983722077373, + "grad_norm": 1344.002685546875, + "learning_rate": 8.83022221559489e-05, + "loss": 3.4994, + "step": 866 + }, + { + "epoch": 0.24438024099781552, + "grad_norm": 720.0031127929688, + "learning_rate": 8.827291445013073e-05, + "loss": 2.7813, + "step": 867 + }, + { + "epoch": 0.24466210978789374, + "grad_norm": 1408.004150390625, + "learning_rate": 8.824357495298357e-05, + "loss": 2.9987, + "step": 868 + }, + { + "epoch": 0.24494397857797195, + "grad_norm": 1048.003173828125, + "learning_rate": 8.821420368887823e-05, + "loss": 3.0196, + "step": 869 + }, + { + "epoch": 0.24522584736805017, + "grad_norm": 1560.0064697265625, + "learning_rate": 8.81848006822119e-05, + "loss": 2.779, + "step": 870 + }, + { + "epoch": 0.2455077161581284, + "grad_norm": 620.0042114257812, + "learning_rate": 8.815536595740816e-05, + "loss": 2.7793, + "step": 871 + }, + { + "epoch": 0.2457895849482066, + "grad_norm": 1368.003662109375, + "learning_rate": 8.812589953891688e-05, + "loss": 2.7396, + "step": 872 + }, + { + "epoch": 0.24607145373828482, + "grad_norm": 720.0037231445312, + "learning_rate": 8.80964014512143e-05, + "loss": 2.7793, + "step": 873 + }, + { + "epoch": 0.24635332252836303, + "grad_norm": 900.0035400390625, + "learning_rate": 8.806687171880297e-05, + "loss": 2.8855, + "step": 874 + }, + { + "epoch": 0.24663519131844128, + "grad_norm": 720.004150390625, + "learning_rate": 8.803731036621168e-05, + "loss": 3.2988, + "step": 875 + }, + { + "epoch": 0.2469170601085195, + "grad_norm": 1440.0079345703125, + "learning_rate": 8.800771741799553e-05, + "loss": 2.8898, + "step": 876 + }, + { + "epoch": 0.2471989288985977, + "grad_norm": 1032.00439453125, + "learning_rate": 8.797809289873587e-05, + "loss": 2.9688, + "step": 877 + }, + { + "epoch": 0.24748079768867592, + "grad_norm": 712.0034790039062, + "learning_rate": 8.794843683304022e-05, + "loss": 2.4164, + "step": 878 + }, + { + "epoch": 0.24776266647875414, + "grad_norm": 1328.003662109375, + "learning_rate": 8.791874924554235e-05, + "loss": 2.9528, + "step": 879 + }, + { + "epoch": 0.24804453526883236, + "grad_norm": 1208.0030517578125, + "learning_rate": 8.788903016090222e-05, + "loss": 3.349, + "step": 880 + }, + { + "epoch": 0.24832640405891057, + "grad_norm": 478.0052795410156, + "learning_rate": 8.785927960380592e-05, + "loss": 2.9092, + "step": 881 + }, + { + "epoch": 0.2486082728489888, + "grad_norm": 1072.0047607421875, + "learning_rate": 8.782949759896568e-05, + "loss": 2.78, + "step": 882 + }, + { + "epoch": 0.248890141639067, + "grad_norm": 1304.0067138671875, + "learning_rate": 8.779968417111991e-05, + "loss": 2.7396, + "step": 883 + }, + { + "epoch": 0.24917201042914525, + "grad_norm": 1752.0078125, + "learning_rate": 8.776983934503307e-05, + "loss": 2.9149, + "step": 884 + }, + { + "epoch": 0.24945387921922346, + "grad_norm": 1936.005859375, + "learning_rate": 8.773996314549569e-05, + "loss": 3.5527, + "step": 885 + }, + { + "epoch": 0.24973574800930168, + "grad_norm": 1272.0047607421875, + "learning_rate": 8.771005559732439e-05, + "loss": 3.3023, + "step": 886 + }, + { + "epoch": 0.2500176167993799, + "grad_norm": 1568.0050048828125, + "learning_rate": 8.768011672536184e-05, + "loss": 3.6152, + "step": 887 + }, + { + "epoch": 0.2502994855894581, + "grad_norm": 1272.00732421875, + "learning_rate": 8.76501465544767e-05, + "loss": 3.0775, + "step": 888 + }, + { + "epoch": 0.2505813543795363, + "grad_norm": 1712.0113525390625, + "learning_rate": 8.762014510956364e-05, + "loss": 2.8885, + "step": 889 + }, + { + "epoch": 0.25086322316961457, + "grad_norm": 588.0027465820312, + "learning_rate": 8.759011241554328e-05, + "loss": 2.9004, + "step": 890 + }, + { + "epoch": 0.25114509195969276, + "grad_norm": 732.0057373046875, + "learning_rate": 8.756004849736229e-05, + "loss": 3.1267, + "step": 891 + }, + { + "epoch": 0.251426960749771, + "grad_norm": 1296.00390625, + "learning_rate": 8.752995337999315e-05, + "loss": 2.9391, + "step": 892 + }, + { + "epoch": 0.2517088295398492, + "grad_norm": 1152.004150390625, + "learning_rate": 8.749982708843435e-05, + "loss": 2.9652, + "step": 893 + }, + { + "epoch": 0.25199069832992743, + "grad_norm": 860.0057373046875, + "learning_rate": 8.746966964771022e-05, + "loss": 3.0027, + "step": 894 + }, + { + "epoch": 0.2522725671200056, + "grad_norm": 852.0065307617188, + "learning_rate": 8.7439481082871e-05, + "loss": 3.0576, + "step": 895 + }, + { + "epoch": 0.25255443591008386, + "grad_norm": 1200.004638671875, + "learning_rate": 8.740926141899277e-05, + "loss": 3.0523, + "step": 896 + }, + { + "epoch": 0.25283630470016205, + "grad_norm": 2040.012939453125, + "learning_rate": 8.737901068117741e-05, + "loss": 3.2179, + "step": 897 + }, + { + "epoch": 0.2531181734902403, + "grad_norm": 1184.0062255859375, + "learning_rate": 8.734872889455268e-05, + "loss": 2.9946, + "step": 898 + }, + { + "epoch": 0.25340004228031854, + "grad_norm": 780.0064697265625, + "learning_rate": 8.731841608427206e-05, + "loss": 3.4779, + "step": 899 + }, + { + "epoch": 0.2536819110703967, + "grad_norm": 1712.0113525390625, + "learning_rate": 8.728807227551487e-05, + "loss": 3.3447, + "step": 900 + }, + { + "epoch": 0.25396377986047497, + "grad_norm": 728.0054321289062, + "learning_rate": 8.725769749348613e-05, + "loss": 2.7574, + "step": 901 + }, + { + "epoch": 0.25424564865055316, + "grad_norm": 972.0068969726562, + "learning_rate": 8.722729176341658e-05, + "loss": 3.376, + "step": 902 + }, + { + "epoch": 0.2545275174406314, + "grad_norm": 1032.0064697265625, + "learning_rate": 8.719685511056268e-05, + "loss": 3.0674, + "step": 903 + }, + { + "epoch": 0.2548093862307096, + "grad_norm": 1024.0064697265625, + "learning_rate": 8.716638756020661e-05, + "loss": 2.7813, + "step": 904 + }, + { + "epoch": 0.25509125502078783, + "grad_norm": 1288.00244140625, + "learning_rate": 8.713588913765617e-05, + "loss": 2.71, + "step": 905 + }, + { + "epoch": 0.255373123810866, + "grad_norm": 816.0039672851562, + "learning_rate": 8.710535986824485e-05, + "loss": 2.7813, + "step": 906 + }, + { + "epoch": 0.25565499260094426, + "grad_norm": 1272.006591796875, + "learning_rate": 8.70747997773317e-05, + "loss": 2.9004, + "step": 907 + }, + { + "epoch": 0.25593686139102245, + "grad_norm": 1112.006103515625, + "learning_rate": 8.704420889030141e-05, + "loss": 2.9571, + "step": 908 + }, + { + "epoch": 0.2562187301811007, + "grad_norm": 1216.0054931640625, + "learning_rate": 8.701358723256425e-05, + "loss": 2.5529, + "step": 909 + }, + { + "epoch": 0.25650059897117894, + "grad_norm": 684.0061645507812, + "learning_rate": 8.698293482955605e-05, + "loss": 2.5837, + "step": 910 + }, + { + "epoch": 0.2567824677612571, + "grad_norm": 1256.0047607421875, + "learning_rate": 8.695225170673819e-05, + "loss": 3.1836, + "step": 911 + }, + { + "epoch": 0.25706433655133537, + "grad_norm": 1816.0052490234375, + "learning_rate": 8.692153788959752e-05, + "loss": 3.5417, + "step": 912 + }, + { + "epoch": 0.25734620534141356, + "grad_norm": 1808.0068359375, + "learning_rate": 8.689079340364643e-05, + "loss": 3.1726, + "step": 913 + }, + { + "epoch": 0.2576280741314918, + "grad_norm": 1488.008544921875, + "learning_rate": 8.686001827442278e-05, + "loss": 3.2911, + "step": 914 + }, + { + "epoch": 0.25790994292157, + "grad_norm": 1736.0072021484375, + "learning_rate": 8.682921252748987e-05, + "loss": 3.1797, + "step": 915 + }, + { + "epoch": 0.25819181171164823, + "grad_norm": 450.0032958984375, + "learning_rate": 8.679837618843646e-05, + "loss": 2.8363, + "step": 916 + }, + { + "epoch": 0.2584736805017264, + "grad_norm": 2544.0078125, + "learning_rate": 8.676750928287667e-05, + "loss": 3.2663, + "step": 917 + }, + { + "epoch": 0.25875554929180467, + "grad_norm": 1536.0093994140625, + "learning_rate": 8.673661183645005e-05, + "loss": 3.0852, + "step": 918 + }, + { + "epoch": 0.2590374180818829, + "grad_norm": 2032.00634765625, + "learning_rate": 8.670568387482154e-05, + "loss": 3.1387, + "step": 919 + }, + { + "epoch": 0.2593192868719611, + "grad_norm": 824.0078735351562, + "learning_rate": 8.667472542368133e-05, + "loss": 2.6514, + "step": 920 + }, + { + "epoch": 0.25960115566203934, + "grad_norm": 312.0046081542969, + "learning_rate": 8.664373650874508e-05, + "loss": 2.6502, + "step": 921 + }, + { + "epoch": 0.25988302445211753, + "grad_norm": 1728.0032958984375, + "learning_rate": 8.661271715575364e-05, + "loss": 2.6787, + "step": 922 + }, + { + "epoch": 0.26016489324219577, + "grad_norm": 664.0023803710938, + "learning_rate": 8.658166739047316e-05, + "loss": 2.7525, + "step": 923 + }, + { + "epoch": 0.26044676203227396, + "grad_norm": 1352.0045166015625, + "learning_rate": 8.655058723869509e-05, + "loss": 3.0381, + "step": 924 + }, + { + "epoch": 0.2607286308223522, + "grad_norm": 1624.0042724609375, + "learning_rate": 8.651947672623613e-05, + "loss": 2.9175, + "step": 925 + }, + { + "epoch": 0.2610104996124304, + "grad_norm": 1112.0029296875, + "learning_rate": 8.648833587893814e-05, + "loss": 2.988, + "step": 926 + }, + { + "epoch": 0.26129236840250863, + "grad_norm": 880.0068969726562, + "learning_rate": 8.645716472266822e-05, + "loss": 2.8722, + "step": 927 + }, + { + "epoch": 0.2615742371925869, + "grad_norm": 1376.0098876953125, + "learning_rate": 8.642596328331864e-05, + "loss": 3.0889, + "step": 928 + }, + { + "epoch": 0.26185610598266507, + "grad_norm": 1344.005615234375, + "learning_rate": 8.639473158680683e-05, + "loss": 2.547, + "step": 929 + }, + { + "epoch": 0.2621379747727433, + "grad_norm": 1552.0107421875, + "learning_rate": 8.636346965907532e-05, + "loss": 3.4709, + "step": 930 + }, + { + "epoch": 0.2624198435628215, + "grad_norm": 1248.0037841796875, + "learning_rate": 8.633217752609177e-05, + "loss": 3.2842, + "step": 931 + }, + { + "epoch": 0.26270171235289974, + "grad_norm": 648.0032348632812, + "learning_rate": 8.630085521384898e-05, + "loss": 2.6, + "step": 932 + }, + { + "epoch": 0.26298358114297793, + "grad_norm": 1568.0032958984375, + "learning_rate": 8.626950274836472e-05, + "loss": 2.8255, + "step": 933 + }, + { + "epoch": 0.2632654499330562, + "grad_norm": 608.001953125, + "learning_rate": 8.62381201556819e-05, + "loss": 3.3067, + "step": 934 + }, + { + "epoch": 0.26354731872313436, + "grad_norm": 1992.0029296875, + "learning_rate": 8.620670746186839e-05, + "loss": 2.4363, + "step": 935 + }, + { + "epoch": 0.2638291875132126, + "grad_norm": 1352.0089111328125, + "learning_rate": 8.61752646930171e-05, + "loss": 2.8148, + "step": 936 + }, + { + "epoch": 0.2641110563032908, + "grad_norm": 1008.0037841796875, + "learning_rate": 8.614379187524592e-05, + "loss": 3.1457, + "step": 937 + }, + { + "epoch": 0.26439292509336904, + "grad_norm": 1984.0074462890625, + "learning_rate": 8.611228903469768e-05, + "loss": 3.4678, + "step": 938 + }, + { + "epoch": 0.2646747938834473, + "grad_norm": 1648.0064697265625, + "learning_rate": 8.608075619754016e-05, + "loss": 3.8496, + "step": 939 + }, + { + "epoch": 0.26495666267352547, + "grad_norm": 996.0039672851562, + "learning_rate": 8.604919338996604e-05, + "loss": 3.679, + "step": 940 + }, + { + "epoch": 0.2652385314636037, + "grad_norm": 1768.0047607421875, + "learning_rate": 8.601760063819294e-05, + "loss": 2.9955, + "step": 941 + }, + { + "epoch": 0.2655204002536819, + "grad_norm": 1608.0054931640625, + "learning_rate": 8.59859779684633e-05, + "loss": 3.2943, + "step": 942 + }, + { + "epoch": 0.26580226904376014, + "grad_norm": 1896.005126953125, + "learning_rate": 8.595432540704446e-05, + "loss": 3.3333, + "step": 943 + }, + { + "epoch": 0.26608413783383833, + "grad_norm": 1464.005126953125, + "learning_rate": 8.592264298022854e-05, + "loss": 3.2031, + "step": 944 + }, + { + "epoch": 0.2663660066239166, + "grad_norm": 524.0040283203125, + "learning_rate": 8.58909307143325e-05, + "loss": 3.367, + "step": 945 + }, + { + "epoch": 0.26664787541399476, + "grad_norm": 928.00537109375, + "learning_rate": 8.585918863569806e-05, + "loss": 2.4956, + "step": 946 + }, + { + "epoch": 0.266929744204073, + "grad_norm": 1992.007568359375, + "learning_rate": 8.582741677069177e-05, + "loss": 3.5124, + "step": 947 + }, + { + "epoch": 0.26721161299415125, + "grad_norm": 1152.006591796875, + "learning_rate": 8.579561514570482e-05, + "loss": 3.3242, + "step": 948 + }, + { + "epoch": 0.26749348178422944, + "grad_norm": 1456.004638671875, + "learning_rate": 8.576378378715322e-05, + "loss": 2.9158, + "step": 949 + }, + { + "epoch": 0.2677753505743077, + "grad_norm": 596.0050659179688, + "learning_rate": 8.57319227214776e-05, + "loss": 2.3425, + "step": 950 + }, + { + "epoch": 0.26805721936438587, + "grad_norm": 1832.0068359375, + "learning_rate": 8.570003197514329e-05, + "loss": 2.7159, + "step": 951 + }, + { + "epoch": 0.2683390881544641, + "grad_norm": 1144.0076904296875, + "learning_rate": 8.566811157464032e-05, + "loss": 2.6792, + "step": 952 + }, + { + "epoch": 0.2686209569445423, + "grad_norm": 1352.0076904296875, + "learning_rate": 8.563616154648328e-05, + "loss": 3.0115, + "step": 953 + }, + { + "epoch": 0.26890282573462054, + "grad_norm": 1512.007080078125, + "learning_rate": 8.560418191721144e-05, + "loss": 3.8939, + "step": 954 + }, + { + "epoch": 0.26918469452469873, + "grad_norm": 1328.0089111328125, + "learning_rate": 8.55721727133886e-05, + "loss": 2.9965, + "step": 955 + }, + { + "epoch": 0.269466563314777, + "grad_norm": 1008.003662109375, + "learning_rate": 8.554013396160315e-05, + "loss": 3.2829, + "step": 956 + }, + { + "epoch": 0.26974843210485516, + "grad_norm": 740.0056762695312, + "learning_rate": 8.550806568846799e-05, + "loss": 3.2725, + "step": 957 + }, + { + "epoch": 0.2700303008949334, + "grad_norm": 864.0023803710938, + "learning_rate": 8.547596792062064e-05, + "loss": 2.615, + "step": 958 + }, + { + "epoch": 0.27031216968501165, + "grad_norm": 1280.0032958984375, + "learning_rate": 8.544384068472301e-05, + "loss": 2.7501, + "step": 959 + }, + { + "epoch": 0.27059403847508984, + "grad_norm": 1896.012939453125, + "learning_rate": 8.541168400746155e-05, + "loss": 3.1172, + "step": 960 + }, + { + "epoch": 0.2708759072651681, + "grad_norm": 724.0042114257812, + "learning_rate": 8.537949791554714e-05, + "loss": 2.9499, + "step": 961 + }, + { + "epoch": 0.27115777605524627, + "grad_norm": 1280.0067138671875, + "learning_rate": 8.534728243571511e-05, + "loss": 2.7501, + "step": 962 + }, + { + "epoch": 0.2714396448453245, + "grad_norm": 680.0038452148438, + "learning_rate": 8.531503759472516e-05, + "loss": 2.907, + "step": 963 + }, + { + "epoch": 0.2717215136354027, + "grad_norm": 1352.0052490234375, + "learning_rate": 8.528276341936146e-05, + "loss": 2.5733, + "step": 964 + }, + { + "epoch": 0.27200338242548094, + "grad_norm": 844.0048828125, + "learning_rate": 8.525045993643244e-05, + "loss": 2.7816, + "step": 965 + }, + { + "epoch": 0.27228525121555913, + "grad_norm": 2336.01220703125, + "learning_rate": 8.5218127172771e-05, + "loss": 3.8559, + "step": 966 + }, + { + "epoch": 0.2725671200056374, + "grad_norm": 1568.0040283203125, + "learning_rate": 8.518576515523424e-05, + "loss": 2.8407, + "step": 967 + }, + { + "epoch": 0.2728489887957156, + "grad_norm": 382.0064392089844, + "learning_rate": 8.515337391070362e-05, + "loss": 2.7422, + "step": 968 + }, + { + "epoch": 0.2731308575857938, + "grad_norm": 976.0037841796875, + "learning_rate": 8.512095346608488e-05, + "loss": 3.2677, + "step": 969 + }, + { + "epoch": 0.27341272637587205, + "grad_norm": 1360.0068359375, + "learning_rate": 8.5088503848308e-05, + "loss": 2.4232, + "step": 970 + }, + { + "epoch": 0.27369459516595024, + "grad_norm": 1592.0128173828125, + "learning_rate": 8.50560250843272e-05, + "loss": 3.46, + "step": 971 + }, + { + "epoch": 0.2739764639560285, + "grad_norm": 1720.0059814453125, + "learning_rate": 8.502351720112092e-05, + "loss": 3.4812, + "step": 972 + }, + { + "epoch": 0.27425833274610667, + "grad_norm": 1464.0067138671875, + "learning_rate": 8.499098022569176e-05, + "loss": 3.1189, + "step": 973 + }, + { + "epoch": 0.2745402015361849, + "grad_norm": 1160.00439453125, + "learning_rate": 8.495841418506652e-05, + "loss": 2.5967, + "step": 974 + }, + { + "epoch": 0.2748220703262631, + "grad_norm": 1184.007080078125, + "learning_rate": 8.492581910629609e-05, + "loss": 2.8256, + "step": 975 + }, + { + "epoch": 0.27510393911634135, + "grad_norm": 240.0046844482422, + "learning_rate": 8.489319501645554e-05, + "loss": 2.4737, + "step": 976 + }, + { + "epoch": 0.2753858079064196, + "grad_norm": 1944.0062255859375, + "learning_rate": 8.486054194264401e-05, + "loss": 2.9621, + "step": 977 + }, + { + "epoch": 0.2756676766964978, + "grad_norm": 792.0030517578125, + "learning_rate": 8.482785991198474e-05, + "loss": 2.7891, + "step": 978 + }, + { + "epoch": 0.275949545486576, + "grad_norm": 1072.0035400390625, + "learning_rate": 8.479514895162495e-05, + "loss": 2.9646, + "step": 979 + }, + { + "epoch": 0.2762314142766542, + "grad_norm": 1640.0047607421875, + "learning_rate": 8.476240908873598e-05, + "loss": 2.5789, + "step": 980 + }, + { + "epoch": 0.27651328306673245, + "grad_norm": 804.00390625, + "learning_rate": 8.472964035051312e-05, + "loss": 2.9402, + "step": 981 + }, + { + "epoch": 0.27679515185681064, + "grad_norm": 1440.00634765625, + "learning_rate": 8.469684276417568e-05, + "loss": 2.9004, + "step": 982 + }, + { + "epoch": 0.2770770206468889, + "grad_norm": 1576.005615234375, + "learning_rate": 8.466401635696692e-05, + "loss": 3.4375, + "step": 983 + }, + { + "epoch": 0.27735888943696707, + "grad_norm": 664.0051879882812, + "learning_rate": 8.463116115615401e-05, + "loss": 2.75, + "step": 984 + }, + { + "epoch": 0.2776407582270453, + "grad_norm": 720.0046997070312, + "learning_rate": 8.459827718902808e-05, + "loss": 2.7188, + "step": 985 + }, + { + "epoch": 0.2779226270171235, + "grad_norm": 964.0037841796875, + "learning_rate": 8.456536448290417e-05, + "loss": 2.7396, + "step": 986 + }, + { + "epoch": 0.27820449580720175, + "grad_norm": 1040.0035400390625, + "learning_rate": 8.453242306512113e-05, + "loss": 2.8975, + "step": 987 + }, + { + "epoch": 0.27848636459728, + "grad_norm": 1320.004150390625, + "learning_rate": 8.449945296304167e-05, + "loss": 2.75, + "step": 988 + }, + { + "epoch": 0.2787682333873582, + "grad_norm": 788.0028686523438, + "learning_rate": 8.44664542040524e-05, + "loss": 2.8356, + "step": 989 + }, + { + "epoch": 0.2790501021774364, + "grad_norm": 720.0034790039062, + "learning_rate": 8.443342681556361e-05, + "loss": 2.7813, + "step": 990 + }, + { + "epoch": 0.2793319709675146, + "grad_norm": 1336.0052490234375, + "learning_rate": 8.440037082500953e-05, + "loss": 2.9004, + "step": 991 + }, + { + "epoch": 0.27961383975759285, + "grad_norm": 924.0036010742188, + "learning_rate": 8.436728625984799e-05, + "loss": 2.8298, + "step": 992 + }, + { + "epoch": 0.27989570854767104, + "grad_norm": 1632.009521484375, + "learning_rate": 8.433417314756067e-05, + "loss": 3.2181, + "step": 993 + }, + { + "epoch": 0.2801775773377493, + "grad_norm": 1664.0089111328125, + "learning_rate": 8.430103151565287e-05, + "loss": 2.961, + "step": 994 + }, + { + "epoch": 0.28045944612782747, + "grad_norm": 1376.00341796875, + "learning_rate": 8.426786139165368e-05, + "loss": 2.7396, + "step": 995 + }, + { + "epoch": 0.2807413149179057, + "grad_norm": 1288.0050048828125, + "learning_rate": 8.423466280311578e-05, + "loss": 3.6146, + "step": 996 + }, + { + "epoch": 0.28102318370798396, + "grad_norm": 1376.0047607421875, + "learning_rate": 8.42014357776155e-05, + "loss": 2.7813, + "step": 997 + }, + { + "epoch": 0.28130505249806215, + "grad_norm": 1432.006591796875, + "learning_rate": 8.416818034275287e-05, + "loss": 3.0208, + "step": 998 + }, + { + "epoch": 0.2815869212881404, + "grad_norm": 1312.0059814453125, + "learning_rate": 8.41348965261514e-05, + "loss": 3.0694, + "step": 999 + }, + { + "epoch": 0.2818687900782186, + "grad_norm": 636.0032958984375, + "learning_rate": 8.410158435545825e-05, + "loss": 2.9089, + "step": 1000 + }, + { + "epoch": 0.2821506588682968, + "grad_norm": 568.0073852539062, + "learning_rate": 8.406824385834412e-05, + "loss": 2.796, + "step": 1001 + }, + { + "epoch": 0.282432527658375, + "grad_norm": 1672.0135498046875, + "learning_rate": 8.403487506250325e-05, + "loss": 3.8741, + "step": 1002 + }, + { + "epoch": 0.28271439644845325, + "grad_norm": 820.00341796875, + "learning_rate": 8.400147799565334e-05, + "loss": 2.7127, + "step": 1003 + }, + { + "epoch": 0.28299626523853144, + "grad_norm": 928.0052490234375, + "learning_rate": 8.396805268553563e-05, + "loss": 3.6472, + "step": 1004 + }, + { + "epoch": 0.2832781340286097, + "grad_norm": 1744.004150390625, + "learning_rate": 8.393459915991478e-05, + "loss": 3.0066, + "step": 1005 + }, + { + "epoch": 0.28356000281868793, + "grad_norm": 1760.005615234375, + "learning_rate": 8.390111744657892e-05, + "loss": 2.8819, + "step": 1006 + }, + { + "epoch": 0.2838418716087661, + "grad_norm": 1952.0074462890625, + "learning_rate": 8.386760757333954e-05, + "loss": 3.9916, + "step": 1007 + }, + { + "epoch": 0.28412374039884436, + "grad_norm": 2272.011962890625, + "learning_rate": 8.38340695680316e-05, + "loss": 4.1847, + "step": 1008 + }, + { + "epoch": 0.28440560918892255, + "grad_norm": 872.00439453125, + "learning_rate": 8.380050345851337e-05, + "loss": 2.451, + "step": 1009 + }, + { + "epoch": 0.2846874779790008, + "grad_norm": 1960.0054931640625, + "learning_rate": 8.376690927266646e-05, + "loss": 3.1036, + "step": 1010 + }, + { + "epoch": 0.284969346769079, + "grad_norm": 1224.0047607421875, + "learning_rate": 8.373328703839585e-05, + "loss": 2.8684, + "step": 1011 + }, + { + "epoch": 0.2852512155591572, + "grad_norm": 1096.0054931640625, + "learning_rate": 8.369963678362977e-05, + "loss": 3.2648, + "step": 1012 + }, + { + "epoch": 0.2855330843492354, + "grad_norm": 284.0071716308594, + "learning_rate": 8.366595853631977e-05, + "loss": 3.2448, + "step": 1013 + }, + { + "epoch": 0.28581495313931365, + "grad_norm": 1200.00634765625, + "learning_rate": 8.36322523244406e-05, + "loss": 3.0131, + "step": 1014 + }, + { + "epoch": 0.28609682192939184, + "grad_norm": 972.0029296875, + "learning_rate": 8.359851817599027e-05, + "loss": 3.1797, + "step": 1015 + }, + { + "epoch": 0.2863786907194701, + "grad_norm": 1592.0028076171875, + "learning_rate": 8.356475611899e-05, + "loss": 3.1407, + "step": 1016 + }, + { + "epoch": 0.28666055950954833, + "grad_norm": 1464.0079345703125, + "learning_rate": 8.353096618148417e-05, + "loss": 2.7813, + "step": 1017 + }, + { + "epoch": 0.2869424282996265, + "grad_norm": 648.0040893554688, + "learning_rate": 8.349714839154035e-05, + "loss": 2.6074, + "step": 1018 + }, + { + "epoch": 0.28722429708970476, + "grad_norm": 624.0028686523438, + "learning_rate": 8.34633027772492e-05, + "loss": 2.8275, + "step": 1019 + }, + { + "epoch": 0.28750616587978295, + "grad_norm": 924.0065307617188, + "learning_rate": 8.342942936672458e-05, + "loss": 2.974, + "step": 1020 + }, + { + "epoch": 0.2877880346698612, + "grad_norm": 988.0037231445312, + "learning_rate": 8.33955281881033e-05, + "loss": 2.71, + "step": 1021 + }, + { + "epoch": 0.2880699034599394, + "grad_norm": 1424.009765625, + "learning_rate": 8.336159926954537e-05, + "loss": 3.5892, + "step": 1022 + }, + { + "epoch": 0.2883517722500176, + "grad_norm": 988.0037841796875, + "learning_rate": 8.332764263923376e-05, + "loss": 2.695, + "step": 1023 + }, + { + "epoch": 0.2886336410400958, + "grad_norm": 1528.005859375, + "learning_rate": 8.329365832537448e-05, + "loss": 2.7162, + "step": 1024 + }, + { + "epoch": 0.28891550983017406, + "grad_norm": 1280.0086669921875, + "learning_rate": 8.325964635619658e-05, + "loss": 3.0801, + "step": 1025 + }, + { + "epoch": 0.2891973786202523, + "grad_norm": 1056.0064697265625, + "learning_rate": 8.322560675995199e-05, + "loss": 3.2598, + "step": 1026 + }, + { + "epoch": 0.2894792474103305, + "grad_norm": 796.0036010742188, + "learning_rate": 8.319153956491568e-05, + "loss": 3.0209, + "step": 1027 + }, + { + "epoch": 0.28976111620040873, + "grad_norm": 636.004150390625, + "learning_rate": 8.315744479938549e-05, + "loss": 3.1784, + "step": 1028 + }, + { + "epoch": 0.2900429849904869, + "grad_norm": 1968.0052490234375, + "learning_rate": 8.312332249168219e-05, + "loss": 3.355, + "step": 1029 + }, + { + "epoch": 0.29032485378056516, + "grad_norm": 1496.0048828125, + "learning_rate": 8.30891726701494e-05, + "loss": 3.336, + "step": 1030 + }, + { + "epoch": 0.29060672257064335, + "grad_norm": 956.0034790039062, + "learning_rate": 8.30549953631536e-05, + "loss": 3.4613, + "step": 1031 + }, + { + "epoch": 0.2908885913607216, + "grad_norm": 824.0057373046875, + "learning_rate": 8.302079059908413e-05, + "loss": 2.7396, + "step": 1032 + }, + { + "epoch": 0.2911704601507998, + "grad_norm": 704.0037231445312, + "learning_rate": 8.298655840635311e-05, + "loss": 2.7188, + "step": 1033 + }, + { + "epoch": 0.291452328940878, + "grad_norm": 628.0046997070312, + "learning_rate": 8.295229881339546e-05, + "loss": 2.7083, + "step": 1034 + }, + { + "epoch": 0.2917341977309562, + "grad_norm": 1184.0074462890625, + "learning_rate": 8.291801184866884e-05, + "loss": 3.6712, + "step": 1035 + }, + { + "epoch": 0.29201606652103446, + "grad_norm": 1160.00537109375, + "learning_rate": 8.288369754065363e-05, + "loss": 2.7752, + "step": 1036 + }, + { + "epoch": 0.2922979353111127, + "grad_norm": 572.0031127929688, + "learning_rate": 8.284935591785299e-05, + "loss": 2.9424, + "step": 1037 + }, + { + "epoch": 0.2925798041011909, + "grad_norm": 1560.006103515625, + "learning_rate": 8.281498700879273e-05, + "loss": 2.8064, + "step": 1038 + }, + { + "epoch": 0.29286167289126913, + "grad_norm": 764.002685546875, + "learning_rate": 8.27805908420213e-05, + "loss": 3.0358, + "step": 1039 + }, + { + "epoch": 0.2931435416813473, + "grad_norm": 1304.0042724609375, + "learning_rate": 8.274616744610983e-05, + "loss": 2.9463, + "step": 1040 + }, + { + "epoch": 0.29342541047142556, + "grad_norm": 1400.0106201171875, + "learning_rate": 8.271171684965204e-05, + "loss": 3.6921, + "step": 1041 + }, + { + "epoch": 0.29370727926150375, + "grad_norm": 1208.0064697265625, + "learning_rate": 8.267723908126428e-05, + "loss": 3.25, + "step": 1042 + }, + { + "epoch": 0.293989148051582, + "grad_norm": 1176.0029296875, + "learning_rate": 8.264273416958541e-05, + "loss": 3.1042, + "step": 1043 + }, + { + "epoch": 0.2942710168416602, + "grad_norm": 1240.003173828125, + "learning_rate": 8.260820214327691e-05, + "loss": 3.1966, + "step": 1044 + }, + { + "epoch": 0.2945528856317384, + "grad_norm": 1360.0079345703125, + "learning_rate": 8.257364303102275e-05, + "loss": 2.8037, + "step": 1045 + }, + { + "epoch": 0.29483475442181667, + "grad_norm": 840.0037841796875, + "learning_rate": 8.253905686152936e-05, + "loss": 2.7236, + "step": 1046 + }, + { + "epoch": 0.29511662321189486, + "grad_norm": 1020.0023193359375, + "learning_rate": 8.25044436635257e-05, + "loss": 2.6862, + "step": 1047 + }, + { + "epoch": 0.2953984920019731, + "grad_norm": 888.0028076171875, + "learning_rate": 8.246980346576317e-05, + "loss": 2.75, + "step": 1048 + }, + { + "epoch": 0.2956803607920513, + "grad_norm": 896.0052490234375, + "learning_rate": 8.243513629701558e-05, + "loss": 2.961, + "step": 1049 + }, + { + "epoch": 0.29596222958212953, + "grad_norm": 708.00390625, + "learning_rate": 8.240044218607916e-05, + "loss": 2.7813, + "step": 1050 + }, + { + "epoch": 0.2962440983722077, + "grad_norm": 408.0076904296875, + "learning_rate": 8.236572116177249e-05, + "loss": 3.1866, + "step": 1051 + }, + { + "epoch": 0.29652596716228596, + "grad_norm": 1064.0042724609375, + "learning_rate": 8.233097325293655e-05, + "loss": 2.5966, + "step": 1052 + }, + { + "epoch": 0.29680783595236415, + "grad_norm": 1080.0054931640625, + "learning_rate": 8.229619848843463e-05, + "loss": 3.0593, + "step": 1053 + }, + { + "epoch": 0.2970897047424424, + "grad_norm": 844.0059814453125, + "learning_rate": 8.226139689715231e-05, + "loss": 2.9571, + "step": 1054 + }, + { + "epoch": 0.29737157353252064, + "grad_norm": 928.00390625, + "learning_rate": 8.222656850799751e-05, + "loss": 2.6065, + "step": 1055 + }, + { + "epoch": 0.2976534423225988, + "grad_norm": 1056.004150390625, + "learning_rate": 8.21917133499003e-05, + "loss": 2.6503, + "step": 1056 + }, + { + "epoch": 0.29793531111267707, + "grad_norm": 1472.008056640625, + "learning_rate": 8.215683145181312e-05, + "loss": 3.377, + "step": 1057 + }, + { + "epoch": 0.29821717990275526, + "grad_norm": 1376.00390625, + "learning_rate": 8.212192284271052e-05, + "loss": 2.8368, + "step": 1058 + }, + { + "epoch": 0.2984990486928335, + "grad_norm": 1216.006103515625, + "learning_rate": 8.20869875515893e-05, + "loss": 3.3575, + "step": 1059 + }, + { + "epoch": 0.2987809174829117, + "grad_norm": 1008.004638671875, + "learning_rate": 8.205202560746838e-05, + "loss": 2.5826, + "step": 1060 + }, + { + "epoch": 0.29906278627298993, + "grad_norm": 620.0034790039062, + "learning_rate": 8.201703703938886e-05, + "loss": 2.4812, + "step": 1061 + }, + { + "epoch": 0.2993446550630681, + "grad_norm": 1992.010986328125, + "learning_rate": 8.19820218764139e-05, + "loss": 3.4486, + "step": 1062 + }, + { + "epoch": 0.29962652385314636, + "grad_norm": 1744.0115966796875, + "learning_rate": 8.194698014762881e-05, + "loss": 3.266, + "step": 1063 + }, + { + "epoch": 0.29990839264322455, + "grad_norm": 2352.007568359375, + "learning_rate": 8.191191188214092e-05, + "loss": 3.1356, + "step": 1064 + }, + { + "epoch": 0.3001902614333028, + "grad_norm": 700.002685546875, + "learning_rate": 8.187681710907964e-05, + "loss": 2.8494, + "step": 1065 + }, + { + "epoch": 0.30047213022338104, + "grad_norm": 1488.00927734375, + "learning_rate": 8.184169585759636e-05, + "loss": 3.0153, + "step": 1066 + }, + { + "epoch": 0.30075399901345923, + "grad_norm": 1528.0074462890625, + "learning_rate": 8.180654815686451e-05, + "loss": 2.9402, + "step": 1067 + }, + { + "epoch": 0.30103586780353747, + "grad_norm": 1440.003662109375, + "learning_rate": 8.177137403607947e-05, + "loss": 2.7188, + "step": 1068 + }, + { + "epoch": 0.30131773659361566, + "grad_norm": 1176.00244140625, + "learning_rate": 8.173617352445852e-05, + "loss": 3.0195, + "step": 1069 + }, + { + "epoch": 0.3015996053836939, + "grad_norm": 908.0028686523438, + "learning_rate": 8.170094665124095e-05, + "loss": 3.0001, + "step": 1070 + }, + { + "epoch": 0.3018814741737721, + "grad_norm": 936.0040893554688, + "learning_rate": 8.166569344568789e-05, + "loss": 3.7696, + "step": 1071 + }, + { + "epoch": 0.30216334296385033, + "grad_norm": 1360.008056640625, + "learning_rate": 8.16304139370823e-05, + "loss": 3.3259, + "step": 1072 + }, + { + "epoch": 0.3024452117539285, + "grad_norm": 692.004150390625, + "learning_rate": 8.159510815472913e-05, + "loss": 2.9744, + "step": 1073 + }, + { + "epoch": 0.30272708054400677, + "grad_norm": 1720.0047607421875, + "learning_rate": 8.1559776127955e-05, + "loss": 3.3539, + "step": 1074 + }, + { + "epoch": 0.303008949334085, + "grad_norm": 1128.0030517578125, + "learning_rate": 8.152441788610842e-05, + "loss": 2.8826, + "step": 1075 + }, + { + "epoch": 0.3032908181241632, + "grad_norm": 2064.00439453125, + "learning_rate": 8.148903345855965e-05, + "loss": 3.0404, + "step": 1076 + }, + { + "epoch": 0.30357268691424144, + "grad_norm": 528.0030517578125, + "learning_rate": 8.145362287470069e-05, + "loss": 2.9479, + "step": 1077 + }, + { + "epoch": 0.30385455570431963, + "grad_norm": 1048.0059814453125, + "learning_rate": 8.141818616394531e-05, + "loss": 2.6771, + "step": 1078 + }, + { + "epoch": 0.3041364244943979, + "grad_norm": 1224.0023193359375, + "learning_rate": 8.138272335572891e-05, + "loss": 3.2917, + "step": 1079 + }, + { + "epoch": 0.30441829328447606, + "grad_norm": 1176.0047607421875, + "learning_rate": 8.134723447950865e-05, + "loss": 3.2188, + "step": 1080 + }, + { + "epoch": 0.3047001620745543, + "grad_norm": 408.0036315917969, + "learning_rate": 8.131171956476328e-05, + "loss": 2.7012, + "step": 1081 + }, + { + "epoch": 0.3049820308646325, + "grad_norm": 812.0023803710938, + "learning_rate": 8.127617864099319e-05, + "loss": 2.6146, + "step": 1082 + }, + { + "epoch": 0.30526389965471074, + "grad_norm": 704.0037841796875, + "learning_rate": 8.12406117377204e-05, + "loss": 2.7813, + "step": 1083 + }, + { + "epoch": 0.3055457684447889, + "grad_norm": 936.005126953125, + "learning_rate": 8.120501888448852e-05, + "loss": 3.3197, + "step": 1084 + }, + { + "epoch": 0.30582763723486717, + "grad_norm": 1128.002685546875, + "learning_rate": 8.116940011086265e-05, + "loss": 3.0, + "step": 1085 + }, + { + "epoch": 0.3061095060249454, + "grad_norm": 1112.002197265625, + "learning_rate": 8.113375544642948e-05, + "loss": 3.0, + "step": 1086 + }, + { + "epoch": 0.3063913748150236, + "grad_norm": 1296.0068359375, + "learning_rate": 8.109808492079718e-05, + "loss": 2.71, + "step": 1087 + }, + { + "epoch": 0.30667324360510184, + "grad_norm": 844.002685546875, + "learning_rate": 8.106238856359542e-05, + "loss": 3.0879, + "step": 1088 + }, + { + "epoch": 0.30695511239518003, + "grad_norm": 1560.00439453125, + "learning_rate": 8.102666640447531e-05, + "loss": 2.8379, + "step": 1089 + }, + { + "epoch": 0.3072369811852583, + "grad_norm": 1832.005859375, + "learning_rate": 8.09909184731094e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.30751884997533646, + "grad_norm": 888.0025634765625, + "learning_rate": 8.095514479919164e-05, + "loss": 2.8054, + "step": 1091 + }, + { + "epoch": 0.3078007187654147, + "grad_norm": 1536.00537109375, + "learning_rate": 8.091934541243737e-05, + "loss": 3.099, + "step": 1092 + }, + { + "epoch": 0.3080825875554929, + "grad_norm": 458.0047302246094, + "learning_rate": 8.08835203425833e-05, + "loss": 3.0693, + "step": 1093 + }, + { + "epoch": 0.30836445634557114, + "grad_norm": 856.0056762695312, + "learning_rate": 8.084766961938747e-05, + "loss": 3.1576, + "step": 1094 + }, + { + "epoch": 0.3086463251356494, + "grad_norm": 1176.0076904296875, + "learning_rate": 8.08117932726292e-05, + "loss": 2.9089, + "step": 1095 + }, + { + "epoch": 0.30892819392572757, + "grad_norm": 1944.0086669921875, + "learning_rate": 8.07758913321091e-05, + "loss": 3.1784, + "step": 1096 + }, + { + "epoch": 0.3092100627158058, + "grad_norm": 1000.0043334960938, + "learning_rate": 8.073996382764908e-05, + "loss": 2.9183, + "step": 1097 + }, + { + "epoch": 0.309491931505884, + "grad_norm": 796.0066528320312, + "learning_rate": 8.070401078909225e-05, + "loss": 3.0446, + "step": 1098 + }, + { + "epoch": 0.30977380029596224, + "grad_norm": 800.0068359375, + "learning_rate": 8.066803224630295e-05, + "loss": 2.8266, + "step": 1099 + }, + { + "epoch": 0.31005566908604043, + "grad_norm": 1648.006103515625, + "learning_rate": 8.063202822916667e-05, + "loss": 3.6323, + "step": 1100 + }, + { + "epoch": 0.3103375378761187, + "grad_norm": 1624.0045166015625, + "learning_rate": 8.05959987675901e-05, + "loss": 3.2266, + "step": 1101 + }, + { + "epoch": 0.31061940666619686, + "grad_norm": 1280.0076904296875, + "learning_rate": 8.055994389150104e-05, + "loss": 2.8688, + "step": 1102 + }, + { + "epoch": 0.3109012754562751, + "grad_norm": 716.005126953125, + "learning_rate": 8.052386363084841e-05, + "loss": 2.7813, + "step": 1103 + }, + { + "epoch": 0.31118314424635335, + "grad_norm": 860.0037841796875, + "learning_rate": 8.048775801560222e-05, + "loss": 3.0209, + "step": 1104 + }, + { + "epoch": 0.31146501303643154, + "grad_norm": 1248.00244140625, + "learning_rate": 8.045162707575354e-05, + "loss": 2.8688, + "step": 1105 + }, + { + "epoch": 0.3117468818265098, + "grad_norm": 498.003662109375, + "learning_rate": 8.041547084131445e-05, + "loss": 2.71, + "step": 1106 + }, + { + "epoch": 0.31202875061658797, + "grad_norm": 1504.005126953125, + "learning_rate": 8.037928934231809e-05, + "loss": 2.809, + "step": 1107 + }, + { + "epoch": 0.3123106194066662, + "grad_norm": 1296.0029296875, + "learning_rate": 8.034308260881853e-05, + "loss": 2.5789, + "step": 1108 + }, + { + "epoch": 0.3125924881967444, + "grad_norm": 1416.0074462890625, + "learning_rate": 8.030685067089088e-05, + "loss": 3.0983, + "step": 1109 + }, + { + "epoch": 0.31287435698682264, + "grad_norm": 872.0033569335938, + "learning_rate": 8.027059355863107e-05, + "loss": 2.7578, + "step": 1110 + }, + { + "epoch": 0.31315622577690083, + "grad_norm": 848.00244140625, + "learning_rate": 8.023431130215605e-05, + "loss": 2.8689, + "step": 1111 + }, + { + "epoch": 0.3134380945669791, + "grad_norm": 1576.002197265625, + "learning_rate": 8.01980039316036e-05, + "loss": 3.0104, + "step": 1112 + }, + { + "epoch": 0.31371996335705726, + "grad_norm": 1360.0040283203125, + "learning_rate": 8.016167147713235e-05, + "loss": 2.544, + "step": 1113 + }, + { + "epoch": 0.3140018321471355, + "grad_norm": 1072.0018310546875, + "learning_rate": 8.012531396892185e-05, + "loss": 2.6774, + "step": 1114 + }, + { + "epoch": 0.31428370093721375, + "grad_norm": 1328.0084228515625, + "learning_rate": 8.008893143717234e-05, + "loss": 2.8037, + "step": 1115 + }, + { + "epoch": 0.31456556972729194, + "grad_norm": 556.00244140625, + "learning_rate": 8.005252391210494e-05, + "loss": 3.0017, + "step": 1116 + }, + { + "epoch": 0.3148474385173702, + "grad_norm": 1520.0052490234375, + "learning_rate": 8.001609142396149e-05, + "loss": 2.5466, + "step": 1117 + }, + { + "epoch": 0.31512930730744837, + "grad_norm": 2368.0078125, + "learning_rate": 7.997963400300454e-05, + "loss": 4.1515, + "step": 1118 + }, + { + "epoch": 0.3154111760975266, + "grad_norm": 1360.0040283203125, + "learning_rate": 7.994315167951743e-05, + "loss": 3.3207, + "step": 1119 + }, + { + "epoch": 0.3156930448876048, + "grad_norm": 972.00244140625, + "learning_rate": 7.990664448380411e-05, + "loss": 3.2315, + "step": 1120 + }, + { + "epoch": 0.31597491367768304, + "grad_norm": 2048.0068359375, + "learning_rate": 7.987011244618925e-05, + "loss": 4.0241, + "step": 1121 + }, + { + "epoch": 0.31625678246776123, + "grad_norm": 2112.0087890625, + "learning_rate": 7.983355559701808e-05, + "loss": 2.7842, + "step": 1122 + }, + { + "epoch": 0.3165386512578395, + "grad_norm": 648.0048217773438, + "learning_rate": 7.979697396665649e-05, + "loss": 3.4209, + "step": 1123 + }, + { + "epoch": 0.3168205200479177, + "grad_norm": 1680.002197265625, + "learning_rate": 7.976036758549097e-05, + "loss": 3.1075, + "step": 1124 + }, + { + "epoch": 0.3171023888379959, + "grad_norm": 720.0029296875, + "learning_rate": 7.972373648392853e-05, + "loss": 3.3328, + "step": 1125 + }, + { + "epoch": 0.31738425762807415, + "grad_norm": 1352.0023193359375, + "learning_rate": 7.968708069239672e-05, + "loss": 2.551, + "step": 1126 + }, + { + "epoch": 0.31766612641815234, + "grad_norm": 532.0037841796875, + "learning_rate": 7.965040024134365e-05, + "loss": 3.1195, + "step": 1127 + }, + { + "epoch": 0.3179479952082306, + "grad_norm": 170.00411987304688, + "learning_rate": 7.961369516123782e-05, + "loss": 2.4372, + "step": 1128 + }, + { + "epoch": 0.31822986399830877, + "grad_norm": 916.0048828125, + "learning_rate": 7.957696548256828e-05, + "loss": 2.5913, + "step": 1129 + }, + { + "epoch": 0.318511732788387, + "grad_norm": 2176.006103515625, + "learning_rate": 7.954021123584445e-05, + "loss": 4.2969, + "step": 1130 + }, + { + "epoch": 0.3187936015784652, + "grad_norm": 2416.008056640625, + "learning_rate": 7.950343245159618e-05, + "loss": 3.2162, + "step": 1131 + }, + { + "epoch": 0.31907547036854345, + "grad_norm": 2192.004150390625, + "learning_rate": 7.946662916037373e-05, + "loss": 3.5134, + "step": 1132 + }, + { + "epoch": 0.3193573391586217, + "grad_norm": 1424.00439453125, + "learning_rate": 7.942980139274766e-05, + "loss": 3.655, + "step": 1133 + }, + { + "epoch": 0.3196392079486999, + "grad_norm": 1080.0028076171875, + "learning_rate": 7.939294917930888e-05, + "loss": 2.7084, + "step": 1134 + }, + { + "epoch": 0.3199210767387781, + "grad_norm": 600.0038452148438, + "learning_rate": 7.935607255066866e-05, + "loss": 2.6875, + "step": 1135 + }, + { + "epoch": 0.3202029455288563, + "grad_norm": 2272.009033203125, + "learning_rate": 7.931917153745846e-05, + "loss": 3.2696, + "step": 1136 + }, + { + "epoch": 0.32048481431893455, + "grad_norm": 1408.0067138671875, + "learning_rate": 7.928224617033008e-05, + "loss": 3.2126, + "step": 1137 + }, + { + "epoch": 0.32076668310901274, + "grad_norm": 1912.005126953125, + "learning_rate": 7.924529647995549e-05, + "loss": 3.4089, + "step": 1138 + }, + { + "epoch": 0.321048551899091, + "grad_norm": 2368.01171875, + "learning_rate": 7.920832249702689e-05, + "loss": 3.1349, + "step": 1139 + }, + { + "epoch": 0.32133042068916917, + "grad_norm": 1424.0025634765625, + "learning_rate": 7.917132425225666e-05, + "loss": 2.9528, + "step": 1140 + }, + { + "epoch": 0.3216122894792474, + "grad_norm": 1600.0048828125, + "learning_rate": 7.91343017763773e-05, + "loss": 3.1826, + "step": 1141 + }, + { + "epoch": 0.3218941582693256, + "grad_norm": 1928.0045166015625, + "learning_rate": 7.909725510014151e-05, + "loss": 3.2787, + "step": 1142 + }, + { + "epoch": 0.32217602705940385, + "grad_norm": 832.003173828125, + "learning_rate": 7.9060184254322e-05, + "loss": 3.0563, + "step": 1143 + }, + { + "epoch": 0.3224578958494821, + "grad_norm": 864.0023803710938, + "learning_rate": 7.902308926971165e-05, + "loss": 2.8897, + "step": 1144 + }, + { + "epoch": 0.3227397646395603, + "grad_norm": 1824.0069580078125, + "learning_rate": 7.898597017712331e-05, + "loss": 2.8858, + "step": 1145 + }, + { + "epoch": 0.3230216334296385, + "grad_norm": 1104.00390625, + "learning_rate": 7.894882700738987e-05, + "loss": 2.7357, + "step": 1146 + }, + { + "epoch": 0.3233035022197167, + "grad_norm": 1352.00439453125, + "learning_rate": 7.891165979136429e-05, + "loss": 2.9763, + "step": 1147 + }, + { + "epoch": 0.32358537100979495, + "grad_norm": 2336.009521484375, + "learning_rate": 7.887446855991942e-05, + "loss": 3.4815, + "step": 1148 + }, + { + "epoch": 0.32386723979987314, + "grad_norm": 1856.004638671875, + "learning_rate": 7.88372533439481e-05, + "loss": 2.5076, + "step": 1149 + }, + { + "epoch": 0.3241491085899514, + "grad_norm": 904.0029296875, + "learning_rate": 7.880001417436309e-05, + "loss": 2.7653, + "step": 1150 + }, + { + "epoch": 0.3244309773800296, + "grad_norm": 644.0042114257812, + "learning_rate": 7.876275108209702e-05, + "loss": 2.485, + "step": 1151 + }, + { + "epoch": 0.3247128461701078, + "grad_norm": 1400.0040283203125, + "learning_rate": 7.872546409810243e-05, + "loss": 3.3467, + "step": 1152 + }, + { + "epoch": 0.32499471496018606, + "grad_norm": 1712.00390625, + "learning_rate": 7.868815325335168e-05, + "loss": 3.0845, + "step": 1153 + }, + { + "epoch": 0.32527658375026425, + "grad_norm": 1464.002685546875, + "learning_rate": 7.865081857883696e-05, + "loss": 3.0277, + "step": 1154 + }, + { + "epoch": 0.3255584525403425, + "grad_norm": 1112.00537109375, + "learning_rate": 7.861346010557026e-05, + "loss": 3.4148, + "step": 1155 + }, + { + "epoch": 0.3258403213304207, + "grad_norm": 1880.0086669921875, + "learning_rate": 7.857607786458333e-05, + "loss": 3.6707, + "step": 1156 + }, + { + "epoch": 0.3261221901204989, + "grad_norm": 1824.0059814453125, + "learning_rate": 7.853867188692763e-05, + "loss": 3.4806, + "step": 1157 + }, + { + "epoch": 0.3264040589105771, + "grad_norm": 996.0032958984375, + "learning_rate": 7.85012422036744e-05, + "loss": 3.1837, + "step": 1158 + }, + { + "epoch": 0.32668592770065535, + "grad_norm": 1552.0037841796875, + "learning_rate": 7.846378884591453e-05, + "loss": 3.1734, + "step": 1159 + }, + { + "epoch": 0.32696779649073354, + "grad_norm": 1744.0042724609375, + "learning_rate": 7.84263118447586e-05, + "loss": 3.2967, + "step": 1160 + }, + { + "epoch": 0.3272496652808118, + "grad_norm": 2112.0078125, + "learning_rate": 7.838881123133681e-05, + "loss": 3.8204, + "step": 1161 + }, + { + "epoch": 0.32753153407089, + "grad_norm": 2144.00341796875, + "learning_rate": 7.835128703679896e-05, + "loss": 2.615, + "step": 1162 + }, + { + "epoch": 0.3278134028609682, + "grad_norm": 1592.0050048828125, + "learning_rate": 7.831373929231447e-05, + "loss": 2.961, + "step": 1163 + }, + { + "epoch": 0.32809527165104646, + "grad_norm": 536.0031127929688, + "learning_rate": 7.82761680290723e-05, + "loss": 2.7188, + "step": 1164 + }, + { + "epoch": 0.32837714044112465, + "grad_norm": 864.0040283203125, + "learning_rate": 7.823857327828099e-05, + "loss": 2.7813, + "step": 1165 + }, + { + "epoch": 0.3286590092312029, + "grad_norm": 1288.0057373046875, + "learning_rate": 7.82009550711685e-05, + "loss": 3.0096, + "step": 1166 + }, + { + "epoch": 0.3289408780212811, + "grad_norm": 1288.0048828125, + "learning_rate": 7.816331343898236e-05, + "loss": 2.6263, + "step": 1167 + }, + { + "epoch": 0.3292227468113593, + "grad_norm": 1360.00390625, + "learning_rate": 7.812564841298952e-05, + "loss": 2.6784, + "step": 1168 + }, + { + "epoch": 0.3295046156014375, + "grad_norm": 1904.00537109375, + "learning_rate": 7.808796002447634e-05, + "loss": 3.7377, + "step": 1169 + }, + { + "epoch": 0.32978648439151576, + "grad_norm": 332.0041198730469, + "learning_rate": 7.805024830474867e-05, + "loss": 2.9376, + "step": 1170 + }, + { + "epoch": 0.33006835318159394, + "grad_norm": 1520.00341796875, + "learning_rate": 7.801251328513164e-05, + "loss": 3.0157, + "step": 1171 + }, + { + "epoch": 0.3303502219716722, + "grad_norm": 1272.006591796875, + "learning_rate": 7.797475499696978e-05, + "loss": 3.073, + "step": 1172 + }, + { + "epoch": 0.33063209076175043, + "grad_norm": 456.0027160644531, + "learning_rate": 7.793697347162698e-05, + "loss": 3.4054, + "step": 1173 + }, + { + "epoch": 0.3309139595518286, + "grad_norm": 2240.004638671875, + "learning_rate": 7.789916874048634e-05, + "loss": 2.6863, + "step": 1174 + }, + { + "epoch": 0.33119582834190686, + "grad_norm": 1928.006103515625, + "learning_rate": 7.786134083495033e-05, + "loss": 3.2116, + "step": 1175 + }, + { + "epoch": 0.33147769713198505, + "grad_norm": 960.0032958984375, + "learning_rate": 7.782348978644066e-05, + "loss": 2.5551, + "step": 1176 + }, + { + "epoch": 0.3317595659220633, + "grad_norm": 660.0016479492188, + "learning_rate": 7.778561562639818e-05, + "loss": 2.7693, + "step": 1177 + }, + { + "epoch": 0.3320414347121415, + "grad_norm": 1448.006591796875, + "learning_rate": 7.774771838628304e-05, + "loss": 3.6846, + "step": 1178 + }, + { + "epoch": 0.3323233035022197, + "grad_norm": 1576.0064697265625, + "learning_rate": 7.770979809757446e-05, + "loss": 2.4724, + "step": 1179 + }, + { + "epoch": 0.3326051722922979, + "grad_norm": 1472.004638671875, + "learning_rate": 7.767185479177093e-05, + "loss": 3.5166, + "step": 1180 + }, + { + "epoch": 0.33288704108237616, + "grad_norm": 1088.003173828125, + "learning_rate": 7.763388850038994e-05, + "loss": 2.5476, + "step": 1181 + }, + { + "epoch": 0.3331689098724544, + "grad_norm": 908.0050048828125, + "learning_rate": 7.759589925496816e-05, + "loss": 2.8429, + "step": 1182 + }, + { + "epoch": 0.3334507786625326, + "grad_norm": 1480.005126953125, + "learning_rate": 7.755788708706124e-05, + "loss": 3.5238, + "step": 1183 + }, + { + "epoch": 0.33373264745261083, + "grad_norm": 1008.00146484375, + "learning_rate": 7.751985202824397e-05, + "loss": 2.7084, + "step": 1184 + }, + { + "epoch": 0.334014516242689, + "grad_norm": 720.0026245117188, + "learning_rate": 7.748179411011008e-05, + "loss": 2.7813, + "step": 1185 + }, + { + "epoch": 0.33429638503276726, + "grad_norm": 1768.00390625, + "learning_rate": 7.744371336427231e-05, + "loss": 3.2975, + "step": 1186 + }, + { + "epoch": 0.33457825382284545, + "grad_norm": 1168.0042724609375, + "learning_rate": 7.740560982236238e-05, + "loss": 2.7534, + "step": 1187 + }, + { + "epoch": 0.3348601226129237, + "grad_norm": 732.003173828125, + "learning_rate": 7.736748351603092e-05, + "loss": 2.8447, + "step": 1188 + }, + { + "epoch": 0.3351419914030019, + "grad_norm": 1304.0052490234375, + "learning_rate": 7.732933447694747e-05, + "loss": 2.376, + "step": 1189 + }, + { + "epoch": 0.3354238601930801, + "grad_norm": 1008.0025024414062, + "learning_rate": 7.729116273680049e-05, + "loss": 3.0054, + "step": 1190 + }, + { + "epoch": 0.3357057289831583, + "grad_norm": 1408.0023193359375, + "learning_rate": 7.725296832729725e-05, + "loss": 2.6502, + "step": 1191 + }, + { + "epoch": 0.33598759777323656, + "grad_norm": 984.006103515625, + "learning_rate": 7.721475128016386e-05, + "loss": 2.735, + "step": 1192 + }, + { + "epoch": 0.3362694665633148, + "grad_norm": 1808.00634765625, + "learning_rate": 7.717651162714527e-05, + "loss": 3.6563, + "step": 1193 + }, + { + "epoch": 0.336551335353393, + "grad_norm": 2080.00830078125, + "learning_rate": 7.713824940000513e-05, + "loss": 3.2988, + "step": 1194 + }, + { + "epoch": 0.33683320414347123, + "grad_norm": 1712.0052490234375, + "learning_rate": 7.709996463052596e-05, + "loss": 3.1572, + "step": 1195 + }, + { + "epoch": 0.3371150729335494, + "grad_norm": 1456.0042724609375, + "learning_rate": 7.706165735050889e-05, + "loss": 2.6787, + "step": 1196 + }, + { + "epoch": 0.33739694172362766, + "grad_norm": 1512.0098876953125, + "learning_rate": 7.702332759177381e-05, + "loss": 3.3643, + "step": 1197 + }, + { + "epoch": 0.33767881051370585, + "grad_norm": 1360.00634765625, + "learning_rate": 7.698497538615927e-05, + "loss": 3.3952, + "step": 1198 + }, + { + "epoch": 0.3379606793037841, + "grad_norm": 1024.0037841796875, + "learning_rate": 7.694660076552244e-05, + "loss": 3.0052, + "step": 1199 + }, + { + "epoch": 0.3382425480938623, + "grad_norm": 824.0040893554688, + "learning_rate": 7.690820376173916e-05, + "loss": 2.8587, + "step": 1200 + }, + { + "epoch": 0.3385244168839405, + "grad_norm": 1296.0052490234375, + "learning_rate": 7.68697844067038e-05, + "loss": 3.1238, + "step": 1201 + }, + { + "epoch": 0.33880628567401877, + "grad_norm": 676.0048217773438, + "learning_rate": 7.683134273232938e-05, + "loss": 3.0519, + "step": 1202 + }, + { + "epoch": 0.33908815446409696, + "grad_norm": 478.0047302246094, + "learning_rate": 7.679287877054734e-05, + "loss": 3.3223, + "step": 1203 + }, + { + "epoch": 0.3393700232541752, + "grad_norm": 1728.0062255859375, + "learning_rate": 7.675439255330778e-05, + "loss": 3.4431, + "step": 1204 + }, + { + "epoch": 0.3396518920442534, + "grad_norm": 868.0060424804688, + "learning_rate": 7.671588411257915e-05, + "loss": 2.9724, + "step": 1205 + }, + { + "epoch": 0.33993376083433163, + "grad_norm": 1712.0029296875, + "learning_rate": 7.667735348034844e-05, + "loss": 2.5927, + "step": 1206 + }, + { + "epoch": 0.3402156296244098, + "grad_norm": 1552.007568359375, + "learning_rate": 7.663880068862106e-05, + "loss": 3.3776, + "step": 1207 + }, + { + "epoch": 0.34049749841448806, + "grad_norm": 1120.0050048828125, + "learning_rate": 7.660022576942078e-05, + "loss": 2.6502, + "step": 1208 + }, + { + "epoch": 0.34077936720456625, + "grad_norm": 1136.0030517578125, + "learning_rate": 7.656162875478985e-05, + "loss": 2.7813, + "step": 1209 + }, + { + "epoch": 0.3410612359946445, + "grad_norm": 414.0047302246094, + "learning_rate": 7.652300967678873e-05, + "loss": 2.7813, + "step": 1210 + }, + { + "epoch": 0.3413431047847227, + "grad_norm": 1304.0064697265625, + "learning_rate": 7.64843685674964e-05, + "loss": 2.6387, + "step": 1211 + }, + { + "epoch": 0.3416249735748009, + "grad_norm": 1248.00341796875, + "learning_rate": 7.644570545900992e-05, + "loss": 2.7813, + "step": 1212 + }, + { + "epoch": 0.34190684236487917, + "grad_norm": 632.0027465820312, + "learning_rate": 7.64070203834448e-05, + "loss": 2.7813, + "step": 1213 + }, + { + "epoch": 0.34218871115495736, + "grad_norm": 1384.00732421875, + "learning_rate": 7.636831337293474e-05, + "loss": 2.7813, + "step": 1214 + }, + { + "epoch": 0.3424705799450356, + "grad_norm": 900.0025634765625, + "learning_rate": 7.632958445963156e-05, + "loss": 2.9956, + "step": 1215 + }, + { + "epoch": 0.3427524487351138, + "grad_norm": 952.0039672851562, + "learning_rate": 7.629083367570546e-05, + "loss": 3.1387, + "step": 1216 + }, + { + "epoch": 0.34303431752519203, + "grad_norm": 720.0029907226562, + "learning_rate": 7.625206105334466e-05, + "loss": 3.2559, + "step": 1217 + }, + { + "epoch": 0.3433161863152702, + "grad_norm": 1448.0079345703125, + "learning_rate": 7.62132666247556e-05, + "loss": 3.4943, + "step": 1218 + }, + { + "epoch": 0.34359805510534847, + "grad_norm": 1880.00732421875, + "learning_rate": 7.617445042216278e-05, + "loss": 3.4229, + "step": 1219 + }, + { + "epoch": 0.34387992389542665, + "grad_norm": 532.0028686523438, + "learning_rate": 7.613561247780882e-05, + "loss": 2.6545, + "step": 1220 + }, + { + "epoch": 0.3441617926855049, + "grad_norm": 2464.011962890625, + "learning_rate": 7.609675282395439e-05, + "loss": 3.6892, + "step": 1221 + }, + { + "epoch": 0.34444366147558314, + "grad_norm": 1888.0076904296875, + "learning_rate": 7.605787149287818e-05, + "loss": 3.5059, + "step": 1222 + }, + { + "epoch": 0.34472553026566133, + "grad_norm": 700.0029907226562, + "learning_rate": 7.601896851687693e-05, + "loss": 2.9383, + "step": 1223 + }, + { + "epoch": 0.34500739905573957, + "grad_norm": 868.0025024414062, + "learning_rate": 7.59800439282653e-05, + "loss": 2.8594, + "step": 1224 + }, + { + "epoch": 0.34528926784581776, + "grad_norm": 1336.0067138671875, + "learning_rate": 7.594109775937595e-05, + "loss": 2.7741, + "step": 1225 + }, + { + "epoch": 0.345571136635896, + "grad_norm": 1080.0035400390625, + "learning_rate": 7.590213004255942e-05, + "loss": 2.9004, + "step": 1226 + }, + { + "epoch": 0.3458530054259742, + "grad_norm": 696.0040283203125, + "learning_rate": 7.586314081018421e-05, + "loss": 2.8885, + "step": 1227 + }, + { + "epoch": 0.34613487421605243, + "grad_norm": 2128.00732421875, + "learning_rate": 7.582413009463664e-05, + "loss": 3.2445, + "step": 1228 + }, + { + "epoch": 0.3464167430061306, + "grad_norm": 1608.0045166015625, + "learning_rate": 7.578509792832088e-05, + "loss": 3.4404, + "step": 1229 + }, + { + "epoch": 0.34669861179620887, + "grad_norm": 156.0040283203125, + "learning_rate": 7.574604434365894e-05, + "loss": 2.9841, + "step": 1230 + }, + { + "epoch": 0.3469804805862871, + "grad_norm": 1008.0035400390625, + "learning_rate": 7.570696937309062e-05, + "loss": 2.6779, + "step": 1231 + }, + { + "epoch": 0.3472623493763653, + "grad_norm": 1032.003173828125, + "learning_rate": 7.566787304907348e-05, + "loss": 3.0195, + "step": 1232 + }, + { + "epoch": 0.34754421816644354, + "grad_norm": 660.0020141601562, + "learning_rate": 7.562875540408278e-05, + "loss": 2.9401, + "step": 1233 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 362.0030517578125, + "learning_rate": 7.558961647061156e-05, + "loss": 2.6065, + "step": 1234 + }, + { + "epoch": 0.3481079557466, + "grad_norm": 1312.0020751953125, + "learning_rate": 7.555045628117049e-05, + "loss": 2.71, + "step": 1235 + }, + { + "epoch": 0.34838982453667816, + "grad_norm": 988.0043334960938, + "learning_rate": 7.551127486828791e-05, + "loss": 2.7813, + "step": 1236 + }, + { + "epoch": 0.3486716933267564, + "grad_norm": 1600.0045166015625, + "learning_rate": 7.547207226450979e-05, + "loss": 3.3125, + "step": 1237 + }, + { + "epoch": 0.3489535621168346, + "grad_norm": 1144.004638671875, + "learning_rate": 7.543284850239974e-05, + "loss": 3.0196, + "step": 1238 + }, + { + "epoch": 0.34923543090691284, + "grad_norm": 1176.0052490234375, + "learning_rate": 7.539360361453883e-05, + "loss": 2.9004, + "step": 1239 + }, + { + "epoch": 0.349517299696991, + "grad_norm": 1232.0069580078125, + "learning_rate": 7.535433763352582e-05, + "loss": 3.4453, + "step": 1240 + }, + { + "epoch": 0.34979916848706927, + "grad_norm": 892.0045166015625, + "learning_rate": 7.531505059197692e-05, + "loss": 2.9334, + "step": 1241 + }, + { + "epoch": 0.3500810372771475, + "grad_norm": 1176.0054931640625, + "learning_rate": 7.527574252252583e-05, + "loss": 3.4707, + "step": 1242 + }, + { + "epoch": 0.3503629060672257, + "grad_norm": 1208.008544921875, + "learning_rate": 7.523641345782375e-05, + "loss": 3.184, + "step": 1243 + }, + { + "epoch": 0.35064477485730394, + "grad_norm": 1056.0062255859375, + "learning_rate": 7.519706343053926e-05, + "loss": 2.836, + "step": 1244 + }, + { + "epoch": 0.35092664364738213, + "grad_norm": 696.0032348632812, + "learning_rate": 7.515769247335843e-05, + "loss": 3.0906, + "step": 1245 + }, + { + "epoch": 0.3512085124374604, + "grad_norm": 1456.0029296875, + "learning_rate": 7.511830061898463e-05, + "loss": 3.0499, + "step": 1246 + }, + { + "epoch": 0.35149038122753856, + "grad_norm": 844.0020141601562, + "learning_rate": 7.507888790013868e-05, + "loss": 3.043, + "step": 1247 + }, + { + "epoch": 0.3517722500176168, + "grad_norm": 868.0021362304688, + "learning_rate": 7.503945434955867e-05, + "loss": 3.0076, + "step": 1248 + }, + { + "epoch": 0.352054118807695, + "grad_norm": 1224.0040283203125, + "learning_rate": 7.500000000000001e-05, + "loss": 2.5877, + "step": 1249 + }, + { + "epoch": 0.35233598759777324, + "grad_norm": 1632.0037841796875, + "learning_rate": 7.496052488423537e-05, + "loss": 2.8865, + "step": 1250 + }, + { + "epoch": 0.3526178563878515, + "grad_norm": 684.0023803710938, + "learning_rate": 7.492102903505471e-05, + "loss": 2.7025, + "step": 1251 + }, + { + "epoch": 0.35289972517792967, + "grad_norm": 1320.0062255859375, + "learning_rate": 7.488151248526518e-05, + "loss": 2.8575, + "step": 1252 + }, + { + "epoch": 0.3531815939680079, + "grad_norm": 616.0033569335938, + "learning_rate": 7.484197526769112e-05, + "loss": 3.3203, + "step": 1253 + }, + { + "epoch": 0.3534634627580861, + "grad_norm": 1456.005126953125, + "learning_rate": 7.480241741517406e-05, + "loss": 3.0881, + "step": 1254 + }, + { + "epoch": 0.35374533154816434, + "grad_norm": 676.0027465820312, + "learning_rate": 7.476283896057267e-05, + "loss": 2.9448, + "step": 1255 + }, + { + "epoch": 0.35402720033824253, + "grad_norm": 1120.0030517578125, + "learning_rate": 7.472323993676269e-05, + "loss": 2.9645, + "step": 1256 + }, + { + "epoch": 0.3543090691283208, + "grad_norm": 1168.0028076171875, + "learning_rate": 7.468362037663703e-05, + "loss": 2.946, + "step": 1257 + }, + { + "epoch": 0.35459093791839896, + "grad_norm": 1936.0048828125, + "learning_rate": 7.464398031310556e-05, + "loss": 3.3574, + "step": 1258 + }, + { + "epoch": 0.3548728067084772, + "grad_norm": 1616.002685546875, + "learning_rate": 7.460431977909527e-05, + "loss": 2.9004, + "step": 1259 + }, + { + "epoch": 0.35515467549855545, + "grad_norm": 680.0018310546875, + "learning_rate": 7.456463880755007e-05, + "loss": 2.5061, + "step": 1260 + }, + { + "epoch": 0.35543654428863364, + "grad_norm": 1168.0037841796875, + "learning_rate": 7.452493743143092e-05, + "loss": 2.6387, + "step": 1261 + }, + { + "epoch": 0.3557184130787119, + "grad_norm": 1600.0069580078125, + "learning_rate": 7.448521568371571e-05, + "loss": 2.9004, + "step": 1262 + }, + { + "epoch": 0.35600028186879007, + "grad_norm": 1200.0052490234375, + "learning_rate": 7.444547359739918e-05, + "loss": 2.6987, + "step": 1263 + }, + { + "epoch": 0.3562821506588683, + "grad_norm": 1824.00244140625, + "learning_rate": 7.440571120549309e-05, + "loss": 3.0479, + "step": 1264 + }, + { + "epoch": 0.3565640194489465, + "grad_norm": 1544.0030517578125, + "learning_rate": 7.436592854102598e-05, + "loss": 3.2578, + "step": 1265 + }, + { + "epoch": 0.35684588823902474, + "grad_norm": 864.0022583007812, + "learning_rate": 7.432612563704322e-05, + "loss": 2.8588, + "step": 1266 + }, + { + "epoch": 0.35712775702910293, + "grad_norm": 460.0022888183594, + "learning_rate": 7.428630252660704e-05, + "loss": 2.6944, + "step": 1267 + }, + { + "epoch": 0.3574096258191812, + "grad_norm": 1232.0042724609375, + "learning_rate": 7.424645924279647e-05, + "loss": 2.7813, + "step": 1268 + }, + { + "epoch": 0.35769149460925936, + "grad_norm": 992.0018920898438, + "learning_rate": 7.420659581870724e-05, + "loss": 3.1589, + "step": 1269 + }, + { + "epoch": 0.3579733633993376, + "grad_norm": 1584.0047607421875, + "learning_rate": 7.416671228745181e-05, + "loss": 3.2188, + "step": 1270 + }, + { + "epoch": 0.35825523218941585, + "grad_norm": 1576.006591796875, + "learning_rate": 7.412680868215939e-05, + "loss": 3.3496, + "step": 1271 + }, + { + "epoch": 0.35853710097949404, + "grad_norm": 1728.0068359375, + "learning_rate": 7.408688503597583e-05, + "loss": 2.917, + "step": 1272 + }, + { + "epoch": 0.3588189697695723, + "grad_norm": 1952.008056640625, + "learning_rate": 7.404694138206365e-05, + "loss": 3.1172, + "step": 1273 + }, + { + "epoch": 0.35910083855965047, + "grad_norm": 2272.005615234375, + "learning_rate": 7.400697775360194e-05, + "loss": 3.0648, + "step": 1274 + }, + { + "epoch": 0.3593827073497287, + "grad_norm": 1488.0028076171875, + "learning_rate": 7.396699418378649e-05, + "loss": 2.9271, + "step": 1275 + }, + { + "epoch": 0.3596645761398069, + "grad_norm": 912.0045166015625, + "learning_rate": 7.39269907058295e-05, + "loss": 3.1472, + "step": 1276 + }, + { + "epoch": 0.35994644492988515, + "grad_norm": 2880.007568359375, + "learning_rate": 7.388696735295982e-05, + "loss": 3.8278, + "step": 1277 + }, + { + "epoch": 0.36022831371996333, + "grad_norm": 572.0023803710938, + "learning_rate": 7.38469241584228e-05, + "loss": 3.0612, + "step": 1278 + }, + { + "epoch": 0.3605101825100416, + "grad_norm": 936.002685546875, + "learning_rate": 7.380686115548024e-05, + "loss": 3.2494, + "step": 1279 + }, + { + "epoch": 0.3607920513001198, + "grad_norm": 2160.005126953125, + "learning_rate": 7.376677837741038e-05, + "loss": 3.3812, + "step": 1280 + }, + { + "epoch": 0.361073920090198, + "grad_norm": 2192.00537109375, + "learning_rate": 7.37266758575079e-05, + "loss": 3.8555, + "step": 1281 + }, + { + "epoch": 0.36135578888027625, + "grad_norm": 1312.002685546875, + "learning_rate": 7.368655362908393e-05, + "loss": 3.503, + "step": 1282 + }, + { + "epoch": 0.36163765767035444, + "grad_norm": 1176.0037841796875, + "learning_rate": 7.364641172546591e-05, + "loss": 2.7693, + "step": 1283 + }, + { + "epoch": 0.3619195264604327, + "grad_norm": 426.00311279296875, + "learning_rate": 7.360625017999764e-05, + "loss": 3.0095, + "step": 1284 + }, + { + "epoch": 0.36220139525051087, + "grad_norm": 696.0028686523438, + "learning_rate": 7.356606902603925e-05, + "loss": 2.9043, + "step": 1285 + }, + { + "epoch": 0.3624832640405891, + "grad_norm": 648.003662109375, + "learning_rate": 7.352586829696711e-05, + "loss": 3.0095, + "step": 1286 + }, + { + "epoch": 0.3627651328306673, + "grad_norm": 1688.0054931640625, + "learning_rate": 7.34856480261739e-05, + "loss": 3.4492, + "step": 1287 + }, + { + "epoch": 0.36304700162074555, + "grad_norm": 520.0020751953125, + "learning_rate": 7.344540824706854e-05, + "loss": 3.0488, + "step": 1288 + }, + { + "epoch": 0.36332887041082373, + "grad_norm": 1072.0030517578125, + "learning_rate": 7.340514899307612e-05, + "loss": 3.0278, + "step": 1289 + }, + { + "epoch": 0.363610739200902, + "grad_norm": 1712.003173828125, + "learning_rate": 7.33648702976379e-05, + "loss": 3.4854, + "step": 1290 + }, + { + "epoch": 0.3638926079909802, + "grad_norm": 1072.0028076171875, + "learning_rate": 7.332457219421132e-05, + "loss": 2.9152, + "step": 1291 + }, + { + "epoch": 0.3641744767810584, + "grad_norm": 1020.0029296875, + "learning_rate": 7.328425471626993e-05, + "loss": 2.8305, + "step": 1292 + }, + { + "epoch": 0.36445634557113665, + "grad_norm": 1304.0059814453125, + "learning_rate": 7.324391789730339e-05, + "loss": 2.9948, + "step": 1293 + }, + { + "epoch": 0.36473821436121484, + "grad_norm": 1328.003662109375, + "learning_rate": 7.320356177081736e-05, + "loss": 2.9297, + "step": 1294 + }, + { + "epoch": 0.3650200831512931, + "grad_norm": 1072.0030517578125, + "learning_rate": 7.316318637033364e-05, + "loss": 2.9987, + "step": 1295 + }, + { + "epoch": 0.3653019519413713, + "grad_norm": 864.0023803710938, + "learning_rate": 7.312279172938994e-05, + "loss": 3.0489, + "step": 1296 + }, + { + "epoch": 0.3655838207314495, + "grad_norm": 1832.0054931640625, + "learning_rate": 7.308237788154003e-05, + "loss": 3.0182, + "step": 1297 + }, + { + "epoch": 0.3658656895215277, + "grad_norm": 868.0023193359375, + "learning_rate": 7.304194486035357e-05, + "loss": 2.7569, + "step": 1298 + }, + { + "epoch": 0.36614755831160595, + "grad_norm": 768.0031127929688, + "learning_rate": 7.300149269941623e-05, + "loss": 3.0359, + "step": 1299 + }, + { + "epoch": 0.3664294271016842, + "grad_norm": 1232.0037841796875, + "learning_rate": 7.296102143232948e-05, + "loss": 2.7129, + "step": 1300 + }, + { + "epoch": 0.3667112958917624, + "grad_norm": 1272.002197265625, + "learning_rate": 7.29205310927107e-05, + "loss": 2.835, + "step": 1301 + }, + { + "epoch": 0.3669931646818406, + "grad_norm": 1592.0054931640625, + "learning_rate": 7.288002171419315e-05, + "loss": 3.005, + "step": 1302 + }, + { + "epoch": 0.3672750334719188, + "grad_norm": 1664.0030517578125, + "learning_rate": 7.283949333042585e-05, + "loss": 2.71, + "step": 1303 + }, + { + "epoch": 0.36755690226199705, + "grad_norm": 704.0027465820312, + "learning_rate": 7.279894597507367e-05, + "loss": 2.9776, + "step": 1304 + }, + { + "epoch": 0.36783877105207524, + "grad_norm": 692.004150390625, + "learning_rate": 7.275837968181717e-05, + "loss": 2.8233, + "step": 1305 + }, + { + "epoch": 0.3681206398421535, + "grad_norm": 632.0031127929688, + "learning_rate": 7.271779448435266e-05, + "loss": 2.7813, + "step": 1306 + }, + { + "epoch": 0.3684025086322317, + "grad_norm": 952.0020751953125, + "learning_rate": 7.267719041639218e-05, + "loss": 2.6862, + "step": 1307 + }, + { + "epoch": 0.3686843774223099, + "grad_norm": 1592.00537109375, + "learning_rate": 7.263656751166341e-05, + "loss": 2.7188, + "step": 1308 + }, + { + "epoch": 0.36896624621238816, + "grad_norm": 904.0025024414062, + "learning_rate": 7.259592580390972e-05, + "loss": 2.75, + "step": 1309 + }, + { + "epoch": 0.36924811500246635, + "grad_norm": 644.0028686523438, + "learning_rate": 7.255526532689007e-05, + "loss": 2.9956, + "step": 1310 + }, + { + "epoch": 0.3695299837925446, + "grad_norm": 652.00390625, + "learning_rate": 7.2514586114379e-05, + "loss": 2.7599, + "step": 1311 + }, + { + "epoch": 0.3698118525826228, + "grad_norm": 1216.0048828125, + "learning_rate": 7.247388820016662e-05, + "loss": 2.9598, + "step": 1312 + }, + { + "epoch": 0.370093721372701, + "grad_norm": 2192.00537109375, + "learning_rate": 7.243317161805858e-05, + "loss": 4.1537, + "step": 1313 + }, + { + "epoch": 0.3703755901627792, + "grad_norm": 1048.003662109375, + "learning_rate": 7.239243640187607e-05, + "loss": 2.9154, + "step": 1314 + }, + { + "epoch": 0.37065745895285745, + "grad_norm": 3104.00927734375, + "learning_rate": 7.23516825854557e-05, + "loss": 4.4968, + "step": 1315 + }, + { + "epoch": 0.37093932774293564, + "grad_norm": 1020.0042724609375, + "learning_rate": 7.231091020264956e-05, + "loss": 3.2478, + "step": 1316 + }, + { + "epoch": 0.3712211965330139, + "grad_norm": 1744.00390625, + "learning_rate": 7.227011928732515e-05, + "loss": 3.1452, + "step": 1317 + }, + { + "epoch": 0.3715030653230921, + "grad_norm": 326.0036926269531, + "learning_rate": 7.222930987336537e-05, + "loss": 3.0052, + "step": 1318 + }, + { + "epoch": 0.3717849341131703, + "grad_norm": 600.00244140625, + "learning_rate": 7.218848199466851e-05, + "loss": 2.8568, + "step": 1319 + }, + { + "epoch": 0.37206680290324856, + "grad_norm": 1136.0074462890625, + "learning_rate": 7.214763568514817e-05, + "loss": 2.9584, + "step": 1320 + }, + { + "epoch": 0.37234867169332675, + "grad_norm": 1088.0047607421875, + "learning_rate": 7.210677097873324e-05, + "loss": 3.1068, + "step": 1321 + }, + { + "epoch": 0.372630540483405, + "grad_norm": 1824.0040283203125, + "learning_rate": 7.206588790936793e-05, + "loss": 3.0195, + "step": 1322 + }, + { + "epoch": 0.3729124092734832, + "grad_norm": 1960.0037841796875, + "learning_rate": 7.202498651101172e-05, + "loss": 2.4878, + "step": 1323 + }, + { + "epoch": 0.3731942780635614, + "grad_norm": 2080.010986328125, + "learning_rate": 7.198406681763925e-05, + "loss": 2.7813, + "step": 1324 + }, + { + "epoch": 0.3734761468536396, + "grad_norm": 1240.0047607421875, + "learning_rate": 7.194312886324039e-05, + "loss": 2.7813, + "step": 1325 + }, + { + "epoch": 0.37375801564371786, + "grad_norm": 688.00439453125, + "learning_rate": 7.19021726818202e-05, + "loss": 2.7813, + "step": 1326 + }, + { + "epoch": 0.37403988443379604, + "grad_norm": 984.003173828125, + "learning_rate": 7.186119830739883e-05, + "loss": 2.6387, + "step": 1327 + }, + { + "epoch": 0.3743217532238743, + "grad_norm": 1720.00439453125, + "learning_rate": 7.18202057740116e-05, + "loss": 2.8897, + "step": 1328 + }, + { + "epoch": 0.37460362201395253, + "grad_norm": 1168.0023193359375, + "learning_rate": 7.177919511570887e-05, + "loss": 2.7861, + "step": 1329 + }, + { + "epoch": 0.3748854908040307, + "grad_norm": 1360.0067138671875, + "learning_rate": 7.173816636655611e-05, + "loss": 3.0782, + "step": 1330 + }, + { + "epoch": 0.37516735959410896, + "grad_norm": 1256.004638671875, + "learning_rate": 7.169711956063372e-05, + "loss": 2.75, + "step": 1331 + }, + { + "epoch": 0.37544922838418715, + "grad_norm": 876.0042114257812, + "learning_rate": 7.165605473203719e-05, + "loss": 2.7813, + "step": 1332 + }, + { + "epoch": 0.3757310971742654, + "grad_norm": 1032.00537109375, + "learning_rate": 7.161497191487693e-05, + "loss": 3.0196, + "step": 1333 + }, + { + "epoch": 0.3760129659643436, + "grad_norm": 1152.003173828125, + "learning_rate": 7.157387114327832e-05, + "loss": 2.7813, + "step": 1334 + }, + { + "epoch": 0.3762948347544218, + "grad_norm": 944.005126953125, + "learning_rate": 7.153275245138167e-05, + "loss": 2.75, + "step": 1335 + }, + { + "epoch": 0.3765767035445, + "grad_norm": 1848.007080078125, + "learning_rate": 7.14916158733421e-05, + "loss": 3.0, + "step": 1336 + }, + { + "epoch": 0.37685857233457826, + "grad_norm": 1192.003173828125, + "learning_rate": 7.145046144332965e-05, + "loss": 2.7813, + "step": 1337 + }, + { + "epoch": 0.37714044112465644, + "grad_norm": 872.0038452148438, + "learning_rate": 7.140928919552921e-05, + "loss": 2.75, + "step": 1338 + }, + { + "epoch": 0.3774223099147347, + "grad_norm": 960.0040893554688, + "learning_rate": 7.136809916414038e-05, + "loss": 2.7813, + "step": 1339 + }, + { + "epoch": 0.37770417870481293, + "grad_norm": 760.0031127929688, + "learning_rate": 7.132689138337765e-05, + "loss": 2.9968, + "step": 1340 + }, + { + "epoch": 0.3779860474948911, + "grad_norm": 944.0028686523438, + "learning_rate": 7.128566588747012e-05, + "loss": 3.0195, + "step": 1341 + }, + { + "epoch": 0.37826791628496936, + "grad_norm": 380.00335693359375, + "learning_rate": 7.124442271066175e-05, + "loss": 2.8519, + "step": 1342 + }, + { + "epoch": 0.37854978507504755, + "grad_norm": 1536.0048828125, + "learning_rate": 7.120316188721105e-05, + "loss": 3.2103, + "step": 1343 + }, + { + "epoch": 0.3788316538651258, + "grad_norm": 1280.0032958984375, + "learning_rate": 7.116188345139126e-05, + "loss": 3.6182, + "step": 1344 + }, + { + "epoch": 0.379113522655204, + "grad_norm": 832.0030517578125, + "learning_rate": 7.112058743749028e-05, + "loss": 2.9942, + "step": 1345 + }, + { + "epoch": 0.3793953914452822, + "grad_norm": 1020.0040283203125, + "learning_rate": 7.107927387981054e-05, + "loss": 3.1367, + "step": 1346 + }, + { + "epoch": 0.3796772602353604, + "grad_norm": 3280.01806640625, + "learning_rate": 7.103794281266907e-05, + "loss": 4.2611, + "step": 1347 + }, + { + "epoch": 0.37995912902543866, + "grad_norm": 314.0032958984375, + "learning_rate": 7.099659427039748e-05, + "loss": 3.1234, + "step": 1348 + }, + { + "epoch": 0.3802409978155169, + "grad_norm": 828.0038452148438, + "learning_rate": 7.095522828734183e-05, + "loss": 3.2188, + "step": 1349 + }, + { + "epoch": 0.3805228666055951, + "grad_norm": 928.0040893554688, + "learning_rate": 7.091384489786271e-05, + "loss": 2.7813, + "step": 1350 + }, + { + "epoch": 0.38080473539567333, + "grad_norm": 2032.0062255859375, + "learning_rate": 7.087244413633515e-05, + "loss": 3.5576, + "step": 1351 + }, + { + "epoch": 0.3810866041857515, + "grad_norm": 1232.00634765625, + "learning_rate": 7.083102603714863e-05, + "loss": 3.2783, + "step": 1352 + }, + { + "epoch": 0.38136847297582976, + "grad_norm": 1104.0069580078125, + "learning_rate": 7.078959063470705e-05, + "loss": 3.4138, + "step": 1353 + }, + { + "epoch": 0.38165034176590795, + "grad_norm": 604.0028686523438, + "learning_rate": 7.074813796342861e-05, + "loss": 3.1341, + "step": 1354 + }, + { + "epoch": 0.3819322105559862, + "grad_norm": 1184.0028076171875, + "learning_rate": 7.070666805774593e-05, + "loss": 2.8281, + "step": 1355 + }, + { + "epoch": 0.3822140793460644, + "grad_norm": 1192.0045166015625, + "learning_rate": 7.066518095210588e-05, + "loss": 3.2217, + "step": 1356 + }, + { + "epoch": 0.3824959481361426, + "grad_norm": 1840.0047607421875, + "learning_rate": 7.062367668096967e-05, + "loss": 3.2857, + "step": 1357 + }, + { + "epoch": 0.38277781692622087, + "grad_norm": 1640.0081787109375, + "learning_rate": 7.058215527881277e-05, + "loss": 2.9916, + "step": 1358 + }, + { + "epoch": 0.38305968571629906, + "grad_norm": 952.0034790039062, + "learning_rate": 7.054061678012483e-05, + "loss": 3.4834, + "step": 1359 + }, + { + "epoch": 0.3833415545063773, + "grad_norm": 1000.0059204101562, + "learning_rate": 7.049906121940974e-05, + "loss": 3.0209, + "step": 1360 + }, + { + "epoch": 0.3836234232964555, + "grad_norm": 1720.00732421875, + "learning_rate": 7.045748863118556e-05, + "loss": 2.9363, + "step": 1361 + }, + { + "epoch": 0.38390529208653373, + "grad_norm": 1072.002685546875, + "learning_rate": 7.041589904998447e-05, + "loss": 3.1921, + "step": 1362 + }, + { + "epoch": 0.3841871608766119, + "grad_norm": 346.00311279296875, + "learning_rate": 7.037429251035279e-05, + "loss": 2.961, + "step": 1363 + }, + { + "epoch": 0.38446902966669017, + "grad_norm": 704.0033569335938, + "learning_rate": 7.033266904685094e-05, + "loss": 2.6055, + "step": 1364 + }, + { + "epoch": 0.38475089845676835, + "grad_norm": 888.0039672851562, + "learning_rate": 7.029102869405334e-05, + "loss": 2.6502, + "step": 1365 + }, + { + "epoch": 0.3850327672468466, + "grad_norm": 1768.00732421875, + "learning_rate": 7.024937148654851e-05, + "loss": 2.8994, + "step": 1366 + }, + { + "epoch": 0.3853146360369248, + "grad_norm": 868.004150390625, + "learning_rate": 7.020769745893891e-05, + "loss": 2.7396, + "step": 1367 + }, + { + "epoch": 0.38559650482700303, + "grad_norm": 1408.00634765625, + "learning_rate": 7.016600664584101e-05, + "loss": 3.1934, + "step": 1368 + }, + { + "epoch": 0.38587837361708127, + "grad_norm": 676.0023193359375, + "learning_rate": 7.012429908188523e-05, + "loss": 2.6355, + "step": 1369 + }, + { + "epoch": 0.38616024240715946, + "grad_norm": 506.0038146972656, + "learning_rate": 7.008257480171583e-05, + "loss": 2.8682, + "step": 1370 + }, + { + "epoch": 0.3864421111972377, + "grad_norm": 1784.0091552734375, + "learning_rate": 7.004083383999107e-05, + "loss": 3.6009, + "step": 1371 + }, + { + "epoch": 0.3867239799873159, + "grad_norm": 1784.0025634765625, + "learning_rate": 6.999907623138296e-05, + "loss": 2.835, + "step": 1372 + }, + { + "epoch": 0.38700584877739413, + "grad_norm": 1032.0054931640625, + "learning_rate": 6.99573020105774e-05, + "loss": 2.75, + "step": 1373 + }, + { + "epoch": 0.3872877175674723, + "grad_norm": 1120.005615234375, + "learning_rate": 6.991551121227412e-05, + "loss": 3.0938, + "step": 1374 + }, + { + "epoch": 0.38756958635755057, + "grad_norm": 752.0037841796875, + "learning_rate": 6.987370387118647e-05, + "loss": 2.6025, + "step": 1375 + }, + { + "epoch": 0.38785145514762875, + "grad_norm": 2784.00634765625, + "learning_rate": 6.983188002204175e-05, + "loss": 3.1039, + "step": 1376 + }, + { + "epoch": 0.388133323937707, + "grad_norm": 1040.002197265625, + "learning_rate": 6.97900396995808e-05, + "loss": 2.7543, + "step": 1377 + }, + { + "epoch": 0.38841519272778524, + "grad_norm": 1616.0087890625, + "learning_rate": 6.974818293855822e-05, + "loss": 3.1055, + "step": 1378 + }, + { + "epoch": 0.38869706151786343, + "grad_norm": 1368.0050048828125, + "learning_rate": 6.970630977374229e-05, + "loss": 2.9365, + "step": 1379 + }, + { + "epoch": 0.3889789303079417, + "grad_norm": 564.0040283203125, + "learning_rate": 6.966442023991484e-05, + "loss": 2.8936, + "step": 1380 + }, + { + "epoch": 0.38926079909801986, + "grad_norm": 458.0035095214844, + "learning_rate": 6.962251437187135e-05, + "loss": 2.6843, + "step": 1381 + }, + { + "epoch": 0.3895426678880981, + "grad_norm": 1040.005859375, + "learning_rate": 6.958059220442089e-05, + "loss": 2.5357, + "step": 1382 + }, + { + "epoch": 0.3898245366781763, + "grad_norm": 1712.004638671875, + "learning_rate": 6.953865377238597e-05, + "loss": 2.9704, + "step": 1383 + }, + { + "epoch": 0.39010640546825454, + "grad_norm": 1280.002685546875, + "learning_rate": 6.94966991106027e-05, + "loss": 2.8665, + "step": 1384 + }, + { + "epoch": 0.3903882742583327, + "grad_norm": 1440.0048828125, + "learning_rate": 6.945472825392069e-05, + "loss": 3.0134, + "step": 1385 + }, + { + "epoch": 0.39067014304841097, + "grad_norm": 604.0037841796875, + "learning_rate": 6.941274123720289e-05, + "loss": 2.7813, + "step": 1386 + }, + { + "epoch": 0.3909520118384892, + "grad_norm": 1288.0035400390625, + "learning_rate": 6.93707380953258e-05, + "loss": 2.7813, + "step": 1387 + }, + { + "epoch": 0.3912338806285674, + "grad_norm": 2016.0069580078125, + "learning_rate": 6.932871886317923e-05, + "loss": 3.2591, + "step": 1388 + }, + { + "epoch": 0.39151574941864564, + "grad_norm": 1416.0072021484375, + "learning_rate": 6.928668357566635e-05, + "loss": 3.0583, + "step": 1389 + }, + { + "epoch": 0.39179761820872383, + "grad_norm": 960.0035400390625, + "learning_rate": 6.924463226770376e-05, + "loss": 2.698, + "step": 1390 + }, + { + "epoch": 0.3920794869988021, + "grad_norm": 1376.00537109375, + "learning_rate": 6.920256497422126e-05, + "loss": 3.0801, + "step": 1391 + }, + { + "epoch": 0.39236135578888026, + "grad_norm": 1688.00341796875, + "learning_rate": 6.916048173016198e-05, + "loss": 3.0724, + "step": 1392 + }, + { + "epoch": 0.3926432245789585, + "grad_norm": 1136.003662109375, + "learning_rate": 6.91183825704823e-05, + "loss": 2.3936, + "step": 1393 + }, + { + "epoch": 0.3929250933690367, + "grad_norm": 808.0050048828125, + "learning_rate": 6.907626753015183e-05, + "loss": 2.4119, + "step": 1394 + }, + { + "epoch": 0.39320696215911494, + "grad_norm": 1696.0093994140625, + "learning_rate": 6.903413664415333e-05, + "loss": 3.392, + "step": 1395 + }, + { + "epoch": 0.3934888309491931, + "grad_norm": 836.00634765625, + "learning_rate": 6.899198994748274e-05, + "loss": 3.099, + "step": 1396 + }, + { + "epoch": 0.39377069973927137, + "grad_norm": 1768.0069580078125, + "learning_rate": 6.894982747514916e-05, + "loss": 3.1387, + "step": 1397 + }, + { + "epoch": 0.3940525685293496, + "grad_norm": 724.0057373046875, + "learning_rate": 6.890764926217477e-05, + "loss": 2.975, + "step": 1398 + }, + { + "epoch": 0.3943344373194278, + "grad_norm": 1120.00634765625, + "learning_rate": 6.88654553435948e-05, + "loss": 2.5455, + "step": 1399 + }, + { + "epoch": 0.39461630610950604, + "grad_norm": 346.0093688964844, + "learning_rate": 6.88232457544576e-05, + "loss": 2.7726, + "step": 1400 + }, + { + "epoch": 0.39489817489958423, + "grad_norm": 412.00531005859375, + "learning_rate": 6.878102052982447e-05, + "loss": 2.6981, + "step": 1401 + }, + { + "epoch": 0.3951800436896625, + "grad_norm": 1304.0050048828125, + "learning_rate": 6.873877970476971e-05, + "loss": 3.0088, + "step": 1402 + }, + { + "epoch": 0.39546191247974066, + "grad_norm": 1056.004150390625, + "learning_rate": 6.869652331438058e-05, + "loss": 2.7693, + "step": 1403 + }, + { + "epoch": 0.3957437812698189, + "grad_norm": 1280.0037841796875, + "learning_rate": 6.86542513937573e-05, + "loss": 2.7693, + "step": 1404 + }, + { + "epoch": 0.3960256500598971, + "grad_norm": 924.005615234375, + "learning_rate": 6.861196397801297e-05, + "loss": 2.7813, + "step": 1405 + }, + { + "epoch": 0.39630751884997534, + "grad_norm": 1312.010498046875, + "learning_rate": 6.856966110227352e-05, + "loss": 2.9401, + "step": 1406 + }, + { + "epoch": 0.3965893876400536, + "grad_norm": 1424.0035400390625, + "learning_rate": 6.852734280167781e-05, + "loss": 3.0196, + "step": 1407 + }, + { + "epoch": 0.39687125643013177, + "grad_norm": 1336.00439453125, + "learning_rate": 6.84850091113774e-05, + "loss": 2.9402, + "step": 1408 + }, + { + "epoch": 0.39715312522021, + "grad_norm": 2144.01123046875, + "learning_rate": 6.844266006653675e-05, + "loss": 2.961, + "step": 1409 + }, + { + "epoch": 0.3974349940102882, + "grad_norm": 536.005615234375, + "learning_rate": 6.8400295702333e-05, + "loss": 2.7813, + "step": 1410 + }, + { + "epoch": 0.39771686280036644, + "grad_norm": 158.0082244873047, + "learning_rate": 6.835791605395605e-05, + "loss": 3.0479, + "step": 1411 + }, + { + "epoch": 0.39799873159044463, + "grad_norm": 1208.00439453125, + "learning_rate": 6.831552115660845e-05, + "loss": 2.7813, + "step": 1412 + }, + { + "epoch": 0.3982806003805229, + "grad_norm": 696.0050659179688, + "learning_rate": 6.827311104550546e-05, + "loss": 2.961, + "step": 1413 + }, + { + "epoch": 0.39856246917060106, + "grad_norm": 916.0042724609375, + "learning_rate": 6.823068575587495e-05, + "loss": 3.0088, + "step": 1414 + }, + { + "epoch": 0.3988443379606793, + "grad_norm": 2000.0068359375, + "learning_rate": 6.818824532295744e-05, + "loss": 3.7107, + "step": 1415 + }, + { + "epoch": 0.3991262067507575, + "grad_norm": 1312.0096435546875, + "learning_rate": 6.8145789782006e-05, + "loss": 3.4665, + "step": 1416 + }, + { + "epoch": 0.39940807554083574, + "grad_norm": 1856.0133056640625, + "learning_rate": 6.810331916828622e-05, + "loss": 3.5717, + "step": 1417 + }, + { + "epoch": 0.399689944330914, + "grad_norm": 1544.005859375, + "learning_rate": 6.806083351707628e-05, + "loss": 2.5401, + "step": 1418 + }, + { + "epoch": 0.39997181312099217, + "grad_norm": 1264.0087890625, + "learning_rate": 6.801833286366676e-05, + "loss": 3.1833, + "step": 1419 + }, + { + "epoch": 0.4002536819110704, + "grad_norm": 1296.0089111328125, + "learning_rate": 6.79758172433608e-05, + "loss": 3.085, + "step": 1420 + }, + { + "epoch": 0.4005355507011486, + "grad_norm": 2080.007080078125, + "learning_rate": 6.793328669147393e-05, + "loss": 3.4271, + "step": 1421 + }, + { + "epoch": 0.40081741949122685, + "grad_norm": 604.0047607421875, + "learning_rate": 6.789074124333404e-05, + "loss": 3.3874, + "step": 1422 + }, + { + "epoch": 0.40109928828130503, + "grad_norm": 804.0051879882812, + "learning_rate": 6.784818093428144e-05, + "loss": 3.0469, + "step": 1423 + }, + { + "epoch": 0.4013811570713833, + "grad_norm": 2240.00439453125, + "learning_rate": 6.780560579966878e-05, + "loss": 3.3708, + "step": 1424 + }, + { + "epoch": 0.40166302586146146, + "grad_norm": 1416.00830078125, + "learning_rate": 6.776301587486102e-05, + "loss": 3.2853, + "step": 1425 + }, + { + "epoch": 0.4019448946515397, + "grad_norm": 548.0037841796875, + "learning_rate": 6.772041119523545e-05, + "loss": 2.8708, + "step": 1426 + }, + { + "epoch": 0.40222676344161795, + "grad_norm": 1408.0062255859375, + "learning_rate": 6.767779179618149e-05, + "loss": 2.9514, + "step": 1427 + }, + { + "epoch": 0.40250863223169614, + "grad_norm": 1496.004150390625, + "learning_rate": 6.763515771310093e-05, + "loss": 3.6697, + "step": 1428 + }, + { + "epoch": 0.4027905010217744, + "grad_norm": 996.0072021484375, + "learning_rate": 6.759250898140768e-05, + "loss": 3.3542, + "step": 1429 + }, + { + "epoch": 0.40307236981185257, + "grad_norm": 1720.005615234375, + "learning_rate": 6.75498456365278e-05, + "loss": 2.918, + "step": 1430 + }, + { + "epoch": 0.4033542386019308, + "grad_norm": 2768.00830078125, + "learning_rate": 6.75071677138996e-05, + "loss": 3.4785, + "step": 1431 + }, + { + "epoch": 0.403636107392009, + "grad_norm": 1376.0050048828125, + "learning_rate": 6.746447524897334e-05, + "loss": 2.9216, + "step": 1432 + }, + { + "epoch": 0.40391797618208725, + "grad_norm": 688.0044555664062, + "learning_rate": 6.742176827721149e-05, + "loss": 2.9034, + "step": 1433 + }, + { + "epoch": 0.40419984497216543, + "grad_norm": 1040.005126953125, + "learning_rate": 6.73790468340885e-05, + "loss": 2.8884, + "step": 1434 + }, + { + "epoch": 0.4044817137622437, + "grad_norm": 1144.0040283203125, + "learning_rate": 6.733631095509088e-05, + "loss": 2.7813, + "step": 1435 + }, + { + "epoch": 0.4047635825523219, + "grad_norm": 928.003173828125, + "learning_rate": 6.72935606757171e-05, + "loss": 2.7803, + "step": 1436 + }, + { + "epoch": 0.4050454513424001, + "grad_norm": 1096.0068359375, + "learning_rate": 6.725079603147759e-05, + "loss": 2.7774, + "step": 1437 + }, + { + "epoch": 0.40532732013247835, + "grad_norm": 860.0070190429688, + "learning_rate": 6.720801705789474e-05, + "loss": 2.7803, + "step": 1438 + }, + { + "epoch": 0.40560918892255654, + "grad_norm": 796.005615234375, + "learning_rate": 6.716522379050282e-05, + "loss": 2.7803, + "step": 1439 + }, + { + "epoch": 0.4058910577126348, + "grad_norm": 880.0055541992188, + "learning_rate": 6.712241626484803e-05, + "loss": 2.9379, + "step": 1440 + }, + { + "epoch": 0.40617292650271297, + "grad_norm": 1400.004638671875, + "learning_rate": 6.70795945164883e-05, + "loss": 2.6469, + "step": 1441 + }, + { + "epoch": 0.4064547952927912, + "grad_norm": 776.0054321289062, + "learning_rate": 6.703675858099347e-05, + "loss": 2.9173, + "step": 1442 + }, + { + "epoch": 0.4067366640828694, + "grad_norm": 748.00439453125, + "learning_rate": 6.699390849394515e-05, + "loss": 3.0762, + "step": 1443 + }, + { + "epoch": 0.40701853287294765, + "grad_norm": 744.0053100585938, + "learning_rate": 6.695104429093664e-05, + "loss": 2.7439, + "step": 1444 + }, + { + "epoch": 0.40730040166302584, + "grad_norm": 936.00537109375, + "learning_rate": 6.690816600757308e-05, + "loss": 2.8946, + "step": 1445 + }, + { + "epoch": 0.4075822704531041, + "grad_norm": 788.0029907226562, + "learning_rate": 6.68652736794712e-05, + "loss": 2.9356, + "step": 1446 + }, + { + "epoch": 0.4078641392431823, + "grad_norm": 1280.0047607421875, + "learning_rate": 6.682236734225944e-05, + "loss": 2.7461, + "step": 1447 + }, + { + "epoch": 0.4081460080332605, + "grad_norm": 1136.00341796875, + "learning_rate": 6.677944703157786e-05, + "loss": 2.8565, + "step": 1448 + }, + { + "epoch": 0.40842787682333875, + "grad_norm": 1248.0028076171875, + "learning_rate": 6.673651278307817e-05, + "loss": 2.6166, + "step": 1449 + }, + { + "epoch": 0.40870974561341694, + "grad_norm": 716.0054931640625, + "learning_rate": 6.669356463242363e-05, + "loss": 3.0261, + "step": 1450 + }, + { + "epoch": 0.4089916144034952, + "grad_norm": 712.005859375, + "learning_rate": 6.6650602615289e-05, + "loss": 3.1688, + "step": 1451 + }, + { + "epoch": 0.4092734831935734, + "grad_norm": 1128.0045166015625, + "learning_rate": 6.660762676736066e-05, + "loss": 2.873, + "step": 1452 + }, + { + "epoch": 0.4095553519836516, + "grad_norm": 1536.00927734375, + "learning_rate": 6.65646371243364e-05, + "loss": 3.4678, + "step": 1453 + }, + { + "epoch": 0.4098372207737298, + "grad_norm": 1112.0052490234375, + "learning_rate": 6.652163372192546e-05, + "loss": 3.3927, + "step": 1454 + }, + { + "epoch": 0.41011908956380805, + "grad_norm": 836.0092163085938, + "learning_rate": 6.64786165958486e-05, + "loss": 2.2756, + "step": 1455 + }, + { + "epoch": 0.4104009583538863, + "grad_norm": 892.0061645507812, + "learning_rate": 6.643558578183787e-05, + "loss": 2.905, + "step": 1456 + }, + { + "epoch": 0.4106828271439645, + "grad_norm": 482.0074462890625, + "learning_rate": 6.639254131563678e-05, + "loss": 3.1798, + "step": 1457 + }, + { + "epoch": 0.4109646959340427, + "grad_norm": 1128.005615234375, + "learning_rate": 6.63494832330001e-05, + "loss": 3.1391, + "step": 1458 + }, + { + "epoch": 0.4112465647241209, + "grad_norm": 944.0072021484375, + "learning_rate": 6.630641156969397e-05, + "loss": 3.1094, + "step": 1459 + }, + { + "epoch": 0.41152843351419915, + "grad_norm": 1616.00537109375, + "learning_rate": 6.62633263614958e-05, + "loss": 3.0906, + "step": 1460 + }, + { + "epoch": 0.41181030230427734, + "grad_norm": 480.0115661621094, + "learning_rate": 6.622022764419423e-05, + "loss": 2.9679, + "step": 1461 + }, + { + "epoch": 0.4120921710943556, + "grad_norm": 820.0066528320312, + "learning_rate": 6.617711545358914e-05, + "loss": 3.3624, + "step": 1462 + }, + { + "epoch": 0.4123740398844338, + "grad_norm": 370.0113525390625, + "learning_rate": 6.613398982549159e-05, + "loss": 2.8922, + "step": 1463 + }, + { + "epoch": 0.412655908674512, + "grad_norm": 1184.005126953125, + "learning_rate": 6.60908507957238e-05, + "loss": 2.7491, + "step": 1464 + }, + { + "epoch": 0.4129377774645902, + "grad_norm": 1544.0125732421875, + "learning_rate": 6.604769840011914e-05, + "loss": 2.647, + "step": 1465 + }, + { + "epoch": 0.41321964625466845, + "grad_norm": 816.0082397460938, + "learning_rate": 6.600453267452207e-05, + "loss": 3.1967, + "step": 1466 + }, + { + "epoch": 0.4135015150447467, + "grad_norm": 1160.0096435546875, + "learning_rate": 6.596135365478814e-05, + "loss": 2.7227, + "step": 1467 + }, + { + "epoch": 0.4137833838348249, + "grad_norm": 900.00634765625, + "learning_rate": 6.591816137678389e-05, + "loss": 2.7524, + "step": 1468 + }, + { + "epoch": 0.4140652526249031, + "grad_norm": 1976.0159912109375, + "learning_rate": 6.587495587638694e-05, + "loss": 3.2222, + "step": 1469 + }, + { + "epoch": 0.4143471214149813, + "grad_norm": 1656.0142822265625, + "learning_rate": 6.583173718948583e-05, + "loss": 3.2531, + "step": 1470 + }, + { + "epoch": 0.41462899020505956, + "grad_norm": 1192.007568359375, + "learning_rate": 6.578850535198014e-05, + "loss": 3.0915, + "step": 1471 + }, + { + "epoch": 0.41491085899513774, + "grad_norm": 1232.0120849609375, + "learning_rate": 6.574526039978027e-05, + "loss": 2.7775, + "step": 1472 + }, + { + "epoch": 0.415192727785216, + "grad_norm": 1600.0135498046875, + "learning_rate": 6.57020023688076e-05, + "loss": 2.896, + "step": 1473 + }, + { + "epoch": 0.4154745965752942, + "grad_norm": 2352.006103515625, + "learning_rate": 6.565873129499431e-05, + "loss": 2.986, + "step": 1474 + }, + { + "epoch": 0.4157564653653724, + "grad_norm": 832.0093383789062, + "learning_rate": 6.561544721428343e-05, + "loss": 3.0162, + "step": 1475 + }, + { + "epoch": 0.41603833415545066, + "grad_norm": 274.0166015625, + "learning_rate": 6.557215016262885e-05, + "loss": 2.9571, + "step": 1476 + }, + { + "epoch": 0.41632020294552885, + "grad_norm": 696.0079345703125, + "learning_rate": 6.552884017599517e-05, + "loss": 3.4304, + "step": 1477 + }, + { + "epoch": 0.4166020717356071, + "grad_norm": 426.01422119140625, + "learning_rate": 6.548551729035774e-05, + "loss": 3.4187, + "step": 1478 + }, + { + "epoch": 0.4168839405256853, + "grad_norm": 1224.0052490234375, + "learning_rate": 6.544218154170262e-05, + "loss": 2.5284, + "step": 1479 + }, + { + "epoch": 0.4171658093157635, + "grad_norm": 372.00958251953125, + "learning_rate": 6.539883296602664e-05, + "loss": 2.7373, + "step": 1480 + }, + { + "epoch": 0.4174476781058417, + "grad_norm": 1808.0106201171875, + "learning_rate": 6.535547159933716e-05, + "loss": 2.9278, + "step": 1481 + }, + { + "epoch": 0.41772954689591996, + "grad_norm": 2304.016845703125, + "learning_rate": 6.531209747765225e-05, + "loss": 3.1986, + "step": 1482 + }, + { + "epoch": 0.41801141568599814, + "grad_norm": 378.0107116699219, + "learning_rate": 6.526871063700055e-05, + "loss": 3.2771, + "step": 1483 + }, + { + "epoch": 0.4182932844760764, + "grad_norm": 920.00439453125, + "learning_rate": 6.522531111342124e-05, + "loss": 3.1293, + "step": 1484 + }, + { + "epoch": 0.41857515326615463, + "grad_norm": 1928.01318359375, + "learning_rate": 6.518189894296406e-05, + "loss": 2.8581, + "step": 1485 + }, + { + "epoch": 0.4188570220562328, + "grad_norm": 732.0030517578125, + "learning_rate": 6.51384741616893e-05, + "loss": 2.7699, + "step": 1486 + }, + { + "epoch": 0.41913889084631106, + "grad_norm": 860.0064086914062, + "learning_rate": 6.50950368056676e-05, + "loss": 2.9222, + "step": 1487 + }, + { + "epoch": 0.41942075963638925, + "grad_norm": 510.0033874511719, + "learning_rate": 6.50515869109802e-05, + "loss": 2.627, + "step": 1488 + }, + { + "epoch": 0.4197026284264675, + "grad_norm": 572.0032958984375, + "learning_rate": 6.500812451371862e-05, + "loss": 2.7692, + "step": 1489 + }, + { + "epoch": 0.4199844972165457, + "grad_norm": 720.0057983398438, + "learning_rate": 6.496464964998485e-05, + "loss": 2.7725, + "step": 1490 + }, + { + "epoch": 0.4202663660066239, + "grad_norm": 1400.0074462890625, + "learning_rate": 6.49211623558912e-05, + "loss": 2.7702, + "step": 1491 + }, + { + "epoch": 0.4205482347967021, + "grad_norm": 1488.009765625, + "learning_rate": 6.487766266756033e-05, + "loss": 2.7702, + "step": 1492 + }, + { + "epoch": 0.42083010358678036, + "grad_norm": 1024.00732421875, + "learning_rate": 6.483415062112517e-05, + "loss": 2.7709, + "step": 1493 + }, + { + "epoch": 0.42111197237685855, + "grad_norm": 1280.003173828125, + "learning_rate": 6.479062625272892e-05, + "loss": 2.6996, + "step": 1494 + }, + { + "epoch": 0.4213938411669368, + "grad_norm": 756.0031127929688, + "learning_rate": 6.474708959852503e-05, + "loss": 2.7601, + "step": 1495 + }, + { + "epoch": 0.42167570995701503, + "grad_norm": 1648.009033203125, + "learning_rate": 6.470354069467714e-05, + "loss": 2.7129, + "step": 1496 + }, + { + "epoch": 0.4219575787470932, + "grad_norm": 928.0037841796875, + "learning_rate": 6.465997957735908e-05, + "loss": 2.8184, + "step": 1497 + }, + { + "epoch": 0.42223944753717146, + "grad_norm": 1440.008544921875, + "learning_rate": 6.461640628275479e-05, + "loss": 3.3565, + "step": 1498 + }, + { + "epoch": 0.42252131632724965, + "grad_norm": 1416.00927734375, + "learning_rate": 6.457282084705837e-05, + "loss": 3.2989, + "step": 1499 + }, + { + "epoch": 0.4228031851173279, + "grad_norm": 1856.0035400390625, + "learning_rate": 6.452922330647397e-05, + "loss": 2.5904, + "step": 1500 + }, + { + "epoch": 0.4230850539074061, + "grad_norm": 404.1029357910156, + "learning_rate": 6.448561369721582e-05, + "loss": 2.7519, + "step": 1501 + }, + { + "epoch": 0.4233669226974843, + "grad_norm": 744.0036010742188, + "learning_rate": 6.444199205550819e-05, + "loss": 3.292, + "step": 1502 + }, + { + "epoch": 0.4236487914875625, + "grad_norm": 446.0061340332031, + "learning_rate": 6.439835841758529e-05, + "loss": 2.9219, + "step": 1503 + }, + { + "epoch": 0.42393066027764076, + "grad_norm": 1792.0050048828125, + "learning_rate": 6.435471281969132e-05, + "loss": 3.4265, + "step": 1504 + }, + { + "epoch": 0.424212529067719, + "grad_norm": 1368.0086669921875, + "learning_rate": 6.431105529808043e-05, + "loss": 3.1615, + "step": 1505 + }, + { + "epoch": 0.4244943978577972, + "grad_norm": 608.0035400390625, + "learning_rate": 6.426738588901668e-05, + "loss": 2.9793, + "step": 1506 + }, + { + "epoch": 0.42477626664787543, + "grad_norm": 1168.00341796875, + "learning_rate": 6.422370462877396e-05, + "loss": 2.9131, + "step": 1507 + }, + { + "epoch": 0.4250581354379536, + "grad_norm": 414.0047607421875, + "learning_rate": 6.418001155363604e-05, + "loss": 3.0332, + "step": 1508 + }, + { + "epoch": 0.42534000422803186, + "grad_norm": 264.0033264160156, + "learning_rate": 6.413630669989652e-05, + "loss": 3.4102, + "step": 1509 + }, + { + "epoch": 0.42562187301811005, + "grad_norm": 888.0072631835938, + "learning_rate": 6.40925901038587e-05, + "loss": 3.1091, + "step": 1510 + }, + { + "epoch": 0.4259037418081883, + "grad_norm": 1136.005615234375, + "learning_rate": 6.404886180183575e-05, + "loss": 2.7923, + "step": 1511 + }, + { + "epoch": 0.4261856105982665, + "grad_norm": 784.0022583007812, + "learning_rate": 6.400512183015051e-05, + "loss": 2.7809, + "step": 1512 + }, + { + "epoch": 0.42646747938834473, + "grad_norm": 768.0045776367188, + "learning_rate": 6.396137022513545e-05, + "loss": 2.7783, + "step": 1513 + }, + { + "epoch": 0.42674934817842297, + "grad_norm": 1304.0050048828125, + "learning_rate": 6.391760702313283e-05, + "loss": 2.693, + "step": 1514 + }, + { + "epoch": 0.42703121696850116, + "grad_norm": 1456.0091552734375, + "learning_rate": 6.387383226049445e-05, + "loss": 3.1953, + "step": 1515 + }, + { + "epoch": 0.4273130857585794, + "grad_norm": 1224.0203857421875, + "learning_rate": 6.383004597358173e-05, + "loss": 3.0326, + "step": 1516 + }, + { + "epoch": 0.4275949545486576, + "grad_norm": 1040.006103515625, + "learning_rate": 6.378624819876569e-05, + "loss": 2.7664, + "step": 1517 + }, + { + "epoch": 0.42787682333873583, + "grad_norm": 215.00405883789062, + "learning_rate": 6.374243897242685e-05, + "loss": 3.0756, + "step": 1518 + }, + { + "epoch": 0.428158692128814, + "grad_norm": 720.0054321289062, + "learning_rate": 6.369861833095531e-05, + "loss": 2.9427, + "step": 1519 + }, + { + "epoch": 0.42844056091889227, + "grad_norm": 920.0040283203125, + "learning_rate": 6.365478631075056e-05, + "loss": 2.7197, + "step": 1520 + }, + { + "epoch": 0.42872242970897045, + "grad_norm": 1616.005859375, + "learning_rate": 6.36109429482216e-05, + "loss": 2.9922, + "step": 1521 + }, + { + "epoch": 0.4290042984990487, + "grad_norm": 912.0032348632812, + "learning_rate": 6.356708827978688e-05, + "loss": 2.9766, + "step": 1522 + }, + { + "epoch": 0.4292861672891269, + "grad_norm": 1480.004150390625, + "learning_rate": 6.352322234187417e-05, + "loss": 2.9873, + "step": 1523 + }, + { + "epoch": 0.42956803607920513, + "grad_norm": 1816.00634765625, + "learning_rate": 6.347934517092065e-05, + "loss": 2.8906, + "step": 1524 + }, + { + "epoch": 0.4298499048692834, + "grad_norm": 1344.0030517578125, + "learning_rate": 6.343545680337279e-05, + "loss": 3.3587, + "step": 1525 + }, + { + "epoch": 0.43013177365936156, + "grad_norm": 1904.0032958984375, + "learning_rate": 6.33915572756864e-05, + "loss": 2.7676, + "step": 1526 + }, + { + "epoch": 0.4304136424494398, + "grad_norm": 944.0039672851562, + "learning_rate": 6.334764662432654e-05, + "loss": 2.7461, + "step": 1527 + }, + { + "epoch": 0.430695511239518, + "grad_norm": 776.0023193359375, + "learning_rate": 6.330372488576754e-05, + "loss": 3.0521, + "step": 1528 + }, + { + "epoch": 0.43097738002959624, + "grad_norm": 1072.010986328125, + "learning_rate": 6.325979209649289e-05, + "loss": 3.6527, + "step": 1529 + }, + { + "epoch": 0.4312592488196744, + "grad_norm": 2160.01220703125, + "learning_rate": 6.321584829299528e-05, + "loss": 3.6973, + "step": 1530 + }, + { + "epoch": 0.43154111760975267, + "grad_norm": 2080.008056640625, + "learning_rate": 6.317189351177657e-05, + "loss": 2.777, + "step": 1531 + }, + { + "epoch": 0.43182298639983085, + "grad_norm": 1864.00537109375, + "learning_rate": 6.31279277893477e-05, + "loss": 3.1908, + "step": 1532 + }, + { + "epoch": 0.4321048551899091, + "grad_norm": 1784.0084228515625, + "learning_rate": 6.308395116222876e-05, + "loss": 3.0079, + "step": 1533 + }, + { + "epoch": 0.43238672397998734, + "grad_norm": 2208.00634765625, + "learning_rate": 6.303996366694881e-05, + "loss": 2.5752, + "step": 1534 + }, + { + "epoch": 0.43266859277006553, + "grad_norm": 980.0032958984375, + "learning_rate": 6.299596534004605e-05, + "loss": 2.9401, + "step": 1535 + }, + { + "epoch": 0.4329504615601438, + "grad_norm": 1400.0068359375, + "learning_rate": 6.295195621806754e-05, + "loss": 2.9401, + "step": 1536 + }, + { + "epoch": 0.43323233035022196, + "grad_norm": 2352.0087890625, + "learning_rate": 6.29079363375694e-05, + "loss": 2.7813, + "step": 1537 + }, + { + "epoch": 0.4335141991403002, + "grad_norm": 796.0025024414062, + "learning_rate": 6.286390573511669e-05, + "loss": 2.7813, + "step": 1538 + }, + { + "epoch": 0.4337960679303784, + "grad_norm": 498.0050964355469, + "learning_rate": 6.281986444728334e-05, + "loss": 3.072, + "step": 1539 + }, + { + "epoch": 0.43407793672045664, + "grad_norm": 564.0042724609375, + "learning_rate": 6.277581251065216e-05, + "loss": 2.7396, + "step": 1540 + }, + { + "epoch": 0.4343598055105348, + "grad_norm": 780.0032348632812, + "learning_rate": 6.273174996181482e-05, + "loss": 2.75, + "step": 1541 + }, + { + "epoch": 0.43464167430061307, + "grad_norm": 1048.0040283203125, + "learning_rate": 6.26876768373718e-05, + "loss": 2.7803, + "step": 1542 + }, + { + "epoch": 0.43492354309069126, + "grad_norm": 1400.004638671875, + "learning_rate": 6.264359317393238e-05, + "loss": 3.0154, + "step": 1543 + }, + { + "epoch": 0.4352054118807695, + "grad_norm": 960.0043334960938, + "learning_rate": 6.259949900811453e-05, + "loss": 3.0, + "step": 1544 + }, + { + "epoch": 0.43548728067084774, + "grad_norm": 816.002685546875, + "learning_rate": 6.255539437654505e-05, + "loss": 2.6667, + "step": 1545 + }, + { + "epoch": 0.43576914946092593, + "grad_norm": 556.0043334960938, + "learning_rate": 6.251127931585933e-05, + "loss": 2.9584, + "step": 1546 + }, + { + "epoch": 0.4360510182510042, + "grad_norm": 1920.006591796875, + "learning_rate": 6.246715386270148e-05, + "loss": 2.7396, + "step": 1547 + }, + { + "epoch": 0.43633288704108236, + "grad_norm": 988.0033569335938, + "learning_rate": 6.242301805372424e-05, + "loss": 2.4835, + "step": 1548 + }, + { + "epoch": 0.4366147558311606, + "grad_norm": 1528.0068359375, + "learning_rate": 6.237887192558893e-05, + "loss": 3.0, + "step": 1549 + }, + { + "epoch": 0.4368966246212388, + "grad_norm": 1768.0098876953125, + "learning_rate": 6.233471551496546e-05, + "loss": 2.8689, + "step": 1550 + }, + { + "epoch": 0.43717849341131704, + "grad_norm": 2304.010986328125, + "learning_rate": 6.229054885853227e-05, + "loss": 3.8356, + "step": 1551 + }, + { + "epoch": 0.4374603622013952, + "grad_norm": 1544.0079345703125, + "learning_rate": 6.224637199297633e-05, + "loss": 3.8399, + "step": 1552 + }, + { + "epoch": 0.43774223099147347, + "grad_norm": 2560.014404296875, + "learning_rate": 6.220218495499306e-05, + "loss": 3.5593, + "step": 1553 + }, + { + "epoch": 0.4380240997815517, + "grad_norm": 1376.00830078125, + "learning_rate": 6.215798778128634e-05, + "loss": 2.9727, + "step": 1554 + }, + { + "epoch": 0.4383059685716299, + "grad_norm": 1216.0078125, + "learning_rate": 6.211378050856851e-05, + "loss": 2.8904, + "step": 1555 + }, + { + "epoch": 0.43858783736170814, + "grad_norm": 568.0037231445312, + "learning_rate": 6.20695631735602e-05, + "loss": 2.6751, + "step": 1556 + }, + { + "epoch": 0.43886970615178633, + "grad_norm": 1528.004150390625, + "learning_rate": 6.202533581299051e-05, + "loss": 3.0807, + "step": 1557 + }, + { + "epoch": 0.4391515749418646, + "grad_norm": 780.0027465820312, + "learning_rate": 6.198109846359682e-05, + "loss": 3.0827, + "step": 1558 + }, + { + "epoch": 0.43943344373194276, + "grad_norm": 1600.009033203125, + "learning_rate": 6.193685116212479e-05, + "loss": 3.43, + "step": 1559 + }, + { + "epoch": 0.439715312522021, + "grad_norm": 836.0057373046875, + "learning_rate": 6.189259394532835e-05, + "loss": 3.1182, + "step": 1560 + }, + { + "epoch": 0.4399971813120992, + "grad_norm": 2608.016357421875, + "learning_rate": 6.184832684996972e-05, + "loss": 4.1436, + "step": 1561 + }, + { + "epoch": 0.44027905010217744, + "grad_norm": 928.0044555664062, + "learning_rate": 6.180404991281926e-05, + "loss": 3.3871, + "step": 1562 + }, + { + "epoch": 0.4405609188922557, + "grad_norm": 1640.0057373046875, + "learning_rate": 6.175976317065551e-05, + "loss": 2.1797, + "step": 1563 + }, + { + "epoch": 0.44084278768233387, + "grad_norm": 824.00537109375, + "learning_rate": 6.171546666026521e-05, + "loss": 3.2663, + "step": 1564 + }, + { + "epoch": 0.4411246564724121, + "grad_norm": 2816.00927734375, + "learning_rate": 6.167116041844316e-05, + "loss": 3.7835, + "step": 1565 + }, + { + "epoch": 0.4414065252624903, + "grad_norm": 1936.006591796875, + "learning_rate": 6.162684448199228e-05, + "loss": 3.654, + "step": 1566 + }, + { + "epoch": 0.44168839405256854, + "grad_norm": 844.0027465820312, + "learning_rate": 6.15825188877235e-05, + "loss": 3.044, + "step": 1567 + }, + { + "epoch": 0.44197026284264673, + "grad_norm": 608.003662109375, + "learning_rate": 6.153818367245579e-05, + "loss": 2.8841, + "step": 1568 + }, + { + "epoch": 0.442252131632725, + "grad_norm": 1584.0386962890625, + "learning_rate": 6.149383887301617e-05, + "loss": 3.264, + "step": 1569 + }, + { + "epoch": 0.44253400042280316, + "grad_norm": 1472.0042724609375, + "learning_rate": 6.14494845262395e-05, + "loss": 2.7197, + "step": 1570 + }, + { + "epoch": 0.4428158692128814, + "grad_norm": 632.0027465820312, + "learning_rate": 6.14051206689687e-05, + "loss": 2.9401, + "step": 1571 + }, + { + "epoch": 0.4430977380029596, + "grad_norm": 1056.003173828125, + "learning_rate": 6.136074733805449e-05, + "loss": 2.6387, + "step": 1572 + }, + { + "epoch": 0.44337960679303784, + "grad_norm": 716.0030517578125, + "learning_rate": 6.13163645703555e-05, + "loss": 2.7813, + "step": 1573 + }, + { + "epoch": 0.4436614755831161, + "grad_norm": 840.00537109375, + "learning_rate": 6.127197240273824e-05, + "loss": 2.7396, + "step": 1574 + }, + { + "epoch": 0.44394334437319427, + "grad_norm": 1080.0059814453125, + "learning_rate": 6.122757087207695e-05, + "loss": 3.1797, + "step": 1575 + }, + { + "epoch": 0.4442252131632725, + "grad_norm": 1520.0068359375, + "learning_rate": 6.118316001525368e-05, + "loss": 2.7813, + "step": 1576 + }, + { + "epoch": 0.4445070819533507, + "grad_norm": 1376.0047607421875, + "learning_rate": 6.113873986915823e-05, + "loss": 2.6563, + "step": 1577 + }, + { + "epoch": 0.44478895074342895, + "grad_norm": 988.0045166015625, + "learning_rate": 6.109431047068815e-05, + "loss": 2.7188, + "step": 1578 + }, + { + "epoch": 0.44507081953350713, + "grad_norm": 880.0032958984375, + "learning_rate": 6.104987185674863e-05, + "loss": 2.8379, + "step": 1579 + }, + { + "epoch": 0.4453526883235854, + "grad_norm": 744.0051879882812, + "learning_rate": 6.100542406425249e-05, + "loss": 3.1191, + "step": 1580 + }, + { + "epoch": 0.44563455711366357, + "grad_norm": 1520.005126953125, + "learning_rate": 6.096096713012025e-05, + "loss": 3.1475, + "step": 1581 + }, + { + "epoch": 0.4459164259037418, + "grad_norm": 1896.0076904296875, + "learning_rate": 6.0916501091279945e-05, + "loss": 3.5417, + "step": 1582 + }, + { + "epoch": 0.44619829469382005, + "grad_norm": 924.0316772460938, + "learning_rate": 6.087202598466726e-05, + "loss": 2.8985, + "step": 1583 + }, + { + "epoch": 0.44648016348389824, + "grad_norm": 640.0028686523438, + "learning_rate": 6.082754184722533e-05, + "loss": 2.7276, + "step": 1584 + }, + { + "epoch": 0.4467620322739765, + "grad_norm": 1568.004150390625, + "learning_rate": 6.078304871590484e-05, + "loss": 2.6668, + "step": 1585 + }, + { + "epoch": 0.44704390106405467, + "grad_norm": 1320.005126953125, + "learning_rate": 6.073854662766394e-05, + "loss": 2.7742, + "step": 1586 + }, + { + "epoch": 0.4473257698541329, + "grad_norm": 1120.0072021484375, + "learning_rate": 6.069403561946817e-05, + "loss": 3.0538, + "step": 1587 + }, + { + "epoch": 0.4476076386442111, + "grad_norm": 1056.0040283203125, + "learning_rate": 6.064951572829056e-05, + "loss": 3.1966, + "step": 1588 + }, + { + "epoch": 0.44788950743428935, + "grad_norm": 1232.00634765625, + "learning_rate": 6.060498699111148e-05, + "loss": 2.4518, + "step": 1589 + }, + { + "epoch": 0.44817137622436753, + "grad_norm": 780.004638671875, + "learning_rate": 6.056044944491862e-05, + "loss": 2.6787, + "step": 1590 + }, + { + "epoch": 0.4484532450144458, + "grad_norm": 1800.0130615234375, + "learning_rate": 6.051590312670703e-05, + "loss": 2.7839, + "step": 1591 + }, + { + "epoch": 0.448735113804524, + "grad_norm": 1088.0084228515625, + "learning_rate": 6.047134807347904e-05, + "loss": 2.765, + "step": 1592 + }, + { + "epoch": 0.4490169825946022, + "grad_norm": 796.0092163085938, + "learning_rate": 6.042678432224421e-05, + "loss": 2.9236, + "step": 1593 + }, + { + "epoch": 0.44929885138468045, + "grad_norm": 1544.0118408203125, + "learning_rate": 6.038221191001935e-05, + "loss": 3.5303, + "step": 1594 + }, + { + "epoch": 0.44958072017475864, + "grad_norm": 1200.0098876953125, + "learning_rate": 6.0337630873828454e-05, + "loss": 2.9622, + "step": 1595 + }, + { + "epoch": 0.4498625889648369, + "grad_norm": 1464.007568359375, + "learning_rate": 6.0293041250702676e-05, + "loss": 3.0879, + "step": 1596 + }, + { + "epoch": 0.4501444577549151, + "grad_norm": 1552.0084228515625, + "learning_rate": 6.0248443077680316e-05, + "loss": 3.125, + "step": 1597 + }, + { + "epoch": 0.4504263265449933, + "grad_norm": 462.0064392089844, + "learning_rate": 6.0203836391806765e-05, + "loss": 2.4815, + "step": 1598 + }, + { + "epoch": 0.4507081953350715, + "grad_norm": 1176.00732421875, + "learning_rate": 6.015922123013449e-05, + "loss": 3.2865, + "step": 1599 + }, + { + "epoch": 0.45099006412514975, + "grad_norm": 1680.0050048828125, + "learning_rate": 6.011459762972299e-05, + "loss": 2.9375, + "step": 1600 + }, + { + "epoch": 0.45127193291522794, + "grad_norm": 1752.0107421875, + "learning_rate": 6.006996562763878e-05, + "loss": 3.5641, + "step": 1601 + }, + { + "epoch": 0.4515538017053062, + "grad_norm": 1088.0068359375, + "learning_rate": 6.002532526095536e-05, + "loss": 2.9287, + "step": 1602 + }, + { + "epoch": 0.4518356704953844, + "grad_norm": 1608.00634765625, + "learning_rate": 5.998067656675318e-05, + "loss": 3.1813, + "step": 1603 + }, + { + "epoch": 0.4521175392854626, + "grad_norm": 1136.00390625, + "learning_rate": 5.993601958211957e-05, + "loss": 2.9167, + "step": 1604 + }, + { + "epoch": 0.45239940807554085, + "grad_norm": 1760.007080078125, + "learning_rate": 5.989135434414882e-05, + "loss": 2.5508, + "step": 1605 + }, + { + "epoch": 0.45268127686561904, + "grad_norm": 1048.0234375, + "learning_rate": 5.9846680889941986e-05, + "loss": 2.9528, + "step": 1606 + }, + { + "epoch": 0.4529631456556973, + "grad_norm": 1544.006103515625, + "learning_rate": 5.9801999256607024e-05, + "loss": 2.7813, + "step": 1607 + }, + { + "epoch": 0.4532450144457755, + "grad_norm": 1120.0059814453125, + "learning_rate": 5.975730948125864e-05, + "loss": 3.2236, + "step": 1608 + }, + { + "epoch": 0.4535268832358537, + "grad_norm": 952.0042114257812, + "learning_rate": 5.971261160101832e-05, + "loss": 3.0915, + "step": 1609 + }, + { + "epoch": 0.4538087520259319, + "grad_norm": 1176.0048828125, + "learning_rate": 5.966790565301429e-05, + "loss": 2.9363, + "step": 1610 + }, + { + "epoch": 0.45409062081601015, + "grad_norm": 424.0035705566406, + "learning_rate": 5.962319167438145e-05, + "loss": 2.7178, + "step": 1611 + }, + { + "epoch": 0.4543724896060884, + "grad_norm": 548.0037231445312, + "learning_rate": 5.95784697022614e-05, + "loss": 2.4112, + "step": 1612 + }, + { + "epoch": 0.4546543583961666, + "grad_norm": 502.00341796875, + "learning_rate": 5.9533739773802343e-05, + "loss": 3.0241, + "step": 1613 + }, + { + "epoch": 0.4549362271862448, + "grad_norm": 1576.0047607421875, + "learning_rate": 5.948900192615916e-05, + "loss": 2.8563, + "step": 1614 + }, + { + "epoch": 0.455218095976323, + "grad_norm": 1120.0037841796875, + "learning_rate": 5.944425619649323e-05, + "loss": 3.8008, + "step": 1615 + }, + { + "epoch": 0.45549996476640126, + "grad_norm": 1840.005126953125, + "learning_rate": 5.939950262197254e-05, + "loss": 3.1886, + "step": 1616 + }, + { + "epoch": 0.45578183355647944, + "grad_norm": 2192.005859375, + "learning_rate": 5.935474123977153e-05, + "loss": 4.0329, + "step": 1617 + }, + { + "epoch": 0.4560637023465577, + "grad_norm": 748.0054321289062, + "learning_rate": 5.9309972087071195e-05, + "loss": 2.6205, + "step": 1618 + }, + { + "epoch": 0.4563455711366359, + "grad_norm": 1800.0050048828125, + "learning_rate": 5.926519520105895e-05, + "loss": 3.6198, + "step": 1619 + }, + { + "epoch": 0.4566274399267141, + "grad_norm": 1528.009033203125, + "learning_rate": 5.922041061892862e-05, + "loss": 3.1091, + "step": 1620 + }, + { + "epoch": 0.4569093087167923, + "grad_norm": 796.00390625, + "learning_rate": 5.917561837788046e-05, + "loss": 2.7718, + "step": 1621 + }, + { + "epoch": 0.45719117750687055, + "grad_norm": 760.0050048828125, + "learning_rate": 5.913081851512104e-05, + "loss": 2.6257, + "step": 1622 + }, + { + "epoch": 0.4574730462969488, + "grad_norm": 532.0021362304688, + "learning_rate": 5.9086011067863324e-05, + "loss": 2.7875, + "step": 1623 + }, + { + "epoch": 0.457754915087027, + "grad_norm": 1640.0064697265625, + "learning_rate": 5.904119607332651e-05, + "loss": 3.1546, + "step": 1624 + }, + { + "epoch": 0.4580367838771052, + "grad_norm": 1568.0032958984375, + "learning_rate": 5.8996373568736094e-05, + "loss": 3.0505, + "step": 1625 + }, + { + "epoch": 0.4583186526671834, + "grad_norm": 2288.003662109375, + "learning_rate": 5.8951543591323835e-05, + "loss": 3.7988, + "step": 1626 + }, + { + "epoch": 0.45860052145726166, + "grad_norm": 1088.0029296875, + "learning_rate": 5.890670617832764e-05, + "loss": 2.5817, + "step": 1627 + }, + { + "epoch": 0.45888239024733984, + "grad_norm": 396.0050354003906, + "learning_rate": 5.8861861366991654e-05, + "loss": 3.1458, + "step": 1628 + }, + { + "epoch": 0.4591642590374181, + "grad_norm": 1004.007080078125, + "learning_rate": 5.881700919456614e-05, + "loss": 3.3135, + "step": 1629 + }, + { + "epoch": 0.4594461278274963, + "grad_norm": 1944.0096435546875, + "learning_rate": 5.877214969830745e-05, + "loss": 3.1254, + "step": 1630 + }, + { + "epoch": 0.4597279966175745, + "grad_norm": 1224.0062255859375, + "learning_rate": 5.872728291547809e-05, + "loss": 2.7117, + "step": 1631 + }, + { + "epoch": 0.46000986540765276, + "grad_norm": 696.0029907226562, + "learning_rate": 5.868240888334653e-05, + "loss": 2.6104, + "step": 1632 + }, + { + "epoch": 0.46029173419773095, + "grad_norm": 1192.00390625, + "learning_rate": 5.8637527639187314e-05, + "loss": 3.1902, + "step": 1633 + }, + { + "epoch": 0.4605736029878092, + "grad_norm": 1432.0064697265625, + "learning_rate": 5.8592639220281e-05, + "loss": 2.5981, + "step": 1634 + }, + { + "epoch": 0.4608554717778874, + "grad_norm": 1064.003662109375, + "learning_rate": 5.854774366391403e-05, + "loss": 2.302, + "step": 1635 + }, + { + "epoch": 0.4611373405679656, + "grad_norm": 1368.0045166015625, + "learning_rate": 5.8502841007378874e-05, + "loss": 3.3594, + "step": 1636 + }, + { + "epoch": 0.4614192093580438, + "grad_norm": 312.0057373046875, + "learning_rate": 5.845793128797379e-05, + "loss": 2.7857, + "step": 1637 + }, + { + "epoch": 0.46170107814812206, + "grad_norm": 1400.007568359375, + "learning_rate": 5.841301454300296e-05, + "loss": 3.2181, + "step": 1638 + }, + { + "epoch": 0.46198294693820025, + "grad_norm": 544.0084838867188, + "learning_rate": 5.836809080977644e-05, + "loss": 2.8969, + "step": 1639 + }, + { + "epoch": 0.4622648157282785, + "grad_norm": 1808.012939453125, + "learning_rate": 5.8323160125610034e-05, + "loss": 2.8825, + "step": 1640 + }, + { + "epoch": 0.46254668451835673, + "grad_norm": 924.0081787109375, + "learning_rate": 5.827822252782533e-05, + "loss": 3.3457, + "step": 1641 + }, + { + "epoch": 0.4628285533084349, + "grad_norm": 1048.0111083984375, + "learning_rate": 5.823327805374965e-05, + "loss": 3.0638, + "step": 1642 + }, + { + "epoch": 0.46311042209851316, + "grad_norm": 1384.0089111328125, + "learning_rate": 5.818832674071606e-05, + "loss": 3.3067, + "step": 1643 + }, + { + "epoch": 0.46339229088859135, + "grad_norm": 1072.0107421875, + "learning_rate": 5.814336862606329e-05, + "loss": 2.767, + "step": 1644 + }, + { + "epoch": 0.4636741596786696, + "grad_norm": 512.013916015625, + "learning_rate": 5.809840374713571e-05, + "loss": 2.9551, + "step": 1645 + }, + { + "epoch": 0.4639560284687478, + "grad_norm": 1128.01025390625, + "learning_rate": 5.805343214128332e-05, + "loss": 3.3887, + "step": 1646 + }, + { + "epoch": 0.464237897258826, + "grad_norm": 564.0091552734375, + "learning_rate": 5.800845384586172e-05, + "loss": 2.3636, + "step": 1647 + }, + { + "epoch": 0.4645197660489042, + "grad_norm": 1704.0150146484375, + "learning_rate": 5.7963468898232024e-05, + "loss": 2.8994, + "step": 1648 + }, + { + "epoch": 0.46480163483898246, + "grad_norm": 580.0107421875, + "learning_rate": 5.7918477335760914e-05, + "loss": 2.7368, + "step": 1649 + }, + { + "epoch": 0.46508350362906065, + "grad_norm": 1384.0081787109375, + "learning_rate": 5.787347919582057e-05, + "loss": 2.813, + "step": 1650 + }, + { + "epoch": 0.4653653724191389, + "grad_norm": 1272.007080078125, + "learning_rate": 5.7828474515788576e-05, + "loss": 2.9212, + "step": 1651 + }, + { + "epoch": 0.46564724120921713, + "grad_norm": 928.0079956054688, + "learning_rate": 5.778346333304804e-05, + "loss": 3.2315, + "step": 1652 + }, + { + "epoch": 0.4659291099992953, + "grad_norm": 1352.0098876953125, + "learning_rate": 5.773844568498737e-05, + "loss": 2.7461, + "step": 1653 + }, + { + "epoch": 0.46621097878937356, + "grad_norm": 836.0028686523438, + "learning_rate": 5.769342160900043e-05, + "loss": 2.7468, + "step": 1654 + }, + { + "epoch": 0.46649284757945175, + "grad_norm": 372.0050354003906, + "learning_rate": 5.764839114248639e-05, + "loss": 2.6338, + "step": 1655 + }, + { + "epoch": 0.46677471636953, + "grad_norm": 784.0036010742188, + "learning_rate": 5.760335432284971e-05, + "loss": 2.8858, + "step": 1656 + }, + { + "epoch": 0.4670565851596082, + "grad_norm": 976.0025024414062, + "learning_rate": 5.755831118750016e-05, + "loss": 2.7051, + "step": 1657 + }, + { + "epoch": 0.4673384539496864, + "grad_norm": 1544.010498046875, + "learning_rate": 5.751326177385272e-05, + "loss": 3.625, + "step": 1658 + }, + { + "epoch": 0.4676203227397646, + "grad_norm": 1776.0078125, + "learning_rate": 5.746820611932764e-05, + "loss": 3.8184, + "step": 1659 + }, + { + "epoch": 0.46790219152984286, + "grad_norm": 1400.0093994140625, + "learning_rate": 5.742314426135029e-05, + "loss": 3.2595, + "step": 1660 + }, + { + "epoch": 0.4681840603199211, + "grad_norm": 1248.009033203125, + "learning_rate": 5.737807623735124e-05, + "loss": 2.8989, + "step": 1661 + }, + { + "epoch": 0.4684659291099993, + "grad_norm": 1568.0072021484375, + "learning_rate": 5.7333002084766165e-05, + "loss": 2.5501, + "step": 1662 + }, + { + "epoch": 0.46874779790007753, + "grad_norm": 704.0028686523438, + "learning_rate": 5.72879218410358e-05, + "loss": 3.0715, + "step": 1663 + }, + { + "epoch": 0.4690296666901557, + "grad_norm": 1528.00634765625, + "learning_rate": 5.724283554360601e-05, + "loss": 2.7953, + "step": 1664 + }, + { + "epoch": 0.46931153548023397, + "grad_norm": 1040.002685546875, + "learning_rate": 5.719774322992765e-05, + "loss": 2.833, + "step": 1665 + }, + { + "epoch": 0.46959340427031215, + "grad_norm": 462.03472900390625, + "learning_rate": 5.7152644937456526e-05, + "loss": 3.3304, + "step": 1666 + }, + { + "epoch": 0.4698752730603904, + "grad_norm": 660.0042114257812, + "learning_rate": 5.7107540703653496e-05, + "loss": 3.0515, + "step": 1667 + }, + { + "epoch": 0.4701571418504686, + "grad_norm": 1056.006103515625, + "learning_rate": 5.7062430565984284e-05, + "loss": 3.0904, + "step": 1668 + }, + { + "epoch": 0.47043901064054683, + "grad_norm": 2000.0120849609375, + "learning_rate": 5.7017314561919576e-05, + "loss": 3.0606, + "step": 1669 + }, + { + "epoch": 0.470720879430625, + "grad_norm": 612.006591796875, + "learning_rate": 5.697219272893488e-05, + "loss": 2.766, + "step": 1670 + }, + { + "epoch": 0.47100274822070326, + "grad_norm": 1064.0814208984375, + "learning_rate": 5.692706510451059e-05, + "loss": 2.8451, + "step": 1671 + }, + { + "epoch": 0.4712846170107815, + "grad_norm": 608.0045166015625, + "learning_rate": 5.6881931726131855e-05, + "loss": 2.8724, + "step": 1672 + }, + { + "epoch": 0.4715664858008597, + "grad_norm": 1384.003173828125, + "learning_rate": 5.683679263128867e-05, + "loss": 2.9792, + "step": 1673 + }, + { + "epoch": 0.47184835459093794, + "grad_norm": 1088.0040283203125, + "learning_rate": 5.6791647857475714e-05, + "loss": 3.1364, + "step": 1674 + }, + { + "epoch": 0.4721302233810161, + "grad_norm": 1272.0086669921875, + "learning_rate": 5.674649744219243e-05, + "loss": 3.2543, + "step": 1675 + }, + { + "epoch": 0.47241209217109437, + "grad_norm": 1544.0084228515625, + "learning_rate": 5.6701341422942935e-05, + "loss": 3.3747, + "step": 1676 + }, + { + "epoch": 0.47269396096117255, + "grad_norm": 732.0042114257812, + "learning_rate": 5.665617983723598e-05, + "loss": 2.7178, + "step": 1677 + }, + { + "epoch": 0.4729758297512508, + "grad_norm": 908.0036010742188, + "learning_rate": 5.661101272258498e-05, + "loss": 3.1455, + "step": 1678 + }, + { + "epoch": 0.473257698541329, + "grad_norm": 1512.0052490234375, + "learning_rate": 5.6565840116507894e-05, + "loss": 2.9968, + "step": 1679 + }, + { + "epoch": 0.47353956733140723, + "grad_norm": 1040.006103515625, + "learning_rate": 5.652066205652727e-05, + "loss": 2.8965, + "step": 1680 + }, + { + "epoch": 0.4738214361214855, + "grad_norm": 1352.0028076171875, + "learning_rate": 5.6475478580170214e-05, + "loss": 2.9476, + "step": 1681 + }, + { + "epoch": 0.47410330491156366, + "grad_norm": 1832.009521484375, + "learning_rate": 5.643028972496828e-05, + "loss": 2.737, + "step": 1682 + }, + { + "epoch": 0.4743851737016419, + "grad_norm": 1296.0096435546875, + "learning_rate": 5.638509552845751e-05, + "loss": 2.9779, + "step": 1683 + }, + { + "epoch": 0.4746670424917201, + "grad_norm": 892.0103149414062, + "learning_rate": 5.6339896028178375e-05, + "loss": 2.7016, + "step": 1684 + }, + { + "epoch": 0.47494891128179834, + "grad_norm": 1080.006103515625, + "learning_rate": 5.629469126167578e-05, + "loss": 2.8317, + "step": 1685 + }, + { + "epoch": 0.4752307800718765, + "grad_norm": 360.0567626953125, + "learning_rate": 5.624948126649898e-05, + "loss": 2.695, + "step": 1686 + }, + { + "epoch": 0.47551264886195477, + "grad_norm": 1408.035400390625, + "learning_rate": 5.620426608020156e-05, + "loss": 3.0313, + "step": 1687 + }, + { + "epoch": 0.47579451765203296, + "grad_norm": 468.01019287109375, + "learning_rate": 5.615904574034146e-05, + "loss": 2.722, + "step": 1688 + }, + { + "epoch": 0.4760763864421112, + "grad_norm": 1672.009521484375, + "learning_rate": 5.611382028448085e-05, + "loss": 2.7544, + "step": 1689 + }, + { + "epoch": 0.47635825523218944, + "grad_norm": 1040.007568359375, + "learning_rate": 5.6068589750186206e-05, + "loss": 2.6708, + "step": 1690 + }, + { + "epoch": 0.47664012402226763, + "grad_norm": 992.0062866210938, + "learning_rate": 5.6023354175028175e-05, + "loss": 3.049, + "step": 1691 + }, + { + "epoch": 0.4769219928123459, + "grad_norm": 1256.0113525390625, + "learning_rate": 5.5978113596581596e-05, + "loss": 3.0851, + "step": 1692 + }, + { + "epoch": 0.47720386160242406, + "grad_norm": 900.0137939453125, + "learning_rate": 5.593286805242549e-05, + "loss": 3.0516, + "step": 1693 + }, + { + "epoch": 0.4774857303925023, + "grad_norm": 1136.0089111328125, + "learning_rate": 5.588761758014298e-05, + "loss": 2.861, + "step": 1694 + }, + { + "epoch": 0.4777675991825805, + "grad_norm": 844.00634765625, + "learning_rate": 5.584236221732131e-05, + "loss": 3.0427, + "step": 1695 + }, + { + "epoch": 0.47804946797265874, + "grad_norm": 660.0076904296875, + "learning_rate": 5.579710200155175e-05, + "loss": 2.8985, + "step": 1696 + }, + { + "epoch": 0.4783313367627369, + "grad_norm": 1004.0077514648438, + "learning_rate": 5.5751836970429624e-05, + "loss": 2.8985, + "step": 1697 + }, + { + "epoch": 0.47861320555281517, + "grad_norm": 660.0089721679688, + "learning_rate": 5.570656716155426e-05, + "loss": 2.8093, + "step": 1698 + }, + { + "epoch": 0.47889507434289336, + "grad_norm": 1752.0086669921875, + "learning_rate": 5.56612926125289e-05, + "loss": 3.2931, + "step": 1699 + }, + { + "epoch": 0.4791769431329716, + "grad_norm": 1336.0079345703125, + "learning_rate": 5.561601336096084e-05, + "loss": 3.3252, + "step": 1700 + }, + { + "epoch": 0.47945881192304984, + "grad_norm": 1304.0040283203125, + "learning_rate": 5.557072944446116e-05, + "loss": 3.072, + "step": 1701 + }, + { + "epoch": 0.47974068071312803, + "grad_norm": 1020.0036010742188, + "learning_rate": 5.552544090064488e-05, + "loss": 2.986, + "step": 1702 + }, + { + "epoch": 0.4800225495032063, + "grad_norm": 462.0047302246094, + "learning_rate": 5.548014776713084e-05, + "loss": 2.9053, + "step": 1703 + }, + { + "epoch": 0.48030441829328446, + "grad_norm": 1536.0040283203125, + "learning_rate": 5.543485008154171e-05, + "loss": 2.6445, + "step": 1704 + }, + { + "epoch": 0.4805862870833627, + "grad_norm": 720.0055541992188, + "learning_rate": 5.5389547881503947e-05, + "loss": 2.6992, + "step": 1705 + }, + { + "epoch": 0.4808681558734409, + "grad_norm": 1304.0062255859375, + "learning_rate": 5.534424120464772e-05, + "loss": 2.6647, + "step": 1706 + }, + { + "epoch": 0.48115002466351914, + "grad_norm": 660.00341796875, + "learning_rate": 5.5298930088606946e-05, + "loss": 2.9697, + "step": 1707 + }, + { + "epoch": 0.4814318934535973, + "grad_norm": 980.0034790039062, + "learning_rate": 5.525361457101923e-05, + "loss": 3.1403, + "step": 1708 + }, + { + "epoch": 0.48171376224367557, + "grad_norm": 1296.008056640625, + "learning_rate": 5.520829468952582e-05, + "loss": 3.1592, + "step": 1709 + }, + { + "epoch": 0.4819956310337538, + "grad_norm": 380.00567626953125, + "learning_rate": 5.516297048177162e-05, + "loss": 3.0632, + "step": 1710 + }, + { + "epoch": 0.482277499823832, + "grad_norm": 1320.0047607421875, + "learning_rate": 5.511764198540506e-05, + "loss": 2.736, + "step": 1711 + }, + { + "epoch": 0.48255936861391024, + "grad_norm": 1792.0072021484375, + "learning_rate": 5.507230923807821e-05, + "loss": 3.2617, + "step": 1712 + }, + { + "epoch": 0.48284123740398843, + "grad_norm": 1384.0072021484375, + "learning_rate": 5.502697227744662e-05, + "loss": 3.1823, + "step": 1713 + }, + { + "epoch": 0.4831231061940667, + "grad_norm": 1784.0130615234375, + "learning_rate": 5.498163114116936e-05, + "loss": 3.0716, + "step": 1714 + }, + { + "epoch": 0.48340497498414486, + "grad_norm": 238.00942993164062, + "learning_rate": 5.493628586690898e-05, + "loss": 2.8639, + "step": 1715 + }, + { + "epoch": 0.4836868437742231, + "grad_norm": 1552.0059814453125, + "learning_rate": 5.4890936492331414e-05, + "loss": 2.8008, + "step": 1716 + }, + { + "epoch": 0.4839687125643013, + "grad_norm": 424.0069580078125, + "learning_rate": 5.484558305510609e-05, + "loss": 3.0589, + "step": 1717 + }, + { + "epoch": 0.48425058135437954, + "grad_norm": 1384.00439453125, + "learning_rate": 5.480022559290573e-05, + "loss": 2.5114, + "step": 1718 + }, + { + "epoch": 0.4845324501444578, + "grad_norm": 416.0391845703125, + "learning_rate": 5.475486414340643e-05, + "loss": 3.1257, + "step": 1719 + }, + { + "epoch": 0.48481431893453597, + "grad_norm": 708.0057373046875, + "learning_rate": 5.47094987442876e-05, + "loss": 3.0475, + "step": 1720 + }, + { + "epoch": 0.4850961877246142, + "grad_norm": 408.00555419921875, + "learning_rate": 5.4664129433231945e-05, + "loss": 2.313, + "step": 1721 + }, + { + "epoch": 0.4853780565146924, + "grad_norm": 1472.0062255859375, + "learning_rate": 5.46187562479254e-05, + "loss": 2.7178, + "step": 1722 + }, + { + "epoch": 0.48565992530477065, + "grad_norm": 1456.0072021484375, + "learning_rate": 5.4573379226057086e-05, + "loss": 3.5183, + "step": 1723 + }, + { + "epoch": 0.48594179409484883, + "grad_norm": 2096.008544921875, + "learning_rate": 5.452799840531939e-05, + "loss": 2.502, + "step": 1724 + }, + { + "epoch": 0.4862236628849271, + "grad_norm": 532.0068359375, + "learning_rate": 5.448261382340778e-05, + "loss": 3.3473, + "step": 1725 + }, + { + "epoch": 0.48650553167500526, + "grad_norm": 1080.0032958984375, + "learning_rate": 5.4437225518020905e-05, + "loss": 3.0801, + "step": 1726 + }, + { + "epoch": 0.4867874004650835, + "grad_norm": 2192.0126953125, + "learning_rate": 5.439183352686047e-05, + "loss": 3.8024, + "step": 1727 + }, + { + "epoch": 0.4870692692551617, + "grad_norm": 1168.004150390625, + "learning_rate": 5.434643788763125e-05, + "loss": 2.9538, + "step": 1728 + }, + { + "epoch": 0.48735113804523994, + "grad_norm": 1936.0101318359375, + "learning_rate": 5.430103863804107e-05, + "loss": 3.5277, + "step": 1729 + }, + { + "epoch": 0.4876330068353182, + "grad_norm": 1152.00439453125, + "learning_rate": 5.4255635815800686e-05, + "loss": 3.4652, + "step": 1730 + }, + { + "epoch": 0.48791487562539637, + "grad_norm": 1560.00390625, + "learning_rate": 5.421022945862393e-05, + "loss": 2.4496, + "step": 1731 + }, + { + "epoch": 0.4881967444154746, + "grad_norm": 1416.0069580078125, + "learning_rate": 5.416481960422748e-05, + "loss": 3.337, + "step": 1732 + }, + { + "epoch": 0.4884786132055528, + "grad_norm": 1120.0050048828125, + "learning_rate": 5.411940629033098e-05, + "loss": 3.0852, + "step": 1733 + }, + { + "epoch": 0.48876048199563105, + "grad_norm": 1032.007080078125, + "learning_rate": 5.407398955465688e-05, + "loss": 3.3413, + "step": 1734 + }, + { + "epoch": 0.48904235078570923, + "grad_norm": 1256.1995849609375, + "learning_rate": 5.402856943493053e-05, + "loss": 2.9525, + "step": 1735 + }, + { + "epoch": 0.4893242195757875, + "grad_norm": 1200.004150390625, + "learning_rate": 5.39831459688801e-05, + "loss": 2.7595, + "step": 1736 + }, + { + "epoch": 0.48960608836586567, + "grad_norm": 708.00732421875, + "learning_rate": 5.393771919423647e-05, + "loss": 2.8257, + "step": 1737 + }, + { + "epoch": 0.4898879571559439, + "grad_norm": 800.0050048828125, + "learning_rate": 5.389228914873333e-05, + "loss": 2.865, + "step": 1738 + }, + { + "epoch": 0.49016982594602215, + "grad_norm": 438.0086975097656, + "learning_rate": 5.384685587010706e-05, + "loss": 3.1065, + "step": 1739 + }, + { + "epoch": 0.49045169473610034, + "grad_norm": 1304.005615234375, + "learning_rate": 5.380141939609673e-05, + "loss": 3.056, + "step": 1740 + }, + { + "epoch": 0.4907335635261786, + "grad_norm": 1020.0086059570312, + "learning_rate": 5.37559797644441e-05, + "loss": 2.8569, + "step": 1741 + }, + { + "epoch": 0.4910154323162568, + "grad_norm": 1496.005126953125, + "learning_rate": 5.371053701289347e-05, + "loss": 2.7784, + "step": 1742 + }, + { + "epoch": 0.491297301106335, + "grad_norm": 536.0082397460938, + "learning_rate": 5.3665091179191817e-05, + "loss": 2.7803, + "step": 1743 + }, + { + "epoch": 0.4915791698964132, + "grad_norm": 776.0074462890625, + "learning_rate": 5.361964230108862e-05, + "loss": 2.8996, + "step": 1744 + }, + { + "epoch": 0.49186103868649145, + "grad_norm": 2208.008056640625, + "learning_rate": 5.357419041633592e-05, + "loss": 2.7938, + "step": 1745 + }, + { + "epoch": 0.49214290747656964, + "grad_norm": 732.0087280273438, + "learning_rate": 5.352873556268827e-05, + "loss": 3.0, + "step": 1746 + }, + { + "epoch": 0.4924247762666479, + "grad_norm": 976.0106201171875, + "learning_rate": 5.348327777790262e-05, + "loss": 2.8281, + "step": 1747 + }, + { + "epoch": 0.49270664505672607, + "grad_norm": 824.0051879882812, + "learning_rate": 5.343781709973843e-05, + "loss": 3.1319, + "step": 1748 + }, + { + "epoch": 0.4929885138468043, + "grad_norm": 1536.0123291015625, + "learning_rate": 5.339235356595751e-05, + "loss": 3.1605, + "step": 1749 + }, + { + "epoch": 0.49327038263688255, + "grad_norm": 1128.0107421875, + "learning_rate": 5.334688721432408e-05, + "loss": 3.532, + "step": 1750 + }, + { + "epoch": 0.49355225142696074, + "grad_norm": 1232.002685546875, + "learning_rate": 5.330141808260468e-05, + "loss": 2.7396, + "step": 1751 + }, + { + "epoch": 0.493834120217039, + "grad_norm": 1296.00634765625, + "learning_rate": 5.3255946208568174e-05, + "loss": 3.0593, + "step": 1752 + }, + { + "epoch": 0.4941159890071172, + "grad_norm": 808.0045166015625, + "learning_rate": 5.321047162998568e-05, + "loss": 2.8766, + "step": 1753 + }, + { + "epoch": 0.4943978577971954, + "grad_norm": 2288.0078125, + "learning_rate": 5.3164994384630574e-05, + "loss": 3.306, + "step": 1754 + }, + { + "epoch": 0.4946797265872736, + "grad_norm": 2624.008056640625, + "learning_rate": 5.3119514510278455e-05, + "loss": 3.3607, + "step": 1755 + }, + { + "epoch": 0.49496159537735185, + "grad_norm": 880.0183715820312, + "learning_rate": 5.307403204470711e-05, + "loss": 3.1992, + "step": 1756 + }, + { + "epoch": 0.49524346416743004, + "grad_norm": 1408.0072021484375, + "learning_rate": 5.302854702569646e-05, + "loss": 3.0375, + "step": 1757 + }, + { + "epoch": 0.4955253329575083, + "grad_norm": 728.0042724609375, + "learning_rate": 5.2983059491028556e-05, + "loss": 3.325, + "step": 1758 + }, + { + "epoch": 0.4958072017475865, + "grad_norm": 1048.0048828125, + "learning_rate": 5.2937569478487544e-05, + "loss": 2.6899, + "step": 1759 + }, + { + "epoch": 0.4960890705376647, + "grad_norm": 1760.00830078125, + "learning_rate": 5.289207702585962e-05, + "loss": 3.1989, + "step": 1760 + }, + { + "epoch": 0.49637093932774295, + "grad_norm": 932.0048828125, + "learning_rate": 5.2846582170933004e-05, + "loss": 2.992, + "step": 1761 + }, + { + "epoch": 0.49665280811782114, + "grad_norm": 1328.0057373046875, + "learning_rate": 5.280108495149793e-05, + "loss": 2.4152, + "step": 1762 + }, + { + "epoch": 0.4969346769078994, + "grad_norm": 2432.01171875, + "learning_rate": 5.275558540534655e-05, + "loss": 3.1022, + "step": 1763 + }, + { + "epoch": 0.4972165456979776, + "grad_norm": 292.002685546875, + "learning_rate": 5.271008357027304e-05, + "loss": 2.69, + "step": 1764 + }, + { + "epoch": 0.4974984144880558, + "grad_norm": 1864.005615234375, + "learning_rate": 5.266457948407336e-05, + "loss": 3.4128, + "step": 1765 + }, + { + "epoch": 0.497780283278134, + "grad_norm": 1840.0087890625, + "learning_rate": 5.261907318454543e-05, + "loss": 2.7159, + "step": 1766 + }, + { + "epoch": 0.49806215206821225, + "grad_norm": 1592.0042724609375, + "learning_rate": 5.257356470948899e-05, + "loss": 2.8816, + "step": 1767 + }, + { + "epoch": 0.4983440208582905, + "grad_norm": 700.0032348632812, + "learning_rate": 5.252805409670554e-05, + "loss": 3.005, + "step": 1768 + }, + { + "epoch": 0.4986258896483687, + "grad_norm": 1464.00439453125, + "learning_rate": 5.2482541383998405e-05, + "loss": 2.7285, + "step": 1769 + }, + { + "epoch": 0.4989077584384469, + "grad_norm": 724.0037841796875, + "learning_rate": 5.243702660917265e-05, + "loss": 2.6176, + "step": 1770 + }, + { + "epoch": 0.4991896272285251, + "grad_norm": 1360.0032958984375, + "learning_rate": 5.239150981003502e-05, + "loss": 3.3672, + "step": 1771 + }, + { + "epoch": 0.49947149601860336, + "grad_norm": 992.0038452148438, + "learning_rate": 5.2345991024394005e-05, + "loss": 2.9359, + "step": 1772 + }, + { + "epoch": 0.49975336480868154, + "grad_norm": 1472.00830078125, + "learning_rate": 5.230047029005965e-05, + "loss": 3.0352, + "step": 1773 + }, + { + "epoch": 0.5000352335987598, + "grad_norm": 1352.004150390625, + "learning_rate": 5.225494764484373e-05, + "loss": 3.2904, + "step": 1774 + }, + { + "epoch": 0.500317102388838, + "grad_norm": 648.005126953125, + "learning_rate": 5.22094231265595e-05, + "loss": 2.9864, + "step": 1775 + }, + { + "epoch": 0.5005989711789162, + "grad_norm": 560.00439453125, + "learning_rate": 5.216389677302185e-05, + "loss": 3.1452, + "step": 1776 + }, + { + "epoch": 0.5008808399689945, + "grad_norm": 860.0043334960938, + "learning_rate": 5.211836862204715e-05, + "loss": 3.155, + "step": 1777 + }, + { + "epoch": 0.5011627087590727, + "grad_norm": 1624.0042724609375, + "learning_rate": 5.207283871145329e-05, + "loss": 2.6771, + "step": 1778 + }, + { + "epoch": 0.5014445775491508, + "grad_norm": 1112.0032958984375, + "learning_rate": 5.2027307079059604e-05, + "loss": 2.9346, + "step": 1779 + }, + { + "epoch": 0.5017264463392291, + "grad_norm": 1368.005615234375, + "learning_rate": 5.198177376268686e-05, + "loss": 3.1517, + "step": 1780 + }, + { + "epoch": 0.5020083151293073, + "grad_norm": 508.0031433105469, + "learning_rate": 5.193623880015723e-05, + "loss": 2.6999, + "step": 1781 + }, + { + "epoch": 0.5022901839193855, + "grad_norm": 1248.007568359375, + "learning_rate": 5.189070222929425e-05, + "loss": 2.8907, + "step": 1782 + }, + { + "epoch": 0.5025720527094637, + "grad_norm": 173.00991821289062, + "learning_rate": 5.1845164087922804e-05, + "loss": 2.5388, + "step": 1783 + }, + { + "epoch": 0.502853921499542, + "grad_norm": 932.0052490234375, + "learning_rate": 5.179962441386906e-05, + "loss": 2.8926, + "step": 1784 + }, + { + "epoch": 0.5031357902896202, + "grad_norm": 1296.004150390625, + "learning_rate": 5.175408324496046e-05, + "loss": 2.5721, + "step": 1785 + }, + { + "epoch": 0.5034176590796984, + "grad_norm": 776.006103515625, + "learning_rate": 5.170854061902569e-05, + "loss": 3.2679, + "step": 1786 + }, + { + "epoch": 0.5036995278697766, + "grad_norm": 1296.006103515625, + "learning_rate": 5.166299657389467e-05, + "loss": 2.7787, + "step": 1787 + }, + { + "epoch": 0.5039813966598549, + "grad_norm": 1352.004150390625, + "learning_rate": 5.161745114739849e-05, + "loss": 3.1504, + "step": 1788 + }, + { + "epoch": 0.504263265449933, + "grad_norm": 1012.0055541992188, + "learning_rate": 5.157190437736935e-05, + "loss": 3.0216, + "step": 1789 + }, + { + "epoch": 0.5045451342400112, + "grad_norm": 1184.010986328125, + "learning_rate": 5.1526356301640625e-05, + "loss": 3.2531, + "step": 1790 + }, + { + "epoch": 0.5048270030300895, + "grad_norm": 852.0035400390625, + "learning_rate": 5.14808069580467e-05, + "loss": 2.8828, + "step": 1791 + }, + { + "epoch": 0.5051088718201677, + "grad_norm": 1392.00634765625, + "learning_rate": 5.143525638442309e-05, + "loss": 2.974, + "step": 1792 + }, + { + "epoch": 0.5053907406102459, + "grad_norm": 680.008056640625, + "learning_rate": 5.1389704618606306e-05, + "loss": 2.9304, + "step": 1793 + }, + { + "epoch": 0.5056726094003241, + "grad_norm": 616.0057373046875, + "learning_rate": 5.134415169843382e-05, + "loss": 2.7951, + "step": 1794 + }, + { + "epoch": 0.5059544781904024, + "grad_norm": 1288.009521484375, + "learning_rate": 5.12985976617441e-05, + "loss": 3.3005, + "step": 1795 + }, + { + "epoch": 0.5062363469804806, + "grad_norm": 860.00830078125, + "learning_rate": 5.125304254637651e-05, + "loss": 2.7996, + "step": 1796 + }, + { + "epoch": 0.5065182157705588, + "grad_norm": 1584.011474609375, + "learning_rate": 5.120748639017133e-05, + "loss": 3.3077, + "step": 1797 + }, + { + "epoch": 0.5068000845606371, + "grad_norm": 636.0078125, + "learning_rate": 5.116192923096973e-05, + "loss": 2.7029, + "step": 1798 + }, + { + "epoch": 0.5070819533507153, + "grad_norm": 524.0087890625, + "learning_rate": 5.1116371106613636e-05, + "loss": 3.1192, + "step": 1799 + }, + { + "epoch": 0.5073638221407935, + "grad_norm": 920.00927734375, + "learning_rate": 5.1070812054945874e-05, + "loss": 2.9861, + "step": 1800 + }, + { + "epoch": 0.5076456909308716, + "grad_norm": 1024.0045166015625, + "learning_rate": 5.102525211380994e-05, + "loss": 3.1172, + "step": 1801 + }, + { + "epoch": 0.5079275597209499, + "grad_norm": 352.004150390625, + "learning_rate": 5.097969132105015e-05, + "loss": 2.7562, + "step": 1802 + }, + { + "epoch": 0.5082094285110281, + "grad_norm": 948.0059204101562, + "learning_rate": 5.09341297145115e-05, + "loss": 2.8653, + "step": 1803 + }, + { + "epoch": 0.5084912973011063, + "grad_norm": 932.00390625, + "learning_rate": 5.0888567332039635e-05, + "loss": 2.6189, + "step": 1804 + }, + { + "epoch": 0.5087731660911845, + "grad_norm": 608.0056762695312, + "learning_rate": 5.0843004211480896e-05, + "loss": 2.8691, + "step": 1805 + }, + { + "epoch": 0.5090550348812628, + "grad_norm": 1024.004150390625, + "learning_rate": 5.079744039068217e-05, + "loss": 2.7394, + "step": 1806 + }, + { + "epoch": 0.509336903671341, + "grad_norm": 1088.0032958984375, + "learning_rate": 5.075187590749101e-05, + "loss": 3.0567, + "step": 1807 + }, + { + "epoch": 0.5096187724614192, + "grad_norm": 920.003662109375, + "learning_rate": 5.070631079975545e-05, + "loss": 2.6462, + "step": 1808 + }, + { + "epoch": 0.5099006412514975, + "grad_norm": 2112.006103515625, + "learning_rate": 5.066074510532406e-05, + "loss": 3.1667, + "step": 1809 + }, + { + "epoch": 0.5101825100415757, + "grad_norm": 572.005126953125, + "learning_rate": 5.061517886204592e-05, + "loss": 2.8984, + "step": 1810 + }, + { + "epoch": 0.5104643788316539, + "grad_norm": 888.0045166015625, + "learning_rate": 5.056961210777051e-05, + "loss": 2.5635, + "step": 1811 + }, + { + "epoch": 0.510746247621732, + "grad_norm": 1184.005615234375, + "learning_rate": 5.052404488034785e-05, + "loss": 2.8841, + "step": 1812 + }, + { + "epoch": 0.5110281164118103, + "grad_norm": 568.005615234375, + "learning_rate": 5.047847721762821e-05, + "loss": 2.7083, + "step": 1813 + }, + { + "epoch": 0.5113099852018885, + "grad_norm": 1288.00634765625, + "learning_rate": 5.043290915746233e-05, + "loss": 2.9863, + "step": 1814 + }, + { + "epoch": 0.5115918539919667, + "grad_norm": 1488.0064697265625, + "learning_rate": 5.0387340737701194e-05, + "loss": 2.7966, + "step": 1815 + }, + { + "epoch": 0.5118737227820449, + "grad_norm": 1504.0091552734375, + "learning_rate": 5.0341771996196175e-05, + "loss": 3.2246, + "step": 1816 + }, + { + "epoch": 0.5121555915721232, + "grad_norm": 720.0029296875, + "learning_rate": 5.029620297079885e-05, + "loss": 2.9102, + "step": 1817 + }, + { + "epoch": 0.5124374603622014, + "grad_norm": 676.0046997070312, + "learning_rate": 5.025063369936104e-05, + "loss": 2.6061, + "step": 1818 + }, + { + "epoch": 0.5127193291522796, + "grad_norm": 1728.01171875, + "learning_rate": 5.02050642197348e-05, + "loss": 3.3242, + "step": 1819 + }, + { + "epoch": 0.5130011979423579, + "grad_norm": 672.004638671875, + "learning_rate": 5.0159494569772314e-05, + "loss": 2.5418, + "step": 1820 + }, + { + "epoch": 0.5132830667324361, + "grad_norm": 1016.0036010742188, + "learning_rate": 5.011392478732595e-05, + "loss": 2.7136, + "step": 1821 + }, + { + "epoch": 0.5135649355225143, + "grad_norm": 2000.0079345703125, + "learning_rate": 5.006835491024816e-05, + "loss": 3.5027, + "step": 1822 + }, + { + "epoch": 0.5138468043125924, + "grad_norm": 1336.00537109375, + "learning_rate": 5.002278497639149e-05, + "loss": 2.8259, + "step": 1823 + }, + { + "epoch": 0.5141286731026707, + "grad_norm": 656.0044555664062, + "learning_rate": 4.9977215023608524e-05, + "loss": 2.999, + "step": 1824 + }, + { + "epoch": 0.5144105418927489, + "grad_norm": 876.0064697265625, + "learning_rate": 4.9931645089751846e-05, + "loss": 3.0027, + "step": 1825 + }, + { + "epoch": 0.5146924106828271, + "grad_norm": 800.0033569335938, + "learning_rate": 4.9886075212674064e-05, + "loss": 2.5147, + "step": 1826 + }, + { + "epoch": 0.5149742794729054, + "grad_norm": 1136.0045166015625, + "learning_rate": 4.98405054302277e-05, + "loss": 2.8731, + "step": 1827 + }, + { + "epoch": 0.5152561482629836, + "grad_norm": 840.0050659179688, + "learning_rate": 4.9794935780265225e-05, + "loss": 2.7404, + "step": 1828 + }, + { + "epoch": 0.5155380170530618, + "grad_norm": 780.0062255859375, + "learning_rate": 4.974936630063896e-05, + "loss": 2.7302, + "step": 1829 + }, + { + "epoch": 0.51581988584314, + "grad_norm": 1432.0059814453125, + "learning_rate": 4.970379702920116e-05, + "loss": 3.6857, + "step": 1830 + }, + { + "epoch": 0.5161017546332183, + "grad_norm": 584.0057373046875, + "learning_rate": 4.965822800380383e-05, + "loss": 3.2793, + "step": 1831 + }, + { + "epoch": 0.5163836234232965, + "grad_norm": 1056.005126953125, + "learning_rate": 4.961265926229881e-05, + "loss": 2.6909, + "step": 1832 + }, + { + "epoch": 0.5166654922133747, + "grad_norm": 1928.013916015625, + "learning_rate": 4.956709084253769e-05, + "loss": 3.8203, + "step": 1833 + }, + { + "epoch": 0.5169473610034528, + "grad_norm": 896.0082397460938, + "learning_rate": 4.95215227823718e-05, + "loss": 2.8825, + "step": 1834 + }, + { + "epoch": 0.5172292297935311, + "grad_norm": 424.0269470214844, + "learning_rate": 4.947595511965216e-05, + "loss": 2.9883, + "step": 1835 + }, + { + "epoch": 0.5175110985836093, + "grad_norm": 1232.011962890625, + "learning_rate": 4.943038789222949e-05, + "loss": 2.8278, + "step": 1836 + }, + { + "epoch": 0.5177929673736875, + "grad_norm": 1264.0135498046875, + "learning_rate": 4.93848211379541e-05, + "loss": 2.8558, + "step": 1837 + }, + { + "epoch": 0.5180748361637658, + "grad_norm": 1064.0072021484375, + "learning_rate": 4.933925489467596e-05, + "loss": 2.8129, + "step": 1838 + }, + { + "epoch": 0.518356704953844, + "grad_norm": 704.0100708007812, + "learning_rate": 4.929368920024456e-05, + "loss": 2.8854, + "step": 1839 + }, + { + "epoch": 0.5186385737439222, + "grad_norm": 1736.005126953125, + "learning_rate": 4.924812409250899e-05, + "loss": 2.9076, + "step": 1840 + }, + { + "epoch": 0.5189204425340004, + "grad_norm": 844.00927734375, + "learning_rate": 4.9202559609317836e-05, + "loss": 2.5775, + "step": 1841 + }, + { + "epoch": 0.5192023113240787, + "grad_norm": 956.0079345703125, + "learning_rate": 4.9156995788519115e-05, + "loss": 2.5281, + "step": 1842 + }, + { + "epoch": 0.5194841801141569, + "grad_norm": 776.00634765625, + "learning_rate": 4.9111432667960377e-05, + "loss": 2.994, + "step": 1843 + }, + { + "epoch": 0.5197660489042351, + "grad_norm": 1288.009033203125, + "learning_rate": 4.9065870285488516e-05, + "loss": 3.4434, + "step": 1844 + }, + { + "epoch": 0.5200479176943132, + "grad_norm": 832.0364990234375, + "learning_rate": 4.902030867894986e-05, + "loss": 2.7345, + "step": 1845 + }, + { + "epoch": 0.5203297864843915, + "grad_norm": 1112.0250244140625, + "learning_rate": 4.8974747886190067e-05, + "loss": 2.5618, + "step": 1846 + }, + { + "epoch": 0.5206116552744697, + "grad_norm": 2320.009521484375, + "learning_rate": 4.892918794505416e-05, + "loss": 3.0157, + "step": 1847 + }, + { + "epoch": 0.5208935240645479, + "grad_norm": 1048.00830078125, + "learning_rate": 4.888362889338638e-05, + "loss": 2.9936, + "step": 1848 + }, + { + "epoch": 0.5211753928546262, + "grad_norm": 1288.0042724609375, + "learning_rate": 4.883807076903029e-05, + "loss": 2.868, + "step": 1849 + }, + { + "epoch": 0.5214572616447044, + "grad_norm": 744.0068969726562, + "learning_rate": 4.879251360982867e-05, + "loss": 2.8384, + "step": 1850 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 1720.0081787109375, + "learning_rate": 4.87469574536235e-05, + "loss": 3.4323, + "step": 1851 + }, + { + "epoch": 0.5220209992248608, + "grad_norm": 1248.0030517578125, + "learning_rate": 4.870140233825591e-05, + "loss": 2.9187, + "step": 1852 + }, + { + "epoch": 0.5223028680149391, + "grad_norm": 844.0032348632812, + "learning_rate": 4.865584830156619e-05, + "loss": 2.9858, + "step": 1853 + }, + { + "epoch": 0.5225847368050173, + "grad_norm": 1128.0045166015625, + "learning_rate": 4.8610295381393705e-05, + "loss": 2.6976, + "step": 1854 + }, + { + "epoch": 0.5228666055950955, + "grad_norm": 1832.0096435546875, + "learning_rate": 4.8564743615576916e-05, + "loss": 3.073, + "step": 1855 + }, + { + "epoch": 0.5231484743851738, + "grad_norm": 1040.0059814453125, + "learning_rate": 4.851919304195331e-05, + "loss": 3.254, + "step": 1856 + }, + { + "epoch": 0.5234303431752519, + "grad_norm": 1896.0076904296875, + "learning_rate": 4.847364369835941e-05, + "loss": 3.5316, + "step": 1857 + }, + { + "epoch": 0.5237122119653301, + "grad_norm": 1376.0062255859375, + "learning_rate": 4.842809562263066e-05, + "loss": 3.0323, + "step": 1858 + }, + { + "epoch": 0.5239940807554083, + "grad_norm": 852.0032348632812, + "learning_rate": 4.8382548852601536e-05, + "loss": 3.3425, + "step": 1859 + }, + { + "epoch": 0.5242759495454866, + "grad_norm": 1536.00341796875, + "learning_rate": 4.833700342610533e-05, + "loss": 3.2325, + "step": 1860 + }, + { + "epoch": 0.5245578183355648, + "grad_norm": 1296.005615234375, + "learning_rate": 4.829145938097431e-05, + "loss": 3.7195, + "step": 1861 + }, + { + "epoch": 0.524839687125643, + "grad_norm": 1320.005859375, + "learning_rate": 4.8245916755039554e-05, + "loss": 3.4014, + "step": 1862 + }, + { + "epoch": 0.5251215559157212, + "grad_norm": 250.00575256347656, + "learning_rate": 4.820037558613095e-05, + "loss": 2.7491, + "step": 1863 + }, + { + "epoch": 0.5254034247057995, + "grad_norm": 2576.007568359375, + "learning_rate": 4.815483591207721e-05, + "loss": 3.0629, + "step": 1864 + }, + { + "epoch": 0.5256852934958777, + "grad_norm": 1408.00390625, + "learning_rate": 4.810929777070576e-05, + "loss": 3.1963, + "step": 1865 + }, + { + "epoch": 0.5259671622859559, + "grad_norm": 1064.004638671875, + "learning_rate": 4.8063761199842786e-05, + "loss": 2.7155, + "step": 1866 + }, + { + "epoch": 0.5262490310760342, + "grad_norm": 984.00244140625, + "learning_rate": 4.8018226237313165e-05, + "loss": 2.7029, + "step": 1867 + }, + { + "epoch": 0.5265308998661123, + "grad_norm": 876.0054321289062, + "learning_rate": 4.7972692920940414e-05, + "loss": 2.9207, + "step": 1868 + }, + { + "epoch": 0.5268127686561905, + "grad_norm": 2336.00634765625, + "learning_rate": 4.792716128854674e-05, + "loss": 2.8659, + "step": 1869 + }, + { + "epoch": 0.5270946374462687, + "grad_norm": 1024.003662109375, + "learning_rate": 4.7881631377952854e-05, + "loss": 2.7739, + "step": 1870 + }, + { + "epoch": 0.527376506236347, + "grad_norm": 1224.0037841796875, + "learning_rate": 4.7836103226978154e-05, + "loss": 3.0427, + "step": 1871 + }, + { + "epoch": 0.5276583750264252, + "grad_norm": 656.0303344726562, + "learning_rate": 4.779057687344051e-05, + "loss": 2.6157, + "step": 1872 + }, + { + "epoch": 0.5279402438165034, + "grad_norm": 1144.0057373046875, + "learning_rate": 4.774505235515628e-05, + "loss": 3.0228, + "step": 1873 + }, + { + "epoch": 0.5282221126065816, + "grad_norm": 1760.0072021484375, + "learning_rate": 4.769952970994035e-05, + "loss": 3.3318, + "step": 1874 + }, + { + "epoch": 0.5285039813966599, + "grad_norm": 1032.004150390625, + "learning_rate": 4.7654008975606006e-05, + "loss": 2.6797, + "step": 1875 + }, + { + "epoch": 0.5287858501867381, + "grad_norm": 1640.0057373046875, + "learning_rate": 4.760849018996498e-05, + "loss": 3.0798, + "step": 1876 + }, + { + "epoch": 0.5290677189768163, + "grad_norm": 488.0033264160156, + "learning_rate": 4.7562973390827356e-05, + "loss": 2.9362, + "step": 1877 + }, + { + "epoch": 0.5293495877668946, + "grad_norm": 2400.00732421875, + "learning_rate": 4.7517458616001606e-05, + "loss": 2.7826, + "step": 1878 + }, + { + "epoch": 0.5296314565569727, + "grad_norm": 2224.007568359375, + "learning_rate": 4.7471945903294493e-05, + "loss": 3.2374, + "step": 1879 + }, + { + "epoch": 0.5299133253470509, + "grad_norm": 712.003662109375, + "learning_rate": 4.7426435290511016e-05, + "loss": 3.3331, + "step": 1880 + }, + { + "epoch": 0.5301951941371291, + "grad_norm": 884.003662109375, + "learning_rate": 4.7380926815454566e-05, + "loss": 2.9236, + "step": 1881 + }, + { + "epoch": 0.5304770629272074, + "grad_norm": 828.0372924804688, + "learning_rate": 4.7335420515926646e-05, + "loss": 2.9389, + "step": 1882 + }, + { + "epoch": 0.5307589317172856, + "grad_norm": 920.0059204101562, + "learning_rate": 4.728991642972698e-05, + "loss": 2.7247, + "step": 1883 + }, + { + "epoch": 0.5310408005073638, + "grad_norm": 896.0101928710938, + "learning_rate": 4.724441459465346e-05, + "loss": 2.5365, + "step": 1884 + }, + { + "epoch": 0.5313226692974421, + "grad_norm": 1152.00537109375, + "learning_rate": 4.719891504850209e-05, + "loss": 3.0691, + "step": 1885 + }, + { + "epoch": 0.5316045380875203, + "grad_norm": 1312.0048828125, + "learning_rate": 4.7153417829067014e-05, + "loss": 2.8643, + "step": 1886 + }, + { + "epoch": 0.5318864068775985, + "grad_norm": 968.0068359375, + "learning_rate": 4.7107922974140395e-05, + "loss": 3.0352, + "step": 1887 + }, + { + "epoch": 0.5321682756676767, + "grad_norm": 1480.0045166015625, + "learning_rate": 4.7062430521512474e-05, + "loss": 2.4865, + "step": 1888 + }, + { + "epoch": 0.532450144457755, + "grad_norm": 1208.005126953125, + "learning_rate": 4.701694050897145e-05, + "loss": 2.922, + "step": 1889 + }, + { + "epoch": 0.5327320132478331, + "grad_norm": 796.011474609375, + "learning_rate": 4.6971452974303535e-05, + "loss": 2.7723, + "step": 1890 + }, + { + "epoch": 0.5330138820379113, + "grad_norm": 1840.0242919921875, + "learning_rate": 4.692596795529289e-05, + "loss": 2.5982, + "step": 1891 + }, + { + "epoch": 0.5332957508279895, + "grad_norm": 1192.008544921875, + "learning_rate": 4.688048548972154e-05, + "loss": 2.8253, + "step": 1892 + }, + { + "epoch": 0.5335776196180678, + "grad_norm": 1848.0113525390625, + "learning_rate": 4.683500561536944e-05, + "loss": 2.9213, + "step": 1893 + }, + { + "epoch": 0.533859488408146, + "grad_norm": 2640.02197265625, + "learning_rate": 4.678952837001433e-05, + "loss": 3.414, + "step": 1894 + }, + { + "epoch": 0.5341413571982242, + "grad_norm": 520.013671875, + "learning_rate": 4.674405379143184e-05, + "loss": 3.0005, + "step": 1895 + }, + { + "epoch": 0.5344232259883025, + "grad_norm": 1000.0093383789062, + "learning_rate": 4.669858191739533e-05, + "loss": 2.608, + "step": 1896 + }, + { + "epoch": 0.5347050947783807, + "grad_norm": 1128.012451171875, + "learning_rate": 4.665311278567593e-05, + "loss": 3.0952, + "step": 1897 + }, + { + "epoch": 0.5349869635684589, + "grad_norm": 1008.0097045898438, + "learning_rate": 4.660764643404251e-05, + "loss": 3.1362, + "step": 1898 + }, + { + "epoch": 0.5352688323585371, + "grad_norm": 1320.003173828125, + "learning_rate": 4.656218290026159e-05, + "loss": 2.4138, + "step": 1899 + }, + { + "epoch": 0.5355507011486154, + "grad_norm": 1024.003662109375, + "learning_rate": 4.651672222209738e-05, + "loss": 3.0027, + "step": 1900 + }, + { + "epoch": 0.5358325699386935, + "grad_norm": 1224.00537109375, + "learning_rate": 4.647126443731174e-05, + "loss": 2.9214, + "step": 1901 + }, + { + "epoch": 0.5361144387287717, + "grad_norm": 1704.0067138671875, + "learning_rate": 4.642580958366407e-05, + "loss": 2.5704, + "step": 1902 + }, + { + "epoch": 0.5363963075188499, + "grad_norm": 800.0045166015625, + "learning_rate": 4.6380357698911384e-05, + "loss": 2.5707, + "step": 1903 + }, + { + "epoch": 0.5366781763089282, + "grad_norm": 1216.0035400390625, + "learning_rate": 4.6334908820808195e-05, + "loss": 3.198, + "step": 1904 + }, + { + "epoch": 0.5369600450990064, + "grad_norm": 1088.0037841796875, + "learning_rate": 4.628946298710655e-05, + "loss": 3.084, + "step": 1905 + }, + { + "epoch": 0.5372419138890846, + "grad_norm": 972.003662109375, + "learning_rate": 4.624402023555592e-05, + "loss": 2.8913, + "step": 1906 + }, + { + "epoch": 0.5375237826791629, + "grad_norm": 1992.0064697265625, + "learning_rate": 4.619858060390328e-05, + "loss": 3.1234, + "step": 1907 + }, + { + "epoch": 0.5378056514692411, + "grad_norm": 1880.0054931640625, + "learning_rate": 4.6153144129892954e-05, + "loss": 3.835, + "step": 1908 + }, + { + "epoch": 0.5380875202593193, + "grad_norm": 2112.010498046875, + "learning_rate": 4.6107710851266695e-05, + "loss": 3.4044, + "step": 1909 + }, + { + "epoch": 0.5383693890493975, + "grad_norm": 1248.005859375, + "learning_rate": 4.606228080576356e-05, + "loss": 3.1117, + "step": 1910 + }, + { + "epoch": 0.5386512578394758, + "grad_norm": 2064.0078125, + "learning_rate": 4.6016854031119906e-05, + "loss": 3.9424, + "step": 1911 + }, + { + "epoch": 0.538933126629554, + "grad_norm": 1448.0323486328125, + "learning_rate": 4.597143056506946e-05, + "loss": 3.3669, + "step": 1912 + }, + { + "epoch": 0.5392149954196321, + "grad_norm": 1472.005126953125, + "learning_rate": 4.592601044534313e-05, + "loss": 3.3291, + "step": 1913 + }, + { + "epoch": 0.5394968642097103, + "grad_norm": 442.0043640136719, + "learning_rate": 4.5880593709669035e-05, + "loss": 3.2058, + "step": 1914 + }, + { + "epoch": 0.5397787329997886, + "grad_norm": 756.0042724609375, + "learning_rate": 4.583518039577252e-05, + "loss": 2.9076, + "step": 1915 + }, + { + "epoch": 0.5400606017898668, + "grad_norm": 1448.0030517578125, + "learning_rate": 4.578977054137608e-05, + "loss": 2.9668, + "step": 1916 + }, + { + "epoch": 0.540342470579945, + "grad_norm": 1712.00390625, + "learning_rate": 4.5744364184199325e-05, + "loss": 3.3539, + "step": 1917 + }, + { + "epoch": 0.5406243393700233, + "grad_norm": 1528.004638671875, + "learning_rate": 4.5698961361958955e-05, + "loss": 2.9541, + "step": 1918 + }, + { + "epoch": 0.5409062081601015, + "grad_norm": 836.0035400390625, + "learning_rate": 4.565356211236876e-05, + "loss": 2.9769, + "step": 1919 + }, + { + "epoch": 0.5411880769501797, + "grad_norm": 502.0028381347656, + "learning_rate": 4.560816647313954e-05, + "loss": 2.7627, + "step": 1920 + }, + { + "epoch": 0.5414699457402579, + "grad_norm": 1320.0042724609375, + "learning_rate": 4.5562774481979087e-05, + "loss": 2.9304, + "step": 1921 + }, + { + "epoch": 0.5417518145303362, + "grad_norm": 1528.0035400390625, + "learning_rate": 4.551738617659222e-05, + "loss": 2.752, + "step": 1922 + }, + { + "epoch": 0.5420336833204144, + "grad_norm": 812.0028686523438, + "learning_rate": 4.547200159468061e-05, + "loss": 2.9229, + "step": 1923 + }, + { + "epoch": 0.5423155521104925, + "grad_norm": 736.0027465820312, + "learning_rate": 4.5426620773942926e-05, + "loss": 2.5091, + "step": 1924 + }, + { + "epoch": 0.5425974209005708, + "grad_norm": 804.00341796875, + "learning_rate": 4.538124375207462e-05, + "loss": 2.5355, + "step": 1925 + }, + { + "epoch": 0.542879289690649, + "grad_norm": 2112.009521484375, + "learning_rate": 4.533587056676807e-05, + "loss": 3.4219, + "step": 1926 + }, + { + "epoch": 0.5431611584807272, + "grad_norm": 1536.006591796875, + "learning_rate": 4.5290501255712415e-05, + "loss": 3.0446, + "step": 1927 + }, + { + "epoch": 0.5434430272708054, + "grad_norm": 2288.010009765625, + "learning_rate": 4.524513585659359e-05, + "loss": 3.5742, + "step": 1928 + }, + { + "epoch": 0.5437248960608837, + "grad_norm": 784.0047607421875, + "learning_rate": 4.5199774407094296e-05, + "loss": 2.7706, + "step": 1929 + }, + { + "epoch": 0.5440067648509619, + "grad_norm": 916.0039672851562, + "learning_rate": 4.515441694489393e-05, + "loss": 2.5449, + "step": 1930 + }, + { + "epoch": 0.5442886336410401, + "grad_norm": 1648.00439453125, + "learning_rate": 4.5109063507668584e-05, + "loss": 3.4245, + "step": 1931 + }, + { + "epoch": 0.5445705024311183, + "grad_norm": 272.00567626953125, + "learning_rate": 4.506371413309104e-05, + "loss": 2.8744, + "step": 1932 + }, + { + "epoch": 0.5448523712211966, + "grad_norm": 1256.005615234375, + "learning_rate": 4.501836885883065e-05, + "loss": 3.3559, + "step": 1933 + }, + { + "epoch": 0.5451342400112748, + "grad_norm": 240.0050506591797, + "learning_rate": 4.49730277225534e-05, + "loss": 2.7735, + "step": 1934 + }, + { + "epoch": 0.5454161088013529, + "grad_norm": 708.0072631835938, + "learning_rate": 4.49276907619218e-05, + "loss": 2.8982, + "step": 1935 + }, + { + "epoch": 0.5456979775914312, + "grad_norm": 888.0042724609375, + "learning_rate": 4.4882358014594955e-05, + "loss": 2.6481, + "step": 1936 + }, + { + "epoch": 0.5459798463815094, + "grad_norm": 510.003173828125, + "learning_rate": 4.48370295182284e-05, + "loss": 2.7891, + "step": 1937 + }, + { + "epoch": 0.5462617151715876, + "grad_norm": 1112.0045166015625, + "learning_rate": 4.4791705310474195e-05, + "loss": 2.6804, + "step": 1938 + }, + { + "epoch": 0.5465435839616658, + "grad_norm": 932.0036010742188, + "learning_rate": 4.474638542898078e-05, + "loss": 2.6729, + "step": 1939 + }, + { + "epoch": 0.5468254527517441, + "grad_norm": 2208.0078125, + "learning_rate": 4.470106991139307e-05, + "loss": 2.6813, + "step": 1940 + }, + { + "epoch": 0.5471073215418223, + "grad_norm": 1072.00341796875, + "learning_rate": 4.465575879535229e-05, + "loss": 2.8965, + "step": 1941 + }, + { + "epoch": 0.5473891903319005, + "grad_norm": 720.0053100585938, + "learning_rate": 4.461045211849605e-05, + "loss": 3.2425, + "step": 1942 + }, + { + "epoch": 0.5476710591219787, + "grad_norm": 1288.0068359375, + "learning_rate": 4.4565149918458294e-05, + "loss": 3.1169, + "step": 1943 + }, + { + "epoch": 0.547952927912057, + "grad_norm": 1664.00439453125, + "learning_rate": 4.4519852232869173e-05, + "loss": 3.0415, + "step": 1944 + }, + { + "epoch": 0.5482347967021352, + "grad_norm": 1408.0045166015625, + "learning_rate": 4.447455909935513e-05, + "loss": 2.9434, + "step": 1945 + }, + { + "epoch": 0.5485166654922133, + "grad_norm": 888.0042724609375, + "learning_rate": 4.442927055553886e-05, + "loss": 2.7241, + "step": 1946 + }, + { + "epoch": 0.5487985342822916, + "grad_norm": 1408.00732421875, + "learning_rate": 4.438398663903918e-05, + "loss": 3.7715, + "step": 1947 + }, + { + "epoch": 0.5490804030723698, + "grad_norm": 488.00323486328125, + "learning_rate": 4.4338707387471104e-05, + "loss": 2.4977, + "step": 1948 + }, + { + "epoch": 0.549362271862448, + "grad_norm": 370.0054626464844, + "learning_rate": 4.429343283844577e-05, + "loss": 3.1589, + "step": 1949 + }, + { + "epoch": 0.5496441406525262, + "grad_norm": 1720.006591796875, + "learning_rate": 4.42481630295704e-05, + "loss": 2.9421, + "step": 1950 + }, + { + "epoch": 0.5499260094426045, + "grad_norm": 2128.008544921875, + "learning_rate": 4.4202897998448254e-05, + "loss": 3.207, + "step": 1951 + }, + { + "epoch": 0.5502078782326827, + "grad_norm": 724.0023193359375, + "learning_rate": 4.415763778267869e-05, + "loss": 2.5811, + "step": 1952 + }, + { + "epoch": 0.5504897470227609, + "grad_norm": 1040.0035400390625, + "learning_rate": 4.4112382419857026e-05, + "loss": 3.2959, + "step": 1953 + }, + { + "epoch": 0.5507716158128392, + "grad_norm": 712.00341796875, + "learning_rate": 4.406713194757451e-05, + "loss": 2.7074, + "step": 1954 + }, + { + "epoch": 0.5510534846029174, + "grad_norm": 792.003173828125, + "learning_rate": 4.4021886403418416e-05, + "loss": 2.7777, + "step": 1955 + }, + { + "epoch": 0.5513353533929956, + "grad_norm": 1304.005615234375, + "learning_rate": 4.3976645824971844e-05, + "loss": 2.415, + "step": 1956 + }, + { + "epoch": 0.5516172221830737, + "grad_norm": 1632.009033203125, + "learning_rate": 4.3931410249813806e-05, + "loss": 3.1279, + "step": 1957 + }, + { + "epoch": 0.551899090973152, + "grad_norm": 588.0032348632812, + "learning_rate": 4.388617971551915e-05, + "loss": 2.7783, + "step": 1958 + }, + { + "epoch": 0.5521809597632302, + "grad_norm": 688.0054321289062, + "learning_rate": 4.3840954259658556e-05, + "loss": 2.6846, + "step": 1959 + }, + { + "epoch": 0.5524628285533084, + "grad_norm": 844.032958984375, + "learning_rate": 4.379573391979846e-05, + "loss": 3.2276, + "step": 1960 + }, + { + "epoch": 0.5527446973433866, + "grad_norm": 876.0027465820312, + "learning_rate": 4.3750518733501045e-05, + "loss": 3.1162, + "step": 1961 + }, + { + "epoch": 0.5530265661334649, + "grad_norm": 284.0064697265625, + "learning_rate": 4.370530873832422e-05, + "loss": 3.0651, + "step": 1962 + }, + { + "epoch": 0.5533084349235431, + "grad_norm": 1144.0042724609375, + "learning_rate": 4.366010397182163e-05, + "loss": 3.375, + "step": 1963 + }, + { + "epoch": 0.5535903037136213, + "grad_norm": 616.00439453125, + "learning_rate": 4.36149044715425e-05, + "loss": 3.0195, + "step": 1964 + }, + { + "epoch": 0.5538721725036996, + "grad_norm": 1800.0045166015625, + "learning_rate": 4.356971027503173e-05, + "loss": 2.7778, + "step": 1965 + }, + { + "epoch": 0.5541540412937778, + "grad_norm": 1200.004150390625, + "learning_rate": 4.352452141982979e-05, + "loss": 3.0459, + "step": 1966 + }, + { + "epoch": 0.554435910083856, + "grad_norm": 1272.007080078125, + "learning_rate": 4.3479337943472734e-05, + "loss": 3.1338, + "step": 1967 + }, + { + "epoch": 0.5547177788739341, + "grad_norm": 956.0076293945312, + "learning_rate": 4.343415988349212e-05, + "loss": 3.0597, + "step": 1968 + }, + { + "epoch": 0.5549996476640124, + "grad_norm": 724.0037841796875, + "learning_rate": 4.3388987277415046e-05, + "loss": 2.8776, + "step": 1969 + }, + { + "epoch": 0.5552815164540906, + "grad_norm": 836.0054931640625, + "learning_rate": 4.334382016276403e-05, + "loss": 2.945, + "step": 1970 + }, + { + "epoch": 0.5555633852441688, + "grad_norm": 1032.003662109375, + "learning_rate": 4.329865857705709e-05, + "loss": 2.8545, + "step": 1971 + }, + { + "epoch": 0.555845254034247, + "grad_norm": 3120.007080078125, + "learning_rate": 4.325350255780757e-05, + "loss": 2.6771, + "step": 1972 + }, + { + "epoch": 0.5561271228243253, + "grad_norm": 1072.0048828125, + "learning_rate": 4.320835214252429e-05, + "loss": 2.9229, + "step": 1973 + }, + { + "epoch": 0.5564089916144035, + "grad_norm": 1368.00537109375, + "learning_rate": 4.316320736871134e-05, + "loss": 2.9535, + "step": 1974 + }, + { + "epoch": 0.5566908604044817, + "grad_norm": 1712.00439453125, + "learning_rate": 4.311806827386815e-05, + "loss": 3.2051, + "step": 1975 + }, + { + "epoch": 0.55697272919456, + "grad_norm": 768.0087280273438, + "learning_rate": 4.307293489548942e-05, + "loss": 2.6865, + "step": 1976 + }, + { + "epoch": 0.5572545979846382, + "grad_norm": 988.0066528320312, + "learning_rate": 4.3027807271065126e-05, + "loss": 2.8369, + "step": 1977 + }, + { + "epoch": 0.5575364667747164, + "grad_norm": 1856.0081787109375, + "learning_rate": 4.298268543808043e-05, + "loss": 3.1934, + "step": 1978 + }, + { + "epoch": 0.5578183355647945, + "grad_norm": 288.0090637207031, + "learning_rate": 4.293756943401573e-05, + "loss": 3.4518, + "step": 1979 + }, + { + "epoch": 0.5581002043548728, + "grad_norm": 1816.0050048828125, + "learning_rate": 4.289245929634652e-05, + "loss": 2.4782, + "step": 1980 + }, + { + "epoch": 0.558382073144951, + "grad_norm": 564.0054931640625, + "learning_rate": 4.28473550625435e-05, + "loss": 3.2321, + "step": 1981 + }, + { + "epoch": 0.5586639419350292, + "grad_norm": 1048.00732421875, + "learning_rate": 4.280225677007237e-05, + "loss": 2.602, + "step": 1982 + }, + { + "epoch": 0.5589458107251075, + "grad_norm": 1064.0067138671875, + "learning_rate": 4.275716445639398e-05, + "loss": 3.0078, + "step": 1983 + }, + { + "epoch": 0.5592276795151857, + "grad_norm": 500.0083312988281, + "learning_rate": 4.2712078158964205e-05, + "loss": 2.6524, + "step": 1984 + }, + { + "epoch": 0.5595095483052639, + "grad_norm": 800.0086669921875, + "learning_rate": 4.2666997915233846e-05, + "loss": 3.1706, + "step": 1985 + }, + { + "epoch": 0.5597914170953421, + "grad_norm": 1528.0093994140625, + "learning_rate": 4.2621923762648776e-05, + "loss": 3.6065, + "step": 1986 + }, + { + "epoch": 0.5600732858854204, + "grad_norm": 940.0082397460938, + "learning_rate": 4.2576855738649714e-05, + "loss": 2.7132, + "step": 1987 + }, + { + "epoch": 0.5603551546754986, + "grad_norm": 1880.0068359375, + "learning_rate": 4.253179388067238e-05, + "loss": 2.8223, + "step": 1988 + }, + { + "epoch": 0.5606370234655768, + "grad_norm": 752.0055541992188, + "learning_rate": 4.2486738226147285e-05, + "loss": 2.6725, + "step": 1989 + }, + { + "epoch": 0.5609188922556549, + "grad_norm": 2000.008544921875, + "learning_rate": 4.244168881249986e-05, + "loss": 2.9024, + "step": 1990 + }, + { + "epoch": 0.5612007610457332, + "grad_norm": 1464.0074462890625, + "learning_rate": 4.2396645677150315e-05, + "loss": 3.419, + "step": 1991 + }, + { + "epoch": 0.5614826298358114, + "grad_norm": 1296.0091552734375, + "learning_rate": 4.235160885751362e-05, + "loss": 3.3499, + "step": 1992 + }, + { + "epoch": 0.5617644986258896, + "grad_norm": 1152.00732421875, + "learning_rate": 4.2306578390999576e-05, + "loss": 2.5999, + "step": 1993 + }, + { + "epoch": 0.5620463674159679, + "grad_norm": 3168.017578125, + "learning_rate": 4.226155431501264e-05, + "loss": 3.8125, + "step": 1994 + }, + { + "epoch": 0.5623282362060461, + "grad_norm": 1104.0040283203125, + "learning_rate": 4.221653666695198e-05, + "loss": 3.0739, + "step": 1995 + }, + { + "epoch": 0.5626101049961243, + "grad_norm": 652.0103149414062, + "learning_rate": 4.2171525484211435e-05, + "loss": 3.1572, + "step": 1996 + }, + { + "epoch": 0.5628919737862025, + "grad_norm": 1216.0081787109375, + "learning_rate": 4.212652080417945e-05, + "loss": 2.6201, + "step": 1997 + }, + { + "epoch": 0.5631738425762808, + "grad_norm": 1904.0069580078125, + "learning_rate": 4.208152266423909e-05, + "loss": 2.7175, + "step": 1998 + }, + { + "epoch": 0.563455711366359, + "grad_norm": 1936.0108642578125, + "learning_rate": 4.203653110176798e-05, + "loss": 3.2559, + "step": 1999 + }, + { + "epoch": 0.5637375801564372, + "grad_norm": 644.0078125, + "learning_rate": 4.19915461541383e-05, + "loss": 2.7923, + "step": 2000 + }, + { + "epoch": 0.5640194489465153, + "grad_norm": 1416.0062255859375, + "learning_rate": 4.194656785871669e-05, + "loss": 2.8504, + "step": 2001 + }, + { + "epoch": 0.5643013177365936, + "grad_norm": 952.0056762695312, + "learning_rate": 4.1901596252864286e-05, + "loss": 2.9496, + "step": 2002 + }, + { + "epoch": 0.5645831865266718, + "grad_norm": 1336.005615234375, + "learning_rate": 4.1856631373936714e-05, + "loss": 2.777, + "step": 2003 + }, + { + "epoch": 0.56486505531675, + "grad_norm": 1168.0166015625, + "learning_rate": 4.181167325928393e-05, + "loss": 3.5143, + "step": 2004 + }, + { + "epoch": 0.5651469241068283, + "grad_norm": 900.004150390625, + "learning_rate": 4.176672194625035e-05, + "loss": 2.6277, + "step": 2005 + }, + { + "epoch": 0.5654287928969065, + "grad_norm": 648.0143432617188, + "learning_rate": 4.1721777472174676e-05, + "loss": 3.1035, + "step": 2006 + }, + { + "epoch": 0.5657106616869847, + "grad_norm": 1224.0059814453125, + "learning_rate": 4.167683987438997e-05, + "loss": 2.7477, + "step": 2007 + }, + { + "epoch": 0.5659925304770629, + "grad_norm": 948.0054931640625, + "learning_rate": 4.1631909190223564e-05, + "loss": 2.5576, + "step": 2008 + }, + { + "epoch": 0.5662743992671412, + "grad_norm": 1752.0091552734375, + "learning_rate": 4.158698545699704e-05, + "loss": 3.2429, + "step": 2009 + }, + { + "epoch": 0.5665562680572194, + "grad_norm": 1288.006103515625, + "learning_rate": 4.154206871202624e-05, + "loss": 3.1182, + "step": 2010 + }, + { + "epoch": 0.5668381368472976, + "grad_norm": 1224.0057373046875, + "learning_rate": 4.149715899262115e-05, + "loss": 2.7341, + "step": 2011 + }, + { + "epoch": 0.5671200056373759, + "grad_norm": 776.0044555664062, + "learning_rate": 4.145225633608598e-05, + "loss": 2.6745, + "step": 2012 + }, + { + "epoch": 0.567401874427454, + "grad_norm": 780.007080078125, + "learning_rate": 4.140736077971901e-05, + "loss": 2.639, + "step": 2013 + }, + { + "epoch": 0.5676837432175322, + "grad_norm": 1304.00537109375, + "learning_rate": 4.1362472360812684e-05, + "loss": 3.1022, + "step": 2014 + }, + { + "epoch": 0.5679656120076104, + "grad_norm": 1360.005859375, + "learning_rate": 4.131759111665349e-05, + "loss": 2.933, + "step": 2015 + }, + { + "epoch": 0.5682474807976887, + "grad_norm": 656.0061645507812, + "learning_rate": 4.127271708452193e-05, + "loss": 2.5889, + "step": 2016 + }, + { + "epoch": 0.5685293495877669, + "grad_norm": 1240.00390625, + "learning_rate": 4.122785030169256e-05, + "loss": 2.9834, + "step": 2017 + }, + { + "epoch": 0.5688112183778451, + "grad_norm": 808.0081176757812, + "learning_rate": 4.118299080543387e-05, + "loss": 3.2806, + "step": 2018 + }, + { + "epoch": 0.5690930871679233, + "grad_norm": 948.0051879882812, + "learning_rate": 4.113813863300836e-05, + "loss": 3.2842, + "step": 2019 + }, + { + "epoch": 0.5693749559580016, + "grad_norm": 1368.00341796875, + "learning_rate": 4.109329382167237e-05, + "loss": 3.1351, + "step": 2020 + }, + { + "epoch": 0.5696568247480798, + "grad_norm": 772.0045166015625, + "learning_rate": 4.104845640867619e-05, + "loss": 2.7751, + "step": 2021 + }, + { + "epoch": 0.569938693538158, + "grad_norm": 2672.011962890625, + "learning_rate": 4.1003626431263924e-05, + "loss": 2.5973, + "step": 2022 + }, + { + "epoch": 0.5702205623282363, + "grad_norm": 624.0050048828125, + "learning_rate": 4.095880392667349e-05, + "loss": 3.0121, + "step": 2023 + }, + { + "epoch": 0.5705024311183144, + "grad_norm": 624.00439453125, + "learning_rate": 4.091398893213668e-05, + "loss": 2.8757, + "step": 2024 + }, + { + "epoch": 0.5707842999083926, + "grad_norm": 744.0038452148438, + "learning_rate": 4.086918148487896e-05, + "loss": 2.6999, + "step": 2025 + }, + { + "epoch": 0.5710661686984708, + "grad_norm": 1192.0078125, + "learning_rate": 4.0824381622119546e-05, + "loss": 3.0677, + "step": 2026 + }, + { + "epoch": 0.5713480374885491, + "grad_norm": 2480.015869140625, + "learning_rate": 4.077958938107139e-05, + "loss": 3.8145, + "step": 2027 + }, + { + "epoch": 0.5716299062786273, + "grad_norm": 700.005615234375, + "learning_rate": 4.0734804798941065e-05, + "loss": 2.6543, + "step": 2028 + }, + { + "epoch": 0.5719117750687055, + "grad_norm": 720.0064086914062, + "learning_rate": 4.0690027912928816e-05, + "loss": 3.3217, + "step": 2029 + }, + { + "epoch": 0.5721936438587837, + "grad_norm": 660.004638671875, + "learning_rate": 4.0645258760228476e-05, + "loss": 3.1358, + "step": 2030 + }, + { + "epoch": 0.572475512648862, + "grad_norm": 1176.0062255859375, + "learning_rate": 4.060049737802749e-05, + "loss": 3.2321, + "step": 2031 + }, + { + "epoch": 0.5727573814389402, + "grad_norm": 1816.005859375, + "learning_rate": 4.0555743803506777e-05, + "loss": 3.1374, + "step": 2032 + }, + { + "epoch": 0.5730392502290184, + "grad_norm": 908.0046997070312, + "learning_rate": 4.0510998073840836e-05, + "loss": 2.8177, + "step": 2033 + }, + { + "epoch": 0.5733211190190967, + "grad_norm": 1752.0081787109375, + "learning_rate": 4.046626022619765e-05, + "loss": 2.6948, + "step": 2034 + }, + { + "epoch": 0.5736029878091748, + "grad_norm": 1072.0106201171875, + "learning_rate": 4.042153029773861e-05, + "loss": 2.8503, + "step": 2035 + }, + { + "epoch": 0.573884856599253, + "grad_norm": 1560.0069580078125, + "learning_rate": 4.037680832561856e-05, + "loss": 2.96, + "step": 2036 + }, + { + "epoch": 0.5741667253893312, + "grad_norm": 968.00634765625, + "learning_rate": 4.0332094346985715e-05, + "loss": 2.8639, + "step": 2037 + }, + { + "epoch": 0.5744485941794095, + "grad_norm": 932.010009765625, + "learning_rate": 4.028738839898169e-05, + "loss": 3.0401, + "step": 2038 + }, + { + "epoch": 0.5747304629694877, + "grad_norm": 1048.0064697265625, + "learning_rate": 4.0242690518741374e-05, + "loss": 2.6155, + "step": 2039 + }, + { + "epoch": 0.5750123317595659, + "grad_norm": 1912.0106201171875, + "learning_rate": 4.0198000743392995e-05, + "loss": 2.66, + "step": 2040 + }, + { + "epoch": 0.5752942005496441, + "grad_norm": 624.0114135742188, + "learning_rate": 4.015331911005803e-05, + "loss": 2.8805, + "step": 2041 + }, + { + "epoch": 0.5755760693397224, + "grad_norm": 1144.0048828125, + "learning_rate": 4.01086456558512e-05, + "loss": 2.7393, + "step": 2042 + }, + { + "epoch": 0.5758579381298006, + "grad_norm": 1232.009033203125, + "learning_rate": 4.006398041788042e-05, + "loss": 2.918, + "step": 2043 + }, + { + "epoch": 0.5761398069198788, + "grad_norm": 740.0109252929688, + "learning_rate": 4.001932343324683e-05, + "loss": 3.5866, + "step": 2044 + }, + { + "epoch": 0.5764216757099571, + "grad_norm": 556.00830078125, + "learning_rate": 3.997467473904464e-05, + "loss": 3.0388, + "step": 2045 + }, + { + "epoch": 0.5767035445000352, + "grad_norm": 1560.005615234375, + "learning_rate": 3.993003437236123e-05, + "loss": 3.3718, + "step": 2046 + }, + { + "epoch": 0.5769854132901134, + "grad_norm": 1280.0048828125, + "learning_rate": 3.988540237027702e-05, + "loss": 2.6436, + "step": 2047 + }, + { + "epoch": 0.5772672820801916, + "grad_norm": 1320.007568359375, + "learning_rate": 3.984077876986553e-05, + "loss": 2.7279, + "step": 2048 + }, + { + "epoch": 0.5775491508702699, + "grad_norm": 438.010009765625, + "learning_rate": 3.979616360819325e-05, + "loss": 2.6338, + "step": 2049 + }, + { + "epoch": 0.5778310196603481, + "grad_norm": 1336.005615234375, + "learning_rate": 3.97515569223197e-05, + "loss": 3.2194, + "step": 2050 + }, + { + "epoch": 0.5781128884504263, + "grad_norm": 804.0045776367188, + "learning_rate": 3.9706958749297335e-05, + "loss": 2.9059, + "step": 2051 + }, + { + "epoch": 0.5783947572405046, + "grad_norm": 1792.0047607421875, + "learning_rate": 3.9662369126171565e-05, + "loss": 2.7425, + "step": 2052 + }, + { + "epoch": 0.5786766260305828, + "grad_norm": 1136.0028076171875, + "learning_rate": 3.9617788089980655e-05, + "loss": 2.8298, + "step": 2053 + }, + { + "epoch": 0.578958494820661, + "grad_norm": 1312.0074462890625, + "learning_rate": 3.957321567775579e-05, + "loss": 3.2311, + "step": 2054 + }, + { + "epoch": 0.5792403636107392, + "grad_norm": 1176.00830078125, + "learning_rate": 3.9528651926520964e-05, + "loss": 3.1224, + "step": 2055 + }, + { + "epoch": 0.5795222324008175, + "grad_norm": 848.0045776367188, + "learning_rate": 3.9484096873292974e-05, + "loss": 3.013, + "step": 2056 + }, + { + "epoch": 0.5798041011908956, + "grad_norm": 596.0038452148438, + "learning_rate": 3.943955055508138e-05, + "loss": 3.0854, + "step": 2057 + }, + { + "epoch": 0.5800859699809738, + "grad_norm": 956.0054931640625, + "learning_rate": 3.939501300888854e-05, + "loss": 3.0863, + "step": 2058 + }, + { + "epoch": 0.580367838771052, + "grad_norm": 1120.0037841796875, + "learning_rate": 3.9350484271709445e-05, + "loss": 3.0602, + "step": 2059 + }, + { + "epoch": 0.5806497075611303, + "grad_norm": 1040.0045166015625, + "learning_rate": 3.930596438053184e-05, + "loss": 2.9988, + "step": 2060 + }, + { + "epoch": 0.5809315763512085, + "grad_norm": 1160.05859375, + "learning_rate": 3.926145337233608e-05, + "loss": 2.7175, + "step": 2061 + }, + { + "epoch": 0.5812134451412867, + "grad_norm": 1512.005126953125, + "learning_rate": 3.921695128409517e-05, + "loss": 2.7982, + "step": 2062 + }, + { + "epoch": 0.581495313931365, + "grad_norm": 860.0050048828125, + "learning_rate": 3.917245815277468e-05, + "loss": 2.7477, + "step": 2063 + }, + { + "epoch": 0.5817771827214432, + "grad_norm": 1496.005126953125, + "learning_rate": 3.912797401533274e-05, + "loss": 2.9538, + "step": 2064 + }, + { + "epoch": 0.5820590515115214, + "grad_norm": 788.00390625, + "learning_rate": 3.9083498908720054e-05, + "loss": 3.1351, + "step": 2065 + }, + { + "epoch": 0.5823409203015996, + "grad_norm": 502.00482177734375, + "learning_rate": 3.903903286987976e-05, + "loss": 2.7464, + "step": 2066 + }, + { + "epoch": 0.5826227890916779, + "grad_norm": 960.0033569335938, + "learning_rate": 3.8994575935747525e-05, + "loss": 2.5693, + "step": 2067 + }, + { + "epoch": 0.582904657881756, + "grad_norm": 1320.005859375, + "learning_rate": 3.8950128143251386e-05, + "loss": 2.7813, + "step": 2068 + }, + { + "epoch": 0.5831865266718342, + "grad_norm": 1232.009765625, + "learning_rate": 3.890568952931185e-05, + "loss": 2.7005, + "step": 2069 + }, + { + "epoch": 0.5834683954619124, + "grad_norm": 356.0039978027344, + "learning_rate": 3.886126013084177e-05, + "loss": 2.7188, + "step": 2070 + }, + { + "epoch": 0.5837502642519907, + "grad_norm": 1544.0059814453125, + "learning_rate": 3.881683998474633e-05, + "loss": 3.0684, + "step": 2071 + }, + { + "epoch": 0.5840321330420689, + "grad_norm": 1160.0059814453125, + "learning_rate": 3.8772429127923084e-05, + "loss": 2.7702, + "step": 2072 + }, + { + "epoch": 0.5843140018321471, + "grad_norm": 1440.0052490234375, + "learning_rate": 3.872802759726178e-05, + "loss": 3.848, + "step": 2073 + }, + { + "epoch": 0.5845958706222254, + "grad_norm": 1440.0030517578125, + "learning_rate": 3.868363542964449e-05, + "loss": 2.8932, + "step": 2074 + }, + { + "epoch": 0.5848777394123036, + "grad_norm": 2080.00927734375, + "learning_rate": 3.863925266194553e-05, + "loss": 3.2412, + "step": 2075 + }, + { + "epoch": 0.5851596082023818, + "grad_norm": 354.0072326660156, + "learning_rate": 3.859487933103132e-05, + "loss": 2.9753, + "step": 2076 + }, + { + "epoch": 0.58544147699246, + "grad_norm": 254.03451538085938, + "learning_rate": 3.8550515473760514e-05, + "loss": 3.0898, + "step": 2077 + }, + { + "epoch": 0.5857233457825383, + "grad_norm": 350.006591796875, + "learning_rate": 3.850616112698385e-05, + "loss": 3.0794, + "step": 2078 + }, + { + "epoch": 0.5860052145726165, + "grad_norm": 1320.0050048828125, + "learning_rate": 3.846181632754422e-05, + "loss": 3.1517, + "step": 2079 + }, + { + "epoch": 0.5862870833626946, + "grad_norm": 364.0078125, + "learning_rate": 3.841748111227652e-05, + "loss": 3.1686, + "step": 2080 + }, + { + "epoch": 0.5865689521527729, + "grad_norm": 724.006103515625, + "learning_rate": 3.837315551800774e-05, + "loss": 2.7631, + "step": 2081 + }, + { + "epoch": 0.5868508209428511, + "grad_norm": 1392.0404052734375, + "learning_rate": 3.832883958155684e-05, + "loss": 3.0404, + "step": 2082 + }, + { + "epoch": 0.5871326897329293, + "grad_norm": 322.0069274902344, + "learning_rate": 3.8284533339734804e-05, + "loss": 2.7894, + "step": 2083 + }, + { + "epoch": 0.5874145585230075, + "grad_norm": 1304.008056640625, + "learning_rate": 3.8240236829344486e-05, + "loss": 2.6524, + "step": 2084 + }, + { + "epoch": 0.5876964273130858, + "grad_norm": 948.0071411132812, + "learning_rate": 3.8195950087180746e-05, + "loss": 2.49, + "step": 2085 + }, + { + "epoch": 0.587978296103164, + "grad_norm": 1280.0091552734375, + "learning_rate": 3.815167315003028e-05, + "loss": 2.4369, + "step": 2086 + }, + { + "epoch": 0.5882601648932422, + "grad_norm": 1504.0035400390625, + "learning_rate": 3.8107406054671643e-05, + "loss": 2.4249, + "step": 2087 + }, + { + "epoch": 0.5885420336833204, + "grad_norm": 1584.0096435546875, + "learning_rate": 3.8063148837875216e-05, + "loss": 3.0975, + "step": 2088 + }, + { + "epoch": 0.5888239024733987, + "grad_norm": 1576.007568359375, + "learning_rate": 3.8018901536403196e-05, + "loss": 2.9268, + "step": 2089 + }, + { + "epoch": 0.5891057712634769, + "grad_norm": 2192.0087890625, + "learning_rate": 3.797466418700949e-05, + "loss": 2.9437, + "step": 2090 + }, + { + "epoch": 0.589387640053555, + "grad_norm": 472.0064392089844, + "learning_rate": 3.793043682643981e-05, + "loss": 2.4177, + "step": 2091 + }, + { + "epoch": 0.5896695088436333, + "grad_norm": 1480.0130615234375, + "learning_rate": 3.7886219491431516e-05, + "loss": 2.8613, + "step": 2092 + }, + { + "epoch": 0.5899513776337115, + "grad_norm": 1240.00634765625, + "learning_rate": 3.784201221871367e-05, + "loss": 2.793, + "step": 2093 + }, + { + "epoch": 0.5902332464237897, + "grad_norm": 868.0081787109375, + "learning_rate": 3.779781504500695e-05, + "loss": 2.7347, + "step": 2094 + }, + { + "epoch": 0.5905151152138679, + "grad_norm": 908.0072021484375, + "learning_rate": 3.7753628007023665e-05, + "loss": 3.0039, + "step": 2095 + }, + { + "epoch": 0.5907969840039462, + "grad_norm": 478.0140686035156, + "learning_rate": 3.770945114146773e-05, + "loss": 3.222, + "step": 2096 + }, + { + "epoch": 0.5910788527940244, + "grad_norm": 1352.01123046875, + "learning_rate": 3.766528448503454e-05, + "loss": 3.1402, + "step": 2097 + }, + { + "epoch": 0.5913607215841026, + "grad_norm": 968.0086669921875, + "learning_rate": 3.762112807441108e-05, + "loss": 2.8855, + "step": 2098 + }, + { + "epoch": 0.5916425903741808, + "grad_norm": 478.0137023925781, + "learning_rate": 3.757698194627577e-05, + "loss": 2.6019, + "step": 2099 + }, + { + "epoch": 0.5919244591642591, + "grad_norm": 1528.0169677734375, + "learning_rate": 3.7532846137298525e-05, + "loss": 3.2087, + "step": 2100 + }, + { + "epoch": 0.5922063279543373, + "grad_norm": 1696.0042724609375, + "learning_rate": 3.7488720684140685e-05, + "loss": 3.098, + "step": 2101 + }, + { + "epoch": 0.5924881967444154, + "grad_norm": 1208.0064697265625, + "learning_rate": 3.744460562345497e-05, + "loss": 2.363, + "step": 2102 + }, + { + "epoch": 0.5927700655344937, + "grad_norm": 1216.00634765625, + "learning_rate": 3.7400500991885484e-05, + "loss": 2.7627, + "step": 2103 + }, + { + "epoch": 0.5930519343245719, + "grad_norm": 804.0088500976562, + "learning_rate": 3.735640682606764e-05, + "loss": 3.0921, + "step": 2104 + }, + { + "epoch": 0.5933338031146501, + "grad_norm": 984.0064086914062, + "learning_rate": 3.731232316262819e-05, + "loss": 2.9287, + "step": 2105 + }, + { + "epoch": 0.5936156719047283, + "grad_norm": 1208.0068359375, + "learning_rate": 3.726825003818518e-05, + "loss": 2.3774, + "step": 2106 + }, + { + "epoch": 0.5938975406948066, + "grad_norm": 1012.0074462890625, + "learning_rate": 3.722418748934785e-05, + "loss": 2.9697, + "step": 2107 + }, + { + "epoch": 0.5941794094848848, + "grad_norm": 1080.0064697265625, + "learning_rate": 3.7180135552716675e-05, + "loss": 2.6261, + "step": 2108 + }, + { + "epoch": 0.594461278274963, + "grad_norm": 2080.013427734375, + "learning_rate": 3.713609426488331e-05, + "loss": 3.4629, + "step": 2109 + }, + { + "epoch": 0.5947431470650413, + "grad_norm": 1200.0078125, + "learning_rate": 3.709206366243061e-05, + "loss": 2.8561, + "step": 2110 + }, + { + "epoch": 0.5950250158551195, + "grad_norm": 1192.008056640625, + "learning_rate": 3.704804378193248e-05, + "loss": 2.9942, + "step": 2111 + }, + { + "epoch": 0.5953068846451977, + "grad_norm": 568.0140991210938, + "learning_rate": 3.700403465995398e-05, + "loss": 3.0069, + "step": 2112 + }, + { + "epoch": 0.5955887534352758, + "grad_norm": 892.0103759765625, + "learning_rate": 3.6960036333051183e-05, + "loss": 2.8041, + "step": 2113 + }, + { + "epoch": 0.5958706222253541, + "grad_norm": 1424.004150390625, + "learning_rate": 3.6916048837771253e-05, + "loss": 2.6019, + "step": 2114 + }, + { + "epoch": 0.5961524910154323, + "grad_norm": 1272.0111083984375, + "learning_rate": 3.687207221065229e-05, + "loss": 2.5645, + "step": 2115 + }, + { + "epoch": 0.5964343598055105, + "grad_norm": 1496.008056640625, + "learning_rate": 3.682810648822343e-05, + "loss": 2.7253, + "step": 2116 + }, + { + "epoch": 0.5967162285955887, + "grad_norm": 1104.0074462890625, + "learning_rate": 3.6784151707004725e-05, + "loss": 2.9515, + "step": 2117 + }, + { + "epoch": 0.596998097385667, + "grad_norm": 860.009765625, + "learning_rate": 3.674020790350713e-05, + "loss": 2.7256, + "step": 2118 + }, + { + "epoch": 0.5972799661757452, + "grad_norm": 756.0098266601562, + "learning_rate": 3.669627511423247e-05, + "loss": 3.1517, + "step": 2119 + }, + { + "epoch": 0.5975618349658234, + "grad_norm": 832.0039672851562, + "learning_rate": 3.665235337567348e-05, + "loss": 2.35, + "step": 2120 + }, + { + "epoch": 0.5978437037559017, + "grad_norm": 936.010009765625, + "learning_rate": 3.6608442724313616e-05, + "loss": 2.9974, + "step": 2121 + }, + { + "epoch": 0.5981255725459799, + "grad_norm": 1032.007080078125, + "learning_rate": 3.6564543196627236e-05, + "loss": 2.5326, + "step": 2122 + }, + { + "epoch": 0.598407441336058, + "grad_norm": 884.0087890625, + "learning_rate": 3.6520654829079384e-05, + "loss": 2.9212, + "step": 2123 + }, + { + "epoch": 0.5986893101261362, + "grad_norm": 1024.070068359375, + "learning_rate": 3.647677765812585e-05, + "loss": 3.292, + "step": 2124 + }, + { + "epoch": 0.5989711789162145, + "grad_norm": 1576.0106201171875, + "learning_rate": 3.6432911720213126e-05, + "loss": 2.8662, + "step": 2125 + }, + { + "epoch": 0.5992530477062927, + "grad_norm": 820.0065307617188, + "learning_rate": 3.638905705177839e-05, + "loss": 2.7761, + "step": 2126 + }, + { + "epoch": 0.5995349164963709, + "grad_norm": 1392.011474609375, + "learning_rate": 3.634521368924946e-05, + "loss": 3.0153, + "step": 2127 + }, + { + "epoch": 0.5998167852864491, + "grad_norm": 2256.010986328125, + "learning_rate": 3.630138166904471e-05, + "loss": 3.2048, + "step": 2128 + }, + { + "epoch": 0.6000986540765274, + "grad_norm": 880.0108032226562, + "learning_rate": 3.6257561027573155e-05, + "loss": 2.5928, + "step": 2129 + }, + { + "epoch": 0.6003805228666056, + "grad_norm": 2128.012451171875, + "learning_rate": 3.6213751801234324e-05, + "loss": 3.4342, + "step": 2130 + }, + { + "epoch": 0.6006623916566838, + "grad_norm": 1184.0118408203125, + "learning_rate": 3.616995402641828e-05, + "loss": 3.1276, + "step": 2131 + }, + { + "epoch": 0.6009442604467621, + "grad_norm": 1120.0125732421875, + "learning_rate": 3.612616773950557e-05, + "loss": 2.4675, + "step": 2132 + }, + { + "epoch": 0.6012261292368403, + "grad_norm": 688.0154418945312, + "learning_rate": 3.6082392976867184e-05, + "loss": 2.599, + "step": 2133 + }, + { + "epoch": 0.6015079980269185, + "grad_norm": 1232.08837890625, + "learning_rate": 3.6038629774864565e-05, + "loss": 2.9425, + "step": 2134 + }, + { + "epoch": 0.6017898668169966, + "grad_norm": 1816.0093994140625, + "learning_rate": 3.599487816984951e-05, + "loss": 2.8147, + "step": 2135 + }, + { + "epoch": 0.6020717356070749, + "grad_norm": 1864.029052734375, + "learning_rate": 3.5951138198164245e-05, + "loss": 2.7156, + "step": 2136 + }, + { + "epoch": 0.6023536043971531, + "grad_norm": 1032.0191650390625, + "learning_rate": 3.590740989614131e-05, + "loss": 2.8191, + "step": 2137 + }, + { + "epoch": 0.6026354731872313, + "grad_norm": 752.0215454101562, + "learning_rate": 3.586369330010351e-05, + "loss": 2.8916, + "step": 2138 + }, + { + "epoch": 0.6029173419773096, + "grad_norm": 776.0130615234375, + "learning_rate": 3.5819988446363974e-05, + "loss": 2.6112, + "step": 2139 + }, + { + "epoch": 0.6031992107673878, + "grad_norm": 1688.008056640625, + "learning_rate": 3.577629537122605e-05, + "loss": 2.2537, + "step": 2140 + }, + { + "epoch": 0.603481079557466, + "grad_norm": 872.0235595703125, + "learning_rate": 3.5732614110983335e-05, + "loss": 3.3888, + "step": 2141 + }, + { + "epoch": 0.6037629483475442, + "grad_norm": 660.0189208984375, + "learning_rate": 3.568894470191957e-05, + "loss": 3.155, + "step": 2142 + }, + { + "epoch": 0.6040448171376225, + "grad_norm": 1176.010498046875, + "learning_rate": 3.5645287180308696e-05, + "loss": 3.1503, + "step": 2143 + }, + { + "epoch": 0.6043266859277007, + "grad_norm": 988.0114135742188, + "learning_rate": 3.5601641582414725e-05, + "loss": 2.8781, + "step": 2144 + }, + { + "epoch": 0.6046085547177789, + "grad_norm": 1400.00927734375, + "learning_rate": 3.5558007944491806e-05, + "loss": 3.6556, + "step": 2145 + }, + { + "epoch": 0.604890423507857, + "grad_norm": 920.010498046875, + "learning_rate": 3.551438630278417e-05, + "loss": 2.4749, + "step": 2146 + }, + { + "epoch": 0.6051722922979353, + "grad_norm": 2080.013671875, + "learning_rate": 3.5470776693526034e-05, + "loss": 3.2989, + "step": 2147 + }, + { + "epoch": 0.6054541610880135, + "grad_norm": 748.009521484375, + "learning_rate": 3.5427179152941646e-05, + "loss": 2.2491, + "step": 2148 + }, + { + "epoch": 0.6057360298780917, + "grad_norm": 1144.0076904296875, + "learning_rate": 3.5383593717245225e-05, + "loss": 2.9024, + "step": 2149 + }, + { + "epoch": 0.60601789866817, + "grad_norm": 2368.012451171875, + "learning_rate": 3.534002042264094e-05, + "loss": 3.6938, + "step": 2150 + }, + { + "epoch": 0.6062997674582482, + "grad_norm": 912.0098266601562, + "learning_rate": 3.529645930532287e-05, + "loss": 2.8011, + "step": 2151 + }, + { + "epoch": 0.6065816362483264, + "grad_norm": 500.0059509277344, + "learning_rate": 3.525291040147498e-05, + "loss": 2.9528, + "step": 2152 + }, + { + "epoch": 0.6068635050384046, + "grad_norm": 1520.005126953125, + "learning_rate": 3.520937374727109e-05, + "loss": 3.1751, + "step": 2153 + }, + { + "epoch": 0.6071453738284829, + "grad_norm": 512.0035400390625, + "learning_rate": 3.5165849378874844e-05, + "loss": 2.9746, + "step": 2154 + }, + { + "epoch": 0.6074272426185611, + "grad_norm": 796.0040283203125, + "learning_rate": 3.512233733243967e-05, + "loss": 3.0843, + "step": 2155 + }, + { + "epoch": 0.6077091114086393, + "grad_norm": 2144.007080078125, + "learning_rate": 3.5078837644108805e-05, + "loss": 3.491, + "step": 2156 + }, + { + "epoch": 0.6079909801987174, + "grad_norm": 1576.0042724609375, + "learning_rate": 3.503535035001516e-05, + "loss": 3.1332, + "step": 2157 + }, + { + "epoch": 0.6082728489887957, + "grad_norm": 1400.00390625, + "learning_rate": 3.49918754862814e-05, + "loss": 2.9727, + "step": 2158 + }, + { + "epoch": 0.6085547177788739, + "grad_norm": 2080.00927734375, + "learning_rate": 3.4948413089019817e-05, + "loss": 2.7998, + "step": 2159 + }, + { + "epoch": 0.6088365865689521, + "grad_norm": 1400.0030517578125, + "learning_rate": 3.490496319433241e-05, + "loss": 2.8369, + "step": 2160 + }, + { + "epoch": 0.6091184553590304, + "grad_norm": 1912.00439453125, + "learning_rate": 3.4861525838310724e-05, + "loss": 2.8064, + "step": 2161 + }, + { + "epoch": 0.6094003241491086, + "grad_norm": 1640.005126953125, + "learning_rate": 3.481810105703595e-05, + "loss": 2.9308, + "step": 2162 + }, + { + "epoch": 0.6096821929391868, + "grad_norm": 1152.006103515625, + "learning_rate": 3.477468888657877e-05, + "loss": 2.8926, + "step": 2163 + }, + { + "epoch": 0.609964061729265, + "grad_norm": 1088.0045166015625, + "learning_rate": 3.473128936299947e-05, + "loss": 2.8812, + "step": 2164 + }, + { + "epoch": 0.6102459305193433, + "grad_norm": 1004.0040283203125, + "learning_rate": 3.4687902522347765e-05, + "loss": 2.5765, + "step": 2165 + }, + { + "epoch": 0.6105277993094215, + "grad_norm": 1240.0057373046875, + "learning_rate": 3.464452840066284e-05, + "loss": 3.0796, + "step": 2166 + }, + { + "epoch": 0.6108096680994997, + "grad_norm": 1416.0037841796875, + "learning_rate": 3.460116703397336e-05, + "loss": 2.7996, + "step": 2167 + }, + { + "epoch": 0.6110915368895778, + "grad_norm": 1344.005126953125, + "learning_rate": 3.455781845829737e-05, + "loss": 2.681, + "step": 2168 + }, + { + "epoch": 0.6113734056796561, + "grad_norm": 1496.0028076171875, + "learning_rate": 3.451448270964228e-05, + "loss": 2.6315, + "step": 2169 + }, + { + "epoch": 0.6116552744697343, + "grad_norm": 1792.008056640625, + "learning_rate": 3.447115982400485e-05, + "loss": 2.865, + "step": 2170 + }, + { + "epoch": 0.6119371432598125, + "grad_norm": 246.00914001464844, + "learning_rate": 3.442784983737116e-05, + "loss": 3.416, + "step": 2171 + }, + { + "epoch": 0.6122190120498908, + "grad_norm": 1064.003173828125, + "learning_rate": 3.4384552785716576e-05, + "loss": 2.9225, + "step": 2172 + }, + { + "epoch": 0.612500880839969, + "grad_norm": 512.0037841796875, + "learning_rate": 3.434126870500571e-05, + "loss": 2.8194, + "step": 2173 + }, + { + "epoch": 0.6127827496300472, + "grad_norm": 952.0037841796875, + "learning_rate": 3.4297997631192426e-05, + "loss": 2.8907, + "step": 2174 + }, + { + "epoch": 0.6130646184201254, + "grad_norm": 1600.0068359375, + "learning_rate": 3.425473960021974e-05, + "loss": 3.0999, + "step": 2175 + }, + { + "epoch": 0.6133464872102037, + "grad_norm": 1512.0054931640625, + "learning_rate": 3.421149464801986e-05, + "loss": 3.0127, + "step": 2176 + }, + { + "epoch": 0.6136283560002819, + "grad_norm": 1344.0054931640625, + "learning_rate": 3.4168262810514164e-05, + "loss": 3.4206, + "step": 2177 + }, + { + "epoch": 0.6139102247903601, + "grad_norm": 664.003662109375, + "learning_rate": 3.412504412361307e-05, + "loss": 3.033, + "step": 2178 + }, + { + "epoch": 0.6141920935804384, + "grad_norm": 548.0051879882812, + "learning_rate": 3.408183862321612e-05, + "loss": 2.7285, + "step": 2179 + }, + { + "epoch": 0.6144739623705165, + "grad_norm": 1496.005615234375, + "learning_rate": 3.403864634521188e-05, + "loss": 3.1283, + "step": 2180 + }, + { + "epoch": 0.6147558311605947, + "grad_norm": 444.0041809082031, + "learning_rate": 3.399546732547794e-05, + "loss": 3.0421, + "step": 2181 + }, + { + "epoch": 0.6150376999506729, + "grad_norm": 756.0048828125, + "learning_rate": 3.3952301599880876e-05, + "loss": 2.5978, + "step": 2182 + }, + { + "epoch": 0.6153195687407512, + "grad_norm": 952.00830078125, + "learning_rate": 3.3909149204276216e-05, + "loss": 2.6306, + "step": 2183 + }, + { + "epoch": 0.6156014375308294, + "grad_norm": 1152.00439453125, + "learning_rate": 3.386601017450844e-05, + "loss": 2.4718, + "step": 2184 + }, + { + "epoch": 0.6158833063209076, + "grad_norm": 900.0059814453125, + "learning_rate": 3.3822884546410885e-05, + "loss": 2.7696, + "step": 2185 + }, + { + "epoch": 0.6161651751109858, + "grad_norm": 150.02078247070312, + "learning_rate": 3.377977235580577e-05, + "loss": 2.8985, + "step": 2186 + }, + { + "epoch": 0.6164470439010641, + "grad_norm": 1296.0050048828125, + "learning_rate": 3.3736673638504215e-05, + "loss": 3.3751, + "step": 2187 + }, + { + "epoch": 0.6167289126911423, + "grad_norm": 772.0090942382812, + "learning_rate": 3.369358843030603e-05, + "loss": 2.5726, + "step": 2188 + }, + { + "epoch": 0.6170107814812205, + "grad_norm": 1232.0089111328125, + "learning_rate": 3.365051676699991e-05, + "loss": 2.6001, + "step": 2189 + }, + { + "epoch": 0.6172926502712988, + "grad_norm": 984.0090942382812, + "learning_rate": 3.3607458684363236e-05, + "loss": 2.7107, + "step": 2190 + }, + { + "epoch": 0.617574519061377, + "grad_norm": 1096.0072021484375, + "learning_rate": 3.3564414218162136e-05, + "loss": 2.791, + "step": 2191 + }, + { + "epoch": 0.6178563878514551, + "grad_norm": 536.0127563476562, + "learning_rate": 3.352138340415141e-05, + "loss": 2.6521, + "step": 2192 + }, + { + "epoch": 0.6181382566415333, + "grad_norm": 436.0067443847656, + "learning_rate": 3.347836627807455e-05, + "loss": 2.6436, + "step": 2193 + }, + { + "epoch": 0.6184201254316116, + "grad_norm": 992.0076293945312, + "learning_rate": 3.343536287566362e-05, + "loss": 2.8689, + "step": 2194 + }, + { + "epoch": 0.6187019942216898, + "grad_norm": 920.0076293945312, + "learning_rate": 3.3392373232639356e-05, + "loss": 2.9346, + "step": 2195 + }, + { + "epoch": 0.618983863011768, + "grad_norm": 636.0073852539062, + "learning_rate": 3.3349397384710995e-05, + "loss": 2.5716, + "step": 2196 + }, + { + "epoch": 0.6192657318018462, + "grad_norm": 1720.0086669921875, + "learning_rate": 3.330643536757638e-05, + "loss": 2.8826, + "step": 2197 + }, + { + "epoch": 0.6195476005919245, + "grad_norm": 932.0076293945312, + "learning_rate": 3.3263487216921826e-05, + "loss": 2.6341, + "step": 2198 + }, + { + "epoch": 0.6198294693820027, + "grad_norm": 1012.0070190429688, + "learning_rate": 3.322055296842215e-05, + "loss": 2.5262, + "step": 2199 + }, + { + "epoch": 0.6201113381720809, + "grad_norm": 1832.0228271484375, + "learning_rate": 3.317763265774058e-05, + "loss": 3.7995, + "step": 2200 + }, + { + "epoch": 0.6203932069621592, + "grad_norm": 1272.00341796875, + "learning_rate": 3.313472632052882e-05, + "loss": 2.641, + "step": 2201 + }, + { + "epoch": 0.6206750757522373, + "grad_norm": 1384.0057373046875, + "learning_rate": 3.309183399242693e-05, + "loss": 2.7773, + "step": 2202 + }, + { + "epoch": 0.6209569445423155, + "grad_norm": 278.02728271484375, + "learning_rate": 3.304895570906336e-05, + "loss": 3.1344, + "step": 2203 + }, + { + "epoch": 0.6212388133323937, + "grad_norm": 1264.004638671875, + "learning_rate": 3.300609150605487e-05, + "loss": 2.5921, + "step": 2204 + }, + { + "epoch": 0.621520682122472, + "grad_norm": 1896.0050048828125, + "learning_rate": 3.296324141900654e-05, + "loss": 2.2904, + "step": 2205 + }, + { + "epoch": 0.6218025509125502, + "grad_norm": 940.0048217773438, + "learning_rate": 3.29204054835117e-05, + "loss": 2.78, + "step": 2206 + }, + { + "epoch": 0.6220844197026284, + "grad_norm": 1064.0093994140625, + "learning_rate": 3.2877583735151976e-05, + "loss": 2.6469, + "step": 2207 + }, + { + "epoch": 0.6223662884927067, + "grad_norm": 1176.0057373046875, + "learning_rate": 3.283477620949717e-05, + "loss": 2.9987, + "step": 2208 + }, + { + "epoch": 0.6226481572827849, + "grad_norm": 1272.0057373046875, + "learning_rate": 3.2791982942105265e-05, + "loss": 3.7103, + "step": 2209 + }, + { + "epoch": 0.6229300260728631, + "grad_norm": 1160.003173828125, + "learning_rate": 3.2749203968522425e-05, + "loss": 2.7028, + "step": 2210 + }, + { + "epoch": 0.6232118948629413, + "grad_norm": 592.0036010742188, + "learning_rate": 3.270643932428292e-05, + "loss": 2.1086, + "step": 2211 + }, + { + "epoch": 0.6234937636530196, + "grad_norm": 1936.007080078125, + "learning_rate": 3.266368904490914e-05, + "loss": 3.0723, + "step": 2212 + }, + { + "epoch": 0.6237756324430977, + "grad_norm": 812.0068969726562, + "learning_rate": 3.262095316591152e-05, + "loss": 3.3236, + "step": 2213 + }, + { + "epoch": 0.6240575012331759, + "grad_norm": 568.0043334960938, + "learning_rate": 3.257823172278852e-05, + "loss": 2.8096, + "step": 2214 + }, + { + "epoch": 0.6243393700232541, + "grad_norm": 556.0076293945312, + "learning_rate": 3.253552475102668e-05, + "loss": 3.1458, + "step": 2215 + }, + { + "epoch": 0.6246212388133324, + "grad_norm": 1160.0069580078125, + "learning_rate": 3.249283228610043e-05, + "loss": 3.2256, + "step": 2216 + }, + { + "epoch": 0.6249031076034106, + "grad_norm": 816.00537109375, + "learning_rate": 3.245015436347219e-05, + "loss": 3.1397, + "step": 2217 + }, + { + "epoch": 0.6251849763934888, + "grad_norm": 964.0050048828125, + "learning_rate": 3.240749101859234e-05, + "loss": 3.2256, + "step": 2218 + }, + { + "epoch": 0.6254668451835671, + "grad_norm": 604.0169067382812, + "learning_rate": 3.236484228689908e-05, + "loss": 3.0091, + "step": 2219 + }, + { + "epoch": 0.6257487139736453, + "grad_norm": 1072.0048828125, + "learning_rate": 3.232220820381852e-05, + "loss": 2.8796, + "step": 2220 + }, + { + "epoch": 0.6260305827637235, + "grad_norm": 1072.0068359375, + "learning_rate": 3.227958880476457e-05, + "loss": 2.9294, + "step": 2221 + }, + { + "epoch": 0.6263124515538017, + "grad_norm": 1560.00732421875, + "learning_rate": 3.223698412513898e-05, + "loss": 3.0938, + "step": 2222 + }, + { + "epoch": 0.62659432034388, + "grad_norm": 1152.00537109375, + "learning_rate": 3.219439420033123e-05, + "loss": 2.5739, + "step": 2223 + }, + { + "epoch": 0.6268761891339582, + "grad_norm": 748.0654296875, + "learning_rate": 3.215181906571858e-05, + "loss": 2.9538, + "step": 2224 + }, + { + "epoch": 0.6271580579240363, + "grad_norm": 664.017578125, + "learning_rate": 3.210925875666598e-05, + "loss": 3.0661, + "step": 2225 + }, + { + "epoch": 0.6274399267141145, + "grad_norm": 764.003662109375, + "learning_rate": 3.20667133085261e-05, + "loss": 2.6764, + "step": 2226 + }, + { + "epoch": 0.6277217955041928, + "grad_norm": 760.005126953125, + "learning_rate": 3.2024182756639185e-05, + "loss": 2.9509, + "step": 2227 + }, + { + "epoch": 0.628003664294271, + "grad_norm": 984.0048828125, + "learning_rate": 3.198166713633323e-05, + "loss": 2.6888, + "step": 2228 + }, + { + "epoch": 0.6282855330843492, + "grad_norm": 1008.0072631835938, + "learning_rate": 3.1939166482923734e-05, + "loss": 2.9902, + "step": 2229 + }, + { + "epoch": 0.6285674018744275, + "grad_norm": 290.0063781738281, + "learning_rate": 3.189668083171379e-05, + "loss": 3.1797, + "step": 2230 + }, + { + "epoch": 0.6288492706645057, + "grad_norm": 1240.005615234375, + "learning_rate": 3.185421021799401e-05, + "loss": 2.9873, + "step": 2231 + }, + { + "epoch": 0.6291311394545839, + "grad_norm": 1944.0062255859375, + "learning_rate": 3.181175467704256e-05, + "loss": 3.0094, + "step": 2232 + }, + { + "epoch": 0.6294130082446621, + "grad_norm": 2656.0107421875, + "learning_rate": 3.176931424412505e-05, + "loss": 3.4203, + "step": 2233 + }, + { + "epoch": 0.6296948770347404, + "grad_norm": 1328.0076904296875, + "learning_rate": 3.1726888954494564e-05, + "loss": 3.2144, + "step": 2234 + }, + { + "epoch": 0.6299767458248186, + "grad_norm": 2880.014404296875, + "learning_rate": 3.168447884339157e-05, + "loss": 3.2666, + "step": 2235 + }, + { + "epoch": 0.6302586146148967, + "grad_norm": 888.0064697265625, + "learning_rate": 3.164208394604398e-05, + "loss": 2.6494, + "step": 2236 + }, + { + "epoch": 0.630540483404975, + "grad_norm": 584.0074462890625, + "learning_rate": 3.1599704297666994e-05, + "loss": 2.8735, + "step": 2237 + }, + { + "epoch": 0.6308223521950532, + "grad_norm": 1592.0133056640625, + "learning_rate": 3.155733993346324e-05, + "loss": 2.8047, + "step": 2238 + }, + { + "epoch": 0.6311042209851314, + "grad_norm": 356.0132141113281, + "learning_rate": 3.1514990888622595e-05, + "loss": 2.8876, + "step": 2239 + }, + { + "epoch": 0.6313860897752096, + "grad_norm": 1224.009033203125, + "learning_rate": 3.1472657198322204e-05, + "loss": 2.8226, + "step": 2240 + }, + { + "epoch": 0.6316679585652879, + "grad_norm": 1248.022216796875, + "learning_rate": 3.143033889772649e-05, + "loss": 2.5707, + "step": 2241 + }, + { + "epoch": 0.6319498273553661, + "grad_norm": 1944.012451171875, + "learning_rate": 3.138803602198704e-05, + "loss": 2.6834, + "step": 2242 + }, + { + "epoch": 0.6322316961454443, + "grad_norm": 1144.007080078125, + "learning_rate": 3.134574860624271e-05, + "loss": 3.0437, + "step": 2243 + }, + { + "epoch": 0.6325135649355225, + "grad_norm": 556.0115356445312, + "learning_rate": 3.130347668561944e-05, + "loss": 2.6123, + "step": 2244 + }, + { + "epoch": 0.6327954337256008, + "grad_norm": 1808.006591796875, + "learning_rate": 3.126122029523031e-05, + "loss": 3.5336, + "step": 2245 + }, + { + "epoch": 0.633077302515679, + "grad_norm": 1240.01025390625, + "learning_rate": 3.121897947017556e-05, + "loss": 3.5306, + "step": 2246 + }, + { + "epoch": 0.6333591713057571, + "grad_norm": 752.0082397460938, + "learning_rate": 3.117675424554241e-05, + "loss": 2.4168, + "step": 2247 + }, + { + "epoch": 0.6336410400958354, + "grad_norm": 1296.007080078125, + "learning_rate": 3.113454465640519e-05, + "loss": 3.1225, + "step": 2248 + }, + { + "epoch": 0.6339229088859136, + "grad_norm": 876.0083618164062, + "learning_rate": 3.109235073782525e-05, + "loss": 3.3187, + "step": 2249 + }, + { + "epoch": 0.6342047776759918, + "grad_norm": 1024.0062255859375, + "learning_rate": 3.1050172524850847e-05, + "loss": 2.4474, + "step": 2250 + }, + { + "epoch": 0.63448664646607, + "grad_norm": 872.0060424804688, + "learning_rate": 3.100801005251727e-05, + "loss": 2.8198, + "step": 2251 + }, + { + "epoch": 0.6347685152561483, + "grad_norm": 1848.0048828125, + "learning_rate": 3.096586335584668e-05, + "loss": 3.1482, + "step": 2252 + }, + { + "epoch": 0.6350503840462265, + "grad_norm": 1200.0067138671875, + "learning_rate": 3.092373246984819e-05, + "loss": 2.9614, + "step": 2253 + }, + { + "epoch": 0.6353322528363047, + "grad_norm": 2144.006103515625, + "learning_rate": 3.0881617429517694e-05, + "loss": 3.6143, + "step": 2254 + }, + { + "epoch": 0.6356141216263829, + "grad_norm": 776.0045776367188, + "learning_rate": 3.083951826983803e-05, + "loss": 3.0091, + "step": 2255 + }, + { + "epoch": 0.6358959904164612, + "grad_norm": 1624.007080078125, + "learning_rate": 3.0797435025778757e-05, + "loss": 3.265, + "step": 2256 + }, + { + "epoch": 0.6361778592065394, + "grad_norm": 1840.00390625, + "learning_rate": 3.075536773229624e-05, + "loss": 3.4577, + "step": 2257 + }, + { + "epoch": 0.6364597279966175, + "grad_norm": 1224.0072021484375, + "learning_rate": 3.071331642433365e-05, + "loss": 2.8535, + "step": 2258 + }, + { + "epoch": 0.6367415967866958, + "grad_norm": 1432.0074462890625, + "learning_rate": 3.0671281136820785e-05, + "loss": 3.1605, + "step": 2259 + }, + { + "epoch": 0.637023465576774, + "grad_norm": 1520.0057373046875, + "learning_rate": 3.0629261904674205e-05, + "loss": 2.4614, + "step": 2260 + }, + { + "epoch": 0.6373053343668522, + "grad_norm": 1528.005859375, + "learning_rate": 3.058725876279711e-05, + "loss": 3.3665, + "step": 2261 + }, + { + "epoch": 0.6375872031569304, + "grad_norm": 249.01097106933594, + "learning_rate": 3.0545271746079327e-05, + "loss": 2.6458, + "step": 2262 + }, + { + "epoch": 0.6378690719470087, + "grad_norm": 728.0046997070312, + "learning_rate": 3.05033008893973e-05, + "loss": 2.9216, + "step": 2263 + }, + { + "epoch": 0.6381509407370869, + "grad_norm": 624.0050048828125, + "learning_rate": 3.0461346227614045e-05, + "loss": 3.2676, + "step": 2264 + }, + { + "epoch": 0.6384328095271651, + "grad_norm": 1056.0040283203125, + "learning_rate": 3.0419407795579146e-05, + "loss": 2.7305, + "step": 2265 + }, + { + "epoch": 0.6387146783172434, + "grad_norm": 636.0043334960938, + "learning_rate": 3.0377485628128653e-05, + "loss": 2.7751, + "step": 2266 + }, + { + "epoch": 0.6389965471073216, + "grad_norm": 1160.0057373046875, + "learning_rate": 3.0335579760085182e-05, + "loss": 2.4941, + "step": 2267 + }, + { + "epoch": 0.6392784158973998, + "grad_norm": 1192.0068359375, + "learning_rate": 3.029369022625772e-05, + "loss": 2.9144, + "step": 2268 + }, + { + "epoch": 0.6395602846874779, + "grad_norm": 1368.0059814453125, + "learning_rate": 3.0251817061441778e-05, + "loss": 2.9678, + "step": 2269 + }, + { + "epoch": 0.6398421534775562, + "grad_norm": 1384.004638671875, + "learning_rate": 3.0209960300419216e-05, + "loss": 2.974, + "step": 2270 + }, + { + "epoch": 0.6401240222676344, + "grad_norm": 2208.00341796875, + "learning_rate": 3.0168119977958266e-05, + "loss": 2.5628, + "step": 2271 + }, + { + "epoch": 0.6404058910577126, + "grad_norm": 600.00537109375, + "learning_rate": 3.0126296128813537e-05, + "loss": 2.5232, + "step": 2272 + }, + { + "epoch": 0.6406877598477908, + "grad_norm": 470.0089416503906, + "learning_rate": 3.0084488787725906e-05, + "loss": 3.7744, + "step": 2273 + }, + { + "epoch": 0.6409696286378691, + "grad_norm": 1000.00537109375, + "learning_rate": 3.0042697989422602e-05, + "loss": 2.8048, + "step": 2274 + }, + { + "epoch": 0.6412514974279473, + "grad_norm": 1944.0074462890625, + "learning_rate": 3.000092376861705e-05, + "loss": 2.9681, + "step": 2275 + }, + { + "epoch": 0.6415333662180255, + "grad_norm": 700.00390625, + "learning_rate": 2.9959166160008955e-05, + "loss": 2.6244, + "step": 2276 + }, + { + "epoch": 0.6418152350081038, + "grad_norm": 1704.0079345703125, + "learning_rate": 2.9917425198284187e-05, + "loss": 3.3307, + "step": 2277 + }, + { + "epoch": 0.642097103798182, + "grad_norm": 1752.0042724609375, + "learning_rate": 2.987570091811479e-05, + "loss": 3.0166, + "step": 2278 + }, + { + "epoch": 0.6423789725882602, + "grad_norm": 1120.0068359375, + "learning_rate": 2.983399335415899e-05, + "loss": 2.8955, + "step": 2279 + }, + { + "epoch": 0.6426608413783383, + "grad_norm": 832.0050048828125, + "learning_rate": 2.97923025410611e-05, + "loss": 2.8831, + "step": 2280 + }, + { + "epoch": 0.6429427101684166, + "grad_norm": 500.0042419433594, + "learning_rate": 2.97506285134515e-05, + "loss": 2.7767, + "step": 2281 + }, + { + "epoch": 0.6432245789584948, + "grad_norm": 1448.0054931640625, + "learning_rate": 2.970897130594667e-05, + "loss": 2.905, + "step": 2282 + }, + { + "epoch": 0.643506447748573, + "grad_norm": 1320.0054931640625, + "learning_rate": 2.9667330953149074e-05, + "loss": 2.5313, + "step": 2283 + }, + { + "epoch": 0.6437883165386512, + "grad_norm": 868.0131225585938, + "learning_rate": 2.9625707489647225e-05, + "loss": 2.7754, + "step": 2284 + }, + { + "epoch": 0.6440701853287295, + "grad_norm": 700.0348510742188, + "learning_rate": 2.958410095001555e-05, + "loss": 3.3214, + "step": 2285 + }, + { + "epoch": 0.6443520541188077, + "grad_norm": 612.0143432617188, + "learning_rate": 2.9542511368814463e-05, + "loss": 2.322, + "step": 2286 + }, + { + "epoch": 0.6446339229088859, + "grad_norm": 516.7572631835938, + "learning_rate": 2.9500938780590275e-05, + "loss": 2.6891, + "step": 2287 + }, + { + "epoch": 0.6449157916989642, + "grad_norm": 664.0189208984375, + "learning_rate": 2.945938321987517e-05, + "loss": 3.0716, + "step": 2288 + }, + { + "epoch": 0.6451976604890424, + "grad_norm": 1448.0135498046875, + "learning_rate": 2.9417844721187238e-05, + "loss": 2.7314, + "step": 2289 + }, + { + "epoch": 0.6454795292791206, + "grad_norm": 700.0070190429688, + "learning_rate": 2.937632331903032e-05, + "loss": 2.5521, + "step": 2290 + }, + { + "epoch": 0.6457613980691987, + "grad_norm": 944.0120849609375, + "learning_rate": 2.9334819047894124e-05, + "loss": 3.0658, + "step": 2291 + }, + { + "epoch": 0.646043266859277, + "grad_norm": 2464.005859375, + "learning_rate": 2.9293331942254075e-05, + "loss": 2.2681, + "step": 2292 + }, + { + "epoch": 0.6463251356493552, + "grad_norm": 972.0101318359375, + "learning_rate": 2.92518620365714e-05, + "loss": 2.7061, + "step": 2293 + }, + { + "epoch": 0.6466070044394334, + "grad_norm": 1152.006103515625, + "learning_rate": 2.9210409365292962e-05, + "loss": 2.6319, + "step": 2294 + }, + { + "epoch": 0.6468888732295116, + "grad_norm": 592.0150146484375, + "learning_rate": 2.9168973962851375e-05, + "loss": 3.0186, + "step": 2295 + }, + { + "epoch": 0.6471707420195899, + "grad_norm": 832.0059204101562, + "learning_rate": 2.9127555863664858e-05, + "loss": 2.1589, + "step": 2296 + }, + { + "epoch": 0.6474526108096681, + "grad_norm": 1128.0079345703125, + "learning_rate": 2.9086155102137313e-05, + "loss": 3.1898, + "step": 2297 + }, + { + "epoch": 0.6477344795997463, + "grad_norm": 840.00830078125, + "learning_rate": 2.904477171265818e-05, + "loss": 3.1878, + "step": 2298 + }, + { + "epoch": 0.6480163483898246, + "grad_norm": 1512.0103759765625, + "learning_rate": 2.900340572960253e-05, + "loss": 2.5744, + "step": 2299 + }, + { + "epoch": 0.6482982171799028, + "grad_norm": 1352.00830078125, + "learning_rate": 2.896205718733094e-05, + "loss": 3.374, + "step": 2300 + }, + { + "epoch": 0.648580085969981, + "grad_norm": 1472.0107421875, + "learning_rate": 2.8920726120189456e-05, + "loss": 2.6642, + "step": 2301 + }, + { + "epoch": 0.6488619547600591, + "grad_norm": 458.00604248046875, + "learning_rate": 2.887941256250972e-05, + "loss": 2.9464, + "step": 2302 + }, + { + "epoch": 0.6491438235501374, + "grad_norm": 1208.0045166015625, + "learning_rate": 2.883811654860874e-05, + "loss": 2.7057, + "step": 2303 + }, + { + "epoch": 0.6494256923402156, + "grad_norm": 912.004150390625, + "learning_rate": 2.8796838112788975e-05, + "loss": 3.0114, + "step": 2304 + }, + { + "epoch": 0.6497075611302938, + "grad_norm": 900.0051879882812, + "learning_rate": 2.8755577289338266e-05, + "loss": 2.8242, + "step": 2305 + }, + { + "epoch": 0.6499894299203721, + "grad_norm": 924.0048828125, + "learning_rate": 2.8714334112529882e-05, + "loss": 3.1618, + "step": 2306 + }, + { + "epoch": 0.6502712987104503, + "grad_norm": 1792.005615234375, + "learning_rate": 2.867310861662238e-05, + "loss": 3.2832, + "step": 2307 + }, + { + "epoch": 0.6505531675005285, + "grad_norm": 1784.0059814453125, + "learning_rate": 2.8631900835859616e-05, + "loss": 3.3936, + "step": 2308 + }, + { + "epoch": 0.6508350362906067, + "grad_norm": 636.0060424804688, + "learning_rate": 2.8590710804470803e-05, + "loss": 2.744, + "step": 2309 + }, + { + "epoch": 0.651116905080685, + "grad_norm": 1320.008056640625, + "learning_rate": 2.8549538556670357e-05, + "loss": 2.778, + "step": 2310 + }, + { + "epoch": 0.6513987738707632, + "grad_norm": 1280.00732421875, + "learning_rate": 2.8508384126657906e-05, + "loss": 3.3975, + "step": 2311 + }, + { + "epoch": 0.6516806426608414, + "grad_norm": 1016.0042114257812, + "learning_rate": 2.8467247548618347e-05, + "loss": 2.8363, + "step": 2312 + }, + { + "epoch": 0.6519625114509195, + "grad_norm": 1264.007568359375, + "learning_rate": 2.8426128856721688e-05, + "loss": 3.0768, + "step": 2313 + }, + { + "epoch": 0.6522443802409978, + "grad_norm": 1104.00537109375, + "learning_rate": 2.8385028085123088e-05, + "loss": 2.6045, + "step": 2314 + }, + { + "epoch": 0.652526249031076, + "grad_norm": 498.010009765625, + "learning_rate": 2.8343945267962825e-05, + "loss": 3.4753, + "step": 2315 + }, + { + "epoch": 0.6528081178211542, + "grad_norm": 744.0064086914062, + "learning_rate": 2.8302880439366298e-05, + "loss": 2.8545, + "step": 2316 + }, + { + "epoch": 0.6530899866112325, + "grad_norm": 996.0037231445312, + "learning_rate": 2.826183363344391e-05, + "loss": 2.1605, + "step": 2317 + }, + { + "epoch": 0.6533718554013107, + "grad_norm": 1560.0089111328125, + "learning_rate": 2.8220804884291142e-05, + "loss": 3.4733, + "step": 2318 + }, + { + "epoch": 0.6536537241913889, + "grad_norm": 928.0053100585938, + "learning_rate": 2.817979422598841e-05, + "loss": 2.7474, + "step": 2319 + }, + { + "epoch": 0.6539355929814671, + "grad_norm": 1296.004150390625, + "learning_rate": 2.813880169260117e-05, + "loss": 2.9557, + "step": 2320 + }, + { + "epoch": 0.6542174617715454, + "grad_norm": 1704.004638671875, + "learning_rate": 2.8097827318179813e-05, + "loss": 2.7777, + "step": 2321 + }, + { + "epoch": 0.6544993305616236, + "grad_norm": 1184.0042724609375, + "learning_rate": 2.8056871136759622e-05, + "loss": 2.7074, + "step": 2322 + }, + { + "epoch": 0.6547811993517018, + "grad_norm": 884.00634765625, + "learning_rate": 2.8015933182360777e-05, + "loss": 2.6445, + "step": 2323 + }, + { + "epoch": 0.65506306814178, + "grad_norm": 398.035888671875, + "learning_rate": 2.79750134889883e-05, + "loss": 3.6735, + "step": 2324 + }, + { + "epoch": 0.6553449369318582, + "grad_norm": 406.00787353515625, + "learning_rate": 2.7934112090632068e-05, + "loss": 3.0176, + "step": 2325 + }, + { + "epoch": 0.6556268057219364, + "grad_norm": 740.0066528320312, + "learning_rate": 2.7893229021266777e-05, + "loss": 2.9405, + "step": 2326 + }, + { + "epoch": 0.6559086745120146, + "grad_norm": 1392.0081787109375, + "learning_rate": 2.785236431485186e-05, + "loss": 3.059, + "step": 2327 + }, + { + "epoch": 0.6561905433020929, + "grad_norm": 1296.0047607421875, + "learning_rate": 2.7811518005331516e-05, + "loss": 2.7917, + "step": 2328 + }, + { + "epoch": 0.6564724120921711, + "grad_norm": 1248.00830078125, + "learning_rate": 2.7770690126634642e-05, + "loss": 2.5928, + "step": 2329 + }, + { + "epoch": 0.6567542808822493, + "grad_norm": 924.0066528320312, + "learning_rate": 2.7729880712674862e-05, + "loss": 2.7035, + "step": 2330 + }, + { + "epoch": 0.6570361496723275, + "grad_norm": 1232.0059814453125, + "learning_rate": 2.7689089797350452e-05, + "loss": 3.5885, + "step": 2331 + }, + { + "epoch": 0.6573180184624058, + "grad_norm": 490.009033203125, + "learning_rate": 2.764831741454432e-05, + "loss": 2.3905, + "step": 2332 + }, + { + "epoch": 0.657599887252484, + "grad_norm": 804.0084228515625, + "learning_rate": 2.760756359812395e-05, + "loss": 2.9284, + "step": 2333 + }, + { + "epoch": 0.6578817560425622, + "grad_norm": 1464.0084228515625, + "learning_rate": 2.756682838194141e-05, + "loss": 3.3933, + "step": 2334 + }, + { + "epoch": 0.6581636248326405, + "grad_norm": 1664.0152587890625, + "learning_rate": 2.75261117998334e-05, + "loss": 3.5378, + "step": 2335 + }, + { + "epoch": 0.6584454936227186, + "grad_norm": 1528.0125732421875, + "learning_rate": 2.7485413885621026e-05, + "loss": 2.7627, + "step": 2336 + }, + { + "epoch": 0.6587273624127968, + "grad_norm": 1448.014892578125, + "learning_rate": 2.7444734673109952e-05, + "loss": 3.1123, + "step": 2337 + }, + { + "epoch": 0.659009231202875, + "grad_norm": 1080.01123046875, + "learning_rate": 2.740407419609028e-05, + "loss": 2.8692, + "step": 2338 + }, + { + "epoch": 0.6592910999929533, + "grad_norm": 572.0125122070312, + "learning_rate": 2.736343248833657e-05, + "loss": 2.5177, + "step": 2339 + }, + { + "epoch": 0.6595729687830315, + "grad_norm": 540.01025390625, + "learning_rate": 2.732280958360782e-05, + "loss": 2.8329, + "step": 2340 + }, + { + "epoch": 0.6598548375731097, + "grad_norm": 1624.0157470703125, + "learning_rate": 2.7282205515647348e-05, + "loss": 2.9718, + "step": 2341 + }, + { + "epoch": 0.6601367063631879, + "grad_norm": 1544.0113525390625, + "learning_rate": 2.724162031818285e-05, + "loss": 2.8809, + "step": 2342 + }, + { + "epoch": 0.6604185751532662, + "grad_norm": 856.007568359375, + "learning_rate": 2.7201054024926344e-05, + "loss": 2.5132, + "step": 2343 + }, + { + "epoch": 0.6607004439433444, + "grad_norm": 624.0098266601562, + "learning_rate": 2.7160506669574136e-05, + "loss": 2.6459, + "step": 2344 + }, + { + "epoch": 0.6609823127334226, + "grad_norm": 1168.0096435546875, + "learning_rate": 2.7119978285806858e-05, + "loss": 3.7497, + "step": 2345 + }, + { + "epoch": 0.6612641815235009, + "grad_norm": 1528.0052490234375, + "learning_rate": 2.707946890728932e-05, + "loss": 2.4419, + "step": 2346 + }, + { + "epoch": 0.661546050313579, + "grad_norm": 604.0079956054688, + "learning_rate": 2.7038978567670558e-05, + "loss": 2.3003, + "step": 2347 + }, + { + "epoch": 0.6618279191036572, + "grad_norm": 1216.0130615234375, + "learning_rate": 2.6998507300583785e-05, + "loss": 2.7754, + "step": 2348 + }, + { + "epoch": 0.6621097878937354, + "grad_norm": 1312.00732421875, + "learning_rate": 2.695805513964641e-05, + "loss": 2.032, + "step": 2349 + }, + { + "epoch": 0.6623916566838137, + "grad_norm": 968.0088500976562, + "learning_rate": 2.691762211845997e-05, + "loss": 2.6062, + "step": 2350 + }, + { + "epoch": 0.6626735254738919, + "grad_norm": 1824.0045166015625, + "learning_rate": 2.6877208270610065e-05, + "loss": 3.0918, + "step": 2351 + }, + { + "epoch": 0.6629553942639701, + "grad_norm": 1752.00732421875, + "learning_rate": 2.683681362966638e-05, + "loss": 2.6745, + "step": 2352 + }, + { + "epoch": 0.6632372630540483, + "grad_norm": 844.0040893554688, + "learning_rate": 2.679643822918264e-05, + "loss": 2.5898, + "step": 2353 + }, + { + "epoch": 0.6635191318441266, + "grad_norm": 1120.006103515625, + "learning_rate": 2.675608210269663e-05, + "loss": 2.5407, + "step": 2354 + }, + { + "epoch": 0.6638010006342048, + "grad_norm": 1440.0084228515625, + "learning_rate": 2.6715745283730076e-05, + "loss": 2.7458, + "step": 2355 + }, + { + "epoch": 0.664082869424283, + "grad_norm": 2624.0751953125, + "learning_rate": 2.66754278057887e-05, + "loss": 2.9727, + "step": 2356 + }, + { + "epoch": 0.6643647382143613, + "grad_norm": 948.0059204101562, + "learning_rate": 2.6635129702362127e-05, + "loss": 3.0052, + "step": 2357 + }, + { + "epoch": 0.6646466070044394, + "grad_norm": 1264.0059814453125, + "learning_rate": 2.65948510069239e-05, + "loss": 2.8102, + "step": 2358 + }, + { + "epoch": 0.6649284757945176, + "grad_norm": 1528.0120849609375, + "learning_rate": 2.6554591752931456e-05, + "loss": 2.6654, + "step": 2359 + }, + { + "epoch": 0.6652103445845958, + "grad_norm": 1136.0048828125, + "learning_rate": 2.6514351973826103e-05, + "loss": 2.7787, + "step": 2360 + }, + { + "epoch": 0.6654922133746741, + "grad_norm": 1120.0035400390625, + "learning_rate": 2.6474131703032907e-05, + "loss": 2.7464, + "step": 2361 + }, + { + "epoch": 0.6657740821647523, + "grad_norm": 1176.00634765625, + "learning_rate": 2.6433930973960773e-05, + "loss": 2.4779, + "step": 2362 + }, + { + "epoch": 0.6660559509548305, + "grad_norm": 716.0037231445312, + "learning_rate": 2.6393749820002356e-05, + "loss": 2.8665, + "step": 2363 + }, + { + "epoch": 0.6663378197449088, + "grad_norm": 1672.0050048828125, + "learning_rate": 2.635358827453409e-05, + "loss": 2.26, + "step": 2364 + }, + { + "epoch": 0.666619688534987, + "grad_norm": 1512.0072021484375, + "learning_rate": 2.6313446370916074e-05, + "loss": 2.4441, + "step": 2365 + }, + { + "epoch": 0.6669015573250652, + "grad_norm": 920.004150390625, + "learning_rate": 2.627332414249211e-05, + "loss": 2.6016, + "step": 2366 + }, + { + "epoch": 0.6671834261151434, + "grad_norm": 660.0069580078125, + "learning_rate": 2.623322162258964e-05, + "loss": 2.8369, + "step": 2367 + }, + { + "epoch": 0.6674652949052217, + "grad_norm": 960.0042724609375, + "learning_rate": 2.6193138844519782e-05, + "loss": 3.1172, + "step": 2368 + }, + { + "epoch": 0.6677471636952999, + "grad_norm": 1024.0072021484375, + "learning_rate": 2.6153075841577212e-05, + "loss": 3.2852, + "step": 2369 + }, + { + "epoch": 0.668029032485378, + "grad_norm": 1344.00390625, + "learning_rate": 2.611303264704017e-05, + "loss": 2.5091, + "step": 2370 + }, + { + "epoch": 0.6683109012754562, + "grad_norm": 430.007080078125, + "learning_rate": 2.6073009294170515e-05, + "loss": 2.7604, + "step": 2371 + }, + { + "epoch": 0.6685927700655345, + "grad_norm": 680.008544921875, + "learning_rate": 2.6033005816213518e-05, + "loss": 3.3533, + "step": 2372 + }, + { + "epoch": 0.6688746388556127, + "grad_norm": 524.0084228515625, + "learning_rate": 2.5993022246398046e-05, + "loss": 2.8672, + "step": 2373 + }, + { + "epoch": 0.6691565076456909, + "grad_norm": 1056.0069580078125, + "learning_rate": 2.5953058617936365e-05, + "loss": 2.9229, + "step": 2374 + }, + { + "epoch": 0.6694383764357692, + "grad_norm": 664.00634765625, + "learning_rate": 2.5913114964024182e-05, + "loss": 2.7223, + "step": 2375 + }, + { + "epoch": 0.6697202452258474, + "grad_norm": 1592.0084228515625, + "learning_rate": 2.5873191317840633e-05, + "loss": 2.9779, + "step": 2376 + }, + { + "epoch": 0.6700021140159256, + "grad_norm": 1104.0062255859375, + "learning_rate": 2.5833287712548198e-05, + "loss": 2.8692, + "step": 2377 + }, + { + "epoch": 0.6702839828060038, + "grad_norm": 1168.0072021484375, + "learning_rate": 2.579340418129278e-05, + "loss": 2.9401, + "step": 2378 + }, + { + "epoch": 0.6705658515960821, + "grad_norm": 1104.0054931640625, + "learning_rate": 2.5753540757203544e-05, + "loss": 2.7768, + "step": 2379 + }, + { + "epoch": 0.6708477203861603, + "grad_norm": 1120.00537109375, + "learning_rate": 2.571369747339295e-05, + "loss": 2.7793, + "step": 2380 + }, + { + "epoch": 0.6711295891762384, + "grad_norm": 620.008544921875, + "learning_rate": 2.5673874362956796e-05, + "loss": 3.1309, + "step": 2381 + }, + { + "epoch": 0.6714114579663166, + "grad_norm": 960.005615234375, + "learning_rate": 2.5634071458974028e-05, + "loss": 2.4436, + "step": 2382 + }, + { + "epoch": 0.6716933267563949, + "grad_norm": 772.0100708007812, + "learning_rate": 2.5594288794506917e-05, + "loss": 2.7768, + "step": 2383 + }, + { + "epoch": 0.6719751955464731, + "grad_norm": 1112.006103515625, + "learning_rate": 2.555452640260082e-05, + "loss": 2.3641, + "step": 2384 + }, + { + "epoch": 0.6722570643365513, + "grad_norm": 1768.007080078125, + "learning_rate": 2.5514784316284323e-05, + "loss": 3.4557, + "step": 2385 + }, + { + "epoch": 0.6725389331266296, + "grad_norm": 852.0120849609375, + "learning_rate": 2.5475062568569076e-05, + "loss": 3.1271, + "step": 2386 + }, + { + "epoch": 0.6728208019167078, + "grad_norm": 972.0106811523438, + "learning_rate": 2.5435361192449935e-05, + "loss": 2.9549, + "step": 2387 + }, + { + "epoch": 0.673102670706786, + "grad_norm": 1256.0074462890625, + "learning_rate": 2.539568022090475e-05, + "loss": 2.9997, + "step": 2388 + }, + { + "epoch": 0.6733845394968642, + "grad_norm": 1040.0084228515625, + "learning_rate": 2.5356019686894455e-05, + "loss": 2.4602, + "step": 2389 + }, + { + "epoch": 0.6736664082869425, + "grad_norm": 99.55215454101562, + "learning_rate": 2.531637962336298e-05, + "loss": 2.8796, + "step": 2390 + }, + { + "epoch": 0.6739482770770207, + "grad_norm": 1296.00537109375, + "learning_rate": 2.5276760063237314e-05, + "loss": 2.7465, + "step": 2391 + }, + { + "epoch": 0.6742301458670988, + "grad_norm": 2016.00927734375, + "learning_rate": 2.5237161039427337e-05, + "loss": 2.9066, + "step": 2392 + }, + { + "epoch": 0.6745120146571771, + "grad_norm": 1176.00732421875, + "learning_rate": 2.519758258482594e-05, + "loss": 2.6505, + "step": 2393 + }, + { + "epoch": 0.6747938834472553, + "grad_norm": 1504.010498046875, + "learning_rate": 2.5158024732308895e-05, + "loss": 3.0089, + "step": 2394 + }, + { + "epoch": 0.6750757522373335, + "grad_norm": 1264.010009765625, + "learning_rate": 2.5118487514734846e-05, + "loss": 3.0915, + "step": 2395 + }, + { + "epoch": 0.6753576210274117, + "grad_norm": 1688.0078125, + "learning_rate": 2.5078970964945293e-05, + "loss": 2.71, + "step": 2396 + }, + { + "epoch": 0.67563948981749, + "grad_norm": 1088.0123291015625, + "learning_rate": 2.503947511576463e-05, + "loss": 3.4603, + "step": 2397 + }, + { + "epoch": 0.6759213586075682, + "grad_norm": 1032.014404296875, + "learning_rate": 2.500000000000001e-05, + "loss": 3.4072, + "step": 2398 + }, + { + "epoch": 0.6762032273976464, + "grad_norm": 1088.0137939453125, + "learning_rate": 2.4960545650441353e-05, + "loss": 3.0013, + "step": 2399 + }, + { + "epoch": 0.6764850961877246, + "grad_norm": 1416.01953125, + "learning_rate": 2.4921112099861326e-05, + "loss": 3.0567, + "step": 2400 + }, + { + "epoch": 0.6767669649778029, + "grad_norm": 1032.0189208984375, + "learning_rate": 2.4881699381015362e-05, + "loss": 2.5155, + "step": 2401 + }, + { + "epoch": 0.677048833767881, + "grad_norm": 1312.0079345703125, + "learning_rate": 2.4842307526641583e-05, + "loss": 2.4761, + "step": 2402 + }, + { + "epoch": 0.6773307025579592, + "grad_norm": 1336.0037841796875, + "learning_rate": 2.480293656946075e-05, + "loss": 2.4031, + "step": 2403 + }, + { + "epoch": 0.6776125713480375, + "grad_norm": 1200.005859375, + "learning_rate": 2.4763586542176272e-05, + "loss": 2.7702, + "step": 2404 + }, + { + "epoch": 0.6778944401381157, + "grad_norm": 426.01165771484375, + "learning_rate": 2.4724257477474182e-05, + "loss": 2.8682, + "step": 2405 + }, + { + "epoch": 0.6781763089281939, + "grad_norm": 1528.006591796875, + "learning_rate": 2.4684949408023084e-05, + "loss": 3.0716, + "step": 2406 + }, + { + "epoch": 0.6784581777182721, + "grad_norm": 1392.0062255859375, + "learning_rate": 2.4645662366474188e-05, + "loss": 3.4391, + "step": 2407 + }, + { + "epoch": 0.6787400465083504, + "grad_norm": 1672.010986328125, + "learning_rate": 2.460639638546119e-05, + "loss": 3.8132, + "step": 2408 + }, + { + "epoch": 0.6790219152984286, + "grad_norm": 1200.00830078125, + "learning_rate": 2.4567151497600306e-05, + "loss": 2.7729, + "step": 2409 + }, + { + "epoch": 0.6793037840885068, + "grad_norm": 788.0079345703125, + "learning_rate": 2.4527927735490214e-05, + "loss": 2.6926, + "step": 2410 + }, + { + "epoch": 0.679585652878585, + "grad_norm": 1784.0081787109375, + "learning_rate": 2.448872513171209e-05, + "loss": 2.6426, + "step": 2411 + }, + { + "epoch": 0.6798675216686633, + "grad_norm": 1120.005859375, + "learning_rate": 2.444954371882951e-05, + "loss": 3.3522, + "step": 2412 + }, + { + "epoch": 0.6801493904587415, + "grad_norm": 1776.0101318359375, + "learning_rate": 2.4410383529388446e-05, + "loss": 3.086, + "step": 2413 + }, + { + "epoch": 0.6804312592488196, + "grad_norm": 1376.006591796875, + "learning_rate": 2.4371244595917225e-05, + "loss": 2.5435, + "step": 2414 + }, + { + "epoch": 0.6807131280388979, + "grad_norm": 856.005615234375, + "learning_rate": 2.4332126950926527e-05, + "loss": 2.6836, + "step": 2415 + }, + { + "epoch": 0.6809949968289761, + "grad_norm": 1624.009521484375, + "learning_rate": 2.429303062690938e-05, + "loss": 2.8975, + "step": 2416 + }, + { + "epoch": 0.6812768656190543, + "grad_norm": 988.0068969726562, + "learning_rate": 2.4253955656341066e-05, + "loss": 3.2546, + "step": 2417 + }, + { + "epoch": 0.6815587344091325, + "grad_norm": 1600.0074462890625, + "learning_rate": 2.421490207167914e-05, + "loss": 3.0732, + "step": 2418 + }, + { + "epoch": 0.6818406031992108, + "grad_norm": 864.0068969726562, + "learning_rate": 2.4175869905363392e-05, + "loss": 3.9414, + "step": 2419 + }, + { + "epoch": 0.682122471989289, + "grad_norm": 1816.0054931640625, + "learning_rate": 2.41368591898158e-05, + "loss": 3.2705, + "step": 2420 + }, + { + "epoch": 0.6824043407793672, + "grad_norm": 1408.007080078125, + "learning_rate": 2.4097869957440572e-05, + "loss": 3.2279, + "step": 2421 + }, + { + "epoch": 0.6826862095694454, + "grad_norm": 1864.0255126953125, + "learning_rate": 2.405890224062406e-05, + "loss": 2.7734, + "step": 2422 + }, + { + "epoch": 0.6829680783595237, + "grad_norm": 1128.0074462890625, + "learning_rate": 2.4019956071734705e-05, + "loss": 2.8496, + "step": 2423 + }, + { + "epoch": 0.6832499471496019, + "grad_norm": 1376.0096435546875, + "learning_rate": 2.3981031483123084e-05, + "loss": 3.5431, + "step": 2424 + }, + { + "epoch": 0.68353181593968, + "grad_norm": 872.0162963867188, + "learning_rate": 2.3942128507121813e-05, + "loss": 2.9613, + "step": 2425 + }, + { + "epoch": 0.6838136847297583, + "grad_norm": 1392.0093994140625, + "learning_rate": 2.3903247176045624e-05, + "loss": 2.4945, + "step": 2426 + }, + { + "epoch": 0.6840955535198365, + "grad_norm": 1320.0142822265625, + "learning_rate": 2.3864387522191197e-05, + "loss": 3.5629, + "step": 2427 + }, + { + "epoch": 0.6843774223099147, + "grad_norm": 1024.0054931640625, + "learning_rate": 2.3825549577837242e-05, + "loss": 2.7601, + "step": 2428 + }, + { + "epoch": 0.6846592910999929, + "grad_norm": 788.014892578125, + "learning_rate": 2.3786733375244412e-05, + "loss": 3.2328, + "step": 2429 + }, + { + "epoch": 0.6849411598900712, + "grad_norm": 512.0091552734375, + "learning_rate": 2.3747938946655352e-05, + "loss": 3.0202, + "step": 2430 + }, + { + "epoch": 0.6852230286801494, + "grad_norm": 1248.0067138671875, + "learning_rate": 2.3709166324294547e-05, + "loss": 2.8561, + "step": 2431 + }, + { + "epoch": 0.6855048974702276, + "grad_norm": 1504.0137939453125, + "learning_rate": 2.367041554036844e-05, + "loss": 3.2272, + "step": 2432 + }, + { + "epoch": 0.6857867662603059, + "grad_norm": 968.018310546875, + "learning_rate": 2.3631686627065295e-05, + "loss": 3.1631, + "step": 2433 + }, + { + "epoch": 0.6860686350503841, + "grad_norm": 700.0159301757812, + "learning_rate": 2.3592979616555193e-05, + "loss": 2.6622, + "step": 2434 + }, + { + "epoch": 0.6863505038404623, + "grad_norm": 836.0165405273438, + "learning_rate": 2.355429454099008e-05, + "loss": 2.1049, + "step": 2435 + }, + { + "epoch": 0.6866323726305404, + "grad_norm": 868.0164794921875, + "learning_rate": 2.3515631432503622e-05, + "loss": 2.8262, + "step": 2436 + }, + { + "epoch": 0.6869142414206187, + "grad_norm": 500.0486755371094, + "learning_rate": 2.3476990323211267e-05, + "loss": 3.1472, + "step": 2437 + }, + { + "epoch": 0.6871961102106969, + "grad_norm": 1280.012939453125, + "learning_rate": 2.3438371245210183e-05, + "loss": 2.9343, + "step": 2438 + }, + { + "epoch": 0.6874779790007751, + "grad_norm": 860.0194702148438, + "learning_rate": 2.3399774230579226e-05, + "loss": 3.3049, + "step": 2439 + }, + { + "epoch": 0.6877598477908533, + "grad_norm": 564.0167236328125, + "learning_rate": 2.3361199311378967e-05, + "loss": 3.0684, + "step": 2440 + }, + { + "epoch": 0.6880417165809316, + "grad_norm": 1264.014892578125, + "learning_rate": 2.3322646519651564e-05, + "loss": 3.1401, + "step": 2441 + }, + { + "epoch": 0.6883235853710098, + "grad_norm": 760.0184326171875, + "learning_rate": 2.328411588742086e-05, + "loss": 3.3548, + "step": 2442 + }, + { + "epoch": 0.688605454161088, + "grad_norm": 624.0196533203125, + "learning_rate": 2.3245607446692236e-05, + "loss": 2.999, + "step": 2443 + }, + { + "epoch": 0.6888873229511663, + "grad_norm": 960.0156860351562, + "learning_rate": 2.3207121229452644e-05, + "loss": 3.0625, + "step": 2444 + }, + { + "epoch": 0.6891691917412445, + "grad_norm": 2016.019287109375, + "learning_rate": 2.316865726767063e-05, + "loss": 2.8099, + "step": 2445 + }, + { + "epoch": 0.6894510605313227, + "grad_norm": 1080.0137939453125, + "learning_rate": 2.31302155932962e-05, + "loss": 3.3596, + "step": 2446 + }, + { + "epoch": 0.6897329293214008, + "grad_norm": 1544.0174560546875, + "learning_rate": 2.309179623826086e-05, + "loss": 3.3021, + "step": 2447 + }, + { + "epoch": 0.6900147981114791, + "grad_norm": 1864.01318359375, + "learning_rate": 2.3053399234477557e-05, + "loss": 3.1869, + "step": 2448 + }, + { + "epoch": 0.6902966669015573, + "grad_norm": 476.0133056640625, + "learning_rate": 2.301502461384074e-05, + "loss": 2.4802, + "step": 2449 + }, + { + "epoch": 0.6905785356916355, + "grad_norm": 912.01171875, + "learning_rate": 2.29766724082262e-05, + "loss": 2.5597, + "step": 2450 + }, + { + "epoch": 0.6908604044817137, + "grad_norm": 2176.0126953125, + "learning_rate": 2.2938342649491107e-05, + "loss": 3.1732, + "step": 2451 + }, + { + "epoch": 0.691142273271792, + "grad_norm": 1008.0070190429688, + "learning_rate": 2.2900035369474044e-05, + "loss": 2.8242, + "step": 2452 + }, + { + "epoch": 0.6914241420618702, + "grad_norm": 1012.0054321289062, + "learning_rate": 2.286175059999487e-05, + "loss": 2.8317, + "step": 2453 + }, + { + "epoch": 0.6917060108519484, + "grad_norm": 1280.0074462890625, + "learning_rate": 2.282348837285474e-05, + "loss": 3.0563, + "step": 2454 + }, + { + "epoch": 0.6919878796420267, + "grad_norm": 2272.005615234375, + "learning_rate": 2.2785248719836145e-05, + "loss": 2.9762, + "step": 2455 + }, + { + "epoch": 0.6922697484321049, + "grad_norm": 772.0111083984375, + "learning_rate": 2.2747031672702768e-05, + "loss": 2.8727, + "step": 2456 + }, + { + "epoch": 0.6925516172221831, + "grad_norm": 856.0093994140625, + "learning_rate": 2.270883726319953e-05, + "loss": 3.3542, + "step": 2457 + }, + { + "epoch": 0.6928334860122612, + "grad_norm": 560.0075073242188, + "learning_rate": 2.2670665523052532e-05, + "loss": 2.5628, + "step": 2458 + }, + { + "epoch": 0.6931153548023395, + "grad_norm": 1040.0081787109375, + "learning_rate": 2.2632516483969098e-05, + "loss": 2.9847, + "step": 2459 + }, + { + "epoch": 0.6933972235924177, + "grad_norm": 932.0071411132812, + "learning_rate": 2.2594390177637643e-05, + "loss": 3.0371, + "step": 2460 + }, + { + "epoch": 0.6936790923824959, + "grad_norm": 880.01123046875, + "learning_rate": 2.2556286635727698e-05, + "loss": 2.8453, + "step": 2461 + }, + { + "epoch": 0.6939609611725742, + "grad_norm": 298.02154541015625, + "learning_rate": 2.251820588988994e-05, + "loss": 2.8268, + "step": 2462 + }, + { + "epoch": 0.6942428299626524, + "grad_norm": 828.0111083984375, + "learning_rate": 2.248014797175604e-05, + "loss": 2.7246, + "step": 2463 + }, + { + "epoch": 0.6945246987527306, + "grad_norm": 1576.0072021484375, + "learning_rate": 2.2442112912938768e-05, + "loss": 2.5522, + "step": 2464 + }, + { + "epoch": 0.6948065675428088, + "grad_norm": 992.0084228515625, + "learning_rate": 2.2404100745031865e-05, + "loss": 2.7128, + "step": 2465 + }, + { + "epoch": 0.6950884363328871, + "grad_norm": 544.00927734375, + "learning_rate": 2.2366111499610078e-05, + "loss": 2.5609, + "step": 2466 + }, + { + "epoch": 0.6953703051229653, + "grad_norm": 1184.005859375, + "learning_rate": 2.2328145208229095e-05, + "loss": 2.9997, + "step": 2467 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 972.007568359375, + "learning_rate": 2.2290201902425546e-05, + "loss": 3.2295, + "step": 2468 + }, + { + "epoch": 0.6959340427031216, + "grad_norm": 1184.0137939453125, + "learning_rate": 2.2252281613716992e-05, + "loss": 3.2334, + "step": 2469 + }, + { + "epoch": 0.6962159114932, + "grad_norm": 612.0108032226562, + "learning_rate": 2.2214384373601843e-05, + "loss": 2.8308, + "step": 2470 + }, + { + "epoch": 0.6964977802832781, + "grad_norm": 812.0108032226562, + "learning_rate": 2.2176510213559372e-05, + "loss": 3.0638, + "step": 2471 + }, + { + "epoch": 0.6967796490733563, + "grad_norm": 1568.005126953125, + "learning_rate": 2.213865916504967e-05, + "loss": 2.9779, + "step": 2472 + }, + { + "epoch": 0.6970615178634346, + "grad_norm": 1912.008056640625, + "learning_rate": 2.210083125951366e-05, + "loss": 3.1159, + "step": 2473 + }, + { + "epoch": 0.6973433866535128, + "grad_norm": 932.007568359375, + "learning_rate": 2.2063026528373037e-05, + "loss": 2.7064, + "step": 2474 + }, + { + "epoch": 0.697625255443591, + "grad_norm": 792.009765625, + "learning_rate": 2.2025245003030227e-05, + "loss": 3.1136, + "step": 2475 + }, + { + "epoch": 0.6979071242336692, + "grad_norm": 760.0079956054688, + "learning_rate": 2.198748671486838e-05, + "loss": 2.7347, + "step": 2476 + }, + { + "epoch": 0.6981889930237475, + "grad_norm": 1680.0076904296875, + "learning_rate": 2.1949751695251335e-05, + "loss": 2.3885, + "step": 2477 + }, + { + "epoch": 0.6984708618138257, + "grad_norm": 434.0163879394531, + "learning_rate": 2.1912039975523664e-05, + "loss": 3.2507, + "step": 2478 + }, + { + "epoch": 0.6987527306039039, + "grad_norm": 1096.0059814453125, + "learning_rate": 2.1874351587010507e-05, + "loss": 3.0332, + "step": 2479 + }, + { + "epoch": 0.699034599393982, + "grad_norm": 213.01927185058594, + "learning_rate": 2.1836686561017665e-05, + "loss": 2.9629, + "step": 2480 + }, + { + "epoch": 0.6993164681840603, + "grad_norm": 784.013671875, + "learning_rate": 2.1799044928831508e-05, + "loss": 2.5112, + "step": 2481 + }, + { + "epoch": 0.6995983369741385, + "grad_norm": 1192.06103515625, + "learning_rate": 2.1761426721719013e-05, + "loss": 3.0258, + "step": 2482 + }, + { + "epoch": 0.6998802057642167, + "grad_norm": 240.0558624267578, + "learning_rate": 2.172383197092769e-05, + "loss": 3.1258, + "step": 2483 + }, + { + "epoch": 0.700162074554295, + "grad_norm": 724.0238037109375, + "learning_rate": 2.1686260707685534e-05, + "loss": 2.5872, + "step": 2484 + }, + { + "epoch": 0.7004439433443732, + "grad_norm": 1152.0250244140625, + "learning_rate": 2.1648712963201057e-05, + "loss": 3.9074, + "step": 2485 + }, + { + "epoch": 0.7007258121344514, + "grad_norm": 1128.1031494140625, + "learning_rate": 2.1611188768663214e-05, + "loss": 3.1492, + "step": 2486 + }, + { + "epoch": 0.7010076809245296, + "grad_norm": 1216.0162353515625, + "learning_rate": 2.15736881552414e-05, + "loss": 3.1082, + "step": 2487 + }, + { + "epoch": 0.7012895497146079, + "grad_norm": 900.0109252929688, + "learning_rate": 2.1536211154085473e-05, + "loss": 2.6355, + "step": 2488 + }, + { + "epoch": 0.7015714185046861, + "grad_norm": 1312.0101318359375, + "learning_rate": 2.149875779632562e-05, + "loss": 2.5132, + "step": 2489 + }, + { + "epoch": 0.7018532872947643, + "grad_norm": 1408.013671875, + "learning_rate": 2.1461328113072397e-05, + "loss": 3.1872, + "step": 2490 + }, + { + "epoch": 0.7021351560848426, + "grad_norm": 1328.0166015625, + "learning_rate": 2.1423922135416692e-05, + "loss": 2.7533, + "step": 2491 + }, + { + "epoch": 0.7024170248749207, + "grad_norm": 1176.0185546875, + "learning_rate": 2.1386539894429736e-05, + "loss": 2.8643, + "step": 2492 + }, + { + "epoch": 0.7026988936649989, + "grad_norm": 592.0130615234375, + "learning_rate": 2.1349181421163033e-05, + "loss": 2.5964, + "step": 2493 + }, + { + "epoch": 0.7029807624550771, + "grad_norm": 484.0201721191406, + "learning_rate": 2.1311846746648322e-05, + "loss": 2.9108, + "step": 2494 + }, + { + "epoch": 0.7032626312451554, + "grad_norm": 1112.010986328125, + "learning_rate": 2.127453590189758e-05, + "loss": 3.1078, + "step": 2495 + }, + { + "epoch": 0.7035445000352336, + "grad_norm": 796.00732421875, + "learning_rate": 2.123724891790298e-05, + "loss": 2.4114, + "step": 2496 + }, + { + "epoch": 0.7038263688253118, + "grad_norm": 1020.0087280273438, + "learning_rate": 2.119998582563692e-05, + "loss": 2.7546, + "step": 2497 + }, + { + "epoch": 0.70410823761539, + "grad_norm": 716.0105590820312, + "learning_rate": 2.1162746656051906e-05, + "loss": 3.3473, + "step": 2498 + }, + { + "epoch": 0.7043901064054683, + "grad_norm": 1448.011474609375, + "learning_rate": 2.1125531440080594e-05, + "loss": 3.4854, + "step": 2499 + }, + { + "epoch": 0.7046719751955465, + "grad_norm": 1016.011474609375, + "learning_rate": 2.1088340208635733e-05, + "loss": 3.2406, + "step": 2500 + }, + { + "epoch": 0.7049538439856247, + "grad_norm": 928.0076293945312, + "learning_rate": 2.1051172992610134e-05, + "loss": 2.6768, + "step": 2501 + }, + { + "epoch": 0.705235712775703, + "grad_norm": 596.0070190429688, + "learning_rate": 2.10140298228767e-05, + "loss": 2.903, + "step": 2502 + }, + { + "epoch": 0.7055175815657811, + "grad_norm": 1840.0091552734375, + "learning_rate": 2.0976910730288358e-05, + "loss": 3.2868, + "step": 2503 + }, + { + "epoch": 0.7057994503558593, + "grad_norm": 680.0056762695312, + "learning_rate": 2.0939815745677998e-05, + "loss": 2.9502, + "step": 2504 + }, + { + "epoch": 0.7060813191459375, + "grad_norm": 1248.0052490234375, + "learning_rate": 2.0902744899858505e-05, + "loss": 2.6191, + "step": 2505 + }, + { + "epoch": 0.7063631879360158, + "grad_norm": 664.0095825195312, + "learning_rate": 2.0865698223622692e-05, + "loss": 3.1621, + "step": 2506 + }, + { + "epoch": 0.706645056726094, + "grad_norm": 492.0075988769531, + "learning_rate": 2.0828675747743353e-05, + "loss": 2.8975, + "step": 2507 + }, + { + "epoch": 0.7069269255161722, + "grad_norm": 320.0095520019531, + "learning_rate": 2.0791677502973124e-05, + "loss": 3.0352, + "step": 2508 + }, + { + "epoch": 0.7072087943062504, + "grad_norm": 1304.00439453125, + "learning_rate": 2.0754703520044532e-05, + "loss": 3.1582, + "step": 2509 + }, + { + "epoch": 0.7074906630963287, + "grad_norm": 2144.0107421875, + "learning_rate": 2.071775382966993e-05, + "loss": 2.8893, + "step": 2510 + }, + { + "epoch": 0.7077725318864069, + "grad_norm": 1304.0079345703125, + "learning_rate": 2.0680828462541552e-05, + "loss": 3.248, + "step": 2511 + }, + { + "epoch": 0.7080544006764851, + "grad_norm": 1232.0123291015625, + "learning_rate": 2.0643927449331352e-05, + "loss": 3.4401, + "step": 2512 + }, + { + "epoch": 0.7083362694665634, + "grad_norm": 1120.005615234375, + "learning_rate": 2.0607050820691125e-05, + "loss": 2.7939, + "step": 2513 + }, + { + "epoch": 0.7086181382566415, + "grad_norm": 1016.0054321289062, + "learning_rate": 2.0570198607252366e-05, + "loss": 2.8292, + "step": 2514 + }, + { + "epoch": 0.7089000070467197, + "grad_norm": 772.0083618164062, + "learning_rate": 2.0533370839626298e-05, + "loss": 3.1382, + "step": 2515 + }, + { + "epoch": 0.7091818758367979, + "grad_norm": 448.0107727050781, + "learning_rate": 2.0496567548403824e-05, + "loss": 3.0439, + "step": 2516 + }, + { + "epoch": 0.7094637446268762, + "grad_norm": 1264.007080078125, + "learning_rate": 2.045978876415556e-05, + "loss": 2.4678, + "step": 2517 + }, + { + "epoch": 0.7097456134169544, + "grad_norm": 1408.00537109375, + "learning_rate": 2.042303451743174e-05, + "loss": 2.7588, + "step": 2518 + }, + { + "epoch": 0.7100274822070326, + "grad_norm": 1128.0072021484375, + "learning_rate": 2.0386304838762198e-05, + "loss": 2.8819, + "step": 2519 + }, + { + "epoch": 0.7103093509971109, + "grad_norm": 334.01214599609375, + "learning_rate": 2.034959975865637e-05, + "loss": 3.0824, + "step": 2520 + }, + { + "epoch": 0.7105912197871891, + "grad_norm": 1152.004150390625, + "learning_rate": 2.0312919307603283e-05, + "loss": 3.1527, + "step": 2521 + }, + { + "epoch": 0.7108730885772673, + "grad_norm": 684.0078735351562, + "learning_rate": 2.0276263516071492e-05, + "loss": 3.3512, + "step": 2522 + }, + { + "epoch": 0.7111549573673455, + "grad_norm": 460.0079345703125, + "learning_rate": 2.023963241450904e-05, + "loss": 2.8617, + "step": 2523 + }, + { + "epoch": 0.7114368261574238, + "grad_norm": 1736.00732421875, + "learning_rate": 2.0203026033343524e-05, + "loss": 2.7588, + "step": 2524 + }, + { + "epoch": 0.711718694947502, + "grad_norm": 1064.009033203125, + "learning_rate": 2.0166444402981926e-05, + "loss": 2.3751, + "step": 2525 + }, + { + "epoch": 0.7120005637375801, + "grad_norm": 1992.0072021484375, + "learning_rate": 2.012988755381076e-05, + "loss": 2.8538, + "step": 2526 + }, + { + "epoch": 0.7122824325276583, + "grad_norm": 1704.0084228515625, + "learning_rate": 2.0093355516195888e-05, + "loss": 2.8076, + "step": 2527 + }, + { + "epoch": 0.7125643013177366, + "grad_norm": 584.0073852539062, + "learning_rate": 2.0056848320482578e-05, + "loss": 2.9707, + "step": 2528 + }, + { + "epoch": 0.7128461701078148, + "grad_norm": 1792.0087890625, + "learning_rate": 2.0020365996995455e-05, + "loss": 3.0003, + "step": 2529 + }, + { + "epoch": 0.713128038897893, + "grad_norm": 1248.0078125, + "learning_rate": 1.998390857603853e-05, + "loss": 2.5609, + "step": 2530 + }, + { + "epoch": 0.7134099076879713, + "grad_norm": 1088.0076904296875, + "learning_rate": 1.9947476087895073e-05, + "loss": 2.8525, + "step": 2531 + }, + { + "epoch": 0.7136917764780495, + "grad_norm": 1384.008056640625, + "learning_rate": 1.9911068562827673e-05, + "loss": 2.8955, + "step": 2532 + }, + { + "epoch": 0.7139736452681277, + "grad_norm": 1184.01171875, + "learning_rate": 1.9874686031078154e-05, + "loss": 3.1921, + "step": 2533 + }, + { + "epoch": 0.7142555140582059, + "grad_norm": 1008.0089721679688, + "learning_rate": 1.9838328522867644e-05, + "loss": 2.915, + "step": 2534 + }, + { + "epoch": 0.7145373828482842, + "grad_norm": 668.0187377929688, + "learning_rate": 1.9801996068396406e-05, + "loss": 3.1999, + "step": 2535 + }, + { + "epoch": 0.7148192516383624, + "grad_norm": 1200.0108642578125, + "learning_rate": 1.976568869784396e-05, + "loss": 3.4359, + "step": 2536 + }, + { + "epoch": 0.7151011204284405, + "grad_norm": 1336.0120849609375, + "learning_rate": 1.972940644136894e-05, + "loss": 3.1001, + "step": 2537 + }, + { + "epoch": 0.7153829892185187, + "grad_norm": 434.0216064453125, + "learning_rate": 1.9693149329109146e-05, + "loss": 2.6934, + "step": 2538 + }, + { + "epoch": 0.715664858008597, + "grad_norm": 1832.0118408203125, + "learning_rate": 1.9656917391181463e-05, + "loss": 3.1733, + "step": 2539 + }, + { + "epoch": 0.7159467267986752, + "grad_norm": 1936.014404296875, + "learning_rate": 1.9620710657681924e-05, + "loss": 2.8307, + "step": 2540 + }, + { + "epoch": 0.7162285955887534, + "grad_norm": 1568.009521484375, + "learning_rate": 1.9584529158685566e-05, + "loss": 3.0296, + "step": 2541 + }, + { + "epoch": 0.7165104643788317, + "grad_norm": 2080.005615234375, + "learning_rate": 1.9548372924246493e-05, + "loss": 3.2549, + "step": 2542 + }, + { + "epoch": 0.7167923331689099, + "grad_norm": 2064.006103515625, + "learning_rate": 1.9512241984397793e-05, + "loss": 2.9816, + "step": 2543 + }, + { + "epoch": 0.7170742019589881, + "grad_norm": 1392.0147705078125, + "learning_rate": 1.947613636915159e-05, + "loss": 2.7358, + "step": 2544 + }, + { + "epoch": 0.7173560707490663, + "grad_norm": 664.0096435546875, + "learning_rate": 1.944005610849897e-05, + "loss": 2.8183, + "step": 2545 + }, + { + "epoch": 0.7176379395391446, + "grad_norm": 576.0164794921875, + "learning_rate": 1.940400123240992e-05, + "loss": 3.1908, + "step": 2546 + }, + { + "epoch": 0.7179198083292228, + "grad_norm": 1232.0062255859375, + "learning_rate": 1.9367971770833347e-05, + "loss": 2.8071, + "step": 2547 + }, + { + "epoch": 0.7182016771193009, + "grad_norm": 1104.0087890625, + "learning_rate": 1.933196775369708e-05, + "loss": 2.8294, + "step": 2548 + }, + { + "epoch": 0.7184835459093791, + "grad_norm": 1072.0079345703125, + "learning_rate": 1.9295989210907754e-05, + "loss": 2.557, + "step": 2549 + }, + { + "epoch": 0.7187654146994574, + "grad_norm": 2112.016357421875, + "learning_rate": 1.926003617235093e-05, + "loss": 3.4746, + "step": 2550 + }, + { + "epoch": 0.7190472834895356, + "grad_norm": 556.0062866210938, + "learning_rate": 1.9224108667890917e-05, + "loss": 2.3907, + "step": 2551 + }, + { + "epoch": 0.7193291522796138, + "grad_norm": 656.0068359375, + "learning_rate": 1.9188206727370834e-05, + "loss": 2.7185, + "step": 2552 + }, + { + "epoch": 0.7196110210696921, + "grad_norm": 1032.0159912109375, + "learning_rate": 1.915233038061254e-05, + "loss": 2.8652, + "step": 2553 + }, + { + "epoch": 0.7198928898597703, + "grad_norm": 1144.006591796875, + "learning_rate": 1.9116479657416685e-05, + "loss": 3.0225, + "step": 2554 + }, + { + "epoch": 0.7201747586498485, + "grad_norm": 1200.0064697265625, + "learning_rate": 1.9080654587562625e-05, + "loss": 3.322, + "step": 2555 + }, + { + "epoch": 0.7204566274399267, + "grad_norm": 528.00927734375, + "learning_rate": 1.904485520080837e-05, + "loss": 2.3713, + "step": 2556 + }, + { + "epoch": 0.720738496230005, + "grad_norm": 656.0103759765625, + "learning_rate": 1.900908152689062e-05, + "loss": 2.9144, + "step": 2557 + }, + { + "epoch": 0.7210203650200832, + "grad_norm": 1800.0081787109375, + "learning_rate": 1.8973333595524695e-05, + "loss": 2.8405, + "step": 2558 + }, + { + "epoch": 0.7213022338101613, + "grad_norm": 748.0060424804688, + "learning_rate": 1.893761143640459e-05, + "loss": 2.5606, + "step": 2559 + }, + { + "epoch": 0.7215841026002396, + "grad_norm": 596.0065307617188, + "learning_rate": 1.8901915079202836e-05, + "loss": 2.8613, + "step": 2560 + }, + { + "epoch": 0.7218659713903178, + "grad_norm": 506.00830078125, + "learning_rate": 1.8866244553570545e-05, + "loss": 2.6424, + "step": 2561 + }, + { + "epoch": 0.722147840180396, + "grad_norm": 592.0126953125, + "learning_rate": 1.883059988913738e-05, + "loss": 3.0209, + "step": 2562 + }, + { + "epoch": 0.7224297089704742, + "grad_norm": 1488.0098876953125, + "learning_rate": 1.879498111551148e-05, + "loss": 3.2572, + "step": 2563 + }, + { + "epoch": 0.7227115777605525, + "grad_norm": 940.0065307617188, + "learning_rate": 1.8759388262279587e-05, + "loss": 2.4236, + "step": 2564 + }, + { + "epoch": 0.7229934465506307, + "grad_norm": 1072.00634765625, + "learning_rate": 1.872382135900681e-05, + "loss": 2.5083, + "step": 2565 + }, + { + "epoch": 0.7232753153407089, + "grad_norm": 1208.005126953125, + "learning_rate": 1.8688280435236734e-05, + "loss": 3.1209, + "step": 2566 + }, + { + "epoch": 0.7235571841307871, + "grad_norm": 1552.0081787109375, + "learning_rate": 1.865276552049136e-05, + "loss": 2.7344, + "step": 2567 + }, + { + "epoch": 0.7238390529208654, + "grad_norm": 576.0087890625, + "learning_rate": 1.861727664427108e-05, + "loss": 3.043, + "step": 2568 + }, + { + "epoch": 0.7241209217109436, + "grad_norm": 1368.0064697265625, + "learning_rate": 1.8581813836054697e-05, + "loss": 2.8672, + "step": 2569 + }, + { + "epoch": 0.7244027905010217, + "grad_norm": 490.014404296875, + "learning_rate": 1.8546377125299313e-05, + "loss": 2.9844, + "step": 2570 + }, + { + "epoch": 0.7246846592911, + "grad_norm": 680.00732421875, + "learning_rate": 1.8510966541440366e-05, + "loss": 3.1006, + "step": 2571 + }, + { + "epoch": 0.7249665280811782, + "grad_norm": 1120.0052490234375, + "learning_rate": 1.8475582113891586e-05, + "loss": 2.9221, + "step": 2572 + }, + { + "epoch": 0.7252483968712564, + "grad_norm": 672.009765625, + "learning_rate": 1.8440223872045013e-05, + "loss": 3.2135, + "step": 2573 + }, + { + "epoch": 0.7255302656613346, + "grad_norm": 752.0086059570312, + "learning_rate": 1.840489184527087e-05, + "loss": 2.7751, + "step": 2574 + }, + { + "epoch": 0.7258121344514129, + "grad_norm": 1152.0048828125, + "learning_rate": 1.836958606291769e-05, + "loss": 2.4939, + "step": 2575 + }, + { + "epoch": 0.7260940032414911, + "grad_norm": 1104.01318359375, + "learning_rate": 1.8334306554312135e-05, + "loss": 2.6023, + "step": 2576 + }, + { + "epoch": 0.7263758720315693, + "grad_norm": 1144.00830078125, + "learning_rate": 1.829905334875905e-05, + "loss": 2.8848, + "step": 2577 + }, + { + "epoch": 0.7266577408216475, + "grad_norm": 2064.0068359375, + "learning_rate": 1.826382647554148e-05, + "loss": 3.2874, + "step": 2578 + }, + { + "epoch": 0.7269396096117258, + "grad_norm": 1504.0064697265625, + "learning_rate": 1.822862596392055e-05, + "loss": 2.922, + "step": 2579 + }, + { + "epoch": 0.727221478401804, + "grad_norm": 944.0138549804688, + "learning_rate": 1.8193451843135502e-05, + "loss": 3.3109, + "step": 2580 + }, + { + "epoch": 0.7275033471918821, + "grad_norm": 740.0076904296875, + "learning_rate": 1.8158304142403653e-05, + "loss": 3.1455, + "step": 2581 + }, + { + "epoch": 0.7277852159819604, + "grad_norm": 1376.0093994140625, + "learning_rate": 1.8123182890920377e-05, + "loss": 2.9977, + "step": 2582 + }, + { + "epoch": 0.7280670847720386, + "grad_norm": 1032.01171875, + "learning_rate": 1.80880881178591e-05, + "loss": 3.2562, + "step": 2583 + }, + { + "epoch": 0.7283489535621168, + "grad_norm": 1080.0096435546875, + "learning_rate": 1.8053019852371196e-05, + "loss": 2.6052, + "step": 2584 + }, + { + "epoch": 0.728630822352195, + "grad_norm": 1376.012939453125, + "learning_rate": 1.8017978123586104e-05, + "loss": 2.8763, + "step": 2585 + }, + { + "epoch": 0.7289126911422733, + "grad_norm": 1344.008544921875, + "learning_rate": 1.7982962960611155e-05, + "loss": 2.8885, + "step": 2586 + }, + { + "epoch": 0.7291945599323515, + "grad_norm": 1088.00927734375, + "learning_rate": 1.7947974392531612e-05, + "loss": 2.8494, + "step": 2587 + }, + { + "epoch": 0.7294764287224297, + "grad_norm": 2112.01025390625, + "learning_rate": 1.79130124484107e-05, + "loss": 3.6123, + "step": 2588 + }, + { + "epoch": 0.729758297512508, + "grad_norm": 1856.010498046875, + "learning_rate": 1.787807715728948e-05, + "loss": 3.5186, + "step": 2589 + }, + { + "epoch": 0.7300401663025862, + "grad_norm": 972.0067138671875, + "learning_rate": 1.7843168548186894e-05, + "loss": 2.938, + "step": 2590 + }, + { + "epoch": 0.7303220350926644, + "grad_norm": 1656.0048828125, + "learning_rate": 1.7808286650099697e-05, + "loss": 2.5544, + "step": 2591 + }, + { + "epoch": 0.7306039038827425, + "grad_norm": 1392.006103515625, + "learning_rate": 1.7773431492002513e-05, + "loss": 2.653, + "step": 2592 + }, + { + "epoch": 0.7308857726728208, + "grad_norm": 720.0118408203125, + "learning_rate": 1.7738603102847694e-05, + "loss": 3.2943, + "step": 2593 + }, + { + "epoch": 0.731167641462899, + "grad_norm": 852.00732421875, + "learning_rate": 1.770380151156537e-05, + "loss": 3.1439, + "step": 2594 + }, + { + "epoch": 0.7314495102529772, + "grad_norm": 1288.0076904296875, + "learning_rate": 1.7669026747063457e-05, + "loss": 2.8015, + "step": 2595 + }, + { + "epoch": 0.7317313790430554, + "grad_norm": 808.0086669921875, + "learning_rate": 1.7634278838227526e-05, + "loss": 3.3145, + "step": 2596 + }, + { + "epoch": 0.7320132478331337, + "grad_norm": 1624.013671875, + "learning_rate": 1.7599557813920853e-05, + "loss": 3.2546, + "step": 2597 + }, + { + "epoch": 0.7322951166232119, + "grad_norm": 728.00634765625, + "learning_rate": 1.756486370298443e-05, + "loss": 2.3804, + "step": 2598 + }, + { + "epoch": 0.7325769854132901, + "grad_norm": 1328.009033203125, + "learning_rate": 1.7530196534236842e-05, + "loss": 2.8031, + "step": 2599 + }, + { + "epoch": 0.7328588542033684, + "grad_norm": 1120.0089111328125, + "learning_rate": 1.7495556336474316e-05, + "loss": 2.6117, + "step": 2600 + }, + { + "epoch": 0.7331407229934466, + "grad_norm": 520.0069580078125, + "learning_rate": 1.7460943138470647e-05, + "loss": 2.4705, + "step": 2601 + }, + { + "epoch": 0.7334225917835248, + "grad_norm": 1184.0042724609375, + "learning_rate": 1.7426356968977265e-05, + "loss": 2.6304, + "step": 2602 + }, + { + "epoch": 0.733704460573603, + "grad_norm": 920.0050048828125, + "learning_rate": 1.7391797856723096e-05, + "loss": 2.7207, + "step": 2603 + }, + { + "epoch": 0.7339863293636812, + "grad_norm": 1480.0091552734375, + "learning_rate": 1.7357265830414586e-05, + "loss": 3.0156, + "step": 2604 + }, + { + "epoch": 0.7342681981537594, + "grad_norm": 1640.0045166015625, + "learning_rate": 1.7322760918735737e-05, + "loss": 3.3044, + "step": 2605 + }, + { + "epoch": 0.7345500669438376, + "grad_norm": 1792.0076904296875, + "learning_rate": 1.7288283150347957e-05, + "loss": 2.6694, + "step": 2606 + }, + { + "epoch": 0.7348319357339158, + "grad_norm": 640.0181274414062, + "learning_rate": 1.7253832553890177e-05, + "loss": 3.2969, + "step": 2607 + }, + { + "epoch": 0.7351138045239941, + "grad_norm": 1552.004150390625, + "learning_rate": 1.7219409157978707e-05, + "loss": 3.0137, + "step": 2608 + }, + { + "epoch": 0.7353956733140723, + "grad_norm": 1760.006591796875, + "learning_rate": 1.7185012991207284e-05, + "loss": 2.7952, + "step": 2609 + }, + { + "epoch": 0.7356775421041505, + "grad_norm": 1296.005859375, + "learning_rate": 1.7150644082147022e-05, + "loss": 2.6453, + "step": 2610 + }, + { + "epoch": 0.7359594108942288, + "grad_norm": 784.008544921875, + "learning_rate": 1.7116302459346377e-05, + "loss": 2.7142, + "step": 2611 + }, + { + "epoch": 0.736241279684307, + "grad_norm": 536.0079345703125, + "learning_rate": 1.708198815133119e-05, + "loss": 2.6823, + "step": 2612 + }, + { + "epoch": 0.7365231484743852, + "grad_norm": 1144.00732421875, + "learning_rate": 1.704770118660457e-05, + "loss": 2.7651, + "step": 2613 + }, + { + "epoch": 0.7368050172644633, + "grad_norm": 692.0065307617188, + "learning_rate": 1.7013441593646894e-05, + "loss": 2.7734, + "step": 2614 + }, + { + "epoch": 0.7370868860545416, + "grad_norm": 290.0092468261719, + "learning_rate": 1.697920940091588e-05, + "loss": 2.8623, + "step": 2615 + }, + { + "epoch": 0.7373687548446198, + "grad_norm": 968.0057373046875, + "learning_rate": 1.6945004636846403e-05, + "loss": 2.7917, + "step": 2616 + }, + { + "epoch": 0.737650623634698, + "grad_norm": 780.0082397460938, + "learning_rate": 1.6910827329850616e-05, + "loss": 2.6237, + "step": 2617 + }, + { + "epoch": 0.7379324924247763, + "grad_norm": 1232.0057373046875, + "learning_rate": 1.6876677508317822e-05, + "loss": 2.6305, + "step": 2618 + }, + { + "epoch": 0.7382143612148545, + "grad_norm": 804.0069580078125, + "learning_rate": 1.684255520061452e-05, + "loss": 3.1013, + "step": 2619 + }, + { + "epoch": 0.7384962300049327, + "grad_norm": 704.0105590820312, + "learning_rate": 1.6808460435084315e-05, + "loss": 2.7013, + "step": 2620 + }, + { + "epoch": 0.7387780987950109, + "grad_norm": 536.0083618164062, + "learning_rate": 1.6774393240048013e-05, + "loss": 2.7738, + "step": 2621 + }, + { + "epoch": 0.7390599675850892, + "grad_norm": 2144.005859375, + "learning_rate": 1.6740353643803442e-05, + "loss": 2.6091, + "step": 2622 + }, + { + "epoch": 0.7393418363751674, + "grad_norm": 1144.0057373046875, + "learning_rate": 1.6706341674625535e-05, + "loss": 2.7441, + "step": 2623 + }, + { + "epoch": 0.7396237051652456, + "grad_norm": 1472.0042724609375, + "learning_rate": 1.6672357360766256e-05, + "loss": 2.7425, + "step": 2624 + }, + { + "epoch": 0.7399055739553237, + "grad_norm": 1152.0047607421875, + "learning_rate": 1.6638400730454634e-05, + "loss": 2.8932, + "step": 2625 + }, + { + "epoch": 0.740187442745402, + "grad_norm": 1088.008056640625, + "learning_rate": 1.6604471811896703e-05, + "loss": 2.7891, + "step": 2626 + }, + { + "epoch": 0.7404693115354802, + "grad_norm": 716.0061645507812, + "learning_rate": 1.657057063327544e-05, + "loss": 2.8298, + "step": 2627 + }, + { + "epoch": 0.7407511803255584, + "grad_norm": 1048.0037841796875, + "learning_rate": 1.6536697222750795e-05, + "loss": 2.8796, + "step": 2628 + }, + { + "epoch": 0.7410330491156367, + "grad_norm": 760.0061645507812, + "learning_rate": 1.650285160845967e-05, + "loss": 2.7996, + "step": 2629 + }, + { + "epoch": 0.7413149179057149, + "grad_norm": 2032.0074462890625, + "learning_rate": 1.6469033818515833e-05, + "loss": 3.0524, + "step": 2630 + }, + { + "epoch": 0.7415967866957931, + "grad_norm": 1736.0103759765625, + "learning_rate": 1.6435243881010016e-05, + "loss": 3.0775, + "step": 2631 + }, + { + "epoch": 0.7418786554858713, + "grad_norm": 568.0170288085938, + "learning_rate": 1.640148182400975e-05, + "loss": 2.5628, + "step": 2632 + }, + { + "epoch": 0.7421605242759496, + "grad_norm": 1256.0137939453125, + "learning_rate": 1.6367747675559426e-05, + "loss": 2.8304, + "step": 2633 + }, + { + "epoch": 0.7424423930660278, + "grad_norm": 1020.005615234375, + "learning_rate": 1.6334041463680237e-05, + "loss": 2.8203, + "step": 2634 + }, + { + "epoch": 0.742724261856106, + "grad_norm": 552.0139770507812, + "learning_rate": 1.630036321637022e-05, + "loss": 2.5554, + "step": 2635 + }, + { + "epoch": 0.7430061306461841, + "grad_norm": 490.0102233886719, + "learning_rate": 1.6266712961604146e-05, + "loss": 2.4397, + "step": 2636 + }, + { + "epoch": 0.7432879994362624, + "grad_norm": 828.0137329101562, + "learning_rate": 1.623309072733354e-05, + "loss": 2.4782, + "step": 2637 + }, + { + "epoch": 0.7435698682263406, + "grad_norm": 968.013671875, + "learning_rate": 1.6199496541486647e-05, + "loss": 2.5506, + "step": 2638 + }, + { + "epoch": 0.7438517370164188, + "grad_norm": 1512.011962890625, + "learning_rate": 1.61659304319684e-05, + "loss": 3.1494, + "step": 2639 + }, + { + "epoch": 0.7441336058064971, + "grad_norm": 1528.0113525390625, + "learning_rate": 1.613239242666046e-05, + "loss": 2.8047, + "step": 2640 + }, + { + "epoch": 0.7444154745965753, + "grad_norm": 664.0132446289062, + "learning_rate": 1.6098882553421102e-05, + "loss": 2.6817, + "step": 2641 + }, + { + "epoch": 0.7446973433866535, + "grad_norm": 708.0201416015625, + "learning_rate": 1.6065400840085238e-05, + "loss": 2.7276, + "step": 2642 + }, + { + "epoch": 0.7449792121767317, + "grad_norm": 800.0154418945312, + "learning_rate": 1.6031947314464397e-05, + "loss": 2.9203, + "step": 2643 + }, + { + "epoch": 0.74526108096681, + "grad_norm": 1496.0135498046875, + "learning_rate": 1.599852200434667e-05, + "loss": 3.6954, + "step": 2644 + }, + { + "epoch": 0.7455429497568882, + "grad_norm": 752.015625, + "learning_rate": 1.5965124937496757e-05, + "loss": 2.957, + "step": 2645 + }, + { + "epoch": 0.7458248185469664, + "grad_norm": 1784.0069580078125, + "learning_rate": 1.5931756141655878e-05, + "loss": 2.4903, + "step": 2646 + }, + { + "epoch": 0.7461066873370447, + "grad_norm": 1040.0120849609375, + "learning_rate": 1.589841564454176e-05, + "loss": 2.8526, + "step": 2647 + }, + { + "epoch": 0.7463885561271228, + "grad_norm": 1832.010498046875, + "learning_rate": 1.5865103473848618e-05, + "loss": 3.111, + "step": 2648 + }, + { + "epoch": 0.746670424917201, + "grad_norm": 1336.0142822265625, + "learning_rate": 1.5831819657247137e-05, + "loss": 3.1025, + "step": 2649 + }, + { + "epoch": 0.7469522937072792, + "grad_norm": 640.0201416015625, + "learning_rate": 1.5798564222384492e-05, + "loss": 3.306, + "step": 2650 + }, + { + "epoch": 0.7472341624973575, + "grad_norm": 800.00537109375, + "learning_rate": 1.5765337196884234e-05, + "loss": 3.0703, + "step": 2651 + }, + { + "epoch": 0.7475160312874357, + "grad_norm": 720.0078735351562, + "learning_rate": 1.5732138608346336e-05, + "loss": 2.6123, + "step": 2652 + }, + { + "epoch": 0.7477979000775139, + "grad_norm": 768.0076904296875, + "learning_rate": 1.5698968484347133e-05, + "loss": 2.4412, + "step": 2653 + }, + { + "epoch": 0.7480797688675921, + "grad_norm": 400.01190185546875, + "learning_rate": 1.566582685243935e-05, + "loss": 3.252, + "step": 2654 + }, + { + "epoch": 0.7483616376576704, + "grad_norm": 1720.010009765625, + "learning_rate": 1.563271374015201e-05, + "loss": 3.0417, + "step": 2655 + }, + { + "epoch": 0.7486435064477486, + "grad_norm": 948.008544921875, + "learning_rate": 1.5599629174990483e-05, + "loss": 2.833, + "step": 2656 + }, + { + "epoch": 0.7489253752378268, + "grad_norm": 928.0087280273438, + "learning_rate": 1.556657318443639e-05, + "loss": 2.5913, + "step": 2657 + }, + { + "epoch": 0.7492072440279051, + "grad_norm": 1896.013671875, + "learning_rate": 1.553354579594763e-05, + "loss": 2.624, + "step": 2658 + }, + { + "epoch": 0.7494891128179832, + "grad_norm": 344.0197448730469, + "learning_rate": 1.5500547036958335e-05, + "loss": 2.6842, + "step": 2659 + }, + { + "epoch": 0.7497709816080614, + "grad_norm": 736.0197143554688, + "learning_rate": 1.5467576934878892e-05, + "loss": 2.9665, + "step": 2660 + }, + { + "epoch": 0.7500528503981396, + "grad_norm": 2032.00927734375, + "learning_rate": 1.5434635517095846e-05, + "loss": 3.4889, + "step": 2661 + }, + { + "epoch": 0.7503347191882179, + "grad_norm": 1008.008544921875, + "learning_rate": 1.5401722810971924e-05, + "loss": 3.2438, + "step": 2662 + }, + { + "epoch": 0.7506165879782961, + "grad_norm": 692.017333984375, + "learning_rate": 1.5368838843845994e-05, + "loss": 2.8229, + "step": 2663 + }, + { + "epoch": 0.7508984567683743, + "grad_norm": 1624.0078125, + "learning_rate": 1.5335983643033096e-05, + "loss": 3.1794, + "step": 2664 + }, + { + "epoch": 0.7511803255584525, + "grad_norm": 1088.010498046875, + "learning_rate": 1.5303157235824322e-05, + "loss": 3.6742, + "step": 2665 + }, + { + "epoch": 0.7514621943485308, + "grad_norm": 1336.0089111328125, + "learning_rate": 1.5270359649486886e-05, + "loss": 2.6387, + "step": 2666 + }, + { + "epoch": 0.751744063138609, + "grad_norm": 1368.0120849609375, + "learning_rate": 1.5237590911264037e-05, + "loss": 3.0567, + "step": 2667 + }, + { + "epoch": 0.7520259319286872, + "grad_norm": 476.0111389160156, + "learning_rate": 1.5204851048375051e-05, + "loss": 2.8308, + "step": 2668 + }, + { + "epoch": 0.7523078007187655, + "grad_norm": 2304.013671875, + "learning_rate": 1.5172140088015274e-05, + "loss": 3.4733, + "step": 2669 + }, + { + "epoch": 0.7525896695088437, + "grad_norm": 948.0082397460938, + "learning_rate": 1.5139458057355994e-05, + "loss": 2.527, + "step": 2670 + }, + { + "epoch": 0.7528715382989218, + "grad_norm": 1464.009765625, + "learning_rate": 1.510680498354447e-05, + "loss": 3.0217, + "step": 2671 + }, + { + "epoch": 0.753153407089, + "grad_norm": 852.0118408203125, + "learning_rate": 1.5074180893703915e-05, + "loss": 2.6357, + "step": 2672 + }, + { + "epoch": 0.7534352758790783, + "grad_norm": 1616.010009765625, + "learning_rate": 1.5041585814933496e-05, + "loss": 2.9411, + "step": 2673 + }, + { + "epoch": 0.7537171446691565, + "grad_norm": 1040.0108642578125, + "learning_rate": 1.5009019774308247e-05, + "loss": 2.7816, + "step": 2674 + }, + { + "epoch": 0.7539990134592347, + "grad_norm": 920.007568359375, + "learning_rate": 1.49764827988791e-05, + "loss": 2.7286, + "step": 2675 + }, + { + "epoch": 0.7542808822493129, + "grad_norm": 1056.0120849609375, + "learning_rate": 1.4943974915672804e-05, + "loss": 2.4162, + "step": 2676 + }, + { + "epoch": 0.7545627510393912, + "grad_norm": 1288.0081787109375, + "learning_rate": 1.4911496151692012e-05, + "loss": 2.9698, + "step": 2677 + }, + { + "epoch": 0.7548446198294694, + "grad_norm": 856.011474609375, + "learning_rate": 1.4879046533915125e-05, + "loss": 2.6738, + "step": 2678 + }, + { + "epoch": 0.7551264886195476, + "grad_norm": 968.0071411132812, + "learning_rate": 1.4846626089296394e-05, + "loss": 3.3005, + "step": 2679 + }, + { + "epoch": 0.7554083574096259, + "grad_norm": 1720.015869140625, + "learning_rate": 1.4814234844765785e-05, + "loss": 3.3877, + "step": 2680 + }, + { + "epoch": 0.755690226199704, + "grad_norm": 1600.0147705078125, + "learning_rate": 1.4781872827229026e-05, + "loss": 2.9251, + "step": 2681 + }, + { + "epoch": 0.7559720949897822, + "grad_norm": 440.0260009765625, + "learning_rate": 1.4749540063567552e-05, + "loss": 2.1293, + "step": 2682 + }, + { + "epoch": 0.7562539637798604, + "grad_norm": 816.0325317382812, + "learning_rate": 1.471723658063856e-05, + "loss": 2.9925, + "step": 2683 + }, + { + "epoch": 0.7565358325699387, + "grad_norm": 1360.0277099609375, + "learning_rate": 1.4684962405274849e-05, + "loss": 3.4343, + "step": 2684 + }, + { + "epoch": 0.7568177013600169, + "grad_norm": 1160.067138671875, + "learning_rate": 1.4652717564284917e-05, + "loss": 2.4857, + "step": 2685 + }, + { + "epoch": 0.7570995701500951, + "grad_norm": 1200.0185546875, + "learning_rate": 1.462050208445287e-05, + "loss": 3.21, + "step": 2686 + }, + { + "epoch": 0.7573814389401734, + "grad_norm": 844.0215454101562, + "learning_rate": 1.4588315992538449e-05, + "loss": 2.6084, + "step": 2687 + }, + { + "epoch": 0.7576633077302516, + "grad_norm": 1080.0255126953125, + "learning_rate": 1.455615931527699e-05, + "loss": 3.2394, + "step": 2688 + }, + { + "epoch": 0.7579451765203298, + "grad_norm": 1040.01513671875, + "learning_rate": 1.4524032079379368e-05, + "loss": 3.1097, + "step": 2689 + }, + { + "epoch": 0.758227045310408, + "grad_norm": 676.0167846679688, + "learning_rate": 1.4491934311532014e-05, + "loss": 2.4028, + "step": 2690 + }, + { + "epoch": 0.7585089141004863, + "grad_norm": 980.0108032226562, + "learning_rate": 1.4459866038396885e-05, + "loss": 2.8697, + "step": 2691 + }, + { + "epoch": 0.7587907828905645, + "grad_norm": 306.03265380859375, + "learning_rate": 1.4427827286611412e-05, + "loss": 2.8933, + "step": 2692 + }, + { + "epoch": 0.7590726516806426, + "grad_norm": 1048.0101318359375, + "learning_rate": 1.439581808278857e-05, + "loss": 2.0918, + "step": 2693 + }, + { + "epoch": 0.7593545204707208, + "grad_norm": 1592.0135498046875, + "learning_rate": 1.436383845351672e-05, + "loss": 2.8311, + "step": 2694 + }, + { + "epoch": 0.7596363892607991, + "grad_norm": 1416.0203857421875, + "learning_rate": 1.4331888425359697e-05, + "loss": 3.0599, + "step": 2695 + }, + { + "epoch": 0.7599182580508773, + "grad_norm": 268.0386657714844, + "learning_rate": 1.4299968024856708e-05, + "loss": 2.8034, + "step": 2696 + }, + { + "epoch": 0.7602001268409555, + "grad_norm": 1160.0157470703125, + "learning_rate": 1.4268077278522407e-05, + "loss": 2.7738, + "step": 2697 + }, + { + "epoch": 0.7604819956310338, + "grad_norm": 1656.0091552734375, + "learning_rate": 1.4236216212846787e-05, + "loss": 2.6266, + "step": 2698 + }, + { + "epoch": 0.760763864421112, + "grad_norm": 1584.0103759765625, + "learning_rate": 1.4204384854295183e-05, + "loss": 2.7939, + "step": 2699 + }, + { + "epoch": 0.7610457332111902, + "grad_norm": 1280.009765625, + "learning_rate": 1.4172583229308245e-05, + "loss": 2.3246, + "step": 2700 + }, + { + "epoch": 0.7613276020012684, + "grad_norm": 1504.01318359375, + "learning_rate": 1.4140811364301932e-05, + "loss": 2.9893, + "step": 2701 + }, + { + "epoch": 0.7616094707913467, + "grad_norm": 736.0086059570312, + "learning_rate": 1.4109069285667515e-05, + "loss": 3.0195, + "step": 2702 + }, + { + "epoch": 0.7618913395814249, + "grad_norm": 932.0107421875, + "learning_rate": 1.4077357019771476e-05, + "loss": 3.0398, + "step": 2703 + }, + { + "epoch": 0.762173208371503, + "grad_norm": 944.008056640625, + "learning_rate": 1.404567459295556e-05, + "loss": 2.7439, + "step": 2704 + }, + { + "epoch": 0.7624550771615812, + "grad_norm": 680.0059204101562, + "learning_rate": 1.4014022031536717e-05, + "loss": 2.4398, + "step": 2705 + }, + { + "epoch": 0.7627369459516595, + "grad_norm": 924.0143432617188, + "learning_rate": 1.3982399361807053e-05, + "loss": 2.8269, + "step": 2706 + }, + { + "epoch": 0.7630188147417377, + "grad_norm": 972.0082397460938, + "learning_rate": 1.3950806610033957e-05, + "loss": 2.4839, + "step": 2707 + }, + { + "epoch": 0.7633006835318159, + "grad_norm": 1488.0133056640625, + "learning_rate": 1.3919243802459847e-05, + "loss": 3.0055, + "step": 2708 + }, + { + "epoch": 0.7635825523218942, + "grad_norm": 1072.010986328125, + "learning_rate": 1.3887710965302331e-05, + "loss": 2.8311, + "step": 2709 + }, + { + "epoch": 0.7638644211119724, + "grad_norm": 1136.005126953125, + "learning_rate": 1.3856208124754089e-05, + "loss": 2.8985, + "step": 2710 + }, + { + "epoch": 0.7641462899020506, + "grad_norm": 1216.0084228515625, + "learning_rate": 1.3824735306982894e-05, + "loss": 3.1113, + "step": 2711 + }, + { + "epoch": 0.7644281586921288, + "grad_norm": 1504.0089111328125, + "learning_rate": 1.3793292538131614e-05, + "loss": 2.9697, + "step": 2712 + }, + { + "epoch": 0.7647100274822071, + "grad_norm": 1080.005615234375, + "learning_rate": 1.3761879844318116e-05, + "loss": 2.7646, + "step": 2713 + }, + { + "epoch": 0.7649918962722853, + "grad_norm": 544.0125732421875, + "learning_rate": 1.3730497251635293e-05, + "loss": 2.8232, + "step": 2714 + }, + { + "epoch": 0.7652737650623634, + "grad_norm": 1184.0155029296875, + "learning_rate": 1.3699144786151036e-05, + "loss": 2.6095, + "step": 2715 + }, + { + "epoch": 0.7655556338524417, + "grad_norm": 1368.0096435546875, + "learning_rate": 1.366782247390822e-05, + "loss": 2.8068, + "step": 2716 + }, + { + "epoch": 0.7658375026425199, + "grad_norm": 1336.0086669921875, + "learning_rate": 1.363653034092469e-05, + "loss": 2.7045, + "step": 2717 + }, + { + "epoch": 0.7661193714325981, + "grad_norm": 1152.03857421875, + "learning_rate": 1.3605268413193184e-05, + "loss": 2.6813, + "step": 2718 + }, + { + "epoch": 0.7664012402226763, + "grad_norm": 1336.0147705078125, + "learning_rate": 1.3574036716681366e-05, + "loss": 3.0599, + "step": 2719 + }, + { + "epoch": 0.7666831090127546, + "grad_norm": 812.0093383789062, + "learning_rate": 1.3542835277331778e-05, + "loss": 2.8239, + "step": 2720 + }, + { + "epoch": 0.7669649778028328, + "grad_norm": 1600.00927734375, + "learning_rate": 1.3511664121061862e-05, + "loss": 3.1706, + "step": 2721 + }, + { + "epoch": 0.767246846592911, + "grad_norm": 2080.010009765625, + "learning_rate": 1.3480523273763878e-05, + "loss": 2.6709, + "step": 2722 + }, + { + "epoch": 0.7675287153829892, + "grad_norm": 704.0118408203125, + "learning_rate": 1.3449412761304919e-05, + "loss": 2.9704, + "step": 2723 + }, + { + "epoch": 0.7678105841730675, + "grad_norm": 996.0108642578125, + "learning_rate": 1.3418332609526868e-05, + "loss": 3.04, + "step": 2724 + }, + { + "epoch": 0.7680924529631457, + "grad_norm": 676.0115966796875, + "learning_rate": 1.3387282844246386e-05, + "loss": 2.7692, + "step": 2725 + }, + { + "epoch": 0.7683743217532238, + "grad_norm": 640.0126953125, + "learning_rate": 1.3356263491254933e-05, + "loss": 2.6501, + "step": 2726 + }, + { + "epoch": 0.7686561905433021, + "grad_norm": 580.01025390625, + "learning_rate": 1.3325274576318657e-05, + "loss": 2.2736, + "step": 2727 + }, + { + "epoch": 0.7689380593333803, + "grad_norm": 984.0110473632812, + "learning_rate": 1.3294316125178474e-05, + "loss": 3.0625, + "step": 2728 + }, + { + "epoch": 0.7692199281234585, + "grad_norm": 956.0078735351562, + "learning_rate": 1.326338816354995e-05, + "loss": 2.9285, + "step": 2729 + }, + { + "epoch": 0.7695017969135367, + "grad_norm": 1448.01123046875, + "learning_rate": 1.323249071712333e-05, + "loss": 3.0144, + "step": 2730 + }, + { + "epoch": 0.769783665703615, + "grad_norm": 992.0071411132812, + "learning_rate": 1.3201623811563546e-05, + "loss": 2.7517, + "step": 2731 + }, + { + "epoch": 0.7700655344936932, + "grad_norm": 876.0098266601562, + "learning_rate": 1.3170787472510132e-05, + "loss": 2.8265, + "step": 2732 + }, + { + "epoch": 0.7703474032837714, + "grad_norm": 1208.0118408203125, + "learning_rate": 1.3139981725577233e-05, + "loss": 3.1657, + "step": 2733 + }, + { + "epoch": 0.7706292720738496, + "grad_norm": 596.0291748046875, + "learning_rate": 1.3109206596353573e-05, + "loss": 2.6354, + "step": 2734 + }, + { + "epoch": 0.7709111408639279, + "grad_norm": 576.0137329101562, + "learning_rate": 1.3078462110402496e-05, + "loss": 2.2291, + "step": 2735 + }, + { + "epoch": 0.7711930096540061, + "grad_norm": 268.0333557128906, + "learning_rate": 1.304774829326183e-05, + "loss": 2.4001, + "step": 2736 + }, + { + "epoch": 0.7714748784440842, + "grad_norm": 2032.012451171875, + "learning_rate": 1.3017065170443948e-05, + "loss": 3.013, + "step": 2737 + }, + { + "epoch": 0.7717567472341625, + "grad_norm": 672.0155029296875, + "learning_rate": 1.2986412767435758e-05, + "loss": 2.5224, + "step": 2738 + }, + { + "epoch": 0.7720386160242407, + "grad_norm": 1064.0155029296875, + "learning_rate": 1.2955791109698606e-05, + "loss": 3.0919, + "step": 2739 + }, + { + "epoch": 0.7723204848143189, + "grad_norm": 1000.0142211914062, + "learning_rate": 1.292520022266831e-05, + "loss": 2.8929, + "step": 2740 + }, + { + "epoch": 0.7726023536043971, + "grad_norm": 1024.01904296875, + "learning_rate": 1.289464013175516e-05, + "loss": 2.9902, + "step": 2741 + }, + { + "epoch": 0.7728842223944754, + "grad_norm": 1376.0196533203125, + "learning_rate": 1.2864110862343832e-05, + "loss": 3.1992, + "step": 2742 + }, + { + "epoch": 0.7731660911845536, + "grad_norm": 1272.01416015625, + "learning_rate": 1.2833612439793402e-05, + "loss": 2.6466, + "step": 2743 + }, + { + "epoch": 0.7734479599746318, + "grad_norm": 1056.0155029296875, + "learning_rate": 1.2803144889437324e-05, + "loss": 3.2985, + "step": 2744 + }, + { + "epoch": 0.7737298287647101, + "grad_norm": 672.0183715820312, + "learning_rate": 1.2772708236583442e-05, + "loss": 2.599, + "step": 2745 + }, + { + "epoch": 0.7740116975547883, + "grad_norm": 1392.0166015625, + "learning_rate": 1.2742302506513892e-05, + "loss": 3.1494, + "step": 2746 + }, + { + "epoch": 0.7742935663448665, + "grad_norm": 1312.0118408203125, + "learning_rate": 1.2711927724485128e-05, + "loss": 3.0345, + "step": 2747 + }, + { + "epoch": 0.7745754351349446, + "grad_norm": 1272.0120849609375, + "learning_rate": 1.268158391572794e-05, + "loss": 2.8367, + "step": 2748 + }, + { + "epoch": 0.7748573039250229, + "grad_norm": 1152.044189453125, + "learning_rate": 1.265127110544732e-05, + "loss": 3.3035, + "step": 2749 + }, + { + "epoch": 0.7751391727151011, + "grad_norm": 536.0335693359375, + "learning_rate": 1.2620989318822596e-05, + "loss": 3.5075, + "step": 2750 + }, + { + "epoch": 0.7754210415051793, + "grad_norm": 1264.0107421875, + "learning_rate": 1.259073858100725e-05, + "loss": 2.5378, + "step": 2751 + }, + { + "epoch": 0.7757029102952575, + "grad_norm": 808.0093383789062, + "learning_rate": 1.2560518917129017e-05, + "loss": 2.6019, + "step": 2752 + }, + { + "epoch": 0.7759847790853358, + "grad_norm": 2000.011474609375, + "learning_rate": 1.2530330352289793e-05, + "loss": 2.6784, + "step": 2753 + }, + { + "epoch": 0.776266647875414, + "grad_norm": 2096.012939453125, + "learning_rate": 1.2500172911565661e-05, + "loss": 2.7712, + "step": 2754 + }, + { + "epoch": 0.7765485166654922, + "grad_norm": 1024.0108642578125, + "learning_rate": 1.2470046620006859e-05, + "loss": 2.8542, + "step": 2755 + }, + { + "epoch": 0.7768303854555705, + "grad_norm": 1008.009033203125, + "learning_rate": 1.2439951502637725e-05, + "loss": 3.1149, + "step": 2756 + }, + { + "epoch": 0.7771122542456487, + "grad_norm": 532.0204467773438, + "learning_rate": 1.2409887584456709e-05, + "loss": 2.6178, + "step": 2757 + }, + { + "epoch": 0.7773941230357269, + "grad_norm": 936.0078125, + "learning_rate": 1.2379854890436376e-05, + "loss": 2.4094, + "step": 2758 + }, + { + "epoch": 0.777675991825805, + "grad_norm": 1012.008544921875, + "learning_rate": 1.2349853445523301e-05, + "loss": 3.0645, + "step": 2759 + }, + { + "epoch": 0.7779578606158833, + "grad_norm": 2000.0147705078125, + "learning_rate": 1.2319883274638167e-05, + "loss": 2.8223, + "step": 2760 + }, + { + "epoch": 0.7782397294059615, + "grad_norm": 1400.0086669921875, + "learning_rate": 1.2289944402675619e-05, + "loss": 3.0212, + "step": 2761 + }, + { + "epoch": 0.7785215981960397, + "grad_norm": 444.016357421875, + "learning_rate": 1.2260036854504331e-05, + "loss": 3.1292, + "step": 2762 + }, + { + "epoch": 0.7788034669861179, + "grad_norm": 1088.0089111328125, + "learning_rate": 1.2230160654966943e-05, + "loss": 2.7582, + "step": 2763 + }, + { + "epoch": 0.7790853357761962, + "grad_norm": 1400.0128173828125, + "learning_rate": 1.2200315828880093e-05, + "loss": 2.9242, + "step": 2764 + }, + { + "epoch": 0.7793672045662744, + "grad_norm": 1960.0108642578125, + "learning_rate": 1.2170502401034329e-05, + "loss": 3.4466, + "step": 2765 + }, + { + "epoch": 0.7796490733563526, + "grad_norm": 430.02099609375, + "learning_rate": 1.2140720396194112e-05, + "loss": 3.2738, + "step": 2766 + }, + { + "epoch": 0.7799309421464309, + "grad_norm": 996.0076904296875, + "learning_rate": 1.2110969839097797e-05, + "loss": 2.8229, + "step": 2767 + }, + { + "epoch": 0.7802128109365091, + "grad_norm": 1072.008544921875, + "learning_rate": 1.208125075445765e-05, + "loss": 2.3793, + "step": 2768 + }, + { + "epoch": 0.7804946797265873, + "grad_norm": 1824.0111083984375, + "learning_rate": 1.205156316695979e-05, + "loss": 3.4365, + "step": 2769 + }, + { + "epoch": 0.7807765485166654, + "grad_norm": 1288.010498046875, + "learning_rate": 1.2021907101264146e-05, + "loss": 2.668, + "step": 2770 + }, + { + "epoch": 0.7810584173067437, + "grad_norm": 1456.011962890625, + "learning_rate": 1.1992282582004477e-05, + "loss": 2.9577, + "step": 2771 + }, + { + "epoch": 0.7813402860968219, + "grad_norm": 896.0418701171875, + "learning_rate": 1.1962689633788338e-05, + "loss": 2.3862, + "step": 2772 + }, + { + "epoch": 0.7816221548869001, + "grad_norm": 1020.0076293945312, + "learning_rate": 1.1933128281197042e-05, + "loss": 2.9421, + "step": 2773 + }, + { + "epoch": 0.7819040236769784, + "grad_norm": 1672.0118408203125, + "learning_rate": 1.1903598548785704e-05, + "loss": 3.0154, + "step": 2774 + }, + { + "epoch": 0.7821858924670566, + "grad_norm": 668.0087890625, + "learning_rate": 1.1874100461083132e-05, + "loss": 3.2419, + "step": 2775 + }, + { + "epoch": 0.7824677612571348, + "grad_norm": 1248.010009765625, + "learning_rate": 1.1844634042591857e-05, + "loss": 2.7816, + "step": 2776 + }, + { + "epoch": 0.782749630047213, + "grad_norm": 1072.0062255859375, + "learning_rate": 1.18151993177881e-05, + "loss": 2.6908, + "step": 2777 + }, + { + "epoch": 0.7830314988372913, + "grad_norm": 1344.0096435546875, + "learning_rate": 1.1785796311121767e-05, + "loss": 2.7565, + "step": 2778 + }, + { + "epoch": 0.7833133676273695, + "grad_norm": 1240.01220703125, + "learning_rate": 1.1756425047016439e-05, + "loss": 3.0391, + "step": 2779 + }, + { + "epoch": 0.7835952364174477, + "grad_norm": 494.0181884765625, + "learning_rate": 1.1727085549869282e-05, + "loss": 2.8382, + "step": 2780 + }, + { + "epoch": 0.7838771052075258, + "grad_norm": 888.0439453125, + "learning_rate": 1.1697777844051105e-05, + "loss": 3.0596, + "step": 2781 + }, + { + "epoch": 0.7841589739976041, + "grad_norm": 1392.0130615234375, + "learning_rate": 1.166850195390628e-05, + "loss": 2.9095, + "step": 2782 + }, + { + "epoch": 0.7844408427876823, + "grad_norm": 536.0239868164062, + "learning_rate": 1.1639257903752809e-05, + "loss": 3.2895, + "step": 2783 + }, + { + "epoch": 0.7847227115777605, + "grad_norm": 1864.0225830078125, + "learning_rate": 1.1610045717882195e-05, + "loss": 3.0652, + "step": 2784 + }, + { + "epoch": 0.7850045803678388, + "grad_norm": 1432.012939453125, + "learning_rate": 1.1580865420559489e-05, + "loss": 2.8705, + "step": 2785 + }, + { + "epoch": 0.785286449157917, + "grad_norm": 640.0146484375, + "learning_rate": 1.1551717036023247e-05, + "loss": 2.5437, + "step": 2786 + }, + { + "epoch": 0.7855683179479952, + "grad_norm": 604.0247192382812, + "learning_rate": 1.1522600588485516e-05, + "loss": 2.7327, + "step": 2787 + }, + { + "epoch": 0.7858501867380734, + "grad_norm": 1280.01318359375, + "learning_rate": 1.1493516102131834e-05, + "loss": 3.4538, + "step": 2788 + }, + { + "epoch": 0.7861320555281517, + "grad_norm": 1184.0172119140625, + "learning_rate": 1.1464463601121194e-05, + "loss": 2.1985, + "step": 2789 + }, + { + "epoch": 0.7864139243182299, + "grad_norm": 1704.0093994140625, + "learning_rate": 1.1435443109585992e-05, + "loss": 3.1165, + "step": 2790 + }, + { + "epoch": 0.7866957931083081, + "grad_norm": 1432.0146484375, + "learning_rate": 1.1406454651632042e-05, + "loss": 4.0902, + "step": 2791 + }, + { + "epoch": 0.7869776618983862, + "grad_norm": 632.0132446289062, + "learning_rate": 1.1377498251338542e-05, + "loss": 2.5458, + "step": 2792 + }, + { + "epoch": 0.7872595306884645, + "grad_norm": 848.0181884765625, + "learning_rate": 1.134857393275811e-05, + "loss": 3.3285, + "step": 2793 + }, + { + "epoch": 0.7875413994785427, + "grad_norm": 1952.01806640625, + "learning_rate": 1.1319681719916664e-05, + "loss": 2.9727, + "step": 2794 + }, + { + "epoch": 0.7878232682686209, + "grad_norm": 1736.01416015625, + "learning_rate": 1.1290821636813464e-05, + "loss": 3.8906, + "step": 2795 + }, + { + "epoch": 0.7881051370586992, + "grad_norm": 1328.0072021484375, + "learning_rate": 1.1261993707421081e-05, + "loss": 2.534, + "step": 2796 + }, + { + "epoch": 0.7883870058487774, + "grad_norm": 924.0145263671875, + "learning_rate": 1.1233197955685409e-05, + "loss": 2.7389, + "step": 2797 + }, + { + "epoch": 0.7886688746388556, + "grad_norm": 872.0098266601562, + "learning_rate": 1.1204434405525555e-05, + "loss": 2.4356, + "step": 2798 + }, + { + "epoch": 0.7889507434289338, + "grad_norm": 358.0157775878906, + "learning_rate": 1.1175703080833943e-05, + "loss": 2.3816, + "step": 2799 + }, + { + "epoch": 0.7892326122190121, + "grad_norm": 608.0137329101562, + "learning_rate": 1.1147004005476191e-05, + "loss": 2.648, + "step": 2800 + }, + { + "epoch": 0.7895144810090903, + "grad_norm": 1720.0076904296875, + "learning_rate": 1.1118337203291124e-05, + "loss": 3.4072, + "step": 2801 + }, + { + "epoch": 0.7897963497991685, + "grad_norm": 700.0078735351562, + "learning_rate": 1.1089702698090759e-05, + "loss": 2.9744, + "step": 2802 + }, + { + "epoch": 0.7900782185892467, + "grad_norm": 780.0091552734375, + "learning_rate": 1.1061100513660334e-05, + "loss": 2.2512, + "step": 2803 + }, + { + "epoch": 0.790360087379325, + "grad_norm": 1640.0069580078125, + "learning_rate": 1.1032530673758174e-05, + "loss": 3.2402, + "step": 2804 + }, + { + "epoch": 0.7906419561694031, + "grad_norm": 1352.0126953125, + "learning_rate": 1.100399320211578e-05, + "loss": 2.851, + "step": 2805 + }, + { + "epoch": 0.7909238249594813, + "grad_norm": 688.0086669921875, + "learning_rate": 1.0975488122437732e-05, + "loss": 3.1253, + "step": 2806 + }, + { + "epoch": 0.7912056937495596, + "grad_norm": 1352.0111083984375, + "learning_rate": 1.0947015458401754e-05, + "loss": 3.1638, + "step": 2807 + }, + { + "epoch": 0.7914875625396378, + "grad_norm": 2176.0107421875, + "learning_rate": 1.0918575233658584e-05, + "loss": 3.0603, + "step": 2808 + }, + { + "epoch": 0.791769431329716, + "grad_norm": 502.0118103027344, + "learning_rate": 1.0890167471832081e-05, + "loss": 2.9121, + "step": 2809 + }, + { + "epoch": 0.7920513001197942, + "grad_norm": 348.0148620605469, + "learning_rate": 1.0861792196519072e-05, + "loss": 2.5088, + "step": 2810 + }, + { + "epoch": 0.7923331689098725, + "grad_norm": 1280.010009765625, + "learning_rate": 1.0833449431289433e-05, + "loss": 3.1202, + "step": 2811 + }, + { + "epoch": 0.7926150376999507, + "grad_norm": 342.0120849609375, + "learning_rate": 1.0805139199686048e-05, + "loss": 2.6156, + "step": 2812 + }, + { + "epoch": 0.7928969064900289, + "grad_norm": 1376.0089111328125, + "learning_rate": 1.0776861525224751e-05, + "loss": 2.5863, + "step": 2813 + }, + { + "epoch": 0.7931787752801072, + "grad_norm": 676.0072021484375, + "learning_rate": 1.0748616431394342e-05, + "loss": 2.6729, + "step": 2814 + }, + { + "epoch": 0.7934606440701854, + "grad_norm": 820.0234375, + "learning_rate": 1.0720403941656548e-05, + "loss": 2.5732, + "step": 2815 + }, + { + "epoch": 0.7937425128602635, + "grad_norm": 1448.0093994140625, + "learning_rate": 1.069222407944605e-05, + "loss": 3.2132, + "step": 2816 + }, + { + "epoch": 0.7940243816503417, + "grad_norm": 336.0115051269531, + "learning_rate": 1.0664076868170381e-05, + "loss": 2.4756, + "step": 2817 + }, + { + "epoch": 0.79430625044042, + "grad_norm": 1856.013916015625, + "learning_rate": 1.063596233120997e-05, + "loss": 2.575, + "step": 2818 + }, + { + "epoch": 0.7945881192304982, + "grad_norm": 1200.0093994140625, + "learning_rate": 1.060788049191812e-05, + "loss": 3.1979, + "step": 2819 + }, + { + "epoch": 0.7948699880205764, + "grad_norm": 1256.068115234375, + "learning_rate": 1.0579831373620963e-05, + "loss": 3.1328, + "step": 2820 + }, + { + "epoch": 0.7951518568106546, + "grad_norm": 1096.010009765625, + "learning_rate": 1.0551814999617431e-05, + "loss": 2.7998, + "step": 2821 + }, + { + "epoch": 0.7954337256007329, + "grad_norm": 1224.0059814453125, + "learning_rate": 1.0523831393179295e-05, + "loss": 2.6427, + "step": 2822 + }, + { + "epoch": 0.7957155943908111, + "grad_norm": 920.0068359375, + "learning_rate": 1.0495880577551086e-05, + "loss": 2.5039, + "step": 2823 + }, + { + "epoch": 0.7959974631808893, + "grad_norm": 1272.008056640625, + "learning_rate": 1.0467962575950096e-05, + "loss": 2.8717, + "step": 2824 + }, + { + "epoch": 0.7962793319709676, + "grad_norm": 2112.013916015625, + "learning_rate": 1.0440077411566345e-05, + "loss": 3.2555, + "step": 2825 + }, + { + "epoch": 0.7965612007610458, + "grad_norm": 1012.0064697265625, + "learning_rate": 1.0412225107562629e-05, + "loss": 3.0241, + "step": 2826 + }, + { + "epoch": 0.7968430695511239, + "grad_norm": 680.0143432617188, + "learning_rate": 1.03844056870744e-05, + "loss": 3.2458, + "step": 2827 + }, + { + "epoch": 0.7971249383412021, + "grad_norm": 972.0093383789062, + "learning_rate": 1.0356619173209808e-05, + "loss": 2.6097, + "step": 2828 + }, + { + "epoch": 0.7974068071312804, + "grad_norm": 992.0060424804688, + "learning_rate": 1.0328865589049664e-05, + "loss": 2.6204, + "step": 2829 + }, + { + "epoch": 0.7976886759213586, + "grad_norm": 1352.0098876953125, + "learning_rate": 1.0301144957647441e-05, + "loss": 2.9925, + "step": 2830 + }, + { + "epoch": 0.7979705447114368, + "grad_norm": 1104.0333251953125, + "learning_rate": 1.0273457302029255e-05, + "loss": 2.7917, + "step": 2831 + }, + { + "epoch": 0.798252413501515, + "grad_norm": 672.016357421875, + "learning_rate": 1.0245802645193781e-05, + "loss": 3.0537, + "step": 2832 + }, + { + "epoch": 0.7985342822915933, + "grad_norm": 1112.0477294921875, + "learning_rate": 1.0218181010112321e-05, + "loss": 2.339, + "step": 2833 + }, + { + "epoch": 0.7988161510816715, + "grad_norm": 1768.018798828125, + "learning_rate": 1.0190592419728739e-05, + "loss": 3.516, + "step": 2834 + }, + { + "epoch": 0.7990980198717497, + "grad_norm": 840.143310546875, + "learning_rate": 1.0163036896959433e-05, + "loss": 2.563, + "step": 2835 + }, + { + "epoch": 0.799379888661828, + "grad_norm": 1936.009033203125, + "learning_rate": 1.0135514464693369e-05, + "loss": 2.7074, + "step": 2836 + }, + { + "epoch": 0.7996617574519062, + "grad_norm": 628.0197143554688, + "learning_rate": 1.0108025145792004e-05, + "loss": 3.1152, + "step": 2837 + }, + { + "epoch": 0.7999436262419843, + "grad_norm": 1048.0125732421875, + "learning_rate": 1.0080568963089287e-05, + "loss": 3.1247, + "step": 2838 + }, + { + "epoch": 0.8002254950320625, + "grad_norm": 932.0121459960938, + "learning_rate": 1.005314593939164e-05, + "loss": 2.6244, + "step": 2839 + }, + { + "epoch": 0.8005073638221408, + "grad_norm": 1480.015625, + "learning_rate": 1.002575609747795e-05, + "loss": 3.5762, + "step": 2840 + }, + { + "epoch": 0.800789232612219, + "grad_norm": 1144.013916015625, + "learning_rate": 9.998399460099572e-06, + "loss": 3.2105, + "step": 2841 + }, + { + "epoch": 0.8010711014022972, + "grad_norm": 1208.013427734375, + "learning_rate": 9.971076049980222e-06, + "loss": 2.9464, + "step": 2842 + }, + { + "epoch": 0.8013529701923755, + "grad_norm": 1312.012939453125, + "learning_rate": 9.943785889816043e-06, + "loss": 3.1628, + "step": 2843 + }, + { + "epoch": 0.8016348389824537, + "grad_norm": 1152.0133056640625, + "learning_rate": 9.916529002275554e-06, + "loss": 2.5962, + "step": 2844 + }, + { + "epoch": 0.8019167077725319, + "grad_norm": 756.0203857421875, + "learning_rate": 9.889305409999655e-06, + "loss": 2.8753, + "step": 2845 + }, + { + "epoch": 0.8021985765626101, + "grad_norm": 1208.0137939453125, + "learning_rate": 9.862115135601573e-06, + "loss": 3.0467, + "step": 2846 + }, + { + "epoch": 0.8024804453526884, + "grad_norm": 1600.0167236328125, + "learning_rate": 9.83495820166686e-06, + "loss": 2.395, + "step": 2847 + }, + { + "epoch": 0.8027623141427666, + "grad_norm": 1472.0133056640625, + "learning_rate": 9.807834630753365e-06, + "loss": 2.2827, + "step": 2848 + }, + { + "epoch": 0.8030441829328447, + "grad_norm": 936.01318359375, + "learning_rate": 9.780744445391221e-06, + "loss": 2.5237, + "step": 2849 + }, + { + "epoch": 0.8033260517229229, + "grad_norm": 1984.015380859375, + "learning_rate": 9.753687668082889e-06, + "loss": 2.9787, + "step": 2850 + }, + { + "epoch": 0.8036079205130012, + "grad_norm": 1776.007080078125, + "learning_rate": 9.726664321303008e-06, + "loss": 2.7793, + "step": 2851 + }, + { + "epoch": 0.8038897893030794, + "grad_norm": 1416.0084228515625, + "learning_rate": 9.699674427498472e-06, + "loss": 2.82, + "step": 2852 + }, + { + "epoch": 0.8041716580931576, + "grad_norm": 896.011962890625, + "learning_rate": 9.672718009088388e-06, + "loss": 2.9779, + "step": 2853 + }, + { + "epoch": 0.8044535268832359, + "grad_norm": 780.0089721679688, + "learning_rate": 9.64579508846405e-06, + "loss": 2.8929, + "step": 2854 + }, + { + "epoch": 0.8047353956733141, + "grad_norm": 764.0076904296875, + "learning_rate": 9.618905687988954e-06, + "loss": 2.8626, + "step": 2855 + }, + { + "epoch": 0.8050172644633923, + "grad_norm": 692.0260009765625, + "learning_rate": 9.592049829998728e-06, + "loss": 2.7654, + "step": 2856 + }, + { + "epoch": 0.8052991332534705, + "grad_norm": 708.0128173828125, + "learning_rate": 9.565227536801135e-06, + "loss": 3.2559, + "step": 2857 + }, + { + "epoch": 0.8055810020435488, + "grad_norm": 1480.0064697265625, + "learning_rate": 9.53843883067606e-06, + "loss": 2.556, + "step": 2858 + }, + { + "epoch": 0.805862870833627, + "grad_norm": 652.0081176757812, + "learning_rate": 9.511683733875498e-06, + "loss": 2.7654, + "step": 2859 + }, + { + "epoch": 0.8061447396237051, + "grad_norm": 436.01220703125, + "learning_rate": 9.484962268623549e-06, + "loss": 2.8135, + "step": 2860 + }, + { + "epoch": 0.8064266084137833, + "grad_norm": 688.0082397460938, + "learning_rate": 9.458274457116324e-06, + "loss": 2.5112, + "step": 2861 + }, + { + "epoch": 0.8067084772038616, + "grad_norm": 992.0109252929688, + "learning_rate": 9.431620321522017e-06, + "loss": 3.3011, + "step": 2862 + }, + { + "epoch": 0.8069903459939398, + "grad_norm": 324.0146179199219, + "learning_rate": 9.40499988398082e-06, + "loss": 1.949, + "step": 2863 + }, + { + "epoch": 0.807272214784018, + "grad_norm": 1112.0084228515625, + "learning_rate": 9.378413166604983e-06, + "loss": 2.4323, + "step": 2864 + }, + { + "epoch": 0.8075540835740963, + "grad_norm": 1704.0135498046875, + "learning_rate": 9.351860191478696e-06, + "loss": 3.0957, + "step": 2865 + }, + { + "epoch": 0.8078359523641745, + "grad_norm": 1248.0059814453125, + "learning_rate": 9.325340980658147e-06, + "loss": 2.6504, + "step": 2866 + }, + { + "epoch": 0.8081178211542527, + "grad_norm": 920.0089111328125, + "learning_rate": 9.298855556171471e-06, + "loss": 3.237, + "step": 2867 + }, + { + "epoch": 0.8083996899443309, + "grad_norm": 1296.009765625, + "learning_rate": 9.272403940018726e-06, + "loss": 2.6077, + "step": 2868 + }, + { + "epoch": 0.8086815587344092, + "grad_norm": 576.0112915039062, + "learning_rate": 9.245986154171915e-06, + "loss": 2.7949, + "step": 2869 + }, + { + "epoch": 0.8089634275244874, + "grad_norm": 1020.0110473632812, + "learning_rate": 9.219602220574936e-06, + "loss": 2.9821, + "step": 2870 + }, + { + "epoch": 0.8092452963145655, + "grad_norm": 1536.0101318359375, + "learning_rate": 9.19325216114354e-06, + "loss": 2.9274, + "step": 2871 + }, + { + "epoch": 0.8095271651046438, + "grad_norm": 752.0095825195312, + "learning_rate": 9.166935997765363e-06, + "loss": 2.7441, + "step": 2872 + }, + { + "epoch": 0.809809033894722, + "grad_norm": 612.0087280273438, + "learning_rate": 9.140653752299871e-06, + "loss": 2.6492, + "step": 2873 + }, + { + "epoch": 0.8100909026848002, + "grad_norm": 1296.0126953125, + "learning_rate": 9.114405446578378e-06, + "loss": 3.4609, + "step": 2874 + }, + { + "epoch": 0.8103727714748784, + "grad_norm": 828.0119018554688, + "learning_rate": 9.088191102403992e-06, + "loss": 2.451, + "step": 2875 + }, + { + "epoch": 0.8106546402649567, + "grad_norm": 1088.00634765625, + "learning_rate": 9.062010741551607e-06, + "loss": 3.0593, + "step": 2876 + }, + { + "epoch": 0.8109365090550349, + "grad_norm": 1120.0087890625, + "learning_rate": 9.035864385767879e-06, + "loss": 3.0713, + "step": 2877 + }, + { + "epoch": 0.8112183778451131, + "grad_norm": 576.010986328125, + "learning_rate": 9.009752056771258e-06, + "loss": 2.4227, + "step": 2878 + }, + { + "epoch": 0.8115002466351913, + "grad_norm": 940.0123901367188, + "learning_rate": 8.983673776251894e-06, + "loss": 2.6035, + "step": 2879 + }, + { + "epoch": 0.8117821154252696, + "grad_norm": 1416.0103759765625, + "learning_rate": 8.957629565871656e-06, + "loss": 2.7344, + "step": 2880 + }, + { + "epoch": 0.8120639842153478, + "grad_norm": 760.0107421875, + "learning_rate": 8.93161944726414e-06, + "loss": 2.5568, + "step": 2881 + }, + { + "epoch": 0.8123458530054259, + "grad_norm": 1760.0174560546875, + "learning_rate": 8.905643442034588e-06, + "loss": 2.9362, + "step": 2882 + }, + { + "epoch": 0.8126277217955042, + "grad_norm": 1020.026123046875, + "learning_rate": 8.879701571759925e-06, + "loss": 2.3418, + "step": 2883 + }, + { + "epoch": 0.8129095905855824, + "grad_norm": 1520.008544921875, + "learning_rate": 8.853793857988735e-06, + "loss": 2.6931, + "step": 2884 + }, + { + "epoch": 0.8131914593756606, + "grad_norm": 294.06402587890625, + "learning_rate": 8.827920322241202e-06, + "loss": 2.987, + "step": 2885 + }, + { + "epoch": 0.8134733281657388, + "grad_norm": 952.0210571289062, + "learning_rate": 8.802080986009136e-06, + "loss": 2.821, + "step": 2886 + }, + { + "epoch": 0.8137551969558171, + "grad_norm": 320.03289794921875, + "learning_rate": 8.776275870755924e-06, + "loss": 2.6506, + "step": 2887 + }, + { + "epoch": 0.8140370657458953, + "grad_norm": 1728.0255126953125, + "learning_rate": 8.750504997916564e-06, + "loss": 3.7181, + "step": 2888 + }, + { + "epoch": 0.8143189345359735, + "grad_norm": 1512.015625, + "learning_rate": 8.724768388897575e-06, + "loss": 2.656, + "step": 2889 + }, + { + "epoch": 0.8146008033260517, + "grad_norm": 1096.0106201171875, + "learning_rate": 8.699066065077006e-06, + "loss": 2.4392, + "step": 2890 + }, + { + "epoch": 0.81488267211613, + "grad_norm": 1120.0157470703125, + "learning_rate": 8.673398047804481e-06, + "loss": 2.7722, + "step": 2891 + }, + { + "epoch": 0.8151645409062082, + "grad_norm": 1208.01171875, + "learning_rate": 8.647764358401062e-06, + "loss": 2.3096, + "step": 2892 + }, + { + "epoch": 0.8154464096962863, + "grad_norm": 844.0252685546875, + "learning_rate": 8.622165018159356e-06, + "loss": 3.0591, + "step": 2893 + }, + { + "epoch": 0.8157282784863646, + "grad_norm": 1296.0164794921875, + "learning_rate": 8.596600048343401e-06, + "loss": 2.7987, + "step": 2894 + }, + { + "epoch": 0.8160101472764428, + "grad_norm": 1608.011474609375, + "learning_rate": 8.571069470188685e-06, + "loss": 2.5372, + "step": 2895 + }, + { + "epoch": 0.816292016066521, + "grad_norm": 510.02911376953125, + "learning_rate": 8.54557330490215e-06, + "loss": 3.292, + "step": 2896 + }, + { + "epoch": 0.8165738848565992, + "grad_norm": 1488.01416015625, + "learning_rate": 8.520111573662115e-06, + "loss": 3.1875, + "step": 2897 + }, + { + "epoch": 0.8168557536466775, + "grad_norm": 1840.016845703125, + "learning_rate": 8.494684297618355e-06, + "loss": 3.3916, + "step": 2898 + }, + { + "epoch": 0.8171376224367557, + "grad_norm": 384.0440979003906, + "learning_rate": 8.46929149789198e-06, + "loss": 2.8358, + "step": 2899 + }, + { + "epoch": 0.8174194912268339, + "grad_norm": 1840.0067138671875, + "learning_rate": 8.443933195575455e-06, + "loss": 3.166, + "step": 2900 + }, + { + "epoch": 0.8177013600169122, + "grad_norm": 832.0128173828125, + "learning_rate": 8.418609411732642e-06, + "loss": 2.5437, + "step": 2901 + }, + { + "epoch": 0.8179832288069904, + "grad_norm": 596.0226440429688, + "learning_rate": 8.393320167398672e-06, + "loss": 2.9557, + "step": 2902 + }, + { + "epoch": 0.8182650975970686, + "grad_norm": 1280.008544921875, + "learning_rate": 8.368065483580034e-06, + "loss": 3.012, + "step": 2903 + }, + { + "epoch": 0.8185469663871467, + "grad_norm": 2240.0146484375, + "learning_rate": 8.342845381254472e-06, + "loss": 3.1816, + "step": 2904 + }, + { + "epoch": 0.818828835177225, + "grad_norm": 1632.0146484375, + "learning_rate": 8.31765988137102e-06, + "loss": 2.5612, + "step": 2905 + }, + { + "epoch": 0.8191107039673032, + "grad_norm": 792.0211181640625, + "learning_rate": 8.292509004849958e-06, + "loss": 2.8257, + "step": 2906 + }, + { + "epoch": 0.8193925727573814, + "grad_norm": 1096.01513671875, + "learning_rate": 8.267392772582844e-06, + "loss": 2.6525, + "step": 2907 + }, + { + "epoch": 0.8196744415474596, + "grad_norm": 334.02984619140625, + "learning_rate": 8.242311205432418e-06, + "loss": 3.4277, + "step": 2908 + }, + { + "epoch": 0.8199563103375379, + "grad_norm": 892.0275268554688, + "learning_rate": 8.217264324232638e-06, + "loss": 2.8649, + "step": 2909 + }, + { + "epoch": 0.8202381791276161, + "grad_norm": 892.011474609375, + "learning_rate": 8.192252149788642e-06, + "loss": 3.0684, + "step": 2910 + }, + { + "epoch": 0.8205200479176943, + "grad_norm": 1392.016845703125, + "learning_rate": 8.167274702876765e-06, + "loss": 2.6554, + "step": 2911 + }, + { + "epoch": 0.8208019167077726, + "grad_norm": 844.0177612304688, + "learning_rate": 8.14233200424448e-06, + "loss": 2.7725, + "step": 2912 + }, + { + "epoch": 0.8210837854978508, + "grad_norm": 952.0107421875, + "learning_rate": 8.117424074610391e-06, + "loss": 3.0163, + "step": 2913 + }, + { + "epoch": 0.821365654287929, + "grad_norm": 1760.0103759765625, + "learning_rate": 8.092550934664227e-06, + "loss": 3.1016, + "step": 2914 + }, + { + "epoch": 0.8216475230780071, + "grad_norm": 580.0126342773438, + "learning_rate": 8.067712605066812e-06, + "loss": 2.4119, + "step": 2915 + }, + { + "epoch": 0.8219293918680854, + "grad_norm": 1376.009765625, + "learning_rate": 8.042909106450058e-06, + "loss": 2.738, + "step": 2916 + }, + { + "epoch": 0.8222112606581636, + "grad_norm": 1272.009033203125, + "learning_rate": 8.018140459416962e-06, + "loss": 2.9544, + "step": 2917 + }, + { + "epoch": 0.8224931294482418, + "grad_norm": 712.0115356445312, + "learning_rate": 7.993406684541548e-06, + "loss": 3.0346, + "step": 2918 + }, + { + "epoch": 0.82277499823832, + "grad_norm": 1200.0115966796875, + "learning_rate": 7.968707802368891e-06, + "loss": 3.1738, + "step": 2919 + }, + { + "epoch": 0.8230568670283983, + "grad_norm": 1960.0111083984375, + "learning_rate": 7.944043833415044e-06, + "loss": 3.4593, + "step": 2920 + }, + { + "epoch": 0.8233387358184765, + "grad_norm": 1088.0162353515625, + "learning_rate": 7.91941479816712e-06, + "loss": 3.0114, + "step": 2921 + }, + { + "epoch": 0.8236206046085547, + "grad_norm": 1296.023193359375, + "learning_rate": 7.894820717083184e-06, + "loss": 2.6706, + "step": 2922 + }, + { + "epoch": 0.823902473398633, + "grad_norm": 1360.0135498046875, + "learning_rate": 7.870261610592255e-06, + "loss": 3.3031, + "step": 2923 + }, + { + "epoch": 0.8241843421887112, + "grad_norm": 1440.0074462890625, + "learning_rate": 7.84573749909432e-06, + "loss": 3.0374, + "step": 2924 + }, + { + "epoch": 0.8244662109787894, + "grad_norm": 1704.0238037109375, + "learning_rate": 7.821248402960268e-06, + "loss": 2.6777, + "step": 2925 + }, + { + "epoch": 0.8247480797688675, + "grad_norm": 1632.00732421875, + "learning_rate": 7.796794342531948e-06, + "loss": 3.1331, + "step": 2926 + }, + { + "epoch": 0.8250299485589458, + "grad_norm": 1144.01513671875, + "learning_rate": 7.772375338122078e-06, + "loss": 3.2585, + "step": 2927 + }, + { + "epoch": 0.825311817349024, + "grad_norm": 1464.0203857421875, + "learning_rate": 7.747991410014255e-06, + "loss": 2.8138, + "step": 2928 + }, + { + "epoch": 0.8255936861391022, + "grad_norm": 928.0152587890625, + "learning_rate": 7.723642578462947e-06, + "loss": 2.9451, + "step": 2929 + }, + { + "epoch": 0.8258755549291804, + "grad_norm": 1328.0216064453125, + "learning_rate": 7.699328863693456e-06, + "loss": 2.6738, + "step": 2930 + }, + { + "epoch": 0.8261574237192587, + "grad_norm": 1712.0321044921875, + "learning_rate": 7.675050285901936e-06, + "loss": 2.9633, + "step": 2931 + }, + { + "epoch": 0.8264392925093369, + "grad_norm": 239.21519470214844, + "learning_rate": 7.650806865255361e-06, + "loss": 3.102, + "step": 2932 + }, + { + "epoch": 0.8267211612994151, + "grad_norm": 1136.0206298828125, + "learning_rate": 7.626598621891473e-06, + "loss": 2.165, + "step": 2933 + }, + { + "epoch": 0.8270030300894934, + "grad_norm": 1936.0135498046875, + "learning_rate": 7.602425575918803e-06, + "loss": 3.0935, + "step": 2934 + }, + { + "epoch": 0.8272848988795716, + "grad_norm": 980.0166015625, + "learning_rate": 7.57828774741664e-06, + "loss": 2.7301, + "step": 2935 + }, + { + "epoch": 0.8275667676696498, + "grad_norm": 656.0333251953125, + "learning_rate": 7.554185156435056e-06, + "loss": 2.4702, + "step": 2936 + }, + { + "epoch": 0.827848636459728, + "grad_norm": 908.0184936523438, + "learning_rate": 7.530117822994809e-06, + "loss": 2.8099, + "step": 2937 + }, + { + "epoch": 0.8281305052498062, + "grad_norm": 1632.0205078125, + "learning_rate": 7.506085767087384e-06, + "loss": 3.4017, + "step": 2938 + }, + { + "epoch": 0.8284123740398844, + "grad_norm": 1440.0118408203125, + "learning_rate": 7.482089008674964e-06, + "loss": 3.0098, + "step": 2939 + }, + { + "epoch": 0.8286942428299626, + "grad_norm": 2224.023193359375, + "learning_rate": 7.458127567690426e-06, + "loss": 3.5223, + "step": 2940 + }, + { + "epoch": 0.8289761116200409, + "grad_norm": 1352.029296875, + "learning_rate": 7.434201464037288e-06, + "loss": 2.5622, + "step": 2941 + }, + { + "epoch": 0.8292579804101191, + "grad_norm": 596.0206298828125, + "learning_rate": 7.4103107175897355e-06, + "loss": 2.1895, + "step": 2942 + }, + { + "epoch": 0.8295398492001973, + "grad_norm": 1000.0214233398438, + "learning_rate": 7.386455348192578e-06, + "loss": 3.5984, + "step": 2943 + }, + { + "epoch": 0.8298217179902755, + "grad_norm": 716.0177001953125, + "learning_rate": 7.362635375661226e-06, + "loss": 2.559, + "step": 2944 + }, + { + "epoch": 0.8301035867803538, + "grad_norm": 3040.035400390625, + "learning_rate": 7.338850819781684e-06, + "loss": 3.0703, + "step": 2945 + }, + { + "epoch": 0.830385455570432, + "grad_norm": 1488.0177001953125, + "learning_rate": 7.3151017003105836e-06, + "loss": 3.3219, + "step": 2946 + }, + { + "epoch": 0.8306673243605102, + "grad_norm": 1768.021240234375, + "learning_rate": 7.291388036975072e-06, + "loss": 3.6953, + "step": 2947 + }, + { + "epoch": 0.8309491931505883, + "grad_norm": 1200.0137939453125, + "learning_rate": 7.267709849472865e-06, + "loss": 2.8799, + "step": 2948 + }, + { + "epoch": 0.8312310619406666, + "grad_norm": 812.0151977539062, + "learning_rate": 7.2440671574721895e-06, + "loss": 2.682, + "step": 2949 + }, + { + "epoch": 0.8315129307307448, + "grad_norm": 1272.0135498046875, + "learning_rate": 7.220459980611838e-06, + "loss": 2.714, + "step": 2950 + }, + { + "epoch": 0.831794799520823, + "grad_norm": 1184.0069580078125, + "learning_rate": 7.19688833850104e-06, + "loss": 2.8268, + "step": 2951 + }, + { + "epoch": 0.8320766683109013, + "grad_norm": 672.0144653320312, + "learning_rate": 7.173352250719561e-06, + "loss": 2.6673, + "step": 2952 + }, + { + "epoch": 0.8323585371009795, + "grad_norm": 884.01025390625, + "learning_rate": 7.149851736817609e-06, + "loss": 2.5391, + "step": 2953 + }, + { + "epoch": 0.8326404058910577, + "grad_norm": 568.0142211914062, + "learning_rate": 7.126386816315822e-06, + "loss": 2.8454, + "step": 2954 + }, + { + "epoch": 0.8329222746811359, + "grad_norm": 1984.0128173828125, + "learning_rate": 7.102957508705327e-06, + "loss": 2.9775, + "step": 2955 + }, + { + "epoch": 0.8332041434712142, + "grad_norm": 1248.0108642578125, + "learning_rate": 7.079563833447617e-06, + "loss": 2.7575, + "step": 2956 + }, + { + "epoch": 0.8334860122612924, + "grad_norm": 948.00830078125, + "learning_rate": 7.0562058099746085e-06, + "loss": 2.7975, + "step": 2957 + }, + { + "epoch": 0.8337678810513706, + "grad_norm": 2040.011474609375, + "learning_rate": 7.032883457688599e-06, + "loss": 2.7272, + "step": 2958 + }, + { + "epoch": 0.8340497498414488, + "grad_norm": 680.0099487304688, + "learning_rate": 7.009596795962275e-06, + "loss": 3.1091, + "step": 2959 + }, + { + "epoch": 0.834331618631527, + "grad_norm": 892.01171875, + "learning_rate": 6.986345844138653e-06, + "loss": 2.4679, + "step": 2960 + }, + { + "epoch": 0.8346134874216052, + "grad_norm": 732.0101318359375, + "learning_rate": 6.963130621531077e-06, + "loss": 2.5895, + "step": 2961 + }, + { + "epoch": 0.8348953562116834, + "grad_norm": 1536.0069580078125, + "learning_rate": 6.939951147423268e-06, + "loss": 2.4103, + "step": 2962 + }, + { + "epoch": 0.8351772250017617, + "grad_norm": 1200.0096435546875, + "learning_rate": 6.91680744106919e-06, + "loss": 2.9011, + "step": 2963 + }, + { + "epoch": 0.8354590937918399, + "grad_norm": 708.0147705078125, + "learning_rate": 6.893699521693114e-06, + "loss": 2.6771, + "step": 2964 + }, + { + "epoch": 0.8357409625819181, + "grad_norm": 1012.0111694335938, + "learning_rate": 6.870627408489616e-06, + "loss": 2.9753, + "step": 2965 + }, + { + "epoch": 0.8360228313719963, + "grad_norm": 1104.009033203125, + "learning_rate": 6.847591120623498e-06, + "loss": 2.6406, + "step": 2966 + }, + { + "epoch": 0.8363047001620746, + "grad_norm": 1400.0079345703125, + "learning_rate": 6.824590677229808e-06, + "loss": 2.7425, + "step": 2967 + }, + { + "epoch": 0.8365865689521528, + "grad_norm": 992.0194091796875, + "learning_rate": 6.801626097413816e-06, + "loss": 2.8125, + "step": 2968 + }, + { + "epoch": 0.836868437742231, + "grad_norm": 2320.007080078125, + "learning_rate": 6.7786974002510325e-06, + "loss": 2.9577, + "step": 2969 + }, + { + "epoch": 0.8371503065323093, + "grad_norm": 748.0097045898438, + "learning_rate": 6.75580460478712e-06, + "loss": 2.8926, + "step": 2970 + }, + { + "epoch": 0.8374321753223875, + "grad_norm": 133.04644775390625, + "learning_rate": 6.7329477300379366e-06, + "loss": 3.1758, + "step": 2971 + }, + { + "epoch": 0.8377140441124656, + "grad_norm": 1152.009765625, + "learning_rate": 6.710126794989524e-06, + "loss": 2.9909, + "step": 2972 + }, + { + "epoch": 0.8379959129025438, + "grad_norm": 1096.0145263671875, + "learning_rate": 6.687341818598031e-06, + "loss": 3.5316, + "step": 2973 + }, + { + "epoch": 0.8382777816926221, + "grad_norm": 1040.0126953125, + "learning_rate": 6.664592819789778e-06, + "loss": 3.0477, + "step": 2974 + }, + { + "epoch": 0.8385596504827003, + "grad_norm": 348.0142822265625, + "learning_rate": 6.641879817461166e-06, + "loss": 2.3895, + "step": 2975 + }, + { + "epoch": 0.8388415192727785, + "grad_norm": 796.0112915039062, + "learning_rate": 6.619202830478721e-06, + "loss": 2.9144, + "step": 2976 + }, + { + "epoch": 0.8391233880628567, + "grad_norm": 876.0084228515625, + "learning_rate": 6.596561877679036e-06, + "loss": 2.6189, + "step": 2977 + }, + { + "epoch": 0.839405256852935, + "grad_norm": 1256.0111083984375, + "learning_rate": 6.573956977868767e-06, + "loss": 2.9535, + "step": 2978 + }, + { + "epoch": 0.8396871256430132, + "grad_norm": 1576.013916015625, + "learning_rate": 6.551388149824656e-06, + "loss": 3.3447, + "step": 2979 + }, + { + "epoch": 0.8399689944330914, + "grad_norm": 988.0109252929688, + "learning_rate": 6.528855412293449e-06, + "loss": 2.3418, + "step": 2980 + }, + { + "epoch": 0.8402508632231697, + "grad_norm": 2064.017578125, + "learning_rate": 6.506358783991922e-06, + "loss": 2.8617, + "step": 2981 + }, + { + "epoch": 0.8405327320132479, + "grad_norm": 636.0432739257812, + "learning_rate": 6.483898283606871e-06, + "loss": 2.7237, + "step": 2982 + }, + { + "epoch": 0.840814600803326, + "grad_norm": 1608.0146484375, + "learning_rate": 6.4614739297950536e-06, + "loss": 3.1521, + "step": 2983 + }, + { + "epoch": 0.8410964695934042, + "grad_norm": 772.0220336914062, + "learning_rate": 6.4390857411832375e-06, + "loss": 3.1776, + "step": 2984 + }, + { + "epoch": 0.8413783383834825, + "grad_norm": 588.0372314453125, + "learning_rate": 6.416733736368124e-06, + "loss": 3.25, + "step": 2985 + }, + { + "epoch": 0.8416602071735607, + "grad_norm": 1440.017578125, + "learning_rate": 6.394417933916374e-06, + "loss": 2.9971, + "step": 2986 + }, + { + "epoch": 0.8419420759636389, + "grad_norm": 772.0330200195312, + "learning_rate": 6.372138352364548e-06, + "loss": 2.3548, + "step": 2987 + }, + { + "epoch": 0.8422239447537171, + "grad_norm": 500.0359191894531, + "learning_rate": 6.349895010219164e-06, + "loss": 3.3474, + "step": 2988 + }, + { + "epoch": 0.8425058135437954, + "grad_norm": 1416.0150146484375, + "learning_rate": 6.327687925956616e-06, + "loss": 2.9017, + "step": 2989 + }, + { + "epoch": 0.8427876823338736, + "grad_norm": 334.0237121582031, + "learning_rate": 6.3055171180231635e-06, + "loss": 2.3721, + "step": 2990 + }, + { + "epoch": 0.8430695511239518, + "grad_norm": 1520.0142822265625, + "learning_rate": 6.283382604834953e-06, + "loss": 2.2159, + "step": 2991 + }, + { + "epoch": 0.8433514199140301, + "grad_norm": 1400.0169677734375, + "learning_rate": 6.261284404777978e-06, + "loss": 3.2147, + "step": 2992 + }, + { + "epoch": 0.8436332887041083, + "grad_norm": 1208.0118408203125, + "learning_rate": 6.239222536208067e-06, + "loss": 2.7969, + "step": 2993 + }, + { + "epoch": 0.8439151574941864, + "grad_norm": 704.0252685546875, + "learning_rate": 6.2171970174508935e-06, + "loss": 2.3799, + "step": 2994 + }, + { + "epoch": 0.8441970262842646, + "grad_norm": 1400.0159912109375, + "learning_rate": 6.1952078668019e-06, + "loss": 3.1953, + "step": 2995 + }, + { + "epoch": 0.8444788950743429, + "grad_norm": 940.0147705078125, + "learning_rate": 6.173255102526338e-06, + "loss": 2.771, + "step": 2996 + }, + { + "epoch": 0.8447607638644211, + "grad_norm": 1336.017333984375, + "learning_rate": 6.151338742859219e-06, + "loss": 2.3434, + "step": 2997 + }, + { + "epoch": 0.8450426326544993, + "grad_norm": 788.0243530273438, + "learning_rate": 6.12945880600535e-06, + "loss": 3.0221, + "step": 2998 + }, + { + "epoch": 0.8453245014445776, + "grad_norm": 892.0155639648438, + "learning_rate": 6.107615310139259e-06, + "loss": 2.3399, + "step": 2999 + }, + { + "epoch": 0.8456063702346558, + "grad_norm": 1232.01123046875, + "learning_rate": 6.085808273405191e-06, + "loss": 2.4714, + "step": 3000 + }, + { + "epoch": 0.845888239024734, + "grad_norm": 1528.0086669921875, + "learning_rate": 6.0640377139171315e-06, + "loss": 3.1243, + "step": 3001 + }, + { + "epoch": 0.8461701078148122, + "grad_norm": 1600.0135498046875, + "learning_rate": 6.0423036497587515e-06, + "loss": 2.5284, + "step": 3002 + }, + { + "epoch": 0.8464519766048905, + "grad_norm": 772.0186767578125, + "learning_rate": 6.020606098983439e-06, + "loss": 2.9658, + "step": 3003 + }, + { + "epoch": 0.8467338453949687, + "grad_norm": 1440.009765625, + "learning_rate": 5.998945079614199e-06, + "loss": 2.6136, + "step": 3004 + }, + { + "epoch": 0.8470157141850468, + "grad_norm": 720.012451171875, + "learning_rate": 5.977320609643733e-06, + "loss": 2.668, + "step": 3005 + }, + { + "epoch": 0.847297582975125, + "grad_norm": 2096.0234375, + "learning_rate": 5.95573270703435e-06, + "loss": 2.4617, + "step": 3006 + }, + { + "epoch": 0.8475794517652033, + "grad_norm": 346.0212707519531, + "learning_rate": 5.93418138971803e-06, + "loss": 2.611, + "step": 3007 + }, + { + "epoch": 0.8478613205552815, + "grad_norm": 976.0142211914062, + "learning_rate": 5.9126666755963236e-06, + "loss": 3.6097, + "step": 3008 + }, + { + "epoch": 0.8481431893453597, + "grad_norm": 764.0107421875, + "learning_rate": 5.891188582540386e-06, + "loss": 2.603, + "step": 3009 + }, + { + "epoch": 0.848425058135438, + "grad_norm": 1752.0196533203125, + "learning_rate": 5.869747128390962e-06, + "loss": 2.8174, + "step": 3010 + }, + { + "epoch": 0.8487069269255162, + "grad_norm": 350.0157470703125, + "learning_rate": 5.8483423309583504e-06, + "loss": 2.5368, + "step": 3011 + }, + { + "epoch": 0.8489887957155944, + "grad_norm": 1120.009033203125, + "learning_rate": 5.8269742080224175e-06, + "loss": 2.6221, + "step": 3012 + }, + { + "epoch": 0.8492706645056726, + "grad_norm": 732.0112915039062, + "learning_rate": 5.805642777332559e-06, + "loss": 2.8037, + "step": 3013 + }, + { + "epoch": 0.8495525332957509, + "grad_norm": 1360.008544921875, + "learning_rate": 5.784348056607692e-06, + "loss": 3.2747, + "step": 3014 + }, + { + "epoch": 0.849834402085829, + "grad_norm": 1904.0384521484375, + "learning_rate": 5.76309006353623e-06, + "loss": 3.2334, + "step": 3015 + }, + { + "epoch": 0.8501162708759072, + "grad_norm": 1160.014404296875, + "learning_rate": 5.7418688157760805e-06, + "loss": 3.7145, + "step": 3016 + }, + { + "epoch": 0.8503981396659854, + "grad_norm": 1520.0218505859375, + "learning_rate": 5.720684330954651e-06, + "loss": 3.0241, + "step": 3017 + }, + { + "epoch": 0.8506800084560637, + "grad_norm": 1248.015625, + "learning_rate": 5.699536626668794e-06, + "loss": 3.5964, + "step": 3018 + }, + { + "epoch": 0.8509618772461419, + "grad_norm": 704.0128784179688, + "learning_rate": 5.678425720484814e-06, + "loss": 3.0732, + "step": 3019 + }, + { + "epoch": 0.8512437460362201, + "grad_norm": 1240.0147705078125, + "learning_rate": 5.657351629938429e-06, + "loss": 3.1091, + "step": 3020 + }, + { + "epoch": 0.8515256148262984, + "grad_norm": 532.0257568359375, + "learning_rate": 5.636314372534823e-06, + "loss": 3.3516, + "step": 3021 + }, + { + "epoch": 0.8518074836163766, + "grad_norm": 1344.0118408203125, + "learning_rate": 5.615313965748531e-06, + "loss": 3.1335, + "step": 3022 + }, + { + "epoch": 0.8520893524064548, + "grad_norm": 1608.011962890625, + "learning_rate": 5.5943504270235314e-06, + "loss": 2.3491, + "step": 3023 + }, + { + "epoch": 0.852371221196533, + "grad_norm": 274.02557373046875, + "learning_rate": 5.573423773773129e-06, + "loss": 2.5501, + "step": 3024 + }, + { + "epoch": 0.8526530899866113, + "grad_norm": 1544.0125732421875, + "learning_rate": 5.552534023380024e-06, + "loss": 2.5203, + "step": 3025 + }, + { + "epoch": 0.8529349587766895, + "grad_norm": 668.0193481445312, + "learning_rate": 5.531681193196231e-06, + "loss": 3.1582, + "step": 3026 + }, + { + "epoch": 0.8532168275667676, + "grad_norm": 444.28143310546875, + "learning_rate": 5.510865300543145e-06, + "loss": 2.8626, + "step": 3027 + }, + { + "epoch": 0.8534986963568459, + "grad_norm": 1344.009033203125, + "learning_rate": 5.490086362711433e-06, + "loss": 2.5225, + "step": 3028 + }, + { + "epoch": 0.8537805651469241, + "grad_norm": 940.0164184570312, + "learning_rate": 5.46934439696109e-06, + "loss": 3.1234, + "step": 3029 + }, + { + "epoch": 0.8540624339370023, + "grad_norm": 516.0139770507812, + "learning_rate": 5.4486394205213785e-06, + "loss": 2.8747, + "step": 3030 + }, + { + "epoch": 0.8543443027270805, + "grad_norm": 700.0249633789062, + "learning_rate": 5.427971450590869e-06, + "loss": 2.9281, + "step": 3031 + }, + { + "epoch": 0.8546261715171588, + "grad_norm": 796.0338134765625, + "learning_rate": 5.407340504337361e-06, + "loss": 2.7612, + "step": 3032 + }, + { + "epoch": 0.854908040307237, + "grad_norm": 1608.0194091796875, + "learning_rate": 5.386746598897929e-06, + "loss": 2.5549, + "step": 3033 + }, + { + "epoch": 0.8551899090973152, + "grad_norm": 964.0198364257812, + "learning_rate": 5.3661897513788585e-06, + "loss": 3.1559, + "step": 3034 + }, + { + "epoch": 0.8554717778873934, + "grad_norm": 720.0303955078125, + "learning_rate": 5.345669978855644e-06, + "loss": 2.655, + "step": 3035 + }, + { + "epoch": 0.8557536466774717, + "grad_norm": 1544.023193359375, + "learning_rate": 5.325187298373019e-06, + "loss": 3.1598, + "step": 3036 + }, + { + "epoch": 0.8560355154675499, + "grad_norm": 1256.02392578125, + "learning_rate": 5.304741726944873e-06, + "loss": 2.7412, + "step": 3037 + }, + { + "epoch": 0.856317384257628, + "grad_norm": 548.02587890625, + "learning_rate": 5.284333281554294e-06, + "loss": 3.1615, + "step": 3038 + }, + { + "epoch": 0.8565992530477063, + "grad_norm": 912.0239868164062, + "learning_rate": 5.263961979153509e-06, + "loss": 2.7707, + "step": 3039 + }, + { + "epoch": 0.8568811218377845, + "grad_norm": 652.0283203125, + "learning_rate": 5.243627836663906e-06, + "loss": 2.6807, + "step": 3040 + }, + { + "epoch": 0.8571629906278627, + "grad_norm": 1160.02392578125, + "learning_rate": 5.2233308709760174e-06, + "loss": 3.1654, + "step": 3041 + }, + { + "epoch": 0.8574448594179409, + "grad_norm": 1264.019287109375, + "learning_rate": 5.203071098949475e-06, + "loss": 2.5635, + "step": 3042 + }, + { + "epoch": 0.8577267282080192, + "grad_norm": 1672.014404296875, + "learning_rate": 5.18284853741301e-06, + "loss": 2.8547, + "step": 3043 + }, + { + "epoch": 0.8580085969980974, + "grad_norm": 1296.019775390625, + "learning_rate": 5.16266320316448e-06, + "loss": 2.6075, + "step": 3044 + }, + { + "epoch": 0.8582904657881756, + "grad_norm": 1984.0157470703125, + "learning_rate": 5.142515112970775e-06, + "loss": 3.0938, + "step": 3045 + }, + { + "epoch": 0.8585723345782538, + "grad_norm": 828.03076171875, + "learning_rate": 5.122404283567889e-06, + "loss": 3.5127, + "step": 3046 + }, + { + "epoch": 0.8588542033683321, + "grad_norm": 1368.0220947265625, + "learning_rate": 5.102330731660848e-06, + "loss": 3.0941, + "step": 3047 + }, + { + "epoch": 0.8591360721584103, + "grad_norm": 848.0164794921875, + "learning_rate": 5.082294473923699e-06, + "loss": 2.9329, + "step": 3048 + }, + { + "epoch": 0.8594179409484884, + "grad_norm": 1496.0150146484375, + "learning_rate": 5.062295526999522e-06, + "loss": 2.5425, + "step": 3049 + }, + { + "epoch": 0.8596998097385667, + "grad_norm": 1520.0130615234375, + "learning_rate": 5.042333907500418e-06, + "loss": 2.6953, + "step": 3050 + }, + { + "epoch": 0.8599816785286449, + "grad_norm": 1560.0155029296875, + "learning_rate": 5.022409632007474e-06, + "loss": 3.166, + "step": 3051 + }, + { + "epoch": 0.8602635473187231, + "grad_norm": 1400.0074462890625, + "learning_rate": 5.002522717070751e-06, + "loss": 2.4252, + "step": 3052 + }, + { + "epoch": 0.8605454161088013, + "grad_norm": 1064.0111083984375, + "learning_rate": 4.982673179209263e-06, + "loss": 2.9971, + "step": 3053 + }, + { + "epoch": 0.8608272848988796, + "grad_norm": 744.01513671875, + "learning_rate": 4.96286103491101e-06, + "loss": 2.9515, + "step": 3054 + }, + { + "epoch": 0.8611091536889578, + "grad_norm": 488.02545166015625, + "learning_rate": 4.943086300632921e-06, + "loss": 2.5772, + "step": 3055 + }, + { + "epoch": 0.861391022479036, + "grad_norm": 680.0216064453125, + "learning_rate": 4.9233489928008356e-06, + "loss": 2.8621, + "step": 3056 + }, + { + "epoch": 0.8616728912691143, + "grad_norm": 776.0157470703125, + "learning_rate": 4.903649127809512e-06, + "loss": 2.4512, + "step": 3057 + }, + { + "epoch": 0.8619547600591925, + "grad_norm": 1792.010986328125, + "learning_rate": 4.88398672202261e-06, + "loss": 3.1442, + "step": 3058 + }, + { + "epoch": 0.8622366288492707, + "grad_norm": 1272.0172119140625, + "learning_rate": 4.864361791772665e-06, + "loss": 2.9053, + "step": 3059 + }, + { + "epoch": 0.8625184976393488, + "grad_norm": 1272.0108642578125, + "learning_rate": 4.844774353361109e-06, + "loss": 3.0544, + "step": 3060 + }, + { + "epoch": 0.8628003664294271, + "grad_norm": 1552.01025390625, + "learning_rate": 4.8252244230582e-06, + "loss": 2.7373, + "step": 3061 + }, + { + "epoch": 0.8630822352195053, + "grad_norm": 264.07861328125, + "learning_rate": 4.805712017103059e-06, + "loss": 2.68, + "step": 3062 + }, + { + "epoch": 0.8633641040095835, + "grad_norm": 720.0108642578125, + "learning_rate": 4.786237151703616e-06, + "loss": 2.3975, + "step": 3063 + }, + { + "epoch": 0.8636459727996617, + "grad_norm": 1768.013916015625, + "learning_rate": 4.766799843036651e-06, + "loss": 3.071, + "step": 3064 + }, + { + "epoch": 0.86392784158974, + "grad_norm": 1072.0098876953125, + "learning_rate": 4.747400107247729e-06, + "loss": 2.7715, + "step": 3065 + }, + { + "epoch": 0.8642097103798182, + "grad_norm": 280.06036376953125, + "learning_rate": 4.728037960451203e-06, + "loss": 3.4447, + "step": 3066 + }, + { + "epoch": 0.8644915791698964, + "grad_norm": 856.0145874023438, + "learning_rate": 4.70871341873021e-06, + "loss": 2.6087, + "step": 3067 + }, + { + "epoch": 0.8647734479599747, + "grad_norm": 1020.0156860351562, + "learning_rate": 4.689426498136629e-06, + "loss": 2.4137, + "step": 3068 + }, + { + "epoch": 0.8650553167500529, + "grad_norm": 1536.0386962890625, + "learning_rate": 4.670177214691129e-06, + "loss": 2.4861, + "step": 3069 + }, + { + "epoch": 0.8653371855401311, + "grad_norm": 988.0120239257812, + "learning_rate": 4.650965584383082e-06, + "loss": 2.9899, + "step": 3070 + }, + { + "epoch": 0.8656190543302092, + "grad_norm": 1432.01806640625, + "learning_rate": 4.6317916231706e-06, + "loss": 2.8535, + "step": 3071 + }, + { + "epoch": 0.8659009231202875, + "grad_norm": 532.0174560546875, + "learning_rate": 4.6126553469804986e-06, + "loss": 2.9248, + "step": 3072 + }, + { + "epoch": 0.8661827919103657, + "grad_norm": 1560.014404296875, + "learning_rate": 4.593556771708279e-06, + "loss": 3.1507, + "step": 3073 + }, + { + "epoch": 0.8664646607004439, + "grad_norm": 1888.0177001953125, + "learning_rate": 4.5744959132181575e-06, + "loss": 3.2178, + "step": 3074 + }, + { + "epoch": 0.8667465294905221, + "grad_norm": 1352.006103515625, + "learning_rate": 4.555472787342996e-06, + "loss": 2.9402, + "step": 3075 + }, + { + "epoch": 0.8670283982806004, + "grad_norm": 1640.0133056640625, + "learning_rate": 4.536487409884327e-06, + "loss": 2.8276, + "step": 3076 + }, + { + "epoch": 0.8673102670706786, + "grad_norm": 2832.0244140625, + "learning_rate": 4.517539796612308e-06, + "loss": 3.0964, + "step": 3077 + }, + { + "epoch": 0.8675921358607568, + "grad_norm": 1312.008056640625, + "learning_rate": 4.498629963265744e-06, + "loss": 2.8727, + "step": 3078 + }, + { + "epoch": 0.8678740046508351, + "grad_norm": 860.01953125, + "learning_rate": 4.479757925552058e-06, + "loss": 2.998, + "step": 3079 + }, + { + "epoch": 0.8681558734409133, + "grad_norm": 632.022216796875, + "learning_rate": 4.460923699147279e-06, + "loss": 2.792, + "step": 3080 + }, + { + "epoch": 0.8684377422309915, + "grad_norm": 536.0162963867188, + "learning_rate": 4.442127299696025e-06, + "loss": 2.9131, + "step": 3081 + }, + { + "epoch": 0.8687196110210696, + "grad_norm": 243.04345703125, + "learning_rate": 4.423368742811468e-06, + "loss": 3.5094, + "step": 3082 + }, + { + "epoch": 0.869001479811148, + "grad_norm": 1584.0133056640625, + "learning_rate": 4.404648044075393e-06, + "loss": 2.9623, + "step": 3083 + }, + { + "epoch": 0.8692833486012261, + "grad_norm": 348.07318115234375, + "learning_rate": 4.385965219038124e-06, + "loss": 2.9342, + "step": 3084 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 436.0369873046875, + "learning_rate": 4.367320283218495e-06, + "loss": 2.6084, + "step": 3085 + }, + { + "epoch": 0.8698470861813825, + "grad_norm": 584.03662109375, + "learning_rate": 4.348713252103903e-06, + "loss": 2.8779, + "step": 3086 + }, + { + "epoch": 0.8701289549714608, + "grad_norm": 812.0233764648438, + "learning_rate": 4.330144141150244e-06, + "loss": 2.4722, + "step": 3087 + }, + { + "epoch": 0.870410823761539, + "grad_norm": 1240.025146484375, + "learning_rate": 4.311612965781903e-06, + "loss": 3.1999, + "step": 3088 + }, + { + "epoch": 0.8706926925516172, + "grad_norm": 1640.0380859375, + "learning_rate": 4.293119741391782e-06, + "loss": 2.9967, + "step": 3089 + }, + { + "epoch": 0.8709745613416955, + "grad_norm": 924.0256958007812, + "learning_rate": 4.274664483341251e-06, + "loss": 2.6404, + "step": 3090 + }, + { + "epoch": 0.8712564301317737, + "grad_norm": 812.0342407226562, + "learning_rate": 4.256247206960123e-06, + "loss": 2.7223, + "step": 3091 + }, + { + "epoch": 0.8715382989218519, + "grad_norm": 1808.025146484375, + "learning_rate": 4.237867927546674e-06, + "loss": 2.7858, + "step": 3092 + }, + { + "epoch": 0.87182016771193, + "grad_norm": 1064.016845703125, + "learning_rate": 4.21952666036764e-06, + "loss": 3.2429, + "step": 3093 + }, + { + "epoch": 0.8721020365020083, + "grad_norm": 1104.0198974609375, + "learning_rate": 4.201223420658135e-06, + "loss": 2.7497, + "step": 3094 + }, + { + "epoch": 0.8723839052920865, + "grad_norm": 450.0430908203125, + "learning_rate": 4.182958223621741e-06, + "loss": 3.1888, + "step": 3095 + }, + { + "epoch": 0.8726657740821647, + "grad_norm": 928.0247802734375, + "learning_rate": 4.164731084430401e-06, + "loss": 2.9086, + "step": 3096 + }, + { + "epoch": 0.872947642872243, + "grad_norm": 928.0385131835938, + "learning_rate": 4.146542018224447e-06, + "loss": 2.1397, + "step": 3097 + }, + { + "epoch": 0.8732295116623212, + "grad_norm": 768.023193359375, + "learning_rate": 4.128391040112606e-06, + "loss": 3.001, + "step": 3098 + }, + { + "epoch": 0.8735113804523994, + "grad_norm": 808.0187377929688, + "learning_rate": 4.110278165171955e-06, + "loss": 2.6443, + "step": 3099 + }, + { + "epoch": 0.8737932492424776, + "grad_norm": 1360.00927734375, + "learning_rate": 4.0922034084479145e-06, + "loss": 2.3591, + "step": 3100 + }, + { + "epoch": 0.8740751180325559, + "grad_norm": 1728.01318359375, + "learning_rate": 4.0741667849542475e-06, + "loss": 2.9889, + "step": 3101 + }, + { + "epoch": 0.8743569868226341, + "grad_norm": 1504.008544921875, + "learning_rate": 4.056168309673059e-06, + "loss": 2.5085, + "step": 3102 + }, + { + "epoch": 0.8746388556127123, + "grad_norm": 732.0138549804688, + "learning_rate": 4.038207997554738e-06, + "loss": 2.7341, + "step": 3103 + }, + { + "epoch": 0.8749207244027905, + "grad_norm": 1848.00927734375, + "learning_rate": 4.020285863517986e-06, + "loss": 2.2842, + "step": 3104 + }, + { + "epoch": 0.8752025931928687, + "grad_norm": 492.0206298828125, + "learning_rate": 4.0024019224498e-06, + "loss": 2.9141, + "step": 3105 + }, + { + "epoch": 0.8754844619829469, + "grad_norm": 1192.009521484375, + "learning_rate": 3.98455618920544e-06, + "loss": 2.7608, + "step": 3106 + }, + { + "epoch": 0.8757663307730251, + "grad_norm": 1216.0074462890625, + "learning_rate": 3.966748678608423e-06, + "loss": 2.7533, + "step": 3107 + }, + { + "epoch": 0.8760481995631034, + "grad_norm": 840.0130004882812, + "learning_rate": 3.948979405450548e-06, + "loss": 3.1859, + "step": 3108 + }, + { + "epoch": 0.8763300683531816, + "grad_norm": 1400.013671875, + "learning_rate": 3.931248384491814e-06, + "loss": 2.723, + "step": 3109 + }, + { + "epoch": 0.8766119371432598, + "grad_norm": 996.0077514648438, + "learning_rate": 3.9135556304604694e-06, + "loss": 2.4699, + "step": 3110 + }, + { + "epoch": 0.876893805933338, + "grad_norm": 828.0100708007812, + "learning_rate": 3.895901158052956e-06, + "loss": 2.449, + "step": 3111 + }, + { + "epoch": 0.8771756747234163, + "grad_norm": 2128.0146484375, + "learning_rate": 3.87828498193395e-06, + "loss": 3.0551, + "step": 3112 + }, + { + "epoch": 0.8774575435134945, + "grad_norm": 2160.006591796875, + "learning_rate": 3.860707116736284e-06, + "loss": 3.2699, + "step": 3113 + }, + { + "epoch": 0.8777394123035727, + "grad_norm": 1240.0126953125, + "learning_rate": 3.84316757706098e-06, + "loss": 2.8135, + "step": 3114 + }, + { + "epoch": 0.8780212810936509, + "grad_norm": 600.0182495117188, + "learning_rate": 3.825666377477239e-06, + "loss": 2.9753, + "step": 3115 + }, + { + "epoch": 0.8783031498837292, + "grad_norm": 760.01416015625, + "learning_rate": 3.808203532522381e-06, + "loss": 2.9844, + "step": 3116 + }, + { + "epoch": 0.8785850186738073, + "grad_norm": 956.0123291015625, + "learning_rate": 3.7907790567019096e-06, + "loss": 2.902, + "step": 3117 + }, + { + "epoch": 0.8788668874638855, + "grad_norm": 1352.01318359375, + "learning_rate": 3.773392964489425e-06, + "loss": 2.5671, + "step": 3118 + }, + { + "epoch": 0.8791487562539638, + "grad_norm": 1416.0364990234375, + "learning_rate": 3.7560452703266526e-06, + "loss": 2.9964, + "step": 3119 + }, + { + "epoch": 0.879430625044042, + "grad_norm": 1512.0047607421875, + "learning_rate": 3.7387359886234264e-06, + "loss": 2.4287, + "step": 3120 + }, + { + "epoch": 0.8797124938341202, + "grad_norm": 912.0108032226562, + "learning_rate": 3.7214651337576624e-06, + "loss": 2.7891, + "step": 3121 + }, + { + "epoch": 0.8799943626241984, + "grad_norm": 928.0166625976562, + "learning_rate": 3.704232720075379e-06, + "loss": 3.4655, + "step": 3122 + }, + { + "epoch": 0.8802762314142767, + "grad_norm": 856.0113525390625, + "learning_rate": 3.687038761890643e-06, + "loss": 2.6631, + "step": 3123 + }, + { + "epoch": 0.8805581002043549, + "grad_norm": 672.0343627929688, + "learning_rate": 3.6698832734855748e-06, + "loss": 3.8803, + "step": 3124 + }, + { + "epoch": 0.8808399689944331, + "grad_norm": 1216.011962890625, + "learning_rate": 3.6527662691103747e-06, + "loss": 2.5296, + "step": 3125 + }, + { + "epoch": 0.8811218377845114, + "grad_norm": 824.0220947265625, + "learning_rate": 3.635687762983231e-06, + "loss": 3.1432, + "step": 3126 + }, + { + "epoch": 0.8814037065745896, + "grad_norm": 1296.0167236328125, + "learning_rate": 3.6186477692903954e-06, + "loss": 3.1849, + "step": 3127 + }, + { + "epoch": 0.8816855753646677, + "grad_norm": 1376.009033203125, + "learning_rate": 3.60164630218609e-06, + "loss": 2.6217, + "step": 3128 + }, + { + "epoch": 0.8819674441547459, + "grad_norm": 1520.0150146484375, + "learning_rate": 3.5846833757925633e-06, + "loss": 2.5622, + "step": 3129 + }, + { + "epoch": 0.8822493129448242, + "grad_norm": 1136.0142822265625, + "learning_rate": 3.5677590042000276e-06, + "loss": 2.8952, + "step": 3130 + }, + { + "epoch": 0.8825311817349024, + "grad_norm": 1536.0155029296875, + "learning_rate": 3.5508732014667036e-06, + "loss": 2.9229, + "step": 3131 + }, + { + "epoch": 0.8828130505249806, + "grad_norm": 988.0132446289062, + "learning_rate": 3.534025981618738e-06, + "loss": 3.1023, + "step": 3132 + }, + { + "epoch": 0.8830949193150588, + "grad_norm": 1120.0277099609375, + "learning_rate": 3.5172173586502545e-06, + "loss": 3.1305, + "step": 3133 + }, + { + "epoch": 0.8833767881051371, + "grad_norm": 996.0294189453125, + "learning_rate": 3.5004473465232834e-06, + "loss": 2.765, + "step": 3134 + }, + { + "epoch": 0.8836586568952153, + "grad_norm": 708.0260009765625, + "learning_rate": 3.483715959167838e-06, + "loss": 2.5102, + "step": 3135 + }, + { + "epoch": 0.8839405256852935, + "grad_norm": 548.0419921875, + "learning_rate": 3.46702321048179e-06, + "loss": 3.3289, + "step": 3136 + }, + { + "epoch": 0.8842223944753718, + "grad_norm": 504.040283203125, + "learning_rate": 3.4503691143309536e-06, + "loss": 2.2738, + "step": 3137 + }, + { + "epoch": 0.88450426326545, + "grad_norm": 210.09120178222656, + "learning_rate": 3.433753684549029e-06, + "loss": 2.6139, + "step": 3138 + }, + { + "epoch": 0.8847861320555281, + "grad_norm": 2176.025146484375, + "learning_rate": 3.417176934937588e-06, + "loss": 3.0745, + "step": 3139 + }, + { + "epoch": 0.8850680008456063, + "grad_norm": 1240.0223388671875, + "learning_rate": 3.400638879266066e-06, + "loss": 2.6759, + "step": 3140 + }, + { + "epoch": 0.8853498696356846, + "grad_norm": 1200.0169677734375, + "learning_rate": 3.3841395312717907e-06, + "loss": 2.6973, + "step": 3141 + }, + { + "epoch": 0.8856317384257628, + "grad_norm": 964.0159301757812, + "learning_rate": 3.3676789046599044e-06, + "loss": 2.2975, + "step": 3142 + }, + { + "epoch": 0.885913607215841, + "grad_norm": 940.0499267578125, + "learning_rate": 3.351257013103404e-06, + "loss": 2.7699, + "step": 3143 + }, + { + "epoch": 0.8861954760059192, + "grad_norm": 1296.0220947265625, + "learning_rate": 3.3348738702430937e-06, + "loss": 3.2005, + "step": 3144 + }, + { + "epoch": 0.8864773447959975, + "grad_norm": 1040.0257568359375, + "learning_rate": 3.318529489687605e-06, + "loss": 2.8018, + "step": 3145 + }, + { + "epoch": 0.8867592135860757, + "grad_norm": 2192.01025390625, + "learning_rate": 3.302223885013389e-06, + "loss": 2.4678, + "step": 3146 + }, + { + "epoch": 0.8870410823761539, + "grad_norm": 1160.0208740234375, + "learning_rate": 3.2859570697646446e-06, + "loss": 2.6842, + "step": 3147 + }, + { + "epoch": 0.8873229511662322, + "grad_norm": 692.02392578125, + "learning_rate": 3.2697290574533855e-06, + "loss": 2.7262, + "step": 3148 + }, + { + "epoch": 0.8876048199563104, + "grad_norm": 1104.0174560546875, + "learning_rate": 3.253539861559368e-06, + "loss": 2.4325, + "step": 3149 + }, + { + "epoch": 0.8878866887463885, + "grad_norm": 1080.0206298828125, + "learning_rate": 3.2373894955301353e-06, + "loss": 2.64, + "step": 3150 + }, + { + "epoch": 0.8881685575364667, + "grad_norm": 1640.006591796875, + "learning_rate": 3.2212779727809505e-06, + "loss": 2.793, + "step": 3151 + }, + { + "epoch": 0.888450426326545, + "grad_norm": 1480.0108642578125, + "learning_rate": 3.2052053066948307e-06, + "loss": 2.6712, + "step": 3152 + }, + { + "epoch": 0.8887322951166232, + "grad_norm": 644.010498046875, + "learning_rate": 3.189171510622496e-06, + "loss": 2.7898, + "step": 3153 + }, + { + "epoch": 0.8890141639067014, + "grad_norm": 1176.010986328125, + "learning_rate": 3.1731765978823878e-06, + "loss": 3.0391, + "step": 3154 + }, + { + "epoch": 0.8892960326967797, + "grad_norm": 1704.0128173828125, + "learning_rate": 3.157220581760667e-06, + "loss": 3.2367, + "step": 3155 + }, + { + "epoch": 0.8895779014868579, + "grad_norm": 1080.0128173828125, + "learning_rate": 3.141303475511165e-06, + "loss": 3.1045, + "step": 3156 + }, + { + "epoch": 0.8898597702769361, + "grad_norm": 632.0204467773438, + "learning_rate": 3.1254252923553994e-06, + "loss": 2.95, + "step": 3157 + }, + { + "epoch": 0.8901416390670143, + "grad_norm": 312.0291442871094, + "learning_rate": 3.1095860454825487e-06, + "loss": 3.1289, + "step": 3158 + }, + { + "epoch": 0.8904235078570926, + "grad_norm": 1504.016845703125, + "learning_rate": 3.0937857480494548e-06, + "loss": 2.5924, + "step": 3159 + }, + { + "epoch": 0.8907053766471708, + "grad_norm": 960.0122680664062, + "learning_rate": 3.078024413180619e-06, + "loss": 3.0387, + "step": 3160 + }, + { + "epoch": 0.8909872454372489, + "grad_norm": 462.0267028808594, + "learning_rate": 3.062302053968158e-06, + "loss": 2.6107, + "step": 3161 + }, + { + "epoch": 0.8912691142273271, + "grad_norm": 1040.014404296875, + "learning_rate": 3.0466186834718248e-06, + "loss": 2.804, + "step": 3162 + }, + { + "epoch": 0.8915509830174054, + "grad_norm": 612.0145874023438, + "learning_rate": 3.0309743147189705e-06, + "loss": 2.9229, + "step": 3163 + }, + { + "epoch": 0.8918328518074836, + "grad_norm": 1272.0115966796875, + "learning_rate": 3.0153689607045845e-06, + "loss": 2.9157, + "step": 3164 + }, + { + "epoch": 0.8921147205975618, + "grad_norm": 1004.0145263671875, + "learning_rate": 2.9998026343912023e-06, + "loss": 2.6667, + "step": 3165 + }, + { + "epoch": 0.8923965893876401, + "grad_norm": 1192.018310546875, + "learning_rate": 2.9842753487089923e-06, + "loss": 2.998, + "step": 3166 + }, + { + "epoch": 0.8926784581777183, + "grad_norm": 1336.00927734375, + "learning_rate": 2.968787116555649e-06, + "loss": 2.6644, + "step": 3167 + }, + { + "epoch": 0.8929603269677965, + "grad_norm": 1344.008544921875, + "learning_rate": 2.9533379507964477e-06, + "loss": 2.7129, + "step": 3168 + }, + { + "epoch": 0.8932421957578747, + "grad_norm": 1600.0079345703125, + "learning_rate": 2.9379278642642062e-06, + "loss": 2.3979, + "step": 3169 + }, + { + "epoch": 0.893524064547953, + "grad_norm": 2288.021484375, + "learning_rate": 2.922556869759302e-06, + "loss": 3.3337, + "step": 3170 + }, + { + "epoch": 0.8938059333380312, + "grad_norm": 732.0186767578125, + "learning_rate": 2.907224980049611e-06, + "loss": 2.615, + "step": 3171 + }, + { + "epoch": 0.8940878021281093, + "grad_norm": 784.05810546875, + "learning_rate": 2.8919322078705456e-06, + "loss": 3.1306, + "step": 3172 + }, + { + "epoch": 0.8943696709181875, + "grad_norm": 1024.0196533203125, + "learning_rate": 2.8766785659250116e-06, + "loss": 3.1405, + "step": 3173 + }, + { + "epoch": 0.8946515397082658, + "grad_norm": 628.01806640625, + "learning_rate": 2.8614640668834345e-06, + "loss": 2.8919, + "step": 3174 + }, + { + "epoch": 0.894933408498344, + "grad_norm": 1120.0255126953125, + "learning_rate": 2.8462887233836945e-06, + "loss": 2.559, + "step": 3175 + }, + { + "epoch": 0.8952152772884222, + "grad_norm": 748.0209350585938, + "learning_rate": 2.831152548031174e-06, + "loss": 3.4688, + "step": 3176 + }, + { + "epoch": 0.8954971460785005, + "grad_norm": 1256.0064697265625, + "learning_rate": 2.8160555533987167e-06, + "loss": 2.7767, + "step": 3177 + }, + { + "epoch": 0.8957790148685787, + "grad_norm": 1272.0106201171875, + "learning_rate": 2.8009977520265963e-06, + "loss": 2.9049, + "step": 3178 + }, + { + "epoch": 0.8960608836586569, + "grad_norm": 1296.01025390625, + "learning_rate": 2.785979156422569e-06, + "loss": 2.7077, + "step": 3179 + }, + { + "epoch": 0.8963427524487351, + "grad_norm": 1336.0206298828125, + "learning_rate": 2.770999779061795e-06, + "loss": 2.8301, + "step": 3180 + }, + { + "epoch": 0.8966246212388134, + "grad_norm": 1656.016845703125, + "learning_rate": 2.7560596323868648e-06, + "loss": 2.5674, + "step": 3181 + }, + { + "epoch": 0.8969064900288916, + "grad_norm": 1176.179443359375, + "learning_rate": 2.741158728807791e-06, + "loss": 3.8962, + "step": 3182 + }, + { + "epoch": 0.8971883588189697, + "grad_norm": 1512.01806640625, + "learning_rate": 2.7262970807019727e-06, + "loss": 2.9814, + "step": 3183 + }, + { + "epoch": 0.897470227609048, + "grad_norm": 1152.0208740234375, + "learning_rate": 2.7114747004142236e-06, + "loss": 2.9004, + "step": 3184 + }, + { + "epoch": 0.8977520963991262, + "grad_norm": 808.0266723632812, + "learning_rate": 2.6966916002567234e-06, + "loss": 2.4873, + "step": 3185 + }, + { + "epoch": 0.8980339651892044, + "grad_norm": 1240.0350341796875, + "learning_rate": 2.6819477925090264e-06, + "loss": 2.8581, + "step": 3186 + }, + { + "epoch": 0.8983158339792826, + "grad_norm": 1336.0186767578125, + "learning_rate": 2.667243289418059e-06, + "loss": 2.7169, + "step": 3187 + }, + { + "epoch": 0.8985977027693609, + "grad_norm": 1696.019287109375, + "learning_rate": 2.6525781031980724e-06, + "loss": 3.6618, + "step": 3188 + }, + { + "epoch": 0.8988795715594391, + "grad_norm": 700.022216796875, + "learning_rate": 2.6379522460307005e-06, + "loss": 2.8115, + "step": 3189 + }, + { + "epoch": 0.8991614403495173, + "grad_norm": 1280.0230712890625, + "learning_rate": 2.6233657300648693e-06, + "loss": 3.0771, + "step": 3190 + }, + { + "epoch": 0.8994433091395955, + "grad_norm": 1848.027099609375, + "learning_rate": 2.6088185674168487e-06, + "loss": 2.9025, + "step": 3191 + }, + { + "epoch": 0.8997251779296738, + "grad_norm": 864.0197143554688, + "learning_rate": 2.5943107701702106e-06, + "loss": 2.5094, + "step": 3192 + }, + { + "epoch": 0.900007046719752, + "grad_norm": 612.0205688476562, + "learning_rate": 2.5798423503758386e-06, + "loss": 2.7516, + "step": 3193 + }, + { + "epoch": 0.9002889155098301, + "grad_norm": 1464.0185546875, + "learning_rate": 2.565413320051896e-06, + "loss": 2.534, + "step": 3194 + }, + { + "epoch": 0.9005707842999084, + "grad_norm": 1600.0201416015625, + "learning_rate": 2.5510236911838293e-06, + "loss": 2.9525, + "step": 3195 + }, + { + "epoch": 0.9008526530899866, + "grad_norm": 1432.0220947265625, + "learning_rate": 2.5366734757243494e-06, + "loss": 2.738, + "step": 3196 + }, + { + "epoch": 0.9011345218800648, + "grad_norm": 2064.022216796875, + "learning_rate": 2.522362685593449e-06, + "loss": 3.057, + "step": 3197 + }, + { + "epoch": 0.901416390670143, + "grad_norm": 1288.01806640625, + "learning_rate": 2.5080913326783693e-06, + "loss": 2.6768, + "step": 3198 + }, + { + "epoch": 0.9016982594602213, + "grad_norm": 860.0183715820312, + "learning_rate": 2.4938594288335725e-06, + "loss": 3.0967, + "step": 3199 + }, + { + "epoch": 0.9019801282502995, + "grad_norm": 556.01953125, + "learning_rate": 2.4796669858807687e-06, + "loss": 2.5485, + "step": 3200 + }, + { + "epoch": 0.9022619970403777, + "grad_norm": 816.0107421875, + "learning_rate": 2.465514015608883e-06, + "loss": 2.8711, + "step": 3201 + }, + { + "epoch": 0.9025438658304559, + "grad_norm": 984.0072021484375, + "learning_rate": 2.451400529774045e-06, + "loss": 2.5944, + "step": 3202 + }, + { + "epoch": 0.9028257346205342, + "grad_norm": 1472.0137939453125, + "learning_rate": 2.4373265400996213e-06, + "loss": 2.7279, + "step": 3203 + }, + { + "epoch": 0.9031076034106124, + "grad_norm": 1592.009033203125, + "learning_rate": 2.4232920582761276e-06, + "loss": 2.6449, + "step": 3204 + }, + { + "epoch": 0.9033894722006905, + "grad_norm": 1408.018798828125, + "learning_rate": 2.409297095961288e-06, + "loss": 3.1901, + "step": 3205 + }, + { + "epoch": 0.9036713409907688, + "grad_norm": 1416.01220703125, + "learning_rate": 2.395341664779982e-06, + "loss": 2.4334, + "step": 3206 + }, + { + "epoch": 0.903953209780847, + "grad_norm": 1004.0116577148438, + "learning_rate": 2.3814257763242807e-06, + "loss": 2.8577, + "step": 3207 + }, + { + "epoch": 0.9042350785709252, + "grad_norm": 1584.0103759765625, + "learning_rate": 2.3675494421533885e-06, + "loss": 2.5138, + "step": 3208 + }, + { + "epoch": 0.9045169473610034, + "grad_norm": 1392.01123046875, + "learning_rate": 2.3537126737936565e-06, + "loss": 2.9668, + "step": 3209 + }, + { + "epoch": 0.9047988161510817, + "grad_norm": 1512.016845703125, + "learning_rate": 2.3399154827385694e-06, + "loss": 2.9802, + "step": 3210 + }, + { + "epoch": 0.9050806849411599, + "grad_norm": 752.0113525390625, + "learning_rate": 2.3261578804487317e-06, + "loss": 2.833, + "step": 3211 + }, + { + "epoch": 0.9053625537312381, + "grad_norm": 1088.0111083984375, + "learning_rate": 2.3124398783518906e-06, + "loss": 2.6361, + "step": 3212 + }, + { + "epoch": 0.9056444225213163, + "grad_norm": 1056.0118408203125, + "learning_rate": 2.298761487842871e-06, + "loss": 2.7816, + "step": 3213 + }, + { + "epoch": 0.9059262913113946, + "grad_norm": 1176.0120849609375, + "learning_rate": 2.2851227202836e-06, + "loss": 3.4141, + "step": 3214 + }, + { + "epoch": 0.9062081601014728, + "grad_norm": 1208.0133056640625, + "learning_rate": 2.2715235870031047e-06, + "loss": 2.6634, + "step": 3215 + }, + { + "epoch": 0.906490028891551, + "grad_norm": 1400.017578125, + "learning_rate": 2.257964099297466e-06, + "loss": 3.3102, + "step": 3216 + }, + { + "epoch": 0.9067718976816292, + "grad_norm": 956.0216064453125, + "learning_rate": 2.244444268429857e-06, + "loss": 3.3704, + "step": 3217 + }, + { + "epoch": 0.9070537664717074, + "grad_norm": 956.0125732421875, + "learning_rate": 2.2309641056305064e-06, + "loss": 2.5723, + "step": 3218 + }, + { + "epoch": 0.9073356352617856, + "grad_norm": 374.0296325683594, + "learning_rate": 2.21752362209669e-06, + "loss": 2.6709, + "step": 3219 + }, + { + "epoch": 0.9076175040518638, + "grad_norm": 592.0213012695312, + "learning_rate": 2.2041228289927108e-06, + "loss": 3.1943, + "step": 3220 + }, + { + "epoch": 0.9078993728419421, + "grad_norm": 410.01995849609375, + "learning_rate": 2.1907617374499145e-06, + "loss": 2.6905, + "step": 3221 + }, + { + "epoch": 0.9081812416320203, + "grad_norm": 1000.01171875, + "learning_rate": 2.177440358566679e-06, + "loss": 2.4268, + "step": 3222 + }, + { + "epoch": 0.9084631104220985, + "grad_norm": 1072.010986328125, + "learning_rate": 2.1641587034083755e-06, + "loss": 2.8867, + "step": 3223 + }, + { + "epoch": 0.9087449792121768, + "grad_norm": 792.0155639648438, + "learning_rate": 2.150916783007395e-06, + "loss": 3.309, + "step": 3224 + }, + { + "epoch": 0.909026848002255, + "grad_norm": 1560.0115966796875, + "learning_rate": 2.1377146083631005e-06, + "loss": 2.8415, + "step": 3225 + }, + { + "epoch": 0.9093087167923332, + "grad_norm": 1168.01513671875, + "learning_rate": 2.124552190441864e-06, + "loss": 2.8523, + "step": 3226 + }, + { + "epoch": 0.9095905855824113, + "grad_norm": 808.0191650390625, + "learning_rate": 2.111429540177029e-06, + "loss": 2.8216, + "step": 3227 + }, + { + "epoch": 0.9098724543724896, + "grad_norm": 1680.0096435546875, + "learning_rate": 2.0983466684689037e-06, + "loss": 2.6361, + "step": 3228 + }, + { + "epoch": 0.9101543231625678, + "grad_norm": 1312.0155029296875, + "learning_rate": 2.085303586184745e-06, + "loss": 2.7614, + "step": 3229 + }, + { + "epoch": 0.910436191952646, + "grad_norm": 456.0726623535156, + "learning_rate": 2.0723003041587694e-06, + "loss": 3.2226, + "step": 3230 + }, + { + "epoch": 0.9107180607427242, + "grad_norm": 600.0193481445312, + "learning_rate": 2.0593368331921215e-06, + "loss": 3.1006, + "step": 3231 + }, + { + "epoch": 0.9109999295328025, + "grad_norm": 1480.0225830078125, + "learning_rate": 2.0464131840528976e-06, + "loss": 2.9525, + "step": 3232 + }, + { + "epoch": 0.9112817983228807, + "grad_norm": 388.04217529296875, + "learning_rate": 2.0335293674761003e-06, + "loss": 2.4605, + "step": 3233 + }, + { + "epoch": 0.9115636671129589, + "grad_norm": 1616.0238037109375, + "learning_rate": 2.0206853941636516e-06, + "loss": 2.328, + "step": 3234 + }, + { + "epoch": 0.9118455359030372, + "grad_norm": 1200.028076171875, + "learning_rate": 2.007881274784362e-06, + "loss": 3.088, + "step": 3235 + }, + { + "epoch": 0.9121274046931154, + "grad_norm": 1352.023681640625, + "learning_rate": 1.995117019973963e-06, + "loss": 2.6654, + "step": 3236 + }, + { + "epoch": 0.9124092734831936, + "grad_norm": 288.0921936035156, + "learning_rate": 1.9823926403350624e-06, + "loss": 2.9563, + "step": 3237 + }, + { + "epoch": 0.9126911422732717, + "grad_norm": 920.0302124023438, + "learning_rate": 1.9697081464371436e-06, + "loss": 2.7861, + "step": 3238 + }, + { + "epoch": 0.91297301106335, + "grad_norm": 1464.0191650390625, + "learning_rate": 1.9570635488165514e-06, + "loss": 2.6862, + "step": 3239 + }, + { + "epoch": 0.9132548798534282, + "grad_norm": 968.0335083007812, + "learning_rate": 1.9444588579765e-06, + "loss": 3.4541, + "step": 3240 + }, + { + "epoch": 0.9135367486435064, + "grad_norm": 1072.022216796875, + "learning_rate": 1.931894084387059e-06, + "loss": 2.6438, + "step": 3241 + }, + { + "epoch": 0.9138186174335846, + "grad_norm": 908.0152587890625, + "learning_rate": 1.9193692384851303e-06, + "loss": 2.1958, + "step": 3242 + }, + { + "epoch": 0.9141004862236629, + "grad_norm": 2192.025146484375, + "learning_rate": 1.9068843306744633e-06, + "loss": 2.9056, + "step": 3243 + }, + { + "epoch": 0.9143823550137411, + "grad_norm": 608.0221557617188, + "learning_rate": 1.894439371325607e-06, + "loss": 2.3384, + "step": 3244 + }, + { + "epoch": 0.9146642238038193, + "grad_norm": 1552.0457763671875, + "learning_rate": 1.8820343707759647e-06, + "loss": 2.5296, + "step": 3245 + }, + { + "epoch": 0.9149460925938976, + "grad_norm": 908.017578125, + "learning_rate": 1.8696693393297215e-06, + "loss": 2.6273, + "step": 3246 + }, + { + "epoch": 0.9152279613839758, + "grad_norm": 1056.0260009765625, + "learning_rate": 1.8573442872578616e-06, + "loss": 3.2152, + "step": 3247 + }, + { + "epoch": 0.915509830174054, + "grad_norm": 1248.0147705078125, + "learning_rate": 1.845059224798179e-06, + "loss": 3.2601, + "step": 3248 + }, + { + "epoch": 0.9157916989641322, + "grad_norm": 1004.0300903320312, + "learning_rate": 1.832814162155233e-06, + "loss": 3.0156, + "step": 3249 + }, + { + "epoch": 0.9160735677542104, + "grad_norm": 192.0559539794922, + "learning_rate": 1.8206091095003541e-06, + "loss": 2.5421, + "step": 3250 + }, + { + "epoch": 0.9163554365442886, + "grad_norm": 438.0137023925781, + "learning_rate": 1.8084440769716714e-06, + "loss": 2.7129, + "step": 3251 + }, + { + "epoch": 0.9166373053343668, + "grad_norm": 1288.008056640625, + "learning_rate": 1.7963190746740299e-06, + "loss": 2.6902, + "step": 3252 + }, + { + "epoch": 0.9169191741244451, + "grad_norm": 1784.013916015625, + "learning_rate": 1.7842341126790507e-06, + "loss": 3.3118, + "step": 3253 + }, + { + "epoch": 0.9172010429145233, + "grad_norm": 1176.014892578125, + "learning_rate": 1.7721892010250707e-06, + "loss": 3.0, + "step": 3254 + }, + { + "epoch": 0.9174829117046015, + "grad_norm": 928.0514526367188, + "learning_rate": 1.760184349717192e-06, + "loss": 3.0393, + "step": 3255 + }, + { + "epoch": 0.9177647804946797, + "grad_norm": 1136.0123291015625, + "learning_rate": 1.7482195687272162e-06, + "loss": 2.6745, + "step": 3256 + }, + { + "epoch": 0.918046649284758, + "grad_norm": 860.0115356445312, + "learning_rate": 1.7362948679936598e-06, + "loss": 2.5986, + "step": 3257 + }, + { + "epoch": 0.9183285180748362, + "grad_norm": 1368.01708984375, + "learning_rate": 1.7244102574217714e-06, + "loss": 2.5693, + "step": 3258 + }, + { + "epoch": 0.9186103868649144, + "grad_norm": 824.014892578125, + "learning_rate": 1.7125657468834655e-06, + "loss": 2.8883, + "step": 3259 + }, + { + "epoch": 0.9188922556549926, + "grad_norm": 852.0115356445312, + "learning_rate": 1.7007613462173777e-06, + "loss": 2.4775, + "step": 3260 + }, + { + "epoch": 0.9191741244450709, + "grad_norm": 800.0108032226562, + "learning_rate": 1.688997065228809e-06, + "loss": 3.126, + "step": 3261 + }, + { + "epoch": 0.919455993235149, + "grad_norm": 1592.01318359375, + "learning_rate": 1.6772729136897314e-06, + "loss": 3.5882, + "step": 3262 + }, + { + "epoch": 0.9197378620252272, + "grad_norm": 1232.00927734375, + "learning_rate": 1.6655889013387993e-06, + "loss": 3.0241, + "step": 3263 + }, + { + "epoch": 0.9200197308153055, + "grad_norm": 1664.0135498046875, + "learning_rate": 1.653945037881305e-06, + "loss": 2.9766, + "step": 3264 + }, + { + "epoch": 0.9203015996053837, + "grad_norm": 1216.0128173828125, + "learning_rate": 1.6423413329892167e-06, + "loss": 2.3311, + "step": 3265 + }, + { + "epoch": 0.9205834683954619, + "grad_norm": 984.0160522460938, + "learning_rate": 1.6307777963011251e-06, + "loss": 3.3984, + "step": 3266 + }, + { + "epoch": 0.9208653371855401, + "grad_norm": 860.0115356445312, + "learning_rate": 1.6192544374222517e-06, + "loss": 2.4137, + "step": 3267 + }, + { + "epoch": 0.9211472059756184, + "grad_norm": 1120.0191650390625, + "learning_rate": 1.6077712659244792e-06, + "loss": 2.6445, + "step": 3268 + }, + { + "epoch": 0.9214290747656966, + "grad_norm": 366.0500183105469, + "learning_rate": 1.5963282913462608e-06, + "loss": 2.7826, + "step": 3269 + }, + { + "epoch": 0.9217109435557748, + "grad_norm": 1136.009521484375, + "learning_rate": 1.584925523192693e-06, + "loss": 2.932, + "step": 3270 + }, + { + "epoch": 0.921992812345853, + "grad_norm": 800.0121459960938, + "learning_rate": 1.573562970935466e-06, + "loss": 2.4473, + "step": 3271 + }, + { + "epoch": 0.9222746811359313, + "grad_norm": 636.0105590820312, + "learning_rate": 1.5622406440128634e-06, + "loss": 2.6707, + "step": 3272 + }, + { + "epoch": 0.9225565499260094, + "grad_norm": 1104.01123046875, + "learning_rate": 1.5509585518297509e-06, + "loss": 2.9044, + "step": 3273 + }, + { + "epoch": 0.9228384187160876, + "grad_norm": 1504.0140380859375, + "learning_rate": 1.5397167037575822e-06, + "loss": 2.8271, + "step": 3274 + }, + { + "epoch": 0.9231202875061659, + "grad_norm": 1200.0135498046875, + "learning_rate": 1.528515109134382e-06, + "loss": 2.7748, + "step": 3275 + }, + { + "epoch": 0.9234021562962441, + "grad_norm": 1600.01171875, + "learning_rate": 1.5173537772647296e-06, + "loss": 2.5437, + "step": 3276 + }, + { + "epoch": 0.9236840250863223, + "grad_norm": 1208.0093994140625, + "learning_rate": 1.5062327174197644e-06, + "loss": 3.292, + "step": 3277 + }, + { + "epoch": 0.9239658938764005, + "grad_norm": 1440.01171875, + "learning_rate": 1.4951519388371805e-06, + "loss": 2.7699, + "step": 3278 + }, + { + "epoch": 0.9242477626664788, + "grad_norm": 1056.017333984375, + "learning_rate": 1.484111450721204e-06, + "loss": 2.8809, + "step": 3279 + }, + { + "epoch": 0.924529631456557, + "grad_norm": 1176.0111083984375, + "learning_rate": 1.473111262242599e-06, + "loss": 2.7598, + "step": 3280 + }, + { + "epoch": 0.9248115002466352, + "grad_norm": 644.0272216796875, + "learning_rate": 1.4621513825386568e-06, + "loss": 2.4513, + "step": 3281 + }, + { + "epoch": 0.9250933690367135, + "grad_norm": 768.0237426757812, + "learning_rate": 1.4512318207131725e-06, + "loss": 2.7952, + "step": 3282 + }, + { + "epoch": 0.9253752378267917, + "grad_norm": 676.0146484375, + "learning_rate": 1.4403525858364576e-06, + "loss": 2.015, + "step": 3283 + }, + { + "epoch": 0.9256571066168698, + "grad_norm": 1320.01708984375, + "learning_rate": 1.4295136869453497e-06, + "loss": 1.996, + "step": 3284 + }, + { + "epoch": 0.925938975406948, + "grad_norm": 1144.034423828125, + "learning_rate": 1.4187151330431413e-06, + "loss": 2.5892, + "step": 3285 + }, + { + "epoch": 0.9262208441970263, + "grad_norm": 416.05303955078125, + "learning_rate": 1.4079569330996412e-06, + "loss": 2.3966, + "step": 3286 + }, + { + "epoch": 0.9265027129871045, + "grad_norm": 386.0554504394531, + "learning_rate": 1.3972390960511228e-06, + "loss": 3.1908, + "step": 3287 + }, + { + "epoch": 0.9267845817771827, + "grad_norm": 552.031982421875, + "learning_rate": 1.3865616308003427e-06, + "loss": 2.8945, + "step": 3288 + }, + { + "epoch": 0.9270664505672609, + "grad_norm": 636.0394287109375, + "learning_rate": 1.3759245462165282e-06, + "loss": 3.2212, + "step": 3289 + }, + { + "epoch": 0.9273483193573392, + "grad_norm": 612.0352783203125, + "learning_rate": 1.3653278511353396e-06, + "loss": 2.8236, + "step": 3290 + }, + { + "epoch": 0.9276301881474174, + "grad_norm": 820.04248046875, + "learning_rate": 1.3547715543589134e-06, + "loss": 3.0652, + "step": 3291 + }, + { + "epoch": 0.9279120569374956, + "grad_norm": 2040.0157470703125, + "learning_rate": 1.344255664655808e-06, + "loss": 3.2253, + "step": 3292 + }, + { + "epoch": 0.9281939257275739, + "grad_norm": 1368.0164794921875, + "learning_rate": 1.3337801907610415e-06, + "loss": 2.6485, + "step": 3293 + }, + { + "epoch": 0.928475794517652, + "grad_norm": 836.024658203125, + "learning_rate": 1.3233451413760422e-06, + "loss": 2.6355, + "step": 3294 + }, + { + "epoch": 0.9287576633077302, + "grad_norm": 896.0211791992188, + "learning_rate": 1.3129505251686603e-06, + "loss": 2.5818, + "step": 3295 + }, + { + "epoch": 0.9290395320978084, + "grad_norm": 1544.0140380859375, + "learning_rate": 1.3025963507731775e-06, + "loss": 2.8268, + "step": 3296 + }, + { + "epoch": 0.9293214008878867, + "grad_norm": 1080.0184326171875, + "learning_rate": 1.2922826267902533e-06, + "loss": 2.752, + "step": 3297 + }, + { + "epoch": 0.9296032696779649, + "grad_norm": 952.0188598632812, + "learning_rate": 1.2820093617869734e-06, + "loss": 2.3451, + "step": 3298 + }, + { + "epoch": 0.9298851384680431, + "grad_norm": 1360.020751953125, + "learning_rate": 1.2717765642968115e-06, + "loss": 3.1725, + "step": 3299 + }, + { + "epoch": 0.9301670072581213, + "grad_norm": 1328.0179443359375, + "learning_rate": 1.2615842428196246e-06, + "loss": 2.7934, + "step": 3300 + }, + { + "epoch": 0.9304488760481996, + "grad_norm": 1072.021240234375, + "learning_rate": 1.2514324058216399e-06, + "loss": 3.4046, + "step": 3301 + }, + { + "epoch": 0.9307307448382778, + "grad_norm": 1088.0128173828125, + "learning_rate": 1.2413210617354564e-06, + "loss": 2.9242, + "step": 3302 + }, + { + "epoch": 0.931012613628356, + "grad_norm": 1168.0184326171875, + "learning_rate": 1.2312502189600617e-06, + "loss": 2.8916, + "step": 3303 + }, + { + "epoch": 0.9312944824184343, + "grad_norm": 1440.014892578125, + "learning_rate": 1.2212198858607694e-06, + "loss": 3.3516, + "step": 3304 + }, + { + "epoch": 0.9315763512085125, + "grad_norm": 840.0111083984375, + "learning_rate": 1.2112300707692647e-06, + "loss": 2.9838, + "step": 3305 + }, + { + "epoch": 0.9318582199985906, + "grad_norm": 1192.0078125, + "learning_rate": 1.2012807819835659e-06, + "loss": 2.6217, + "step": 3306 + }, + { + "epoch": 0.9321400887886688, + "grad_norm": 944.0284423828125, + "learning_rate": 1.191372027768034e-06, + "loss": 2.5622, + "step": 3307 + }, + { + "epoch": 0.9324219575787471, + "grad_norm": 1080.0140380859375, + "learning_rate": 1.1815038163533577e-06, + "loss": 2.7614, + "step": 3308 + }, + { + "epoch": 0.9327038263688253, + "grad_norm": 972.0099487304688, + "learning_rate": 1.171676155936552e-06, + "loss": 2.9443, + "step": 3309 + }, + { + "epoch": 0.9329856951589035, + "grad_norm": 984.0109252929688, + "learning_rate": 1.1618890546809424e-06, + "loss": 3.0719, + "step": 3310 + }, + { + "epoch": 0.9332675639489818, + "grad_norm": 1152.0096435546875, + "learning_rate": 1.1521425207161763e-06, + "loss": 2.6641, + "step": 3311 + }, + { + "epoch": 0.93354943273906, + "grad_norm": 656.040771484375, + "learning_rate": 1.1424365621381773e-06, + "loss": 2.6504, + "step": 3312 + }, + { + "epoch": 0.9338313015291382, + "grad_norm": 868.0209350585938, + "learning_rate": 1.1327711870091962e-06, + "loss": 2.5396, + "step": 3313 + }, + { + "epoch": 0.9341131703192164, + "grad_norm": 1120.0162353515625, + "learning_rate": 1.123146403357761e-06, + "loss": 3.1401, + "step": 3314 + }, + { + "epoch": 0.9343950391092947, + "grad_norm": 1344.011474609375, + "learning_rate": 1.1135622191786766e-06, + "loss": 2.5158, + "step": 3315 + }, + { + "epoch": 0.9346769078993729, + "grad_norm": 628.0128784179688, + "learning_rate": 1.104018642433019e-06, + "loss": 2.7539, + "step": 3316 + }, + { + "epoch": 0.934958776689451, + "grad_norm": 956.01416015625, + "learning_rate": 1.0945156810481639e-06, + "loss": 2.7363, + "step": 3317 + }, + { + "epoch": 0.9352406454795292, + "grad_norm": 1768.0125732421875, + "learning_rate": 1.0850533429177133e-06, + "loss": 2.5752, + "step": 3318 + }, + { + "epoch": 0.9355225142696075, + "grad_norm": 772.013427734375, + "learning_rate": 1.0756316359015529e-06, + "loss": 3.3611, + "step": 3319 + }, + { + "epoch": 0.9358043830596857, + "grad_norm": 1264.012939453125, + "learning_rate": 1.0662505678257995e-06, + "loss": 2.6862, + "step": 3320 + }, + { + "epoch": 0.9360862518497639, + "grad_norm": 716.0166015625, + "learning_rate": 1.0569101464828202e-06, + "loss": 2.4365, + "step": 3321 + }, + { + "epoch": 0.9363681206398422, + "grad_norm": 1584.0101318359375, + "learning_rate": 1.0476103796312253e-06, + "loss": 2.9941, + "step": 3322 + }, + { + "epoch": 0.9366499894299204, + "grad_norm": 330.0744934082031, + "learning_rate": 1.0383512749958413e-06, + "loss": 2.6165, + "step": 3323 + }, + { + "epoch": 0.9369318582199986, + "grad_norm": 676.0198974609375, + "learning_rate": 1.029132840267738e-06, + "loss": 3.0404, + "step": 3324 + }, + { + "epoch": 0.9372137270100768, + "grad_norm": 344.03387451171875, + "learning_rate": 1.0199550831041903e-06, + "loss": 2.6852, + "step": 3325 + }, + { + "epoch": 0.9374955958001551, + "grad_norm": 1144.012451171875, + "learning_rate": 1.0108180111286725e-06, + "loss": 3.0072, + "step": 3326 + }, + { + "epoch": 0.9377774645902333, + "grad_norm": 988.0136108398438, + "learning_rate": 1.0017216319308908e-06, + "loss": 2.9779, + "step": 3327 + }, + { + "epoch": 0.9380593333803114, + "grad_norm": 1088.013671875, + "learning_rate": 9.926659530667294e-07, + "loss": 2.7266, + "step": 3328 + }, + { + "epoch": 0.9383412021703896, + "grad_norm": 684.0222778320312, + "learning_rate": 9.836509820582819e-07, + "loss": 3.3024, + "step": 3329 + }, + { + "epoch": 0.9386230709604679, + "grad_norm": 1784.029052734375, + "learning_rate": 9.746767263938083e-07, + "loss": 2.9578, + "step": 3330 + }, + { + "epoch": 0.9389049397505461, + "grad_norm": 560.0173950195312, + "learning_rate": 9.657431935277627e-07, + "loss": 3.0996, + "step": 3331 + }, + { + "epoch": 0.9391868085406243, + "grad_norm": 2080.021728515625, + "learning_rate": 9.568503908807647e-07, + "loss": 2.7168, + "step": 3332 + }, + { + "epoch": 0.9394686773307026, + "grad_norm": 1296.0283203125, + "learning_rate": 9.479983258396107e-07, + "loss": 2.6398, + "step": 3333 + }, + { + "epoch": 0.9397505461207808, + "grad_norm": 446.0425109863281, + "learning_rate": 9.391870057572527e-07, + "loss": 2.64, + "step": 3334 + }, + { + "epoch": 0.940032414910859, + "grad_norm": 820.0249633789062, + "learning_rate": 9.304164379527858e-07, + "loss": 2.127, + "step": 3335 + }, + { + "epoch": 0.9403142837009372, + "grad_norm": 788.0250854492188, + "learning_rate": 9.216866297114824e-07, + "loss": 2.6384, + "step": 3336 + }, + { + "epoch": 0.9405961524910155, + "grad_norm": 1152.0491943359375, + "learning_rate": 9.129975882847364e-07, + "loss": 2.9994, + "step": 3337 + }, + { + "epoch": 0.9408780212810937, + "grad_norm": 884.0449829101562, + "learning_rate": 9.043493208900856e-07, + "loss": 2.5205, + "step": 3338 + }, + { + "epoch": 0.9411598900711718, + "grad_norm": 624.04052734375, + "learning_rate": 8.957418347111946e-07, + "loss": 2.9701, + "step": 3339 + }, + { + "epoch": 0.94144175886125, + "grad_norm": 1072.0323486328125, + "learning_rate": 8.871751368978553e-07, + "loss": 3.2888, + "step": 3340 + }, + { + "epoch": 0.9417236276513283, + "grad_norm": 812.0240478515625, + "learning_rate": 8.786492345659924e-07, + "loss": 3.2219, + "step": 3341 + }, + { + "epoch": 0.9420054964414065, + "grad_norm": 1672.03564453125, + "learning_rate": 8.701641347976186e-07, + "loss": 2.835, + "step": 3342 + }, + { + "epoch": 0.9422873652314847, + "grad_norm": 1032.0157470703125, + "learning_rate": 8.617198446408736e-07, + "loss": 2.7888, + "step": 3343 + }, + { + "epoch": 0.942569234021563, + "grad_norm": 1248.02392578125, + "learning_rate": 8.533163711099912e-07, + "loss": 2.8854, + "step": 3344 + }, + { + "epoch": 0.9428511028116412, + "grad_norm": 540.0230102539062, + "learning_rate": 8.449537211852987e-07, + "loss": 2.7839, + "step": 3345 + }, + { + "epoch": 0.9431329716017194, + "grad_norm": 506.01788330078125, + "learning_rate": 8.36631901813223e-07, + "loss": 2.8151, + "step": 3346 + }, + { + "epoch": 0.9434148403917976, + "grad_norm": 1752.013671875, + "learning_rate": 8.283509199062567e-07, + "loss": 2.7345, + "step": 3347 + }, + { + "epoch": 0.9436967091818759, + "grad_norm": 1688.0150146484375, + "learning_rate": 8.201107823429977e-07, + "loss": 3.2122, + "step": 3348 + }, + { + "epoch": 0.9439785779719541, + "grad_norm": 796.0221557617188, + "learning_rate": 8.119114959680929e-07, + "loss": 3.055, + "step": 3349 + }, + { + "epoch": 0.9442604467620322, + "grad_norm": 1672.014404296875, + "learning_rate": 8.037530675922611e-07, + "loss": 2.8678, + "step": 3350 + }, + { + "epoch": 0.9445423155521105, + "grad_norm": 1376.0128173828125, + "learning_rate": 7.95635503992298e-07, + "loss": 3.3145, + "step": 3351 + }, + { + "epoch": 0.9448241843421887, + "grad_norm": 660.0154418945312, + "learning_rate": 7.875588119110378e-07, + "loss": 2.6211, + "step": 3352 + }, + { + "epoch": 0.9451060531322669, + "grad_norm": 1024.0093994140625, + "learning_rate": 7.795229980573692e-07, + "loss": 3.084, + "step": 3353 + }, + { + "epoch": 0.9453879219223451, + "grad_norm": 1896.0101318359375, + "learning_rate": 7.715280691062255e-07, + "loss": 3.1361, + "step": 3354 + }, + { + "epoch": 0.9456697907124234, + "grad_norm": 1012.0113525390625, + "learning_rate": 7.635740316985884e-07, + "loss": 2.8617, + "step": 3355 + }, + { + "epoch": 0.9459516595025016, + "grad_norm": 756.0113525390625, + "learning_rate": 7.556608924414566e-07, + "loss": 2.9818, + "step": 3356 + }, + { + "epoch": 0.9462335282925798, + "grad_norm": 948.0112915039062, + "learning_rate": 7.477886579078719e-07, + "loss": 2.7026, + "step": 3357 + }, + { + "epoch": 0.946515397082658, + "grad_norm": 900.0184326171875, + "learning_rate": 7.399573346368871e-07, + "loss": 2.6742, + "step": 3358 + }, + { + "epoch": 0.9467972658727363, + "grad_norm": 656.0155639648438, + "learning_rate": 7.321669291335709e-07, + "loss": 2.6612, + "step": 3359 + }, + { + "epoch": 0.9470791346628145, + "grad_norm": 764.0093383789062, + "learning_rate": 7.244174478690247e-07, + "loss": 2.4359, + "step": 3360 + }, + { + "epoch": 0.9473610034528926, + "grad_norm": 992.0133666992188, + "learning_rate": 7.167088972803327e-07, + "loss": 3.04, + "step": 3361 + }, + { + "epoch": 0.947642872242971, + "grad_norm": 1208.0078125, + "learning_rate": 7.090412837705895e-07, + "loss": 2.3933, + "step": 3362 + }, + { + "epoch": 0.9479247410330491, + "grad_norm": 260.04193115234375, + "learning_rate": 7.014146137088895e-07, + "loss": 2.6992, + "step": 3363 + }, + { + "epoch": 0.9482066098231273, + "grad_norm": 944.0186157226562, + "learning_rate": 6.938288934303039e-07, + "loss": 3.0846, + "step": 3364 + }, + { + "epoch": 0.9484884786132055, + "grad_norm": 1020.0128173828125, + "learning_rate": 6.862841292359035e-07, + "loss": 2.614, + "step": 3365 + }, + { + "epoch": 0.9487703474032838, + "grad_norm": 264.0467834472656, + "learning_rate": 6.787803273927362e-07, + "loss": 2.851, + "step": 3366 + }, + { + "epoch": 0.949052216193362, + "grad_norm": 1528.0107421875, + "learning_rate": 6.713174941338162e-07, + "loss": 2.6618, + "step": 3367 + }, + { + "epoch": 0.9493340849834402, + "grad_norm": 780.0111083984375, + "learning_rate": 6.638956356581349e-07, + "loss": 2.614, + "step": 3368 + }, + { + "epoch": 0.9496159537735184, + "grad_norm": 1336.008056640625, + "learning_rate": 6.565147581306441e-07, + "loss": 2.7234, + "step": 3369 + }, + { + "epoch": 0.9498978225635967, + "grad_norm": 1552.0093994140625, + "learning_rate": 6.491748676822618e-07, + "loss": 2.6224, + "step": 3370 + }, + { + "epoch": 0.9501796913536749, + "grad_norm": 912.029541015625, + "learning_rate": 6.418759704098498e-07, + "loss": 2.5072, + "step": 3371 + }, + { + "epoch": 0.950461560143753, + "grad_norm": 1184.009521484375, + "learning_rate": 6.346180723762307e-07, + "loss": 2.4146, + "step": 3372 + }, + { + "epoch": 0.9507434289338313, + "grad_norm": 664.022216796875, + "learning_rate": 6.274011796101597e-07, + "loss": 2.7181, + "step": 3373 + }, + { + "epoch": 0.9510252977239095, + "grad_norm": 976.0091552734375, + "learning_rate": 6.202252981063306e-07, + "loss": 3.2158, + "step": 3374 + }, + { + "epoch": 0.9513071665139877, + "grad_norm": 840.0238647460938, + "learning_rate": 6.130904338253862e-07, + "loss": 3.1875, + "step": 3375 + }, + { + "epoch": 0.9515890353040659, + "grad_norm": 764.0136108398438, + "learning_rate": 6.05996592693886e-07, + "loss": 2.8294, + "step": 3376 + }, + { + "epoch": 0.9518709040941442, + "grad_norm": 1048.010986328125, + "learning_rate": 5.989437806043219e-07, + "loss": 2.4225, + "step": 3377 + }, + { + "epoch": 0.9521527728842224, + "grad_norm": 1440.0159912109375, + "learning_rate": 5.919320034150855e-07, + "loss": 3.5293, + "step": 3378 + }, + { + "epoch": 0.9524346416743006, + "grad_norm": 1664.0079345703125, + "learning_rate": 5.849612669505067e-07, + "loss": 2.8597, + "step": 3379 + }, + { + "epoch": 0.9527165104643789, + "grad_norm": 1456.0184326171875, + "learning_rate": 5.780315770008148e-07, + "loss": 2.7799, + "step": 3380 + }, + { + "epoch": 0.9529983792544571, + "grad_norm": 1104.0115966796875, + "learning_rate": 5.711429393221502e-07, + "loss": 2.917, + "step": 3381 + }, + { + "epoch": 0.9532802480445353, + "grad_norm": 1136.031494140625, + "learning_rate": 5.642953596365408e-07, + "loss": 2.4136, + "step": 3382 + }, + { + "epoch": 0.9535621168346134, + "grad_norm": 1864.020263671875, + "learning_rate": 5.574888436319093e-07, + "loss": 2.5277, + "step": 3383 + }, + { + "epoch": 0.9538439856246917, + "grad_norm": 712.0355834960938, + "learning_rate": 5.507233969620939e-07, + "loss": 2.4984, + "step": 3384 + }, + { + "epoch": 0.9541258544147699, + "grad_norm": 708.0245361328125, + "learning_rate": 5.439990252467886e-07, + "loss": 2.8515, + "step": 3385 + }, + { + "epoch": 0.9544077232048481, + "grad_norm": 1296.0318603515625, + "learning_rate": 5.373157340715862e-07, + "loss": 2.8394, + "step": 3386 + }, + { + "epoch": 0.9546895919949263, + "grad_norm": 832.0259399414062, + "learning_rate": 5.30673528987946e-07, + "loss": 3.2762, + "step": 3387 + }, + { + "epoch": 0.9549714607850046, + "grad_norm": 592.0363159179688, + "learning_rate": 5.240724155132049e-07, + "loss": 2.578, + "step": 3388 + }, + { + "epoch": 0.9552533295750828, + "grad_norm": 1128.0341796875, + "learning_rate": 5.175123991305653e-07, + "loss": 2.6353, + "step": 3389 + }, + { + "epoch": 0.955535198365161, + "grad_norm": 816.0277709960938, + "learning_rate": 5.109934852891018e-07, + "loss": 2.7559, + "step": 3390 + }, + { + "epoch": 0.9558170671552393, + "grad_norm": 1520.035400390625, + "learning_rate": 5.045156794037331e-07, + "loss": 2.9089, + "step": 3391 + }, + { + "epoch": 0.9560989359453175, + "grad_norm": 1064.0369873046875, + "learning_rate": 4.980789868552327e-07, + "loss": 3.2507, + "step": 3392 + }, + { + "epoch": 0.9563808047353957, + "grad_norm": 1248.0159912109375, + "learning_rate": 4.916834129902348e-07, + "loss": 3.265, + "step": 3393 + }, + { + "epoch": 0.9566626735254739, + "grad_norm": 243.06398010253906, + "learning_rate": 4.853289631212066e-07, + "loss": 2.6501, + "step": 3394 + }, + { + "epoch": 0.9569445423155521, + "grad_norm": 1096.021240234375, + "learning_rate": 4.790156425264647e-07, + "loss": 3.0928, + "step": 3395 + }, + { + "epoch": 0.9572264111056303, + "grad_norm": 288.03668212890625, + "learning_rate": 4.727434564501587e-07, + "loss": 2.63, + "step": 3396 + }, + { + "epoch": 0.9575082798957085, + "grad_norm": 1272.0267333984375, + "learning_rate": 4.6651241010226e-07, + "loss": 2.9639, + "step": 3397 + }, + { + "epoch": 0.9577901486857867, + "grad_norm": 1256.0174560546875, + "learning_rate": 4.603225086585838e-07, + "loss": 2.5147, + "step": 3398 + }, + { + "epoch": 0.958072017475865, + "grad_norm": 932.021728515625, + "learning_rate": 4.541737572607618e-07, + "loss": 2.5355, + "step": 3399 + }, + { + "epoch": 0.9583538862659432, + "grad_norm": 712.0348510742188, + "learning_rate": 4.4806616101624176e-07, + "loss": 2.6955, + "step": 3400 + }, + { + "epoch": 0.9586357550560214, + "grad_norm": 1880.0181884765625, + "learning_rate": 4.4199972499828213e-07, + "loss": 2.9202, + "step": 3401 + }, + { + "epoch": 0.9589176238460997, + "grad_norm": 1216.0162353515625, + "learning_rate": 4.359744542459632e-07, + "loss": 2.5264, + "step": 3402 + }, + { + "epoch": 0.9591994926361779, + "grad_norm": 676.0286254882812, + "learning_rate": 4.299903537641703e-07, + "loss": 3.1501, + "step": 3403 + }, + { + "epoch": 0.9594813614262561, + "grad_norm": 1208.0145263671875, + "learning_rate": 4.2404742852357735e-07, + "loss": 2.7017, + "step": 3404 + }, + { + "epoch": 0.9597632302163343, + "grad_norm": 588.0198364257812, + "learning_rate": 4.1814568346066896e-07, + "loss": 2.9919, + "step": 3405 + }, + { + "epoch": 0.9600450990064126, + "grad_norm": 1136.0164794921875, + "learning_rate": 4.122851234777181e-07, + "loss": 2.5088, + "step": 3406 + }, + { + "epoch": 0.9603269677964907, + "grad_norm": 588.0154418945312, + "learning_rate": 4.064657534427807e-07, + "loss": 3.0649, + "step": 3407 + }, + { + "epoch": 0.9606088365865689, + "grad_norm": 936.0125122070312, + "learning_rate": 4.0068757818972344e-07, + "loss": 3.0114, + "step": 3408 + }, + { + "epoch": 0.9608907053766472, + "grad_norm": 828.009521484375, + "learning_rate": 3.9495060251816265e-07, + "loss": 2.7306, + "step": 3409 + }, + { + "epoch": 0.9611725741667254, + "grad_norm": 692.0127563476562, + "learning_rate": 3.89254831193514e-07, + "loss": 2.5775, + "step": 3410 + }, + { + "epoch": 0.9614544429568036, + "grad_norm": 640.0155639648438, + "learning_rate": 3.836002689469598e-07, + "loss": 2.4731, + "step": 3411 + }, + { + "epoch": 0.9617363117468818, + "grad_norm": 1568.0164794921875, + "learning_rate": 3.779869204754427e-07, + "loss": 2.7764, + "step": 3412 + }, + { + "epoch": 0.9620181805369601, + "grad_norm": 1012.0254516601562, + "learning_rate": 3.7241479044169967e-07, + "loss": 3.4183, + "step": 3413 + }, + { + "epoch": 0.9623000493270383, + "grad_norm": 2512.013427734375, + "learning_rate": 3.66883883474195e-07, + "loss": 3.1501, + "step": 3414 + }, + { + "epoch": 0.9625819181171165, + "grad_norm": 1312.0130615234375, + "learning_rate": 3.613942041671703e-07, + "loss": 2.7087, + "step": 3415 + }, + { + "epoch": 0.9628637869071947, + "grad_norm": 1648.021728515625, + "learning_rate": 3.5594575708062236e-07, + "loss": 3.1523, + "step": 3416 + }, + { + "epoch": 0.963145655697273, + "grad_norm": 696.0241088867188, + "learning_rate": 3.5053854674029217e-07, + "loss": 2.9717, + "step": 3417 + }, + { + "epoch": 0.9634275244873511, + "grad_norm": 980.0131225585938, + "learning_rate": 3.4517257763766466e-07, + "loss": 2.7461, + "step": 3418 + }, + { + "epoch": 0.9637093932774293, + "grad_norm": 1480.0196533203125, + "learning_rate": 3.398478542299799e-07, + "loss": 2.9935, + "step": 3419 + }, + { + "epoch": 0.9639912620675076, + "grad_norm": 1264.0128173828125, + "learning_rate": 3.345643809401999e-07, + "loss": 2.9557, + "step": 3420 + }, + { + "epoch": 0.9642731308575858, + "grad_norm": 215.03524780273438, + "learning_rate": 3.293221621570419e-07, + "loss": 2.6999, + "step": 3421 + }, + { + "epoch": 0.964554999647664, + "grad_norm": 282.0231018066406, + "learning_rate": 3.241212022349449e-07, + "loss": 2.681, + "step": 3422 + }, + { + "epoch": 0.9648368684377422, + "grad_norm": 1144.014892578125, + "learning_rate": 3.189615054940753e-07, + "loss": 2.6953, + "step": 3423 + }, + { + "epoch": 0.9651187372278205, + "grad_norm": 1776.013916015625, + "learning_rate": 3.1384307622032147e-07, + "loss": 2.5604, + "step": 3424 + }, + { + "epoch": 0.9654006060178987, + "grad_norm": 2368.020263671875, + "learning_rate": 3.0876591866531023e-07, + "loss": 2.9661, + "step": 3425 + }, + { + "epoch": 0.9656824748079769, + "grad_norm": 1352.0169677734375, + "learning_rate": 3.0373003704635696e-07, + "loss": 2.7908, + "step": 3426 + }, + { + "epoch": 0.965964343598055, + "grad_norm": 736.0149536132812, + "learning_rate": 2.9873543554652106e-07, + "loss": 3.1914, + "step": 3427 + }, + { + "epoch": 0.9662462123881334, + "grad_norm": 1232.011474609375, + "learning_rate": 2.937821183145506e-07, + "loss": 2.6807, + "step": 3428 + }, + { + "epoch": 0.9665280811782115, + "grad_norm": 1248.0181884765625, + "learning_rate": 2.88870089464921e-07, + "loss": 2.6938, + "step": 3429 + }, + { + "epoch": 0.9668099499682897, + "grad_norm": 1248.011962890625, + "learning_rate": 2.8399935307778514e-07, + "loss": 2.931, + "step": 3430 + }, + { + "epoch": 0.967091818758368, + "grad_norm": 820.0166625976562, + "learning_rate": 2.791699131990233e-07, + "loss": 2.8975, + "step": 3431 + }, + { + "epoch": 0.9673736875484462, + "grad_norm": 1464.0147705078125, + "learning_rate": 2.743817738401988e-07, + "loss": 3.2236, + "step": 3432 + }, + { + "epoch": 0.9676555563385244, + "grad_norm": 2064.0107421875, + "learning_rate": 2.6963493897856904e-07, + "loss": 3.2725, + "step": 3433 + }, + { + "epoch": 0.9679374251286026, + "grad_norm": 1152.11572265625, + "learning_rate": 2.649294125570856e-07, + "loss": 2.8387, + "step": 3434 + }, + { + "epoch": 0.9682192939186809, + "grad_norm": 1056.02197265625, + "learning_rate": 2.602651984843829e-07, + "loss": 2.8926, + "step": 3435 + }, + { + "epoch": 0.9685011627087591, + "grad_norm": 1216.0167236328125, + "learning_rate": 2.5564230063478413e-07, + "loss": 2.3809, + "step": 3436 + }, + { + "epoch": 0.9687830314988373, + "grad_norm": 608.03271484375, + "learning_rate": 2.510607228482953e-07, + "loss": 2.043, + "step": 3437 + }, + { + "epoch": 0.9690649002889156, + "grad_norm": 532.0436401367188, + "learning_rate": 2.465204689305889e-07, + "loss": 3.2631, + "step": 3438 + }, + { + "epoch": 0.9693467690789938, + "grad_norm": 1856.0262451171875, + "learning_rate": 2.420215426530259e-07, + "loss": 2.7771, + "step": 3439 + }, + { + "epoch": 0.9696286378690719, + "grad_norm": 1176.0263671875, + "learning_rate": 2.375639477526226e-07, + "loss": 2.2559, + "step": 3440 + }, + { + "epoch": 0.9699105066591501, + "grad_norm": 1056.027099609375, + "learning_rate": 2.331476879320782e-07, + "loss": 3.2539, + "step": 3441 + }, + { + "epoch": 0.9701923754492284, + "grad_norm": 1248.0286865234375, + "learning_rate": 2.287727668597528e-07, + "loss": 2.8418, + "step": 3442 + }, + { + "epoch": 0.9704742442393066, + "grad_norm": 796.0182495117188, + "learning_rate": 2.244391881696617e-07, + "loss": 2.1772, + "step": 3443 + }, + { + "epoch": 0.9707561130293848, + "grad_norm": 1024.0137939453125, + "learning_rate": 2.2014695546148657e-07, + "loss": 2.5613, + "step": 3444 + }, + { + "epoch": 0.971037981819463, + "grad_norm": 2192.02685546875, + "learning_rate": 2.158960723005643e-07, + "loss": 3.0164, + "step": 3445 + }, + { + "epoch": 0.9713198506095413, + "grad_norm": 564.03857421875, + "learning_rate": 2.116865422178871e-07, + "loss": 2.5641, + "step": 3446 + }, + { + "epoch": 0.9716017193996195, + "grad_norm": 992.0149536132812, + "learning_rate": 2.0751836871008567e-07, + "loss": 2.4103, + "step": 3447 + }, + { + "epoch": 0.9718835881896977, + "grad_norm": 860.0227661132812, + "learning_rate": 2.0339155523945163e-07, + "loss": 2.4403, + "step": 3448 + }, + { + "epoch": 0.972165456979776, + "grad_norm": 928.01953125, + "learning_rate": 1.9930610523390959e-07, + "loss": 2.7028, + "step": 3449 + }, + { + "epoch": 0.9724473257698542, + "grad_norm": 840.017822265625, + "learning_rate": 1.9526202208703938e-07, + "loss": 2.297, + "step": 3450 + }, + { + "epoch": 0.9727291945599323, + "grad_norm": 872.0165405273438, + "learning_rate": 1.912593091580428e-07, + "loss": 2.6563, + "step": 3451 + }, + { + "epoch": 0.9730110633500105, + "grad_norm": 1384.012939453125, + "learning_rate": 1.8729796977177138e-07, + "loss": 2.8201, + "step": 3452 + }, + { + "epoch": 0.9732929321400888, + "grad_norm": 984.0227661132812, + "learning_rate": 1.833780072187097e-07, + "loss": 2.9329, + "step": 3453 + }, + { + "epoch": 0.973574800930167, + "grad_norm": 540.017578125, + "learning_rate": 1.7949942475495863e-07, + "loss": 2.3985, + "step": 3454 + }, + { + "epoch": 0.9738566697202452, + "grad_norm": 1488.0125732421875, + "learning_rate": 1.7566222560225776e-07, + "loss": 2.4906, + "step": 3455 + }, + { + "epoch": 0.9741385385103234, + "grad_norm": 976.0130004882812, + "learning_rate": 1.7186641294797413e-07, + "loss": 3.2275, + "step": 3456 + }, + { + "epoch": 0.9744204073004017, + "grad_norm": 478.0204162597656, + "learning_rate": 1.681119899450856e-07, + "loss": 2.9551, + "step": 3457 + }, + { + "epoch": 0.9747022760904799, + "grad_norm": 1848.1097412109375, + "learning_rate": 1.6439895971220865e-07, + "loss": 3.3324, + "step": 3458 + }, + { + "epoch": 0.9749841448805581, + "grad_norm": 1080.020751953125, + "learning_rate": 1.6072732533355395e-07, + "loss": 3.1706, + "step": 3459 + }, + { + "epoch": 0.9752660136706364, + "grad_norm": 876.0193481445312, + "learning_rate": 1.5709708985895965e-07, + "loss": 2.95, + "step": 3460 + }, + { + "epoch": 0.9755478824607146, + "grad_norm": 700.01806640625, + "learning_rate": 1.5350825630388033e-07, + "loss": 2.9938, + "step": 3461 + }, + { + "epoch": 0.9758297512507927, + "grad_norm": 1032.018798828125, + "learning_rate": 1.4996082764937025e-07, + "loss": 2.2227, + "step": 3462 + }, + { + "epoch": 0.9761116200408709, + "grad_norm": 1096.013671875, + "learning_rate": 1.464548068421001e-07, + "loss": 2.9932, + "step": 3463 + }, + { + "epoch": 0.9763934888309492, + "grad_norm": 760.0137939453125, + "learning_rate": 1.4299019679432924e-07, + "loss": 2.5586, + "step": 3464 + }, + { + "epoch": 0.9766753576210274, + "grad_norm": 1592.023193359375, + "learning_rate": 1.3956700038393888e-07, + "loss": 3.3965, + "step": 3465 + }, + { + "epoch": 0.9769572264111056, + "grad_norm": 1712.009521484375, + "learning_rate": 1.3618522045439895e-07, + "loss": 2.436, + "step": 3466 + }, + { + "epoch": 0.9772390952011838, + "grad_norm": 1088.0133056640625, + "learning_rate": 1.328448598147791e-07, + "loss": 2.6498, + "step": 3467 + }, + { + "epoch": 0.9775209639912621, + "grad_norm": 2048.009765625, + "learning_rate": 1.295459212397432e-07, + "loss": 2.5928, + "step": 3468 + }, + { + "epoch": 0.9778028327813403, + "grad_norm": 1488.0128173828125, + "learning_rate": 1.2628840746954363e-07, + "loss": 2.9056, + "step": 3469 + }, + { + "epoch": 0.9780847015714185, + "grad_norm": 1232.0155029296875, + "learning_rate": 1.230723212100382e-07, + "loss": 3.1149, + "step": 3470 + }, + { + "epoch": 0.9783665703614968, + "grad_norm": 1480.010986328125, + "learning_rate": 1.1989766513265664e-07, + "loss": 2.3528, + "step": 3471 + }, + { + "epoch": 0.978648439151575, + "grad_norm": 1280.0084228515625, + "learning_rate": 1.1676444187442848e-07, + "loss": 2.8994, + "step": 3472 + }, + { + "epoch": 0.9789303079416531, + "grad_norm": 500.0232849121094, + "learning_rate": 1.1367265403794958e-07, + "loss": 2.7995, + "step": 3473 + }, + { + "epoch": 0.9792121767317313, + "grad_norm": 288.0411682128906, + "learning_rate": 1.1062230419141562e-07, + "loss": 3.0609, + "step": 3474 + }, + { + "epoch": 0.9794940455218096, + "grad_norm": 1328.0159912109375, + "learning_rate": 1.0761339486859424e-07, + "loss": 2.5547, + "step": 3475 + }, + { + "epoch": 0.9797759143118878, + "grad_norm": 1600.0081787109375, + "learning_rate": 1.0464592856882504e-07, + "loss": 2.9352, + "step": 3476 + }, + { + "epoch": 0.980057783101966, + "grad_norm": 1512.0120849609375, + "learning_rate": 1.0171990775703633e-07, + "loss": 2.8415, + "step": 3477 + }, + { + "epoch": 0.9803396518920443, + "grad_norm": 528.0144653320312, + "learning_rate": 9.883533486371721e-08, + "loss": 2.61, + "step": 3478 + }, + { + "epoch": 0.9806215206821225, + "grad_norm": 1928.012939453125, + "learning_rate": 9.599221228493438e-08, + "loss": 2.5749, + "step": 3479 + }, + { + "epoch": 0.9809033894722007, + "grad_norm": 222.03543090820312, + "learning_rate": 9.319054238232094e-08, + "loss": 2.397, + "step": 3480 + }, + { + "epoch": 0.9811852582622789, + "grad_norm": 482.0177307128906, + "learning_rate": 9.0430327483082e-08, + "loss": 2.822, + "step": 3481 + }, + { + "epoch": 0.9814671270523572, + "grad_norm": 748.0298461914062, + "learning_rate": 8.771156987997797e-08, + "loss": 2.6097, + "step": 3482 + }, + { + "epoch": 0.9817489958424354, + "grad_norm": 836.024658203125, + "learning_rate": 8.503427183135238e-08, + "loss": 2.7666, + "step": 3483 + }, + { + "epoch": 0.9820308646325135, + "grad_norm": 1752.034423828125, + "learning_rate": 8.239843556108739e-08, + "loss": 3.0326, + "step": 3484 + }, + { + "epoch": 0.9823127334225917, + "grad_norm": 1096.0157470703125, + "learning_rate": 7.980406325864276e-08, + "loss": 2.2061, + "step": 3485 + }, + { + "epoch": 0.98259460221267, + "grad_norm": 1208.0225830078125, + "learning_rate": 7.725115707902797e-08, + "loss": 2.6502, + "step": 3486 + }, + { + "epoch": 0.9828764710027482, + "grad_norm": 1936.013671875, + "learning_rate": 7.473971914280787e-08, + "loss": 2.7484, + "step": 3487 + }, + { + "epoch": 0.9831583397928264, + "grad_norm": 1560.0279541015625, + "learning_rate": 7.22697515361026e-08, + "loss": 2.9928, + "step": 3488 + }, + { + "epoch": 0.9834402085829047, + "grad_norm": 1088.03125, + "learning_rate": 6.984125631058768e-08, + "loss": 2.5985, + "step": 3489 + }, + { + "epoch": 0.9837220773729829, + "grad_norm": 728.02490234375, + "learning_rate": 6.745423548348839e-08, + "loss": 2.9128, + "step": 3490 + }, + { + "epoch": 0.9840039461630611, + "grad_norm": 704.0316772460938, + "learning_rate": 6.510869103757978e-08, + "loss": 2.5728, + "step": 3491 + }, + { + "epoch": 0.9842858149531393, + "grad_norm": 1240.0223388671875, + "learning_rate": 6.280462492118666e-08, + "loss": 3.0566, + "step": 3492 + }, + { + "epoch": 0.9845676837432176, + "grad_norm": 1176.0189208984375, + "learning_rate": 6.054203904817812e-08, + "loss": 3.1856, + "step": 3493 + }, + { + "epoch": 0.9848495525332958, + "grad_norm": 1824.01806640625, + "learning_rate": 5.83209352979619e-08, + "loss": 2.9172, + "step": 3494 + }, + { + "epoch": 0.985131421323374, + "grad_norm": 880.0178833007812, + "learning_rate": 5.614131551549551e-08, + "loss": 2.9346, + "step": 3495 + }, + { + "epoch": 0.9854132901134521, + "grad_norm": 1192.0155029296875, + "learning_rate": 5.400318151127515e-08, + "loss": 2.585, + "step": 3496 + }, + { + "epoch": 0.9856951589035304, + "grad_norm": 1096.0115966796875, + "learning_rate": 5.190653506134124e-08, + "loss": 2.3792, + "step": 3497 + }, + { + "epoch": 0.9859770276936086, + "grad_norm": 1400.0142822265625, + "learning_rate": 4.985137790726735e-08, + "loss": 2.6532, + "step": 3498 + }, + { + "epoch": 0.9862588964836868, + "grad_norm": 1464.01318359375, + "learning_rate": 4.7837711756171245e-08, + "loss": 2.0822, + "step": 3499 + }, + { + "epoch": 0.9865407652737651, + "grad_norm": 1272.0101318359375, + "learning_rate": 4.586553828069273e-08, + "loss": 2.9935, + "step": 3500 + }, + { + "epoch": 0.9868226340638433, + "grad_norm": 556.0151977539062, + "learning_rate": 4.39348591190214e-08, + "loss": 3.0017, + "step": 3501 + }, + { + "epoch": 0.9871045028539215, + "grad_norm": 1560.008056640625, + "learning_rate": 4.204567587486885e-08, + "loss": 2.7712, + "step": 3502 + }, + { + "epoch": 0.9873863716439997, + "grad_norm": 1488.0162353515625, + "learning_rate": 4.019799011747982e-08, + "loss": 3.0596, + "step": 3503 + }, + { + "epoch": 0.987668240434078, + "grad_norm": 1056.0091552734375, + "learning_rate": 3.8391803381637726e-08, + "loss": 2.9541, + "step": 3504 + }, + { + "epoch": 0.9879501092241562, + "grad_norm": 932.0169067382812, + "learning_rate": 3.662711716764245e-08, + "loss": 2.6013, + "step": 3505 + }, + { + "epoch": 0.9882319780142343, + "grad_norm": 1048.0118408203125, + "learning_rate": 3.490393294133254e-08, + "loss": 2.946, + "step": 3506 + }, + { + "epoch": 0.9885138468043126, + "grad_norm": 764.0161743164062, + "learning_rate": 3.322225213406305e-08, + "loss": 3.0189, + "step": 3507 + }, + { + "epoch": 0.9887957155943908, + "grad_norm": 1088.0081787109375, + "learning_rate": 3.158207614272213e-08, + "loss": 2.586, + "step": 3508 + }, + { + "epoch": 0.989077584384469, + "grad_norm": 2256.02392578125, + "learning_rate": 2.998340632971441e-08, + "loss": 3.1888, + "step": 3509 + }, + { + "epoch": 0.9893594531745472, + "grad_norm": 516.0213012695312, + "learning_rate": 2.8426244022983218e-08, + "loss": 2.9255, + "step": 3510 + }, + { + "epoch": 0.9896413219646255, + "grad_norm": 1392.027587890625, + "learning_rate": 2.6910590515966116e-08, + "loss": 3.0641, + "step": 3511 + }, + { + "epoch": 0.9899231907547037, + "grad_norm": 87.12870788574219, + "learning_rate": 2.543644706765047e-08, + "loss": 2.8366, + "step": 3512 + }, + { + "epoch": 0.9902050595447819, + "grad_norm": 1152.0089111328125, + "learning_rate": 2.400381490253456e-08, + "loss": 2.4812, + "step": 3513 + }, + { + "epoch": 0.9904869283348601, + "grad_norm": 446.0273132324219, + "learning_rate": 2.2612695210616487e-08, + "loss": 2.7539, + "step": 3514 + }, + { + "epoch": 0.9907687971249384, + "grad_norm": 1512.009765625, + "learning_rate": 2.1263089147438577e-08, + "loss": 2.7048, + "step": 3515 + }, + { + "epoch": 0.9910506659150166, + "grad_norm": 1200.015625, + "learning_rate": 1.995499783404853e-08, + "loss": 2.8187, + "step": 3516 + }, + { + "epoch": 0.9913325347050947, + "grad_norm": 1400.01171875, + "learning_rate": 1.8688422357004966e-08, + "loss": 3.199, + "step": 3517 + }, + { + "epoch": 0.991614403495173, + "grad_norm": 540.0226440429688, + "learning_rate": 1.746336376838853e-08, + "loss": 3.0693, + "step": 3518 + }, + { + "epoch": 0.9918962722852512, + "grad_norm": 544.0159912109375, + "learning_rate": 1.627982308579079e-08, + "loss": 2.2531, + "step": 3519 + }, + { + "epoch": 0.9921781410753294, + "grad_norm": 1600.00927734375, + "learning_rate": 1.5137801292325336e-08, + "loss": 3.1742, + "step": 3520 + }, + { + "epoch": 0.9924600098654076, + "grad_norm": 1352.0142822265625, + "learning_rate": 1.4037299336600029e-08, + "loss": 2.6038, + "step": 3521 + }, + { + "epoch": 0.9927418786554859, + "grad_norm": 860.0155029296875, + "learning_rate": 1.29783181327503e-08, + "loss": 2.8971, + "step": 3522 + }, + { + "epoch": 0.9930237474455641, + "grad_norm": 708.0150146484375, + "learning_rate": 1.1960858560416954e-08, + "loss": 2.4028, + "step": 3523 + }, + { + "epoch": 0.9933056162356423, + "grad_norm": 580.0316772460938, + "learning_rate": 1.0984921464751718e-08, + "loss": 2.9736, + "step": 3524 + }, + { + "epoch": 0.9935874850257205, + "grad_norm": 592.0153198242188, + "learning_rate": 1.005050765641169e-08, + "loss": 3.0531, + "step": 3525 + }, + { + "epoch": 0.9938693538157988, + "grad_norm": 1544.0152587890625, + "learning_rate": 9.157617911570438e-09, + "loss": 2.7581, + "step": 3526 + }, + { + "epoch": 0.994151222605877, + "grad_norm": 896.0110473632812, + "learning_rate": 8.306252971901351e-09, + "loss": 1.9491, + "step": 3527 + }, + { + "epoch": 0.9944330913959551, + "grad_norm": 1496.0167236328125, + "learning_rate": 7.496413544588743e-09, + "loss": 3.4232, + "step": 3528 + }, + { + "epoch": 0.9947149601860334, + "grad_norm": 792.0169067382812, + "learning_rate": 6.728100302327845e-09, + "loss": 2.902, + "step": 3529 + }, + { + "epoch": 0.9949968289761116, + "grad_norm": 964.0136108398438, + "learning_rate": 6.0013138833137084e-09, + "loss": 3.0117, + "step": 3530 + }, + { + "epoch": 0.9952786977661898, + "grad_norm": 2144.01904296875, + "learning_rate": 5.31605489125786e-09, + "loss": 2.5762, + "step": 3531 + }, + { + "epoch": 0.995560566556268, + "grad_norm": 2256.02783203125, + "learning_rate": 4.672323895354991e-09, + "loss": 2.8086, + "step": 3532 + }, + { + "epoch": 0.9958424353463463, + "grad_norm": 1192.0283203125, + "learning_rate": 4.070121430332918e-09, + "loss": 2.9684, + "step": 3533 + }, + { + "epoch": 0.9961243041364245, + "grad_norm": 1232.0283203125, + "learning_rate": 3.509447996402626e-09, + "loss": 3.2721, + "step": 3534 + }, + { + "epoch": 0.9964061729265027, + "grad_norm": 1440.0252685546875, + "learning_rate": 2.9903040592860197e-09, + "loss": 2.8018, + "step": 3535 + }, + { + "epoch": 0.996688041716581, + "grad_norm": 912.0300903320312, + "learning_rate": 2.5126900502159268e-09, + "loss": 2.5107, + "step": 3536 + }, + { + "epoch": 0.9969699105066592, + "grad_norm": 608.0260009765625, + "learning_rate": 2.0766063659138912e-09, + "loss": 2.2817, + "step": 3537 + }, + { + "epoch": 0.9972517792967374, + "grad_norm": 1368.0216064453125, + "learning_rate": 1.6820533686179306e-09, + "loss": 2.8211, + "step": 3538 + }, + { + "epoch": 0.9975336480868155, + "grad_norm": 1280.0107421875, + "learning_rate": 1.329031386060331e-09, + "loss": 2.5254, + "step": 3539 + }, + { + "epoch": 0.9978155168768938, + "grad_norm": 944.0214233398438, + "learning_rate": 1.0175407114731971e-09, + "loss": 2.6421, + "step": 3540 + }, + { + "epoch": 0.998097385666972, + "grad_norm": 394.041748046875, + "learning_rate": 7.475816036051076e-10, + "loss": 2.9111, + "step": 3541 + }, + { + "epoch": 0.9983792544570502, + "grad_norm": 474.02777099609375, + "learning_rate": 5.191542866878063e-10, + "loss": 2.7517, + "step": 3542 + }, + { + "epoch": 0.9986611232471284, + "grad_norm": 1272.0213623046875, + "learning_rate": 3.3225895047506174e-10, + "loss": 2.614, + "step": 3543 + }, + { + "epoch": 0.9989429920372067, + "grad_norm": 556.025146484375, + "learning_rate": 1.8689575020380824e-10, + "loss": 2.8845, + "step": 3544 + }, + { + "epoch": 0.9992248608272849, + "grad_norm": 1304.029052734375, + "learning_rate": 8.306480661635085e-11, + "loss": 2.9408, + "step": 3545 + }, + { + "epoch": 0.9995067296173631, + "grad_norm": 668.022705078125, + "learning_rate": 2.0766205965916385e-11, + "loss": 2.9515, + "step": 3546 + }, + { + "epoch": 0.9997885984074414, + "grad_norm": 1368.0181884765625, + "learning_rate": 0.0, + "loss": 3.3613, + "step": 3547 + } + ], + "logging_steps": 1, + "max_steps": 3547, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 239, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.968840278120858e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}