diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" deleted file mode 100644--- "a/last-checkpoint/trainer_state.json" +++ /dev/null @@ -1,7416 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 15.27020630048712, - "global_step": 1000000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.02, - "learning_rate": 2.9999999999999997e-06, - "loss": 0.8784, - "step": 1000 - }, - { - "epoch": 0.03, - "learning_rate": 5.999999999999999e-06, - "loss": 0.77, - "step": 2000 - }, - { - "epoch": 0.05, - "learning_rate": 8.999999999999999e-06, - "loss": 0.7664, - "step": 3000 - }, - { - "epoch": 0.06, - "learning_rate": 1.1999999999999999e-05, - "loss": 0.7655, - "step": 4000 - }, - { - "epoch": 0.08, - "learning_rate": 1.4999999999999999e-05, - "loss": 0.765, - "step": 5000 - }, - { - "epoch": 0.08, - "eval_runtime": 1.3797, - "eval_samples_per_second": 724.791, - "eval_steps_per_second": 11.597, - "step": 5000 - }, - { - "epoch": 0.09, - "learning_rate": 1.7999999999999997e-05, - "loss": 0.7647, - "step": 6000 - }, - { - "epoch": 0.11, - "learning_rate": 2.1e-05, - "loss": 0.7644, - "step": 7000 - }, - { - "epoch": 0.12, - "learning_rate": 2.3999999999999997e-05, - "loss": 0.7638, - "step": 8000 - }, - { - "epoch": 0.14, - "learning_rate": 2.6999999999999996e-05, - "loss": 0.7633, - "step": 9000 - }, - { - "epoch": 0.15, - "learning_rate": 2.9999999999999997e-05, - "loss": 0.76, - "step": 10000 - }, - { - "epoch": 0.15, - "eval_runtime": 1.1376, - "eval_samples_per_second": 879.066, - "eval_steps_per_second": 14.065, - "step": 10000 - }, - { - "epoch": 0.17, - "learning_rate": 3.2999999999999996e-05, - "loss": 0.7148, - "step": 11000 - }, - { - "epoch": 0.18, - "learning_rate": 3.5999999999999994e-05, - "loss": 0.6963, - "step": 12000 - }, - { - "epoch": 0.2, - "learning_rate": 3.9e-05, - "loss": 0.6755, - "step": 13000 - }, - { - "epoch": 0.21, - "learning_rate": 4.2e-05, - "loss": 0.6516, - "step": 14000 - }, - { - "epoch": 0.23, - "learning_rate": 4.4999999999999996e-05, - "loss": 0.6412, - "step": 15000 - }, - { - "epoch": 0.23, - "eval_runtime": 1.1689, - "eval_samples_per_second": 855.472, - "eval_steps_per_second": 13.688, - "step": 15000 - }, - { - "epoch": 0.24, - "learning_rate": 4.7999999999999994e-05, - "loss": 0.6348, - "step": 16000 - }, - { - "epoch": 0.26, - "learning_rate": 5.1e-05, - "loss": 0.6295, - "step": 17000 - }, - { - "epoch": 0.27, - "learning_rate": 5.399999999999999e-05, - "loss": 0.6224, - "step": 18000 - }, - { - "epoch": 0.29, - "learning_rate": 5.6999999999999996e-05, - "loss": 0.6169, - "step": 19000 - }, - { - "epoch": 0.31, - "learning_rate": 5.9999999999999995e-05, - "loss": 0.6113, - "step": 20000 - }, - { - "epoch": 0.31, - "eval_runtime": 1.0179, - "eval_samples_per_second": 982.441, - "eval_steps_per_second": 15.719, - "step": 20000 - }, - { - "epoch": 0.32, - "learning_rate": 6.299999999999999e-05, - "loss": 0.6074, - "step": 21000 - }, - { - "epoch": 0.34, - "learning_rate": 6.599999999999999e-05, - "loss": 0.6039, - "step": 22000 - }, - { - "epoch": 0.35, - "learning_rate": 6.9e-05, - "loss": 0.6005, - "step": 23000 - }, - { - "epoch": 0.37, - "learning_rate": 7.199999999999999e-05, - "loss": 0.5968, - "step": 24000 - }, - { - "epoch": 0.38, - "learning_rate": 7.5e-05, - "loss": 0.5932, - "step": 25000 - }, - { - "epoch": 0.38, - "eval_runtime": 1.1249, - "eval_samples_per_second": 888.989, - "eval_steps_per_second": 14.224, - "step": 25000 - }, - { - "epoch": 0.4, - "learning_rate": 7.8e-05, - "loss": 0.5912, - "step": 26000 - }, - { - "epoch": 0.41, - "learning_rate": 8.1e-05, - "loss": 0.58, - "step": 27000 - }, - { - "epoch": 0.43, - "learning_rate": 8.4e-05, - "loss": 0.5698, - "step": 28000 - }, - { - "epoch": 0.44, - "learning_rate": 8.699999999999999e-05, - "loss": 0.5639, - "step": 29000 - }, - { - "epoch": 0.46, - "learning_rate": 8.999999999999999e-05, - "loss": 0.5601, - "step": 30000 - }, - { - "epoch": 0.46, - "eval_runtime": 1.0096, - "eval_samples_per_second": 990.512, - "eval_steps_per_second": 15.848, - "step": 30000 - }, - { - "epoch": 0.47, - "learning_rate": 9.3e-05, - "loss": 0.5536, - "step": 31000 - }, - { - "epoch": 0.49, - "learning_rate": 9.599999999999999e-05, - "loss": 0.5496, - "step": 32000 - }, - { - "epoch": 0.5, - "learning_rate": 9.9e-05, - "loss": 0.5458, - "step": 33000 - }, - { - "epoch": 0.52, - "learning_rate": 0.000102, - "loss": 0.5426, - "step": 34000 - }, - { - "epoch": 0.53, - "learning_rate": 0.00010499999999999999, - "loss": 0.5394, - "step": 35000 - }, - { - "epoch": 0.53, - "eval_runtime": 1.3102, - "eval_samples_per_second": 763.27, - "eval_steps_per_second": 12.212, - "step": 35000 - }, - { - "epoch": 0.55, - "learning_rate": 0.00010799999999999998, - "loss": 0.5345, - "step": 36000 - }, - { - "epoch": 0.56, - "learning_rate": 0.00011099999999999999, - "loss": 0.5302, - "step": 37000 - }, - { - "epoch": 0.58, - "learning_rate": 0.00011399999999999999, - "loss": 0.527, - "step": 38000 - }, - { - "epoch": 0.6, - "learning_rate": 0.000117, - "loss": 0.5232, - "step": 39000 - }, - { - "epoch": 0.61, - "learning_rate": 0.00011999999999999999, - "loss": 0.5202, - "step": 40000 - }, - { - "epoch": 0.61, - "eval_runtime": 1.0146, - "eval_samples_per_second": 985.598, - "eval_steps_per_second": 15.77, - "step": 40000 - }, - { - "epoch": 0.63, - "learning_rate": 0.00012299999999999998, - "loss": 0.5163, - "step": 41000 - }, - { - "epoch": 0.64, - "learning_rate": 0.00012599999999999997, - "loss": 0.5126, - "step": 42000 - }, - { - "epoch": 0.66, - "learning_rate": 0.000129, - "loss": 0.5094, - "step": 43000 - }, - { - "epoch": 0.67, - "learning_rate": 0.00013199999999999998, - "loss": 0.5061, - "step": 44000 - }, - { - "epoch": 0.69, - "learning_rate": 0.000135, - "loss": 0.5036, - "step": 45000 - }, - { - "epoch": 0.69, - "eval_runtime": 1.0362, - "eval_samples_per_second": 965.067, - "eval_steps_per_second": 15.441, - "step": 45000 - }, - { - "epoch": 0.7, - "learning_rate": 0.000138, - "loss": 0.4995, - "step": 46000 - }, - { - "epoch": 0.72, - "learning_rate": 0.00014099999999999998, - "loss": 0.4967, - "step": 47000 - }, - { - "epoch": 0.73, - "learning_rate": 0.00014399999999999998, - "loss": 0.4934, - "step": 48000 - }, - { - "epoch": 0.75, - "learning_rate": 0.000147, - "loss": 0.4898, - "step": 49000 - }, - { - "epoch": 0.76, - "learning_rate": 0.00015, - "loss": 0.4863, - "step": 50000 - }, - { - "epoch": 0.76, - "eval_runtime": 1.0374, - "eval_samples_per_second": 963.96, - "eval_steps_per_second": 15.423, - "step": 50000 - }, - { - "epoch": 0.78, - "learning_rate": 0.0001499996172456075, - "loss": 0.4824, - "step": 51000 - }, - { - "epoch": 0.79, - "learning_rate": 0.00014999846898661572, - "loss": 0.4778, - "step": 52000 - }, - { - "epoch": 0.81, - "learning_rate": 0.00014999655523558183, - "loss": 0.474, - "step": 53000 - }, - { - "epoch": 0.82, - "learning_rate": 0.00014999387601343436, - "loss": 0.4694, - "step": 54000 - }, - { - "epoch": 0.84, - "learning_rate": 0.00014999043134947282, - "loss": 0.4651, - "step": 55000 - }, - { - "epoch": 0.84, - "eval_runtime": 1.0465, - "eval_samples_per_second": 955.566, - "eval_steps_per_second": 15.289, - "step": 55000 - }, - { - "epoch": 0.86, - "learning_rate": 0.00014998622128136748, - "loss": 0.4608, - "step": 56000 - }, - { - "epoch": 0.87, - "learning_rate": 0.000149981245855159, - "loss": 0.4566, - "step": 57000 - }, - { - "epoch": 0.89, - "learning_rate": 0.00014997550512525784, - "loss": 0.4523, - "step": 58000 - }, - { - "epoch": 0.9, - "learning_rate": 0.0001499689991544437, - "loss": 0.4483, - "step": 59000 - }, - { - "epoch": 0.92, - "learning_rate": 0.00014996172801386482, - "loss": 0.4447, - "step": 60000 - }, - { - "epoch": 0.92, - "eval_runtime": 1.2288, - "eval_samples_per_second": 813.826, - "eval_steps_per_second": 13.021, - "step": 60000 - }, - { - "epoch": 0.93, - "learning_rate": 0.00014995369178303722, - "loss": 0.4408, - "step": 61000 - }, - { - "epoch": 0.95, - "learning_rate": 0.0001499448905498439, - "loss": 0.4381, - "step": 62000 - }, - { - "epoch": 0.96, - "learning_rate": 0.00014993532441053364, - "loss": 0.434, - "step": 63000 - }, - { - "epoch": 0.98, - "learning_rate": 0.0001499249934697203, - "loss": 0.4316, - "step": 64000 - }, - { - "epoch": 0.99, - "learning_rate": 0.0001499138978403813, - "loss": 0.4275, - "step": 65000 - }, - { - "epoch": 0.99, - "eval_runtime": 1.0345, - "eval_samples_per_second": 966.655, - "eval_steps_per_second": 15.466, - "step": 65000 - }, - { - "epoch": 1.01, - "learning_rate": 0.00014990203764385677, - "loss": 0.425, - "step": 66000 - }, - { - "epoch": 1.02, - "learning_rate": 0.00014988941300984784, - "loss": 0.422, - "step": 67000 - }, - { - "epoch": 1.04, - "learning_rate": 0.0001498760240764155, - "loss": 0.4191, - "step": 68000 - }, - { - "epoch": 1.05, - "learning_rate": 0.000149861870989979, - "loss": 0.4164, - "step": 69000 - }, - { - "epoch": 1.07, - "learning_rate": 0.0001498469539053142, - "loss": 0.4138, - "step": 70000 - }, - { - "epoch": 1.07, - "eval_runtime": 1.1341, - "eval_samples_per_second": 881.784, - "eval_steps_per_second": 14.109, - "step": 70000 - }, - { - "epoch": 1.08, - "learning_rate": 0.00014983127298555198, - "loss": 0.4114, - "step": 71000 - }, - { - "epoch": 1.1, - "learning_rate": 0.00014981482840217632, - "loss": 0.4086, - "step": 72000 - }, - { - "epoch": 1.11, - "learning_rate": 0.00014979762033502262, - "loss": 0.4066, - "step": 73000 - }, - { - "epoch": 1.13, - "learning_rate": 0.00014977964897227547, - "loss": 0.4042, - "step": 74000 - }, - { - "epoch": 1.15, - "learning_rate": 0.00014976091451046687, - "loss": 0.402, - "step": 75000 - }, - { - "epoch": 1.15, - "eval_runtime": 1.0331, - "eval_samples_per_second": 967.957, - "eval_steps_per_second": 15.487, - "step": 75000 - }, - { - "epoch": 1.16, - "learning_rate": 0.00014974141715447386, - "loss": 0.3999, - "step": 76000 - }, - { - "epoch": 1.18, - "learning_rate": 0.00014972115711751644, - "loss": 0.398, - "step": 77000 - }, - { - "epoch": 1.19, - "learning_rate": 0.00014970013462115505, - "loss": 0.3971, - "step": 78000 - }, - { - "epoch": 1.21, - "learning_rate": 0.00014967834989528843, - "loss": 0.3942, - "step": 79000 - }, - { - "epoch": 1.22, - "learning_rate": 0.00014965580317815078, - "loss": 0.3926, - "step": 80000 - }, - { - "epoch": 1.22, - "eval_runtime": 1.084, - "eval_samples_per_second": 922.521, - "eval_steps_per_second": 14.76, - "step": 80000 - }, - { - "epoch": 1.24, - "learning_rate": 0.00014963249471630944, - "loss": 0.3906, - "step": 81000 - }, - { - "epoch": 1.25, - "learning_rate": 0.000149608424764662, - "loss": 0.391, - "step": 82000 - }, - { - "epoch": 1.27, - "learning_rate": 0.0001495835935864336, - "loss": 0.3875, - "step": 83000 - }, - { - "epoch": 1.28, - "learning_rate": 0.00014955800145317397, - "loss": 0.3861, - "step": 84000 - }, - { - "epoch": 1.3, - "learning_rate": 0.00014953164864475466, - "loss": 0.3844, - "step": 85000 - }, - { - "epoch": 1.3, - "eval_runtime": 1.0992, - "eval_samples_per_second": 909.734, - "eval_steps_per_second": 14.556, - "step": 85000 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001495045354493657, - "loss": 0.3829, - "step": 86000 - }, - { - "epoch": 1.33, - "learning_rate": 0.00014947666216351272, - "loss": 0.3815, - "step": 87000 - }, - { - "epoch": 1.34, - "learning_rate": 0.00014944802909201344, - "loss": 0.38, - "step": 88000 - }, - { - "epoch": 1.36, - "learning_rate": 0.00014941863654799456, - "loss": 0.3789, - "step": 89000 - }, - { - "epoch": 1.37, - "learning_rate": 0.00014938848485288825, - "loss": 0.3785, - "step": 90000 - }, - { - "epoch": 1.37, - "eval_runtime": 0.9266, - "eval_samples_per_second": 1079.167, - "eval_steps_per_second": 17.267, - "step": 90000 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001493575743364286, - "loss": 0.3766, - "step": 91000 - }, - { - "epoch": 1.4, - "learning_rate": 0.00014932590533664808, - "loss": 0.3745, - "step": 92000 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001492934781998738, - "loss": 0.3741, - "step": 93000 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001492602932807237, - "loss": 0.3729, - "step": 94000 - }, - { - "epoch": 1.45, - "learning_rate": 0.00014922635094210277, - "loss": 0.3709, - "step": 95000 - }, - { - "epoch": 1.45, - "eval_runtime": 0.9895, - "eval_samples_per_second": 1010.579, - "eval_steps_per_second": 16.169, - "step": 95000 - }, - { - "epoch": 1.47, - "learning_rate": 0.000149191651555199, - "loss": 0.3699, - "step": 96000 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001491561954994793, - "loss": 0.3688, - "step": 97000 - }, - { - "epoch": 1.5, - "learning_rate": 0.00014911998316268537, - "loss": 0.3678, - "step": 98000 - }, - { - "epoch": 1.51, - "learning_rate": 0.00014908301494082963, - "loss": 0.3666, - "step": 99000 - }, - { - "epoch": 1.53, - "learning_rate": 0.00014904529123819054, - "loss": 0.3654, - "step": 100000 - }, - { - "epoch": 1.53, - "eval_runtime": 1.0046, - "eval_samples_per_second": 995.424, - "eval_steps_per_second": 15.927, - "step": 100000 - }, - { - "epoch": 1.54, - "learning_rate": 0.00014900681246730852, - "loss": 0.3643, - "step": 101000 - }, - { - "epoch": 1.56, - "learning_rate": 0.00014896757904898125, - "loss": 0.3646, - "step": 102000 - }, - { - "epoch": 1.57, - "learning_rate": 0.00014892759141225904, - "loss": 0.3628, - "step": 103000 - }, - { - "epoch": 1.59, - "learning_rate": 0.00014888684999444035, - "loss": 0.3616, - "step": 104000 - }, - { - "epoch": 1.6, - "learning_rate": 0.00014884535524106675, - "loss": 0.3604, - "step": 105000 - }, - { - "epoch": 1.6, - "eval_runtime": 1.0499, - "eval_samples_per_second": 952.499, - "eval_steps_per_second": 15.24, - "step": 105000 - }, - { - "epoch": 1.62, - "learning_rate": 0.00014880310760591824, - "loss": 0.3594, - "step": 106000 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001487601075510082, - "loss": 0.3597, - "step": 107000 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001487163555465783, - "loss": 0.3583, - "step": 108000 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001486718520710935, - "loss": 0.3583, - "step": 109000 - }, - { - "epoch": 1.68, - "learning_rate": 0.00014862659761123663, - "loss": 0.3558, - "step": 110000 - }, - { - "epoch": 1.68, - "eval_runtime": 1.0153, - "eval_samples_per_second": 984.91, - "eval_steps_per_second": 15.759, - "step": 110000 - }, - { - "epoch": 1.69, - "learning_rate": 0.00014858059266190327, - "loss": 0.3552, - "step": 111000 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014853383772619612, - "loss": 0.3544, - "step": 112000 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014848633331541967, - "loss": 0.3537, - "step": 113000 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001484380799490746, - "loss": 0.3524, - "step": 114000 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014838907815485194, - "loss": 0.3519, - "step": 115000 - }, - { - "epoch": 1.76, - "eval_runtime": 1.003, - "eval_samples_per_second": 997.001, - "eval_steps_per_second": 15.952, - "step": 115000 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014833932846862748, - "loss": 0.3511, - "step": 116000 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014828883143445582, - "loss": 0.3502, - "step": 117000 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001482375876045644, - "loss": 0.3493, - "step": 118000 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001481855975393476, - "loss": 0.3489, - "step": 119000 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001481328618073604, - "loss": 0.3482, - "step": 120000 - }, - { - "epoch": 1.83, - "eval_runtime": 1.0596, - "eval_samples_per_second": 943.744, - "eval_steps_per_second": 15.1, - "step": 120000 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001480793809853123, - "loss": 0.3478, - "step": 121000 - }, - { - "epoch": 1.86, - "learning_rate": 0.00014802515565806107, - "loss": 0.3468, - "step": 122000 - }, - { - "epoch": 1.88, - "learning_rate": 0.00014797018641860612, - "loss": 0.346, - "step": 123000 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001479144738680823, - "loss": 0.3474, - "step": 124000 - }, - { - "epoch": 1.91, - "learning_rate": 0.00014785801861575312, - "loss": 0.3447, - "step": 125000 - }, - { - "epoch": 1.91, - "eval_runtime": 0.9375, - "eval_samples_per_second": 1066.699, - "eval_steps_per_second": 17.067, - "step": 125000 - }, - { - "epoch": 1.92, - "learning_rate": 0.00014780082127900416, - "loss": 0.3439, - "step": 126000 - }, - { - "epoch": 1.94, - "learning_rate": 0.00014774288248333635, - "loss": 0.3436, - "step": 127000 - }, - { - "epoch": 1.95, - "learning_rate": 0.00014768420286235908, - "loss": 0.3429, - "step": 128000 - }, - { - "epoch": 1.97, - "learning_rate": 0.00014762478305778328, - "loss": 0.3422, - "step": 129000 - }, - { - "epoch": 1.99, - "learning_rate": 0.0001475646237194144, - "loss": 0.3414, - "step": 130000 - }, - { - "epoch": 1.99, - "eval_runtime": 1.0085, - "eval_samples_per_second": 991.553, - "eval_steps_per_second": 15.865, - "step": 130000 - }, - { - "epoch": 2.0, - "learning_rate": 0.00014750372550514533, - "loss": 0.3409, - "step": 131000 - }, - { - "epoch": 2.02, - "learning_rate": 0.0001474420890809492, - "loss": 0.3401, - "step": 132000 - }, - { - "epoch": 2.03, - "learning_rate": 0.00014737971512087202, - "loss": 0.3396, - "step": 133000 - }, - { - "epoch": 2.05, - "learning_rate": 0.00014731660430702552, - "loss": 0.339, - "step": 134000 - }, - { - "epoch": 2.06, - "learning_rate": 0.00014725275732957937, - "loss": 0.3402, - "step": 135000 - }, - { - "epoch": 2.06, - "eval_runtime": 1.1005, - "eval_samples_per_second": 908.652, - "eval_steps_per_second": 14.538, - "step": 135000 - }, - { - "epoch": 2.08, - "learning_rate": 0.00014718817488675387, - "loss": 0.3379, - "step": 136000 - }, - { - "epoch": 2.09, - "learning_rate": 0.00014712285768481235, - "loss": 0.3371, - "step": 137000 - }, - { - "epoch": 2.11, - "learning_rate": 0.00014705680643805323, - "loss": 0.3368, - "step": 138000 - }, - { - "epoch": 2.12, - "learning_rate": 0.00014699002186880232, - "loss": 0.3363, - "step": 139000 - }, - { - "epoch": 2.14, - "learning_rate": 0.00014692250470740503, - "loss": 0.3361, - "step": 140000 - }, - { - "epoch": 2.14, - "eval_runtime": 1.0104, - "eval_samples_per_second": 989.716, - "eval_steps_per_second": 15.835, - "step": 140000 - }, - { - "epoch": 2.15, - "learning_rate": 0.00014685425569221819, - "loss": 0.3353, - "step": 141000 - }, - { - "epoch": 2.17, - "learning_rate": 0.00014678527556960207, - "loss": 0.3346, - "step": 142000 - }, - { - "epoch": 2.18, - "learning_rate": 0.0001467155650939123, - "loss": 0.3342, - "step": 143000 - }, - { - "epoch": 2.2, - "learning_rate": 0.00014664512502749141, - "loss": 0.3338, - "step": 144000 - }, - { - "epoch": 2.21, - "learning_rate": 0.00014657395614066075, - "loss": 0.3334, - "step": 145000 - }, - { - "epoch": 2.21, - "eval_runtime": 1.0369, - "eval_samples_per_second": 964.439, - "eval_steps_per_second": 15.431, - "step": 145000 - }, - { - "epoch": 2.23, - "learning_rate": 0.0001465020592117118, - "loss": 0.3327, - "step": 146000 - }, - { - "epoch": 2.24, - "learning_rate": 0.0001464294350268979, - "loss": 0.3324, - "step": 147000 - }, - { - "epoch": 2.26, - "learning_rate": 0.00014635608438042546, - "loss": 0.3319, - "step": 148000 - }, - { - "epoch": 2.28, - "learning_rate": 0.00014628200807444543, - "loss": 0.3313, - "step": 149000 - }, - { - "epoch": 2.29, - "learning_rate": 0.0001462072069190444, - "loss": 0.3307, - "step": 150000 - }, - { - "epoch": 2.29, - "eval_runtime": 1.0431, - "eval_samples_per_second": 958.687, - "eval_steps_per_second": 15.339, - "step": 150000 - }, - { - "epoch": 2.31, - "learning_rate": 0.00014613168173223585, - "loss": 0.3308, - "step": 151000 - }, - { - "epoch": 2.32, - "learning_rate": 0.00014605543333995113, - "loss": 0.3302, - "step": 152000 - }, - { - "epoch": 2.34, - "learning_rate": 0.00014597846257603038, - "loss": 0.3294, - "step": 153000 - }, - { - "epoch": 2.35, - "learning_rate": 0.0001459007702822136, - "loss": 0.329, - "step": 154000 - }, - { - "epoch": 2.37, - "learning_rate": 0.00014582235730813128, - "loss": 0.3283, - "step": 155000 - }, - { - "epoch": 2.37, - "eval_runtime": 1.0629, - "eval_samples_per_second": 940.817, - "eval_steps_per_second": 15.053, - "step": 155000 - }, - { - "epoch": 2.38, - "learning_rate": 0.00014574322451129507, - "loss": 0.3281, - "step": 156000 - }, - { - "epoch": 2.4, - "learning_rate": 0.00014566337275708863, - "loss": 0.328, - "step": 157000 - }, - { - "epoch": 2.41, - "learning_rate": 0.0001455828029187579, - "loss": 0.3272, - "step": 158000 - }, - { - "epoch": 2.43, - "learning_rate": 0.00014550151587740178, - "loss": 0.3269, - "step": 159000 - }, - { - "epoch": 2.44, - "learning_rate": 0.00014541951252196225, - "loss": 0.3265, - "step": 160000 - }, - { - "epoch": 2.44, - "eval_runtime": 1.0199, - "eval_samples_per_second": 980.452, - "eval_steps_per_second": 15.687, - "step": 160000 - }, - { - "epoch": 2.46, - "learning_rate": 0.00014533679374921493, - "loss": 0.3259, - "step": 161000 - }, - { - "epoch": 2.47, - "learning_rate": 0.00014525336046375905, - "loss": 0.3254, - "step": 162000 - }, - { - "epoch": 2.49, - "learning_rate": 0.00014516921357800766, - "loss": 0.3251, - "step": 163000 - }, - { - "epoch": 2.5, - "learning_rate": 0.00014508435401217759, - "loss": 0.3244, - "step": 164000 - }, - { - "epoch": 2.52, - "learning_rate": 0.00014499878269427948, - "loss": 0.3243, - "step": 165000 - }, - { - "epoch": 2.52, - "eval_runtime": 1.0655, - "eval_samples_per_second": 938.486, - "eval_steps_per_second": 15.016, - "step": 165000 - }, - { - "epoch": 2.53, - "learning_rate": 0.00014491250056010758, - "loss": 0.3236, - "step": 166000 - }, - { - "epoch": 2.55, - "learning_rate": 0.00014482550855322943, - "loss": 0.3233, - "step": 167000 - }, - { - "epoch": 2.57, - "learning_rate": 0.0001447378076249757, - "loss": 0.3231, - "step": 168000 - }, - { - "epoch": 2.58, - "learning_rate": 0.00014464939873442973, - "loss": 0.3228, - "step": 169000 - }, - { - "epoch": 2.6, - "learning_rate": 0.00014456028284841693, - "loss": 0.3221, - "step": 170000 - }, - { - "epoch": 2.6, - "eval_runtime": 1.1756, - "eval_samples_per_second": 850.656, - "eval_steps_per_second": 13.611, - "step": 170000 - }, - { - "epoch": 2.61, - "learning_rate": 0.00014447046094149437, - "loss": 0.3221, - "step": 171000 - }, - { - "epoch": 2.63, - "learning_rate": 0.00014437993399594003, - "loss": 0.3216, - "step": 172000 - }, - { - "epoch": 2.64, - "learning_rate": 0.0001442887030017421, - "loss": 0.3217, - "step": 173000 - }, - { - "epoch": 2.66, - "learning_rate": 0.00014419676895658807, - "loss": 0.3208, - "step": 174000 - }, - { - "epoch": 2.67, - "learning_rate": 0.000144104132865854, - "loss": 0.3207, - "step": 175000 - }, - { - "epoch": 2.67, - "eval_runtime": 1.0679, - "eval_samples_per_second": 936.423, - "eval_steps_per_second": 14.983, - "step": 175000 - }, - { - "epoch": 2.69, - "learning_rate": 0.0001440107957425933, - "loss": 0.3203, - "step": 176000 - }, - { - "epoch": 2.7, - "learning_rate": 0.0001439167586075258, - "loss": 0.3201, - "step": 177000 - }, - { - "epoch": 2.72, - "learning_rate": 0.0001438220224890265, - "loss": 0.3191, - "step": 178000 - }, - { - "epoch": 2.73, - "learning_rate": 0.00014372658842311449, - "loss": 0.3195, - "step": 179000 - }, - { - "epoch": 2.75, - "learning_rate": 0.00014363045745344137, - "loss": 0.3191, - "step": 180000 - }, - { - "epoch": 2.75, - "eval_runtime": 1.0169, - "eval_samples_per_second": 983.42, - "eval_steps_per_second": 15.735, - "step": 180000 - }, - { - "epoch": 2.76, - "learning_rate": 0.00014353363063128005, - "loss": 0.3183, - "step": 181000 - }, - { - "epoch": 2.78, - "learning_rate": 0.0001434361090155131, - "loss": 0.3177, - "step": 182000 - }, - { - "epoch": 2.79, - "learning_rate": 0.00014333789367262136, - "loss": 0.3178, - "step": 183000 - }, - { - "epoch": 2.81, - "learning_rate": 0.00014323898567667202, - "loss": 0.3177, - "step": 184000 - }, - { - "epoch": 2.82, - "learning_rate": 0.00014313938610930712, - "loss": 0.3171, - "step": 185000 - }, - { - "epoch": 2.82, - "eval_runtime": 1.0441, - "eval_samples_per_second": 957.721, - "eval_steps_per_second": 15.324, - "step": 185000 - }, - { - "epoch": 2.84, - "learning_rate": 0.00014303909605973154, - "loss": 0.3167, - "step": 186000 - }, - { - "epoch": 2.86, - "learning_rate": 0.0001429381166247012, - "loss": 0.3168, - "step": 187000 - }, - { - "epoch": 2.87, - "learning_rate": 0.00014283644890851103, - "loss": 0.3164, - "step": 188000 - }, - { - "epoch": 2.89, - "learning_rate": 0.00014273409402298291, - "loss": 0.3161, - "step": 189000 - }, - { - "epoch": 2.9, - "learning_rate": 0.00014263105308745343, - "loss": 0.3155, - "step": 190000 - }, - { - "epoch": 2.9, - "eval_runtime": 1.0119, - "eval_samples_per_second": 988.212, - "eval_steps_per_second": 15.811, - "step": 190000 - }, - { - "epoch": 2.92, - "learning_rate": 0.00014252732722876176, - "loss": 0.3149, - "step": 191000 - }, - { - "epoch": 2.93, - "learning_rate": 0.0001424229175812373, - "loss": 0.3149, - "step": 192000 - }, - { - "epoch": 2.95, - "learning_rate": 0.00014231782528668717, - "loss": 0.3146, - "step": 193000 - }, - { - "epoch": 2.96, - "learning_rate": 0.00014221205149438394, - "loss": 0.3145, - "step": 194000 - }, - { - "epoch": 2.98, - "learning_rate": 0.0001421055973610528, - "loss": 0.3138, - "step": 195000 - }, - { - "epoch": 2.98, - "eval_runtime": 1.0908, - "eval_samples_per_second": 916.734, - "eval_steps_per_second": 14.668, - "step": 195000 - }, - { - "epoch": 2.99, - "learning_rate": 0.00014199846405085913, - "loss": 0.3137, - "step": 196000 - }, - { - "epoch": 3.01, - "learning_rate": 0.00014189065273539564, - "loss": 0.3135, - "step": 197000 - }, - { - "epoch": 3.02, - "learning_rate": 0.00014178216459366958, - "loss": 0.3137, - "step": 198000 - }, - { - "epoch": 3.04, - "learning_rate": 0.00014167300081208988, - "loss": 0.3131, - "step": 199000 - }, - { - "epoch": 3.05, - "learning_rate": 0.00014156316258445421, - "loss": 0.3125, - "step": 200000 - }, - { - "epoch": 3.05, - "eval_runtime": 1.1346, - "eval_samples_per_second": 881.333, - "eval_steps_per_second": 14.101, - "step": 200000 - }, - { - "epoch": 3.07, - "learning_rate": 0.00014145265111193583, - "loss": 0.3121, - "step": 201000 - }, - { - "epoch": 3.08, - "learning_rate": 0.00014134146760307043, - "loss": 0.3122, - "step": 202000 - }, - { - "epoch": 3.1, - "learning_rate": 0.00014122961327374313, - "loss": 0.3131, - "step": 203000 - }, - { - "epoch": 3.12, - "learning_rate": 0.0001411170893471749, - "loss": 0.3116, - "step": 204000 - }, - { - "epoch": 3.13, - "learning_rate": 0.00014100389705390938, - "loss": 0.311, - "step": 205000 - }, - { - "epoch": 3.13, - "eval_runtime": 1.1239, - "eval_samples_per_second": 889.731, - "eval_steps_per_second": 14.236, - "step": 205000 - }, - { - "epoch": 3.15, - "learning_rate": 0.0001408900376317994, - "loss": 0.311, - "step": 206000 - }, - { - "epoch": 3.16, - "learning_rate": 0.0001407755123259933, - "loss": 0.3108, - "step": 207000 - }, - { - "epoch": 3.18, - "learning_rate": 0.00014066032238892152, - "loss": 0.3104, - "step": 208000 - }, - { - "epoch": 3.19, - "learning_rate": 0.00014054446908028272, - "loss": 0.3102, - "step": 209000 - }, - { - "epoch": 3.21, - "learning_rate": 0.00014042795366703018, - "loss": 0.3097, - "step": 210000 - }, - { - "epoch": 3.21, - "eval_runtime": 1.0233, - "eval_samples_per_second": 977.233, - "eval_steps_per_second": 15.636, - "step": 210000 - }, - { - "epoch": 3.22, - "learning_rate": 0.0001403107774233577, - "loss": 0.3098, - "step": 211000 - }, - { - "epoch": 3.24, - "learning_rate": 0.00014019294163068597, - "loss": 0.3093, - "step": 212000 - }, - { - "epoch": 3.25, - "learning_rate": 0.00014007444757764835, - "loss": 0.3093, - "step": 213000 - }, - { - "epoch": 3.27, - "learning_rate": 0.0001399552965600768, - "loss": 0.3088, - "step": 214000 - }, - { - "epoch": 3.28, - "learning_rate": 0.0001398354898809877, - "loss": 0.3089, - "step": 215000 - }, - { - "epoch": 3.28, - "eval_runtime": 1.0098, - "eval_samples_per_second": 990.287, - "eval_steps_per_second": 15.845, - "step": 215000 - }, - { - "epoch": 3.3, - "learning_rate": 0.0001397150288505678, - "loss": 0.3315, - "step": 216000 - }, - { - "epoch": 3.31, - "learning_rate": 0.00013959391478615959, - "loss": 0.628, - "step": 217000 - }, - { - "epoch": 3.33, - "learning_rate": 0.00013947214901224706, - "loss": 0.3112, - "step": 218000 - }, - { - "epoch": 3.34, - "learning_rate": 0.0001393497328604412, - "loss": 0.3094, - "step": 219000 - }, - { - "epoch": 3.36, - "learning_rate": 0.00013922666766946545, - "loss": 0.3082, - "step": 220000 - }, - { - "epoch": 3.36, - "eval_runtime": 1.0751, - "eval_samples_per_second": 930.139, - "eval_steps_per_second": 14.882, - "step": 220000 - }, - { - "epoch": 3.37, - "learning_rate": 0.00013910295478514106, - "loss": 0.3079, - "step": 221000 - }, - { - "epoch": 3.39, - "learning_rate": 0.0001389785955603722, - "loss": 0.3077, - "step": 222000 - }, - { - "epoch": 3.41, - "learning_rate": 0.00013885359135513154, - "loss": 0.3073, - "step": 223000 - }, - { - "epoch": 3.42, - "learning_rate": 0.000138727943536445, - "loss": 0.3064, - "step": 224000 - }, - { - "epoch": 3.44, - "learning_rate": 0.00013860165347837698, - "loss": 0.3066, - "step": 225000 - }, - { - "epoch": 3.44, - "eval_runtime": 1.0901, - "eval_samples_per_second": 917.309, - "eval_steps_per_second": 14.677, - "step": 225000 - }, - { - "epoch": 3.45, - "learning_rate": 0.00013847472256201535, - "loss": 0.306, - "step": 226000 - }, - { - "epoch": 3.47, - "learning_rate": 0.00013834715217545625, - "loss": 0.3058, - "step": 227000 - }, - { - "epoch": 3.48, - "learning_rate": 0.000138218943713789, - "loss": 0.3056, - "step": 228000 - }, - { - "epoch": 3.5, - "learning_rate": 0.0001380900985790808, - "loss": 0.3054, - "step": 229000 - }, - { - "epoch": 3.51, - "learning_rate": 0.00013796061818036138, - "loss": 0.3051, - "step": 230000 - }, - { - "epoch": 3.51, - "eval_runtime": 1.0217, - "eval_samples_per_second": 978.715, - "eval_steps_per_second": 15.659, - "step": 230000 - }, - { - "epoch": 3.53, - "learning_rate": 0.00013783050393360768, - "loss": 0.3048, - "step": 231000 - }, - { - "epoch": 3.54, - "learning_rate": 0.0001376997572617282, - "loss": 0.305, - "step": 232000 - }, - { - "epoch": 3.56, - "learning_rate": 0.00013756837959454766, - "loss": 0.3042, - "step": 233000 - }, - { - "epoch": 3.57, - "learning_rate": 0.0001374363723687911, - "loss": 0.3042, - "step": 234000 - }, - { - "epoch": 3.59, - "learning_rate": 0.00013730373702806846, - "loss": 0.304, - "step": 235000 - }, - { - "epoch": 3.59, - "eval_runtime": 1.0392, - "eval_samples_per_second": 962.319, - "eval_steps_per_second": 15.397, - "step": 235000 - }, - { - "epoch": 3.6, - "learning_rate": 0.00013717047502285855, - "loss": 0.3036, - "step": 236000 - }, - { - "epoch": 3.62, - "learning_rate": 0.0001370365878104933, - "loss": 0.3036, - "step": 237000 - }, - { - "epoch": 3.63, - "learning_rate": 0.00013690207685514185, - "loss": 0.3031, - "step": 238000 - }, - { - "epoch": 3.65, - "learning_rate": 0.0001367669436277944, - "loss": 0.3032, - "step": 239000 - }, - { - "epoch": 3.66, - "learning_rate": 0.0001366311896062463, - "loss": 0.3036, - "step": 240000 - }, - { - "epoch": 3.66, - "eval_runtime": 1.0097, - "eval_samples_per_second": 990.396, - "eval_steps_per_second": 15.846, - "step": 240000 - }, - { - "epoch": 3.68, - "learning_rate": 0.00013649481627508181, - "loss": 0.3031, - "step": 241000 - }, - { - "epoch": 3.7, - "learning_rate": 0.0001363578251256578, - "loss": 0.3023, - "step": 242000 - }, - { - "epoch": 3.71, - "learning_rate": 0.00013622021765608754, - "loss": 0.3022, - "step": 243000 - }, - { - "epoch": 3.73, - "learning_rate": 0.00013608199537122425, - "loss": 0.3017, - "step": 244000 - }, - { - "epoch": 3.74, - "learning_rate": 0.0001359431597826447, - "loss": 0.3019, - "step": 245000 - }, - { - "epoch": 3.74, - "eval_runtime": 1.0744, - "eval_samples_per_second": 930.717, - "eval_steps_per_second": 14.891, - "step": 245000 - }, - { - "epoch": 3.76, - "learning_rate": 0.0001358037124086327, - "loss": 0.3015, - "step": 246000 - }, - { - "epoch": 3.77, - "learning_rate": 0.00013566365477416233, - "loss": 0.3018, - "step": 247000 - }, - { - "epoch": 3.79, - "learning_rate": 0.00013552298841088144, - "loss": 0.3013, - "step": 248000 - }, - { - "epoch": 3.8, - "learning_rate": 0.00013538171485709486, - "loss": 0.3006, - "step": 249000 - }, - { - "epoch": 3.82, - "learning_rate": 0.00013523983565774753, - "loss": 0.3008, - "step": 250000 - }, - { - "epoch": 3.82, - "eval_runtime": 1.0168, - "eval_samples_per_second": 983.434, - "eval_steps_per_second": 15.735, - "step": 250000 - }, - { - "epoch": 3.83, - "learning_rate": 0.00013509735236440766, - "loss": 0.3003, - "step": 251000 - }, - { - "epoch": 3.85, - "learning_rate": 0.00013495426653524972, - "loss": 0.3, - "step": 252000 - }, - { - "epoch": 3.86, - "learning_rate": 0.00013481057973503742, - "loss": 0.3, - "step": 253000 - }, - { - "epoch": 3.88, - "learning_rate": 0.00013466629353510651, - "loss": 0.2997, - "step": 254000 - }, - { - "epoch": 3.89, - "learning_rate": 0.00013452140951334787, - "loss": 0.2995, - "step": 255000 - }, - { - "epoch": 3.89, - "eval_runtime": 0.8192, - "eval_samples_per_second": 1220.744, - "eval_steps_per_second": 19.532, - "step": 255000 - }, - { - "epoch": 3.91, - "learning_rate": 0.00013437592925418985, - "loss": 0.2996, - "step": 256000 - }, - { - "epoch": 3.92, - "learning_rate": 0.00013422985434858133, - "loss": 0.299, - "step": 257000 - }, - { - "epoch": 3.94, - "learning_rate": 0.00013408318639397405, - "loss": 0.2987, - "step": 258000 - }, - { - "epoch": 3.95, - "learning_rate": 0.00013393592699430525, - "loss": 0.2986, - "step": 259000 - }, - { - "epoch": 3.97, - "learning_rate": 0.00013378807775998012, - "loss": 0.2984, - "step": 260000 - }, - { - "epoch": 3.97, - "eval_runtime": 1.0461, - "eval_samples_per_second": 955.963, - "eval_steps_per_second": 15.295, - "step": 260000 - }, - { - "epoch": 3.99, - "learning_rate": 0.00013363964030785422, - "loss": 0.2983, - "step": 261000 - }, - { - "epoch": 4.0, - "learning_rate": 0.00013349061626121578, - "loss": 0.2982, - "step": 262000 - }, - { - "epoch": 4.02, - "learning_rate": 0.00013334100724976783, - "loss": 0.2977, - "step": 263000 - }, - { - "epoch": 4.03, - "learning_rate": 0.0001331908149096106, - "loss": 0.2976, - "step": 264000 - }, - { - "epoch": 4.05, - "learning_rate": 0.00013304004088322342, - "loss": 0.2978, - "step": 265000 - }, - { - "epoch": 4.05, - "eval_runtime": 1.0225, - "eval_samples_per_second": 978.001, - "eval_steps_per_second": 15.648, - "step": 265000 - }, - { - "epoch": 4.06, - "learning_rate": 0.00013288868681944692, - "loss": 0.2971, - "step": 266000 - }, - { - "epoch": 4.08, - "learning_rate": 0.00013273675437346487, - "loss": 0.2972, - "step": 267000 - }, - { - "epoch": 4.09, - "learning_rate": 0.00013258424520678618, - "loss": 0.2969, - "step": 268000 - }, - { - "epoch": 4.11, - "learning_rate": 0.00013243116098722663, - "loss": 0.2968, - "step": 269000 - }, - { - "epoch": 4.12, - "learning_rate": 0.00013227750338889077, - "loss": 0.2966, - "step": 270000 - }, - { - "epoch": 4.12, - "eval_runtime": 1.1084, - "eval_samples_per_second": 902.192, - "eval_steps_per_second": 14.435, - "step": 270000 - }, - { - "epoch": 4.14, - "learning_rate": 0.00013212327409215343, - "loss": 0.296, - "step": 271000 - }, - { - "epoch": 4.15, - "learning_rate": 0.0001319684747836415, - "loss": 0.2958, - "step": 272000 - }, - { - "epoch": 4.17, - "learning_rate": 0.0001318131071562154, - "loss": 0.2961, - "step": 273000 - }, - { - "epoch": 4.18, - "learning_rate": 0.00013165717290895067, - "loss": 0.2957, - "step": 274000 - }, - { - "epoch": 4.2, - "learning_rate": 0.0001315006737471192, - "loss": 0.2955, - "step": 275000 - }, - { - "epoch": 4.2, - "eval_runtime": 1.0552, - "eval_samples_per_second": 947.654, - "eval_steps_per_second": 15.162, - "step": 275000 - }, - { - "epoch": 4.21, - "learning_rate": 0.0001313436113821708, - "loss": 0.2952, - "step": 276000 - }, - { - "epoch": 4.23, - "learning_rate": 0.00013118598753171425, - "loss": 0.2951, - "step": 277000 - }, - { - "epoch": 4.25, - "learning_rate": 0.0001310278039194988, - "loss": 0.2951, - "step": 278000 - }, - { - "epoch": 4.26, - "learning_rate": 0.00013086906227539506, - "loss": 0.2952, - "step": 279000 - }, - { - "epoch": 4.28, - "learning_rate": 0.00013070976433537623, - "loss": 0.2946, - "step": 280000 - }, - { - "epoch": 4.28, - "eval_runtime": 1.0293, - "eval_samples_per_second": 971.532, - "eval_steps_per_second": 15.545, - "step": 280000 - }, - { - "epoch": 4.29, - "learning_rate": 0.00013054991184149905, - "loss": 0.2946, - "step": 281000 - }, - { - "epoch": 4.31, - "learning_rate": 0.00013038950654188476, - "loss": 0.2942, - "step": 282000 - }, - { - "epoch": 4.32, - "learning_rate": 0.00013022855019070005, - "loss": 0.2941, - "step": 283000 - }, - { - "epoch": 4.34, - "learning_rate": 0.0001300670445481378, - "loss": 0.2937, - "step": 284000 - }, - { - "epoch": 4.35, - "learning_rate": 0.0001299049913803978, - "loss": 0.2937, - "step": 285000 - }, - { - "epoch": 4.35, - "eval_runtime": 1.0469, - "eval_samples_per_second": 955.197, - "eval_steps_per_second": 15.283, - "step": 285000 - }, - { - "epoch": 4.37, - "learning_rate": 0.00012974239245966754, - "loss": 0.2934, - "step": 286000 - }, - { - "epoch": 4.38, - "learning_rate": 0.0001295792495641028, - "loss": 0.2962, - "step": 287000 - }, - { - "epoch": 4.4, - "learning_rate": 0.00012941556447780813, - "loss": 0.2931, - "step": 288000 - }, - { - "epoch": 4.41, - "learning_rate": 0.0001292513389908174, - "loss": 0.2931, - "step": 289000 - }, - { - "epoch": 4.43, - "learning_rate": 0.0001290865748990742, - "loss": 0.2932, - "step": 290000 - }, - { - "epoch": 4.43, - "eval_runtime": 1.0143, - "eval_samples_per_second": 985.898, - "eval_steps_per_second": 15.774, - "step": 290000 - }, - { - "epoch": 4.44, - "learning_rate": 0.00012892127400441228, - "loss": 0.2923, - "step": 291000 - }, - { - "epoch": 4.46, - "learning_rate": 0.00012875543811453576, - "loss": 0.2919, - "step": 292000 - }, - { - "epoch": 4.47, - "learning_rate": 0.0001285890690429993, - "loss": 0.2931, - "step": 293000 - }, - { - "epoch": 4.49, - "learning_rate": 0.00012842216860918846, - "loss": 0.292, - "step": 294000 - }, - { - "epoch": 4.5, - "learning_rate": 0.0001282547386382996, - "loss": 0.2914, - "step": 295000 - }, - { - "epoch": 4.5, - "eval_runtime": 1.0329, - "eval_samples_per_second": 968.135, - "eval_steps_per_second": 15.49, - "step": 295000 - }, - { - "epoch": 4.52, - "learning_rate": 0.0001280867809613201, - "loss": 0.2919, - "step": 296000 - }, - { - "epoch": 4.54, - "learning_rate": 0.0001279182974150082, - "loss": 0.2915, - "step": 297000 - }, - { - "epoch": 4.55, - "learning_rate": 0.00012774928984187297, - "loss": 0.2914, - "step": 298000 - }, - { - "epoch": 4.57, - "learning_rate": 0.00012757976009015413, - "loss": 0.2908, - "step": 299000 - }, - { - "epoch": 4.58, - "learning_rate": 0.0001274097100138019, - "loss": 0.2909, - "step": 300000 - }, - { - "epoch": 4.58, - "eval_runtime": 1.0054, - "eval_samples_per_second": 994.612, - "eval_steps_per_second": 15.914, - "step": 300000 - }, - { - "epoch": 4.6, - "learning_rate": 0.00012723914147245663, - "loss": 0.2906, - "step": 301000 - }, - { - "epoch": 4.61, - "learning_rate": 0.00012706805633142863, - "loss": 0.2906, - "step": 302000 - }, - { - "epoch": 4.63, - "learning_rate": 0.00012689645646167755, - "loss": 0.2902, - "step": 303000 - }, - { - "epoch": 4.64, - "learning_rate": 0.00012672434373979207, - "loss": 0.291, - "step": 304000 - }, - { - "epoch": 4.66, - "learning_rate": 0.00012655172004796936, - "loss": 0.2899, - "step": 305000 - }, - { - "epoch": 4.66, - "eval_runtime": 1.0975, - "eval_samples_per_second": 911.158, - "eval_steps_per_second": 14.579, - "step": 305000 - }, - { - "epoch": 4.67, - "learning_rate": 0.00012637858727399448, - "loss": 0.2898, - "step": 306000 - }, - { - "epoch": 4.69, - "learning_rate": 0.00012620494731121966, - "loss": 0.2896, - "step": 307000 - }, - { - "epoch": 4.7, - "learning_rate": 0.00012603080205854372, - "loss": 0.2894, - "step": 308000 - }, - { - "epoch": 4.72, - "learning_rate": 0.00012585615342039126, - "loss": 0.2894, - "step": 309000 - }, - { - "epoch": 4.73, - "learning_rate": 0.0001256810033066918, - "loss": 0.2894, - "step": 310000 - }, - { - "epoch": 4.73, - "eval_runtime": 1.0481, - "eval_samples_per_second": 954.11, - "eval_steps_per_second": 15.266, - "step": 310000 - }, - { - "epoch": 4.75, - "learning_rate": 0.0001255053536328589, - "loss": 0.2887, - "step": 311000 - }, - { - "epoch": 4.76, - "learning_rate": 0.0001253292063197693, - "loss": 0.2887, - "step": 312000 - }, - { - "epoch": 4.78, - "learning_rate": 0.0001251525632937418, - "loss": 0.2886, - "step": 313000 - }, - { - "epoch": 4.79, - "learning_rate": 0.00012497542648651615, - "loss": 0.2887, - "step": 314000 - }, - { - "epoch": 4.81, - "learning_rate": 0.00012479779783523216, - "loss": 0.2883, - "step": 315000 - }, - { - "epoch": 4.81, - "eval_runtime": 1.0333, - "eval_samples_per_second": 967.804, - "eval_steps_per_second": 15.485, - "step": 315000 - }, - { - "epoch": 4.83, - "learning_rate": 0.00012461967928240828, - "loss": 0.2883, - "step": 316000 - }, - { - "epoch": 4.84, - "learning_rate": 0.00012444107277592047, - "loss": 0.2877, - "step": 317000 - }, - { - "epoch": 4.86, - "learning_rate": 0.0001242619802689809, - "loss": 0.2879, - "step": 318000 - }, - { - "epoch": 4.87, - "learning_rate": 0.00012408240372011647, - "loss": 0.2876, - "step": 319000 - }, - { - "epoch": 4.89, - "learning_rate": 0.0001239023450931476, - "loss": 0.2874, - "step": 320000 - }, - { - "epoch": 4.89, - "eval_runtime": 1.04, - "eval_samples_per_second": 961.537, - "eval_steps_per_second": 15.385, - "step": 320000 - }, - { - "epoch": 4.9, - "learning_rate": 0.00012372180635716656, - "loss": 0.2874, - "step": 321000 - }, - { - "epoch": 4.92, - "learning_rate": 0.00012354078948651604, - "loss": 0.2873, - "step": 322000 - }, - { - "epoch": 4.93, - "learning_rate": 0.00012335929646076758, - "loss": 0.2868, - "step": 323000 - }, - { - "epoch": 4.95, - "learning_rate": 0.00012317732926469976, - "loss": 0.2871, - "step": 324000 - }, - { - "epoch": 4.96, - "learning_rate": 0.00012299488988827675, - "loss": 0.2869, - "step": 325000 - }, - { - "epoch": 4.96, - "eval_runtime": 1.3977, - "eval_samples_per_second": 715.452, - "eval_steps_per_second": 11.447, - "step": 325000 - }, - { - "epoch": 4.98, - "learning_rate": 0.0001228119803266263, - "loss": 0.2867, - "step": 326000 - }, - { - "epoch": 4.99, - "learning_rate": 0.0001226286025800181, - "loss": 0.2866, - "step": 327000 - }, - { - "epoch": 5.01, - "learning_rate": 0.00012244475865384177, - "loss": 0.2862, - "step": 328000 - }, - { - "epoch": 5.02, - "learning_rate": 0.00012226045055858505, - "loss": 0.2858, - "step": 329000 - }, - { - "epoch": 5.04, - "learning_rate": 0.00012207568030981174, - "loss": 0.2859, - "step": 330000 - }, - { - "epoch": 5.04, - "eval_runtime": 1.1314, - "eval_samples_per_second": 883.862, - "eval_steps_per_second": 14.142, - "step": 330000 - }, - { - "epoch": 5.05, - "learning_rate": 0.00012189044992813972, - "loss": 0.2858, - "step": 331000 - }, - { - "epoch": 5.07, - "learning_rate": 0.0001217047614392187, - "loss": 0.2857, - "step": 332000 - }, - { - "epoch": 5.08, - "learning_rate": 0.00012151861687370828, - "loss": 0.2857, - "step": 333000 - }, - { - "epoch": 5.1, - "learning_rate": 0.00012133201826725558, - "loss": 0.2852, - "step": 334000 - }, - { - "epoch": 5.12, - "learning_rate": 0.0001211449676604731, - "loss": 0.2853, - "step": 335000 - }, - { - "epoch": 5.12, - "eval_runtime": 1.3419, - "eval_samples_per_second": 745.216, - "eval_steps_per_second": 11.923, - "step": 335000 - }, - { - "epoch": 5.13, - "learning_rate": 0.00012095746709891632, - "loss": 0.2852, - "step": 336000 - }, - { - "epoch": 5.15, - "learning_rate": 0.00012076951863306127, - "loss": 0.285, - "step": 337000 - }, - { - "epoch": 5.16, - "learning_rate": 0.0001205811243182823, - "loss": 0.2848, - "step": 338000 - }, - { - "epoch": 5.18, - "learning_rate": 0.00012039228621482949, - "loss": 0.2858, - "step": 339000 - }, - { - "epoch": 5.19, - "learning_rate": 0.00012020300638780604, - "loss": 0.2845, - "step": 340000 - }, - { - "epoch": 5.19, - "eval_runtime": 1.2559, - "eval_samples_per_second": 796.26, - "eval_steps_per_second": 12.74, - "step": 340000 - }, - { - "epoch": 5.21, - "learning_rate": 0.00012001328690714582, - "loss": 0.284, - "step": 341000 - }, - { - "epoch": 5.22, - "learning_rate": 0.00011982312984759068, - "loss": 0.2845, - "step": 342000 - }, - { - "epoch": 5.24, - "learning_rate": 0.00011963253728866778, - "loss": 0.2841, - "step": 343000 - }, - { - "epoch": 5.25, - "learning_rate": 0.00011944151131466675, - "loss": 0.284, - "step": 344000 - }, - { - "epoch": 5.27, - "learning_rate": 0.00011925005401461709, - "loss": 0.2836, - "step": 345000 - }, - { - "epoch": 5.27, - "eval_runtime": 1.1037, - "eval_samples_per_second": 906.031, - "eval_steps_per_second": 14.496, - "step": 345000 - }, - { - "epoch": 5.28, - "learning_rate": 0.00011905816748226513, - "loss": 0.2834, - "step": 346000 - }, - { - "epoch": 5.3, - "learning_rate": 0.00011886585381605125, - "loss": 0.2835, - "step": 347000 - }, - { - "epoch": 5.31, - "learning_rate": 0.00011867311511908693, - "loss": 0.2832, - "step": 348000 - }, - { - "epoch": 5.33, - "learning_rate": 0.00011847995349913162, - "loss": 0.2828, - "step": 349000 - }, - { - "epoch": 5.34, - "learning_rate": 0.00011828637106856989, - "loss": 0.2828, - "step": 350000 - }, - { - "epoch": 5.34, - "eval_runtime": 1.0295, - "eval_samples_per_second": 971.32, - "eval_steps_per_second": 15.541, - "step": 350000 - }, - { - "epoch": 5.36, - "learning_rate": 0.00011809236994438816, - "loss": 0.2831, - "step": 351000 - }, - { - "epoch": 5.38, - "learning_rate": 0.00011789795224815164, - "loss": 0.2827, - "step": 352000 - }, - { - "epoch": 5.39, - "learning_rate": 0.00011770312010598116, - "loss": 0.282, - "step": 353000 - }, - { - "epoch": 5.41, - "learning_rate": 0.00011750787564852973, - "loss": 0.2822, - "step": 354000 - }, - { - "epoch": 5.42, - "learning_rate": 0.00011731222101095955, - "loss": 0.2825, - "step": 355000 - }, - { - "epoch": 5.42, - "eval_runtime": 1.0697, - "eval_samples_per_second": 934.885, - "eval_steps_per_second": 14.958, - "step": 355000 - }, - { - "epoch": 5.44, - "learning_rate": 0.00011711615833291833, - "loss": 0.2822, - "step": 356000 - }, - { - "epoch": 5.45, - "learning_rate": 0.0001169196897585161, - "loss": 0.2824, - "step": 357000 - }, - { - "epoch": 5.47, - "learning_rate": 0.00011672281743630175, - "loss": 0.2818, - "step": 358000 - }, - { - "epoch": 5.48, - "learning_rate": 0.0001165255435192394, - "loss": 0.2815, - "step": 359000 - }, - { - "epoch": 5.5, - "learning_rate": 0.00011632787016468506, - "loss": 0.2819, - "step": 360000 - }, - { - "epoch": 5.5, - "eval_runtime": 1.1008, - "eval_samples_per_second": 908.433, - "eval_steps_per_second": 14.535, - "step": 360000 - }, - { - "epoch": 5.51, - "learning_rate": 0.0001161297995343628, - "loss": 0.2815, - "step": 361000 - }, - { - "epoch": 5.53, - "learning_rate": 0.00011593133379434138, - "loss": 0.2815, - "step": 362000 - }, - { - "epoch": 5.54, - "learning_rate": 0.00011573247511501028, - "loss": 0.2811, - "step": 363000 - }, - { - "epoch": 5.56, - "learning_rate": 0.00011553322567105619, - "loss": 0.2807, - "step": 364000 - }, - { - "epoch": 5.57, - "learning_rate": 0.00011533358764143905, - "loss": 0.2808, - "step": 365000 - }, - { - "epoch": 5.57, - "eval_runtime": 1.1301, - "eval_samples_per_second": 884.842, - "eval_steps_per_second": 14.157, - "step": 365000 - }, - { - "epoch": 5.59, - "learning_rate": 0.00011513356320936841, - "loss": 0.2808, - "step": 366000 - }, - { - "epoch": 5.6, - "learning_rate": 0.00011493315456227943, - "loss": 0.2817, - "step": 367000 - }, - { - "epoch": 5.62, - "learning_rate": 0.00011473236389180894, - "loss": 0.2803, - "step": 368000 - }, - { - "epoch": 5.63, - "learning_rate": 0.00011453119339377154, - "loss": 0.2803, - "step": 369000 - }, - { - "epoch": 5.65, - "learning_rate": 0.00011432964526813558, - "loss": 0.2817, - "step": 370000 - }, - { - "epoch": 5.65, - "eval_runtime": 1.2187, - "eval_samples_per_second": 820.56, - "eval_steps_per_second": 13.129, - "step": 370000 - }, - { - "epoch": 5.67, - "learning_rate": 0.00011412772171899904, - "loss": 0.2819, - "step": 371000 - }, - { - "epoch": 5.68, - "learning_rate": 0.00011392542495456556, - "loss": 0.28, - "step": 372000 - }, - { - "epoch": 5.7, - "learning_rate": 0.00011372275718712006, - "loss": 0.2797, - "step": 373000 - }, - { - "epoch": 5.71, - "learning_rate": 0.00011351972063300484, - "loss": 0.2797, - "step": 374000 - }, - { - "epoch": 5.73, - "learning_rate": 0.00011331631751259515, - "loss": 0.2801, - "step": 375000 - }, - { - "epoch": 5.73, - "eval_runtime": 1.0146, - "eval_samples_per_second": 985.631, - "eval_steps_per_second": 15.77, - "step": 375000 - }, - { - "epoch": 5.74, - "learning_rate": 0.00011311255005027487, - "loss": 0.2789, - "step": 376000 - }, - { - "epoch": 5.76, - "learning_rate": 0.00011290842047441232, - "loss": 0.2791, - "step": 377000 - }, - { - "epoch": 5.77, - "learning_rate": 0.00011270393101733585, - "loss": 0.279, - "step": 378000 - }, - { - "epoch": 5.79, - "learning_rate": 0.00011249908391530946, - "loss": 0.279, - "step": 379000 - }, - { - "epoch": 5.8, - "learning_rate": 0.00011229388140850814, - "loss": 0.279, - "step": 380000 - }, - { - "epoch": 5.8, - "eval_runtime": 1.2375, - "eval_samples_per_second": 808.112, - "eval_steps_per_second": 12.93, - "step": 380000 - }, - { - "epoch": 5.82, - "learning_rate": 0.00011208832574099368, - "loss": 0.2788, - "step": 381000 - }, - { - "epoch": 5.83, - "learning_rate": 0.00011188241916068993, - "loss": 0.2785, - "step": 382000 - }, - { - "epoch": 5.85, - "learning_rate": 0.00011167616391935826, - "loss": 0.2783, - "step": 383000 - }, - { - "epoch": 5.86, - "learning_rate": 0.00011146956227257293, - "loss": 0.2785, - "step": 384000 - }, - { - "epoch": 5.88, - "learning_rate": 0.00011126261647969645, - "loss": 0.2781, - "step": 385000 - }, - { - "epoch": 5.88, - "eval_runtime": 1.0191, - "eval_samples_per_second": 981.273, - "eval_steps_per_second": 15.7, - "step": 385000 - }, - { - "epoch": 5.89, - "learning_rate": 0.00011105532880385487, - "loss": 0.2782, - "step": 386000 - }, - { - "epoch": 5.91, - "learning_rate": 0.00011084770151191299, - "loss": 0.2782, - "step": 387000 - }, - { - "epoch": 5.92, - "learning_rate": 0.00011063973687444962, - "loss": 0.2779, - "step": 388000 - }, - { - "epoch": 5.94, - "learning_rate": 0.00011043143716573272, - "loss": 0.2774, - "step": 389000 - }, - { - "epoch": 5.96, - "learning_rate": 0.00011022280466369448, - "loss": 0.2776, - "step": 390000 - }, - { - "epoch": 5.96, - "eval_runtime": 1.0236, - "eval_samples_per_second": 976.954, - "eval_steps_per_second": 15.631, - "step": 390000 - }, - { - "epoch": 5.97, - "learning_rate": 0.00011001384164990662, - "loss": 0.2775, - "step": 391000 - }, - { - "epoch": 5.99, - "learning_rate": 0.00010980455040955506, - "loss": 0.2769, - "step": 392000 - }, - { - "epoch": 6.0, - "learning_rate": 0.00010959493323141538, - "loss": 0.2773, - "step": 393000 - }, - { - "epoch": 6.02, - "learning_rate": 0.00010938499240782739, - "loss": 0.277, - "step": 394000 - }, - { - "epoch": 6.03, - "learning_rate": 0.00010917473023467032, - "loss": 0.277, - "step": 395000 - }, - { - "epoch": 6.03, - "eval_runtime": 1.0769, - "eval_samples_per_second": 928.59, - "eval_steps_per_second": 14.857, - "step": 395000 - }, - { - "epoch": 6.05, - "learning_rate": 0.00010896414901133761, - "loss": 0.2766, - "step": 396000 - }, - { - "epoch": 6.06, - "learning_rate": 0.00010875325104071177, - "loss": 0.2768, - "step": 397000 - }, - { - "epoch": 6.08, - "learning_rate": 0.00010854203862913927, - "loss": 0.2765, - "step": 398000 - }, - { - "epoch": 6.09, - "learning_rate": 0.00010833051408640509, - "loss": 0.2763, - "step": 399000 - }, - { - "epoch": 6.11, - "learning_rate": 0.00010811867972570786, - "loss": 0.2767, - "step": 400000 - }, - { - "epoch": 6.11, - "eval_runtime": 1.1081, - "eval_samples_per_second": 902.417, - "eval_steps_per_second": 14.439, - "step": 400000 - }, - { - "epoch": 6.12, - "learning_rate": 0.00010790653786363416, - "loss": 0.2759, - "step": 401000 - }, - { - "epoch": 6.14, - "learning_rate": 0.00010769409082013337, - "loss": 0.2759, - "step": 402000 - }, - { - "epoch": 6.15, - "learning_rate": 0.00010748134091849238, - "loss": 0.2757, - "step": 403000 - }, - { - "epoch": 6.17, - "learning_rate": 0.00010726829048531, - "loss": 0.2762, - "step": 404000 - }, - { - "epoch": 6.18, - "learning_rate": 0.00010705494185047165, - "loss": 0.276, - "step": 405000 - }, - { - "epoch": 6.18, - "eval_runtime": 1.1676, - "eval_samples_per_second": 856.476, - "eval_steps_per_second": 13.704, - "step": 405000 - }, - { - "epoch": 6.2, - "learning_rate": 0.0001068412973471238, - "loss": 0.2754, - "step": 406000 - }, - { - "epoch": 6.21, - "learning_rate": 0.00010662735931164853, - "loss": 0.2755, - "step": 407000 - }, - { - "epoch": 6.23, - "learning_rate": 0.0001064131300836379, - "loss": 0.2752, - "step": 408000 - }, - { - "epoch": 6.25, - "learning_rate": 0.0001061986120058684, - "loss": 0.2748, - "step": 409000 - }, - { - "epoch": 6.26, - "learning_rate": 0.00010598380742427543, - "loss": 0.2749, - "step": 410000 - }, - { - "epoch": 6.26, - "eval_runtime": 1.0797, - "eval_samples_per_second": 926.22, - "eval_steps_per_second": 14.82, - "step": 410000 - }, - { - "epoch": 6.28, - "learning_rate": 0.00010576871868792746, - "loss": 0.275, - "step": 411000 - }, - { - "epoch": 6.29, - "learning_rate": 0.0001055533481490004, - "loss": 0.2746, - "step": 412000 - }, - { - "epoch": 6.31, - "learning_rate": 0.000105337698162752, - "loss": 0.2741, - "step": 413000 - }, - { - "epoch": 6.32, - "learning_rate": 0.00010512177108749594, - "loss": 0.2746, - "step": 414000 - }, - { - "epoch": 6.34, - "learning_rate": 0.00010490556928457616, - "loss": 0.2743, - "step": 415000 - }, - { - "epoch": 6.34, - "eval_runtime": 1.0107, - "eval_samples_per_second": 989.389, - "eval_steps_per_second": 15.83, - "step": 415000 - }, - { - "epoch": 6.35, - "learning_rate": 0.00010468909511834088, - "loss": 0.2741, - "step": 416000 - }, - { - "epoch": 6.37, - "learning_rate": 0.00010447235095611692, - "loss": 0.2738, - "step": 417000 - }, - { - "epoch": 6.38, - "learning_rate": 0.00010425533916818376, - "loss": 0.2738, - "step": 418000 - }, - { - "epoch": 6.4, - "learning_rate": 0.00010403806212774747, - "loss": 0.2742, - "step": 419000 - }, - { - "epoch": 6.41, - "learning_rate": 0.000103820522210915, - "loss": 0.2737, - "step": 420000 - }, - { - "epoch": 6.41, - "eval_runtime": 1.055, - "eval_samples_per_second": 947.861, - "eval_steps_per_second": 15.166, - "step": 420000 - }, - { - "epoch": 6.43, - "learning_rate": 0.00010360272179666802, - "loss": 0.2742, - "step": 421000 - }, - { - "epoch": 6.44, - "learning_rate": 0.00010338466326683697, - "loss": 0.2733, - "step": 422000 - }, - { - "epoch": 6.46, - "learning_rate": 0.00010316634900607497, - "loss": 0.2737, - "step": 423000 - }, - { - "epoch": 6.47, - "learning_rate": 0.00010294778140183182, - "loss": 0.2732, - "step": 424000 - }, - { - "epoch": 6.49, - "learning_rate": 0.00010272896284432785, - "loss": 0.2733, - "step": 425000 - }, - { - "epoch": 6.49, - "eval_runtime": 1.0035, - "eval_samples_per_second": 996.544, - "eval_steps_per_second": 15.945, - "step": 425000 - }, - { - "epoch": 6.51, - "learning_rate": 0.00010250989572652766, - "loss": 0.2728, - "step": 426000 - }, - { - "epoch": 6.52, - "learning_rate": 0.00010229058244411427, - "loss": 0.2729, - "step": 427000 - }, - { - "epoch": 6.54, - "learning_rate": 0.00010207102539546251, - "loss": 0.2728, - "step": 428000 - }, - { - "epoch": 6.55, - "learning_rate": 0.00010185122698161311, - "loss": 0.2726, - "step": 429000 - }, - { - "epoch": 6.57, - "learning_rate": 0.00010163118960624632, - "loss": 0.2725, - "step": 430000 - }, - { - "epoch": 6.57, - "eval_runtime": 1.0983, - "eval_samples_per_second": 910.508, - "eval_steps_per_second": 14.568, - "step": 430000 - }, - { - "epoch": 6.58, - "learning_rate": 0.00010141091567565561, - "loss": 0.2727, - "step": 431000 - }, - { - "epoch": 6.6, - "learning_rate": 0.00010119040759872142, - "loss": 0.2725, - "step": 432000 - }, - { - "epoch": 6.61, - "learning_rate": 0.00010096966778688472, - "loss": 0.2721, - "step": 433000 - }, - { - "epoch": 6.63, - "learning_rate": 0.00010074869865412074, - "loss": 0.272, - "step": 434000 - }, - { - "epoch": 6.64, - "learning_rate": 0.00010052750261691254, - "loss": 0.2721, - "step": 435000 - }, - { - "epoch": 6.64, - "eval_runtime": 0.9895, - "eval_samples_per_second": 1010.612, - "eval_steps_per_second": 16.17, - "step": 435000 - }, - { - "epoch": 6.66, - "learning_rate": 0.0001003060820942245, - "loss": 0.2716, - "step": 436000 - }, - { - "epoch": 6.67, - "learning_rate": 0.00010008443950747599, - "loss": 0.2716, - "step": 437000 - }, - { - "epoch": 6.69, - "learning_rate": 9.986257728051483e-05, - "loss": 0.2717, - "step": 438000 - }, - { - "epoch": 6.7, - "learning_rate": 9.964049783959082e-05, - "loss": 0.2716, - "step": 439000 - }, - { - "epoch": 6.72, - "learning_rate": 9.94182036133291e-05, - "loss": 0.2715, - "step": 440000 - }, - { - "epoch": 6.72, - "eval_runtime": 1.0245, - "eval_samples_per_second": 976.131, - "eval_steps_per_second": 15.618, - "step": 440000 - }, - { - "epoch": 6.73, - "learning_rate": 9.919569703270376e-05, - "loss": 0.2716, - "step": 441000 - }, - { - "epoch": 6.75, - "learning_rate": 9.89729805310111e-05, - "loss": 0.2711, - "step": 442000 - }, - { - "epoch": 6.76, - "learning_rate": 9.875005654384307e-05, - "loss": 0.2712, - "step": 443000 - }, - { - "epoch": 6.78, - "learning_rate": 9.852692750906071e-05, - "loss": 0.2717, - "step": 444000 - }, - { - "epoch": 6.8, - "learning_rate": 9.830359586676737e-05, - "loss": 0.2722, - "step": 445000 - }, - { - "epoch": 6.8, - "eval_runtime": 1.1145, - "eval_samples_per_second": 897.295, - "eval_steps_per_second": 14.357, - "step": 445000 - }, - { - "epoch": 6.81, - "learning_rate": 9.808006405928215e-05, - "loss": 0.2703, - "step": 446000 - }, - { - "epoch": 6.83, - "learning_rate": 9.785633453111306e-05, - "loss": 0.2705, - "step": 447000 - }, - { - "epoch": 6.84, - "learning_rate": 9.763240972893037e-05, - "loss": 0.27, - "step": 448000 - }, - { - "epoch": 6.86, - "learning_rate": 9.740829210153984e-05, - "loss": 0.2703, - "step": 449000 - }, - { - "epoch": 6.87, - "learning_rate": 9.718398409985593e-05, - "loss": 0.27, - "step": 450000 - }, - { - "epoch": 6.87, - "eval_runtime": 0.9938, - "eval_samples_per_second": 1006.215, - "eval_steps_per_second": 16.099, - "step": 450000 - }, - { - "epoch": 6.89, - "learning_rate": 9.695948817687504e-05, - "loss": 0.2699, - "step": 451000 - }, - { - "epoch": 6.9, - "learning_rate": 9.673480678764858e-05, - "loss": 0.2698, - "step": 452000 - }, - { - "epoch": 6.92, - "learning_rate": 9.650994238925626e-05, - "loss": 0.2699, - "step": 453000 - }, - { - "epoch": 6.93, - "learning_rate": 9.628489744077911e-05, - "loss": 0.2696, - "step": 454000 - }, - { - "epoch": 6.95, - "learning_rate": 9.60596744032726e-05, - "loss": 0.2699, - "step": 455000 - }, - { - "epoch": 6.95, - "eval_runtime": 1.0008, - "eval_samples_per_second": 999.165, - "eval_steps_per_second": 15.987, - "step": 455000 - }, - { - "epoch": 6.96, - "learning_rate": 9.583427573973982e-05, - "loss": 0.2696, - "step": 456000 - }, - { - "epoch": 6.98, - "learning_rate": 9.560870391510441e-05, - "loss": 0.2695, - "step": 457000 - }, - { - "epoch": 6.99, - "learning_rate": 9.538296139618371e-05, - "loss": 0.2691, - "step": 458000 - }, - { - "epoch": 7.01, - "learning_rate": 9.515705065166178e-05, - "loss": 0.2693, - "step": 459000 - }, - { - "epoch": 7.02, - "learning_rate": 9.493097415206228e-05, - "loss": 0.2688, - "step": 460000 - }, - { - "epoch": 7.02, - "eval_runtime": 1.0225, - "eval_samples_per_second": 978.034, - "eval_steps_per_second": 15.649, - "step": 460000 - }, - { - "epoch": 7.04, - "learning_rate": 9.47047343697216e-05, - "loss": 0.269, - "step": 461000 - }, - { - "epoch": 7.05, - "learning_rate": 9.447833377876176e-05, - "loss": 0.269, - "step": 462000 - }, - { - "epoch": 7.07, - "learning_rate": 9.425177485506336e-05, - "loss": 0.2688, - "step": 463000 - }, - { - "epoch": 7.09, - "learning_rate": 9.402506007623848e-05, - "loss": 0.269, - "step": 464000 - }, - { - "epoch": 7.1, - "learning_rate": 9.379819192160362e-05, - "loss": 0.2692, - "step": 465000 - }, - { - "epoch": 7.1, - "eval_runtime": 1.1401, - "eval_samples_per_second": 877.142, - "eval_steps_per_second": 14.034, - "step": 465000 - }, - { - "epoch": 7.12, - "learning_rate": 9.357117287215258e-05, - "loss": 0.2682, - "step": 466000 - }, - { - "epoch": 7.13, - "learning_rate": 9.334400541052928e-05, - "loss": 0.2683, - "step": 467000 - }, - { - "epoch": 7.15, - "learning_rate": 9.311669202100073e-05, - "loss": 0.2693, - "step": 468000 - }, - { - "epoch": 7.16, - "learning_rate": 9.288923518942968e-05, - "loss": 0.2683, - "step": 469000 - }, - { - "epoch": 7.18, - "learning_rate": 9.26616374032477e-05, - "loss": 0.2677, - "step": 470000 - }, - { - "epoch": 7.18, - "eval_runtime": 0.8954, - "eval_samples_per_second": 1116.774, - "eval_steps_per_second": 17.868, - "step": 470000 - }, - { - "epoch": 7.19, - "learning_rate": 9.243390115142761e-05, - "loss": 0.2678, - "step": 471000 - }, - { - "epoch": 7.21, - "learning_rate": 9.220602892445661e-05, - "loss": 0.2678, - "step": 472000 - }, - { - "epoch": 7.22, - "learning_rate": 9.197802321430889e-05, - "loss": 0.2679, - "step": 473000 - }, - { - "epoch": 7.24, - "learning_rate": 9.174988651441833e-05, - "loss": 0.2673, - "step": 474000 - }, - { - "epoch": 7.25, - "learning_rate": 9.152162131965137e-05, - "loss": 0.2675, - "step": 475000 - }, - { - "epoch": 7.25, - "eval_runtime": 1.0353, - "eval_samples_per_second": 965.922, - "eval_steps_per_second": 15.455, - "step": 475000 - }, - { - "epoch": 7.27, - "learning_rate": 9.129323012627956e-05, - "loss": 0.2693, - "step": 476000 - }, - { - "epoch": 7.28, - "learning_rate": 9.106471543195244e-05, - "loss": 0.2675, - "step": 477000 - }, - { - "epoch": 7.3, - "learning_rate": 9.08360797356701e-05, - "loss": 0.2679, - "step": 478000 - }, - { - "epoch": 7.31, - "learning_rate": 9.060732553775582e-05, - "loss": 0.2672, - "step": 479000 - }, - { - "epoch": 7.33, - "learning_rate": 9.037845533982892e-05, - "loss": 0.267, - "step": 480000 - }, - { - "epoch": 7.33, - "eval_runtime": 1.0347, - "eval_samples_per_second": 966.468, - "eval_steps_per_second": 15.463, - "step": 480000 - }, - { - "epoch": 7.34, - "learning_rate": 9.014947164477721e-05, - "loss": 0.2663, - "step": 481000 - }, - { - "epoch": 7.36, - "learning_rate": 8.992037695672967e-05, - "loss": 0.267, - "step": 482000 - }, - { - "epoch": 7.38, - "learning_rate": 8.969117378102912e-05, - "loss": 0.2665, - "step": 483000 - }, - { - "epoch": 7.39, - "learning_rate": 8.946186462420478e-05, - "loss": 0.2662, - "step": 484000 - }, - { - "epoch": 7.41, - "learning_rate": 8.923245199394482e-05, - "loss": 0.2662, - "step": 485000 - }, - { - "epoch": 7.41, - "eval_runtime": 1.0079, - "eval_samples_per_second": 992.191, - "eval_steps_per_second": 15.875, - "step": 485000 - }, - { - "epoch": 7.42, - "learning_rate": 8.900293839906903e-05, - "loss": 0.2664, - "step": 486000 - }, - { - "epoch": 7.44, - "learning_rate": 8.87733263495013e-05, - "loss": 0.2658, - "step": 487000 - }, - { - "epoch": 7.45, - "learning_rate": 8.85436183562422e-05, - "loss": 0.2659, - "step": 488000 - }, - { - "epoch": 7.47, - "learning_rate": 8.83138169313416e-05, - "loss": 0.2663, - "step": 489000 - }, - { - "epoch": 7.48, - "learning_rate": 8.808392458787103e-05, - "loss": 0.2656, - "step": 490000 - }, - { - "epoch": 7.48, - "eval_runtime": 1.075, - "eval_samples_per_second": 930.213, - "eval_steps_per_second": 14.883, - "step": 490000 - }, - { - "epoch": 7.5, - "learning_rate": 8.78539438398963e-05, - "loss": 0.2655, - "step": 491000 - }, - { - "epoch": 7.51, - "learning_rate": 8.762387720245008e-05, - "loss": 0.2656, - "step": 492000 - }, - { - "epoch": 7.53, - "learning_rate": 8.73937271915042e-05, - "loss": 0.2655, - "step": 493000 - }, - { - "epoch": 7.54, - "learning_rate": 8.716349632394235e-05, - "loss": 0.2652, - "step": 494000 - }, - { - "epoch": 7.56, - "learning_rate": 8.69331871175324e-05, - "loss": 0.2651, - "step": 495000 - }, - { - "epoch": 7.56, - "eval_runtime": 1.1978, - "eval_samples_per_second": 834.871, - "eval_steps_per_second": 13.358, - "step": 495000 - }, - { - "epoch": 7.57, - "learning_rate": 8.67028020908989e-05, - "loss": 0.2647, - "step": 496000 - }, - { - "epoch": 7.59, - "learning_rate": 8.647234376349565e-05, - "loss": 0.2653, - "step": 497000 - }, - { - "epoch": 7.6, - "learning_rate": 8.624181465557794e-05, - "loss": 0.2649, - "step": 498000 - }, - { - "epoch": 7.62, - "learning_rate": 8.601121728817519e-05, - "loss": 0.2647, - "step": 499000 - }, - { - "epoch": 7.64, - "learning_rate": 8.578055418306327e-05, - "loss": 0.2654, - "step": 500000 - }, - { - "epoch": 7.64, - "eval_runtime": 1.1022, - "eval_samples_per_second": 907.298, - "eval_steps_per_second": 14.517, - "step": 500000 - }, - { - "epoch": 7.65, - "learning_rate": 8.55498278627369e-05, - "loss": 0.2646, - "step": 501000 - }, - { - "epoch": 7.67, - "learning_rate": 8.531904085038221e-05, - "loss": 0.2646, - "step": 502000 - }, - { - "epoch": 7.68, - "learning_rate": 8.508819566984897e-05, - "loss": 0.2641, - "step": 503000 - }, - { - "epoch": 7.7, - "learning_rate": 8.485729484562307e-05, - "loss": 0.2641, - "step": 504000 - }, - { - "epoch": 7.71, - "learning_rate": 8.462634090279895e-05, - "loss": 0.264, - "step": 505000 - }, - { - "epoch": 7.71, - "eval_runtime": 1.0129, - "eval_samples_per_second": 987.309, - "eval_steps_per_second": 15.797, - "step": 505000 - }, - { - "epoch": 7.73, - "learning_rate": 8.439533636705194e-05, - "loss": 0.2635, - "step": 506000 - }, - { - "epoch": 7.74, - "learning_rate": 8.416428376461061e-05, - "loss": 0.2644, - "step": 507000 - }, - { - "epoch": 7.76, - "learning_rate": 8.393318562222916e-05, - "loss": 0.2642, - "step": 508000 - }, - { - "epoch": 7.77, - "learning_rate": 8.370204446715997e-05, - "loss": 0.2638, - "step": 509000 - }, - { - "epoch": 7.79, - "learning_rate": 8.347086282712556e-05, - "loss": 0.2637, - "step": 510000 - }, - { - "epoch": 7.79, - "eval_runtime": 1.1071, - "eval_samples_per_second": 903.278, - "eval_steps_per_second": 14.452, - "step": 510000 - }, - { - "epoch": 7.8, - "learning_rate": 8.323964323029136e-05, - "loss": 0.2633, - "step": 511000 - }, - { - "epoch": 7.82, - "learning_rate": 8.300838820523784e-05, - "loss": 0.2634, - "step": 512000 - }, - { - "epoch": 7.83, - "learning_rate": 8.277710028093289e-05, - "loss": 0.263, - "step": 513000 - }, - { - "epoch": 7.85, - "learning_rate": 8.254578198670421e-05, - "loss": 0.2632, - "step": 514000 - }, - { - "epoch": 7.86, - "learning_rate": 8.231443585221157e-05, - "loss": 0.2629, - "step": 515000 - }, - { - "epoch": 7.86, - "eval_runtime": 1.0457, - "eval_samples_per_second": 956.256, - "eval_steps_per_second": 15.3, - "step": 515000 - }, - { - "epoch": 7.88, - "learning_rate": 8.208306440741926e-05, - "loss": 0.2626, - "step": 516000 - }, - { - "epoch": 7.89, - "learning_rate": 8.185167018256834e-05, - "loss": 0.2629, - "step": 517000 - }, - { - "epoch": 7.91, - "learning_rate": 8.162025570814896e-05, - "loss": 0.2625, - "step": 518000 - }, - { - "epoch": 7.93, - "learning_rate": 8.138882351487275e-05, - "loss": 0.2623, - "step": 519000 - }, - { - "epoch": 7.94, - "learning_rate": 8.115737613364511e-05, - "loss": 0.2626, - "step": 520000 - }, - { - "epoch": 7.94, - "eval_runtime": 1.0504, - "eval_samples_per_second": 952.036, - "eval_steps_per_second": 15.233, - "step": 520000 - }, - { - "epoch": 7.96, - "learning_rate": 8.092591609553747e-05, - "loss": 0.2623, - "step": 521000 - }, - { - "epoch": 7.97, - "learning_rate": 8.069444593175975e-05, - "loss": 0.2622, - "step": 522000 - }, - { - "epoch": 7.99, - "learning_rate": 8.046296817363259e-05, - "loss": 0.262, - "step": 523000 - }, - { - "epoch": 8.0, - "learning_rate": 8.023148535255965e-05, - "loss": 0.2619, - "step": 524000 - }, - { - "epoch": 8.02, - "learning_rate": 7.999999999999999e-05, - "loss": 0.262, - "step": 525000 - }, - { - "epoch": 8.02, - "eval_runtime": 1.1375, - "eval_samples_per_second": 879.131, - "eval_steps_per_second": 14.066, - "step": 525000 - }, - { - "epoch": 8.03, - "learning_rate": 7.976851464744033e-05, - "loss": 0.2616, - "step": 526000 - }, - { - "epoch": 8.05, - "learning_rate": 7.953703182636741e-05, - "loss": 0.2616, - "step": 527000 - }, - { - "epoch": 8.06, - "learning_rate": 7.930555406824026e-05, - "loss": 0.2617, - "step": 528000 - }, - { - "epoch": 8.08, - "learning_rate": 7.907408390446254e-05, - "loss": 0.2614, - "step": 529000 - }, - { - "epoch": 8.09, - "learning_rate": 7.884262386635489e-05, - "loss": 0.2607, - "step": 530000 - }, - { - "epoch": 8.09, - "eval_runtime": 1.0134, - "eval_samples_per_second": 986.75, - "eval_steps_per_second": 15.788, - "step": 530000 - }, - { - "epoch": 8.11, - "learning_rate": 7.861117648512725e-05, - "loss": 0.2613, - "step": 531000 - }, - { - "epoch": 8.12, - "learning_rate": 7.837974429185103e-05, - "loss": 0.2614, - "step": 532000 - }, - { - "epoch": 8.14, - "learning_rate": 7.814832981743164e-05, - "loss": 0.2614, - "step": 533000 - }, - { - "epoch": 8.15, - "learning_rate": 7.791693559258072e-05, - "loss": 0.2608, - "step": 534000 - }, - { - "epoch": 8.17, - "learning_rate": 7.768556414778842e-05, - "loss": 0.2606, - "step": 535000 - }, - { - "epoch": 8.17, - "eval_runtime": 1.097, - "eval_samples_per_second": 911.552, - "eval_steps_per_second": 14.585, - "step": 535000 - }, - { - "epoch": 8.18, - "learning_rate": 7.74542180132958e-05, - "loss": 0.2606, - "step": 536000 - }, - { - "epoch": 8.2, - "learning_rate": 7.72228997190671e-05, - "loss": 0.2608, - "step": 537000 - }, - { - "epoch": 8.22, - "learning_rate": 7.699161179476217e-05, - "loss": 0.2604, - "step": 538000 - }, - { - "epoch": 8.23, - "learning_rate": 7.676035676970863e-05, - "loss": 0.2606, - "step": 539000 - }, - { - "epoch": 8.25, - "learning_rate": 7.652913717287443e-05, - "loss": 0.2604, - "step": 540000 - }, - { - "epoch": 8.25, - "eval_runtime": 1.1778, - "eval_samples_per_second": 849.063, - "eval_steps_per_second": 13.585, - "step": 540000 - }, - { - "epoch": 8.26, - "learning_rate": 7.629795553284005e-05, - "loss": 0.2602, - "step": 541000 - }, - { - "epoch": 8.28, - "learning_rate": 7.606681437777081e-05, - "loss": 0.2605, - "step": 542000 - }, - { - "epoch": 8.29, - "learning_rate": 7.583571623538939e-05, - "loss": 0.26, - "step": 543000 - }, - { - "epoch": 8.31, - "learning_rate": 7.560466363294806e-05, - "loss": 0.2596, - "step": 544000 - }, - { - "epoch": 8.32, - "learning_rate": 7.537365909720104e-05, - "loss": 0.2595, - "step": 545000 - }, - { - "epoch": 8.32, - "eval_runtime": 1.1629, - "eval_samples_per_second": 859.911, - "eval_steps_per_second": 13.759, - "step": 545000 - }, - { - "epoch": 8.34, - "learning_rate": 7.514270515437691e-05, - "loss": 0.2595, - "step": 546000 - }, - { - "epoch": 8.35, - "learning_rate": 7.491180433015101e-05, - "loss": 0.2594, - "step": 547000 - }, - { - "epoch": 8.37, - "learning_rate": 7.468095914961777e-05, - "loss": 0.2596, - "step": 548000 - }, - { - "epoch": 8.38, - "learning_rate": 7.445017213726307e-05, - "loss": 0.2596, - "step": 549000 - }, - { - "epoch": 8.4, - "learning_rate": 7.421944581693674e-05, - "loss": 0.2594, - "step": 550000 - }, - { - "epoch": 8.4, - "eval_runtime": 0.9899, - "eval_samples_per_second": 1010.184, - "eval_steps_per_second": 16.163, - "step": 550000 - }, - { - "epoch": 8.41, - "learning_rate": 7.39887827118248e-05, - "loss": 0.259, - "step": 551000 - }, - { - "epoch": 8.43, - "learning_rate": 7.375818534442207e-05, - "loss": 0.2588, - "step": 552000 - }, - { - "epoch": 8.44, - "learning_rate": 7.352765623650435e-05, - "loss": 0.259, - "step": 553000 - }, - { - "epoch": 8.46, - "learning_rate": 7.329719790910108e-05, - "loss": 0.2587, - "step": 554000 - }, - { - "epoch": 8.47, - "learning_rate": 7.30668128824676e-05, - "loss": 0.2587, - "step": 555000 - }, - { - "epoch": 8.47, - "eval_runtime": 1.1635, - "eval_samples_per_second": 859.466, - "eval_steps_per_second": 13.751, - "step": 555000 - }, - { - "epoch": 8.49, - "learning_rate": 7.283650367605764e-05, - "loss": 0.2584, - "step": 556000 - }, - { - "epoch": 8.51, - "learning_rate": 7.260627280849581e-05, - "loss": 0.2585, - "step": 557000 - }, - { - "epoch": 8.52, - "learning_rate": 7.23761227975499e-05, - "loss": 0.2584, - "step": 558000 - }, - { - "epoch": 8.54, - "learning_rate": 7.21460561601037e-05, - "loss": 0.2584, - "step": 559000 - }, - { - "epoch": 8.55, - "learning_rate": 7.191607541212897e-05, - "loss": 0.2585, - "step": 560000 - }, - { - "epoch": 8.55, - "eval_runtime": 1.1711, - "eval_samples_per_second": 853.863, - "eval_steps_per_second": 13.662, - "step": 560000 - }, - { - "epoch": 8.57, - "learning_rate": 7.168618306865838e-05, - "loss": 0.2583, - "step": 561000 - }, - { - "epoch": 8.58, - "learning_rate": 7.145638164375779e-05, - "loss": 0.2588, - "step": 562000 - }, - { - "epoch": 8.6, - "learning_rate": 7.122667365049869e-05, - "loss": 0.2578, - "step": 563000 - }, - { - "epoch": 8.61, - "learning_rate": 7.099706160093098e-05, - "loss": 0.2578, - "step": 564000 - }, - { - "epoch": 8.63, - "learning_rate": 7.076754800605516e-05, - "loss": 0.2579, - "step": 565000 - }, - { - "epoch": 8.63, - "eval_runtime": 1.0129, - "eval_samples_per_second": 987.305, - "eval_steps_per_second": 15.797, - "step": 565000 - }, - { - "epoch": 8.64, - "learning_rate": 7.053813537579523e-05, - "loss": 0.2581, - "step": 566000 - }, - { - "epoch": 8.66, - "learning_rate": 7.030882621897088e-05, - "loss": 0.2575, - "step": 567000 - }, - { - "epoch": 8.67, - "learning_rate": 7.00796230432703e-05, - "loss": 0.2574, - "step": 568000 - }, - { - "epoch": 8.69, - "learning_rate": 6.985052835522279e-05, - "loss": 0.2572, - "step": 569000 - }, - { - "epoch": 8.7, - "learning_rate": 6.962154466017105e-05, - "loss": 0.2572, - "step": 570000 - }, - { - "epoch": 8.7, - "eval_runtime": 1.048, - "eval_samples_per_second": 954.187, - "eval_steps_per_second": 15.267, - "step": 570000 - }, - { - "epoch": 8.72, - "learning_rate": 6.939267446224418e-05, - "loss": 0.2569, - "step": 571000 - }, - { - "epoch": 8.73, - "learning_rate": 6.91639202643299e-05, - "loss": 0.2569, - "step": 572000 - }, - { - "epoch": 8.75, - "learning_rate": 6.893528456804756e-05, - "loss": 0.2569, - "step": 573000 - }, - { - "epoch": 8.77, - "learning_rate": 6.870676987372044e-05, - "loss": 0.2568, - "step": 574000 - }, - { - "epoch": 8.78, - "learning_rate": 6.847837868034861e-05, - "loss": 0.257, - "step": 575000 - }, - { - "epoch": 8.78, - "eval_runtime": 1.0002, - "eval_samples_per_second": 999.79, - "eval_steps_per_second": 15.997, - "step": 575000 - }, - { - "epoch": 8.8, - "learning_rate": 6.825011348558167e-05, - "loss": 0.2573, - "step": 576000 - }, - { - "epoch": 8.81, - "learning_rate": 6.802197678569109e-05, - "loss": 0.2566, - "step": 577000 - }, - { - "epoch": 8.83, - "learning_rate": 6.779397107554339e-05, - "loss": 0.2562, - "step": 578000 - }, - { - "epoch": 8.84, - "learning_rate": 6.756609884857239e-05, - "loss": 0.2566, - "step": 579000 - }, - { - "epoch": 8.86, - "learning_rate": 6.733836259675233e-05, - "loss": 0.2564, - "step": 580000 - }, - { - "epoch": 8.86, - "eval_runtime": 1.0727, - "eval_samples_per_second": 932.263, - "eval_steps_per_second": 14.916, - "step": 580000 - }, - { - "epoch": 8.87, - "learning_rate": 6.71107648105703e-05, - "loss": 0.2564, - "step": 581000 - }, - { - "epoch": 8.89, - "learning_rate": 6.688330797899925e-05, - "loss": 0.2562, - "step": 582000 - }, - { - "epoch": 8.9, - "learning_rate": 6.665599458947072e-05, - "loss": 0.2562, - "step": 583000 - }, - { - "epoch": 8.92, - "learning_rate": 6.642882712784742e-05, - "loss": 0.2561, - "step": 584000 - }, - { - "epoch": 8.93, - "learning_rate": 6.620180807839639e-05, - "loss": 0.2561, - "step": 585000 - }, - { - "epoch": 8.93, - "eval_runtime": 0.9936, - "eval_samples_per_second": 1006.405, - "eval_steps_per_second": 16.102, - "step": 585000 - }, - { - "epoch": 8.95, - "learning_rate": 6.597493992376152e-05, - "loss": 0.2557, - "step": 586000 - }, - { - "epoch": 8.96, - "learning_rate": 6.574822514493664e-05, - "loss": 0.2554, - "step": 587000 - }, - { - "epoch": 8.98, - "learning_rate": 6.552166622123824e-05, - "loss": 0.2554, - "step": 588000 - }, - { - "epoch": 8.99, - "learning_rate": 6.52952656302784e-05, - "loss": 0.2556, - "step": 589000 - }, - { - "epoch": 9.01, - "learning_rate": 6.506902584793773e-05, - "loss": 0.2553, - "step": 590000 - }, - { - "epoch": 9.01, - "eval_runtime": 0.9015, - "eval_samples_per_second": 1109.201, - "eval_steps_per_second": 17.747, - "step": 590000 - }, - { - "epoch": 9.02, - "learning_rate": 6.484294934833822e-05, - "loss": 0.2552, - "step": 591000 - }, - { - "epoch": 9.04, - "learning_rate": 6.461703860381628e-05, - "loss": 0.2551, - "step": 592000 - }, - { - "epoch": 9.06, - "learning_rate": 6.439129608489559e-05, - "loss": 0.2555, - "step": 593000 - }, - { - "epoch": 9.07, - "learning_rate": 6.41657242602602e-05, - "loss": 0.2549, - "step": 594000 - }, - { - "epoch": 9.09, - "learning_rate": 6.39403255967274e-05, - "loss": 0.255, - "step": 595000 - }, - { - "epoch": 9.09, - "eval_runtime": 1.1107, - "eval_samples_per_second": 900.319, - "eval_steps_per_second": 14.405, - "step": 595000 - }, - { - "epoch": 9.1, - "learning_rate": 6.371510255922088e-05, - "loss": 0.2545, - "step": 596000 - }, - { - "epoch": 9.12, - "learning_rate": 6.349005761074372e-05, - "loss": 0.2547, - "step": 597000 - }, - { - "epoch": 9.13, - "learning_rate": 6.326519321235139e-05, - "loss": 0.2546, - "step": 598000 - }, - { - "epoch": 9.15, - "learning_rate": 6.304051182312496e-05, - "loss": 0.2549, - "step": 599000 - }, - { - "epoch": 9.16, - "learning_rate": 6.281601590014407e-05, - "loss": 0.2546, - "step": 600000 - }, - { - "epoch": 9.16, - "eval_runtime": 1.0772, - "eval_samples_per_second": 928.316, - "eval_steps_per_second": 14.853, - "step": 600000 - }, - { - "epoch": 9.18, - "learning_rate": 6.259170789846017e-05, - "loss": 0.2546, - "step": 601000 - }, - { - "epoch": 9.19, - "learning_rate": 6.236759027106965e-05, - "loss": 0.2542, - "step": 602000 - }, - { - "epoch": 9.21, - "learning_rate": 6.214366546888694e-05, - "loss": 0.2541, - "step": 603000 - }, - { - "epoch": 9.22, - "learning_rate": 6.191993594071785e-05, - "loss": 0.2541, - "step": 604000 - }, - { - "epoch": 9.24, - "learning_rate": 6.169640413323262e-05, - "loss": 0.254, - "step": 605000 - }, - { - "epoch": 9.24, - "eval_runtime": 1.0913, - "eval_samples_per_second": 916.334, - "eval_steps_per_second": 14.661, - "step": 605000 - }, - { - "epoch": 9.25, - "learning_rate": 6.147307249093929e-05, - "loss": 0.2537, - "step": 606000 - }, - { - "epoch": 9.27, - "learning_rate": 6.124994345615693e-05, - "loss": 0.2532, - "step": 607000 - }, - { - "epoch": 9.28, - "learning_rate": 6.102701946898891e-05, - "loss": 0.2536, - "step": 608000 - }, - { - "epoch": 9.3, - "learning_rate": 6.0804302967296225e-05, - "loss": 0.2545, - "step": 609000 - }, - { - "epoch": 9.31, - "learning_rate": 6.058179638667089e-05, - "loss": 0.2536, - "step": 610000 - }, - { - "epoch": 9.31, - "eval_runtime": 1.0284, - "eval_samples_per_second": 972.365, - "eval_steps_per_second": 15.558, - "step": 610000 - }, - { - "epoch": 9.33, - "learning_rate": 6.035950216040917e-05, - "loss": 0.2533, - "step": 611000 - }, - { - "epoch": 9.35, - "learning_rate": 6.0137422719485145e-05, - "loss": 0.2531, - "step": 612000 - }, - { - "epoch": 9.36, - "learning_rate": 5.991556049252401e-05, - "loss": 0.2532, - "step": 613000 - }, - { - "epoch": 9.38, - "learning_rate": 5.969391790577551e-05, - "loss": 0.2532, - "step": 614000 - }, - { - "epoch": 9.39, - "learning_rate": 5.947249738308747e-05, - "loss": 0.2529, - "step": 615000 - }, - { - "epoch": 9.39, - "eval_runtime": 1.014, - "eval_samples_per_second": 986.174, - "eval_steps_per_second": 15.779, - "step": 615000 - }, - { - "epoch": 9.41, - "learning_rate": 5.925130134587924e-05, - "loss": 0.2527, - "step": 616000 - }, - { - "epoch": 9.42, - "learning_rate": 5.903033221311528e-05, - "loss": 0.2525, - "step": 617000 - }, - { - "epoch": 9.44, - "learning_rate": 5.880959240127858e-05, - "loss": 0.2524, - "step": 618000 - }, - { - "epoch": 9.45, - "learning_rate": 5.858908432434438e-05, - "loss": 0.2525, - "step": 619000 - }, - { - "epoch": 9.47, - "learning_rate": 5.8368810393753684e-05, - "loss": 0.2524, - "step": 620000 - }, - { - "epoch": 9.47, - "eval_runtime": 1.0588, - "eval_samples_per_second": 944.48, - "eval_steps_per_second": 15.112, - "step": 620000 - }, - { - "epoch": 9.48, - "learning_rate": 5.814877301838688e-05, - "loss": 0.2523, - "step": 621000 - }, - { - "epoch": 9.5, - "learning_rate": 5.7928974604537494e-05, - "loss": 0.2522, - "step": 622000 - }, - { - "epoch": 9.51, - "learning_rate": 5.770941755588573e-05, - "loss": 0.2537, - "step": 623000 - }, - { - "epoch": 9.53, - "learning_rate": 5.749010427347233e-05, - "loss": 0.254, - "step": 624000 - }, - { - "epoch": 9.54, - "learning_rate": 5.7271037155672156e-05, - "loss": 0.2522, - "step": 625000 - }, - { - "epoch": 9.54, - "eval_runtime": 1.0707, - "eval_samples_per_second": 934.001, - "eval_steps_per_second": 14.944, - "step": 625000 - }, - { - "epoch": 9.56, - "learning_rate": 5.7052218598168154e-05, - "loss": 0.2524, - "step": 626000 - }, - { - "epoch": 9.57, - "learning_rate": 5.6833650993925016e-05, - "loss": 0.2522, - "step": 627000 - }, - { - "epoch": 9.59, - "learning_rate": 5.661533673316303e-05, - "loss": 0.2522, - "step": 628000 - }, - { - "epoch": 9.6, - "learning_rate": 5.639727820333198e-05, - "loss": 0.2518, - "step": 629000 - }, - { - "epoch": 9.62, - "learning_rate": 5.617947778908498e-05, - "loss": 0.2517, - "step": 630000 - }, - { - "epoch": 9.62, - "eval_runtime": 1.1949, - "eval_samples_per_second": 836.899, - "eval_steps_per_second": 13.39, - "step": 630000 - }, - { - "epoch": 9.64, - "learning_rate": 5.596193787225254e-05, - "loss": 0.2514, - "step": 631000 - }, - { - "epoch": 9.65, - "learning_rate": 5.574466083181624e-05, - "loss": 0.2512, - "step": 632000 - }, - { - "epoch": 9.67, - "learning_rate": 5.552764904388305e-05, - "loss": 0.2511, - "step": 633000 - }, - { - "epoch": 9.68, - "learning_rate": 5.5310904881659116e-05, - "loss": 0.2511, - "step": 634000 - }, - { - "epoch": 9.7, - "learning_rate": 5.5094430715423835e-05, - "loss": 0.2509, - "step": 635000 - }, - { - "epoch": 9.7, - "eval_runtime": 1.0102, - "eval_samples_per_second": 989.889, - "eval_steps_per_second": 15.838, - "step": 635000 - }, - { - "epoch": 9.71, - "learning_rate": 5.487822891250406e-05, - "loss": 0.2511, - "step": 636000 - }, - { - "epoch": 9.73, - "learning_rate": 5.4662301837247985e-05, - "loss": 0.2508, - "step": 637000 - }, - { - "epoch": 9.74, - "learning_rate": 5.4446651850999604e-05, - "loss": 0.2506, - "step": 638000 - }, - { - "epoch": 9.76, - "learning_rate": 5.4231281312072544e-05, - "loss": 0.2505, - "step": 639000 - }, - { - "epoch": 9.77, - "learning_rate": 5.401619257572453e-05, - "loss": 0.2502, - "step": 640000 - }, - { - "epoch": 9.77, - "eval_runtime": 1.0069, - "eval_samples_per_second": 993.184, - "eval_steps_per_second": 15.891, - "step": 640000 - }, - { - "epoch": 9.79, - "learning_rate": 5.3801387994131576e-05, - "loss": 0.2501, - "step": 641000 - }, - { - "epoch": 9.8, - "learning_rate": 5.358686991636209e-05, - "loss": 0.2503, - "step": 642000 - }, - { - "epoch": 9.82, - "learning_rate": 5.3372640688351476e-05, - "loss": 0.2505, - "step": 643000 - }, - { - "epoch": 9.83, - "learning_rate": 5.315870265287618e-05, - "loss": 0.2502, - "step": 644000 - }, - { - "epoch": 9.85, - "learning_rate": 5.294505814952835e-05, - "loss": 0.2501, - "step": 645000 - }, - { - "epoch": 9.85, - "eval_runtime": 1.0688, - "eval_samples_per_second": 935.652, - "eval_steps_per_second": 14.97, - "step": 645000 - }, - { - "epoch": 9.86, - "learning_rate": 5.2731709514689995e-05, - "loss": 0.2502, - "step": 646000 - }, - { - "epoch": 9.88, - "learning_rate": 5.25186590815076e-05, - "loss": 0.2501, - "step": 647000 - }, - { - "epoch": 9.9, - "learning_rate": 5.2305909179866635e-05, - "loss": 0.2495, - "step": 648000 - }, - { - "epoch": 9.91, - "learning_rate": 5.209346213636584e-05, - "loss": 0.2498, - "step": 649000 - }, - { - "epoch": 9.93, - "learning_rate": 5.188132027429215e-05, - "loss": 0.2495, - "step": 650000 - }, - { - "epoch": 9.93, - "eval_runtime": 1.0361, - "eval_samples_per_second": 965.164, - "eval_steps_per_second": 15.443, - "step": 650000 - }, - { - "epoch": 9.94, - "learning_rate": 5.166948591359489e-05, - "loss": 0.2493, - "step": 651000 - }, - { - "epoch": 9.96, - "learning_rate": 5.145796137086076e-05, - "loss": 0.2493, - "step": 652000 - }, - { - "epoch": 9.97, - "learning_rate": 5.124674895928823e-05, - "loss": 0.2493, - "step": 653000 - }, - { - "epoch": 9.99, - "learning_rate": 5.103585098866237e-05, - "loss": 0.2491, - "step": 654000 - }, - { - "epoch": 10.0, - "learning_rate": 5.082526976532968e-05, - "loss": 0.249, - "step": 655000 - }, - { - "epoch": 10.0, - "eval_runtime": 1.0267, - "eval_samples_per_second": 974.027, - "eval_steps_per_second": 15.584, - "step": 655000 - }, - { - "epoch": 10.02, - "learning_rate": 5.061500759217261e-05, - "loss": 0.2494, - "step": 656000 - }, - { - "epoch": 10.03, - "learning_rate": 5.04050667685846e-05, - "loss": 0.2487, - "step": 657000 - }, - { - "epoch": 10.05, - "learning_rate": 5.01954495904449e-05, - "loss": 0.2485, - "step": 658000 - }, - { - "epoch": 10.06, - "learning_rate": 4.998615835009339e-05, - "loss": 0.2488, - "step": 659000 - }, - { - "epoch": 10.08, - "learning_rate": 4.97771953363055e-05, - "loss": 0.2489, - "step": 660000 - }, - { - "epoch": 10.08, - "eval_runtime": 1.0445, - "eval_samples_per_second": 957.361, - "eval_steps_per_second": 15.318, - "step": 660000 - }, - { - "epoch": 10.09, - "learning_rate": 4.956856283426728e-05, - "loss": 0.2487, - "step": 661000 - }, - { - "epoch": 10.11, - "learning_rate": 4.936026312555037e-05, - "loss": 0.248, - "step": 662000 - }, - { - "epoch": 10.12, - "learning_rate": 4.915229848808698e-05, - "loss": 0.2478, - "step": 663000 - }, - { - "epoch": 10.14, - "learning_rate": 4.8944671196145136e-05, - "loss": 0.2484, - "step": 664000 - }, - { - "epoch": 10.15, - "learning_rate": 4.8737383520303546e-05, - "loss": 0.2485, - "step": 665000 - }, - { - "epoch": 10.15, - "eval_runtime": 1.1085, - "eval_samples_per_second": 902.106, - "eval_steps_per_second": 14.434, - "step": 665000 - }, - { - "epoch": 10.17, - "learning_rate": 4.853043772742709e-05, - "loss": 0.248, - "step": 666000 - }, - { - "epoch": 10.19, - "learning_rate": 4.832383608064172e-05, - "loss": 0.2476, - "step": 667000 - }, - { - "epoch": 10.2, - "learning_rate": 4.811758083931005e-05, - "loss": 0.2478, - "step": 668000 - }, - { - "epoch": 10.22, - "learning_rate": 4.791167425900632e-05, - "loss": 0.2481, - "step": 669000 - }, - { - "epoch": 10.23, - "learning_rate": 4.770611859149185e-05, - "loss": 0.2508, - "step": 670000 - }, - { - "epoch": 10.23, - "eval_runtime": 1.1412, - "eval_samples_per_second": 876.243, - "eval_steps_per_second": 14.02, - "step": 670000 - }, - { - "epoch": 10.25, - "learning_rate": 4.7500916084690564e-05, - "loss": 0.2542, - "step": 671000 - }, - { - "epoch": 10.26, - "learning_rate": 4.729606898266411e-05, - "loss": 0.2507, - "step": 672000 - }, - { - "epoch": 10.28, - "learning_rate": 4.709157952558768e-05, - "loss": 0.2478, - "step": 673000 - }, - { - "epoch": 10.29, - "learning_rate": 4.688744994972514e-05, - "loss": 0.2482, - "step": 674000 - }, - { - "epoch": 10.31, - "learning_rate": 4.668368248740485e-05, - "loss": 0.247, - "step": 675000 - }, - { - "epoch": 10.31, - "eval_runtime": 0.9224, - "eval_samples_per_second": 1084.145, - "eval_steps_per_second": 17.346, - "step": 675000 - }, - { - "epoch": 10.32, - "learning_rate": 4.6480279366995116e-05, - "loss": 0.2472, - "step": 676000 - }, - { - "epoch": 10.34, - "learning_rate": 4.6277242812879914e-05, - "loss": 0.2473, - "step": 677000 - }, - { - "epoch": 10.35, - "learning_rate": 4.607457504543447e-05, - "loss": 0.2471, - "step": 678000 - }, - { - "epoch": 10.37, - "learning_rate": 4.5872278281000955e-05, - "loss": 0.2469, - "step": 679000 - }, - { - "epoch": 10.38, - "learning_rate": 4.567035473186444e-05, - "loss": 0.2469, - "step": 680000 - }, - { - "epoch": 10.38, - "eval_runtime": 0.7393, - "eval_samples_per_second": 1352.617, - "eval_steps_per_second": 21.642, - "step": 680000 - }, - { - "epoch": 10.4, - "learning_rate": 4.546880660622845e-05, - "loss": 0.2463, - "step": 681000 - }, - { - "epoch": 10.41, - "learning_rate": 4.5267636108191036e-05, - "loss": 0.2466, - "step": 682000 - }, - { - "epoch": 10.43, - "learning_rate": 4.5066845437720555e-05, - "loss": 0.2462, - "step": 683000 - }, - { - "epoch": 10.44, - "learning_rate": 4.4866436790631564e-05, - "loss": 0.2463, - "step": 684000 - }, - { - "epoch": 10.46, - "learning_rate": 4.4666412358560955e-05, - "loss": 0.2461, - "step": 685000 - }, - { - "epoch": 10.46, - "eval_runtime": 0.7931, - "eval_samples_per_second": 1260.883, - "eval_steps_per_second": 20.174, - "step": 685000 - }, - { - "epoch": 10.48, - "learning_rate": 4.4466774328943796e-05, - "loss": 0.2462, - "step": 686000 - }, - { - "epoch": 10.49, - "learning_rate": 4.426752488498972e-05, - "loss": 0.2462, - "step": 687000 - }, - { - "epoch": 10.51, - "learning_rate": 4.406866620565862e-05, - "loss": 0.2459, - "step": 688000 - }, - { - "epoch": 10.52, - "learning_rate": 4.3870200465637164e-05, - "loss": 0.2471, - "step": 689000 - }, - { - "epoch": 10.54, - "learning_rate": 4.3672129835314955e-05, - "loss": 0.2481, - "step": 690000 - }, - { - "epoch": 10.54, - "eval_runtime": 0.8338, - "eval_samples_per_second": 1199.26, - "eval_steps_per_second": 19.188, - "step": 690000 - }, - { - "epoch": 10.55, - "learning_rate": 4.347445648076057e-05, - "loss": 0.2463, - "step": 691000 - }, - { - "epoch": 10.57, - "learning_rate": 4.327718256369826e-05, - "loss": 0.2458, - "step": 692000 - }, - { - "epoch": 10.58, - "learning_rate": 4.3080310241483885e-05, - "loss": 0.2451, - "step": 693000 - }, - { - "epoch": 10.6, - "learning_rate": 4.2883841667081675e-05, - "loss": 0.2454, - "step": 694000 - }, - { - "epoch": 10.61, - "learning_rate": 4.268777898904044e-05, - "loss": 0.2455, - "step": 695000 - }, - { - "epoch": 10.61, - "eval_runtime": 0.794, - "eval_samples_per_second": 1259.505, - "eval_steps_per_second": 20.152, - "step": 695000 - }, - { - "epoch": 10.63, - "learning_rate": 4.2492124351470214e-05, - "loss": 0.2453, - "step": 696000 - }, - { - "epoch": 10.64, - "learning_rate": 4.2296879894018835e-05, - "loss": 0.2449, - "step": 697000 - }, - { - "epoch": 10.66, - "learning_rate": 4.210204775184834e-05, - "loss": 0.245, - "step": 698000 - }, - { - "epoch": 10.67, - "learning_rate": 4.190763005561186e-05, - "loss": 0.2447, - "step": 699000 - }, - { - "epoch": 10.69, - "learning_rate": 4.171362893143013e-05, - "loss": 0.2444, - "step": 700000 - }, - { - "epoch": 10.69, - "eval_runtime": 0.7798, - "eval_samples_per_second": 1282.444, - "eval_steps_per_second": 20.519, - "step": 700000 - }, - { - "epoch": 10.7, - "learning_rate": 4.1520046500868384e-05, - "loss": 0.2442, - "step": 701000 - }, - { - "epoch": 10.72, - "learning_rate": 4.1326884880913074e-05, - "loss": 0.2454, - "step": 702000 - }, - { - "epoch": 10.73, - "learning_rate": 4.1134146183948724e-05, - "loss": 0.2445, - "step": 703000 - }, - { - "epoch": 10.75, - "learning_rate": 4.0941832517734885e-05, - "loss": 0.2448, - "step": 704000 - }, - { - "epoch": 10.77, - "learning_rate": 4.0749945985382915e-05, - "loss": 0.2445, - "step": 705000 - }, - { - "epoch": 10.77, - "eval_runtime": 0.7458, - "eval_samples_per_second": 1340.853, - "eval_steps_per_second": 21.454, - "step": 705000 - }, - { - "epoch": 10.78, - "learning_rate": 4.0558488685333235e-05, - "loss": 0.253, - "step": 706000 - }, - { - "epoch": 10.8, - "learning_rate": 4.036746271133223e-05, - "loss": 0.2533, - "step": 707000 - }, - { - "epoch": 10.81, - "learning_rate": 4.0176870152409324e-05, - "loss": 0.2547, - "step": 708000 - }, - { - "epoch": 10.83, - "learning_rate": 3.998671309285417e-05, - "loss": 0.2529, - "step": 709000 - }, - { - "epoch": 10.84, - "learning_rate": 3.979699361219395e-05, - "loss": 0.2457, - "step": 710000 - }, - { - "epoch": 10.84, - "eval_runtime": 0.7472, - "eval_samples_per_second": 1338.326, - "eval_steps_per_second": 21.413, - "step": 710000 - }, - { - "epoch": 10.86, - "learning_rate": 3.960771378517049e-05, - "loss": 0.2438, - "step": 711000 - }, - { - "epoch": 10.87, - "learning_rate": 3.941887568171766e-05, - "loss": 0.2464, - "step": 712000 - }, - { - "epoch": 10.89, - "learning_rate": 3.923048136693873e-05, - "loss": 0.2445, - "step": 713000 - }, - { - "epoch": 10.9, - "learning_rate": 3.904253290108369e-05, - "loss": 0.2435, - "step": 714000 - }, - { - "epoch": 10.92, - "learning_rate": 3.885503233952689e-05, - "loss": 0.2446, - "step": 715000 - }, - { - "epoch": 10.92, - "eval_runtime": 0.8432, - "eval_samples_per_second": 1186.017, - "eval_steps_per_second": 18.976, - "step": 715000 - }, - { - "epoch": 10.93, - "learning_rate": 3.86679817327444e-05, - "loss": 0.2432, - "step": 716000 - }, - { - "epoch": 10.95, - "learning_rate": 3.848138312629171e-05, - "loss": 0.2433, - "step": 717000 - }, - { - "epoch": 10.96, - "learning_rate": 3.8295238560781317e-05, - "loss": 0.2436, - "step": 718000 - }, - { - "epoch": 10.98, - "learning_rate": 3.810955007186029e-05, - "loss": 0.2433, - "step": 719000 - }, - { - "epoch": 10.99, - "learning_rate": 3.792431969018824e-05, - "loss": 0.243, - "step": 720000 - }, - { - "epoch": 10.99, - "eval_runtime": 0.7755, - "eval_samples_per_second": 1289.466, - "eval_steps_per_second": 20.631, - "step": 720000 - }, - { - "epoch": 11.01, - "learning_rate": 3.7739549441414945e-05, - "loss": 0.2427, - "step": 721000 - }, - { - "epoch": 11.03, - "learning_rate": 3.755524134615825e-05, - "loss": 0.2429, - "step": 722000 - }, - { - "epoch": 11.04, - "learning_rate": 3.7371397419981925e-05, - "loss": 0.2428, - "step": 723000 - }, - { - "epoch": 11.06, - "learning_rate": 3.7188019673373706e-05, - "loss": 0.2431, - "step": 724000 - }, - { - "epoch": 11.07, - "learning_rate": 3.700511011172325e-05, - "loss": 0.2436, - "step": 725000 - }, - { - "epoch": 11.07, - "eval_runtime": 0.7297, - "eval_samples_per_second": 1370.472, - "eval_steps_per_second": 21.928, - "step": 725000 - }, - { - "epoch": 11.09, - "learning_rate": 3.682267073530023e-05, - "loss": 0.243, - "step": 726000 - }, - { - "epoch": 11.1, - "learning_rate": 3.664070353923245e-05, - "loss": 0.2424, - "step": 727000 - }, - { - "epoch": 11.12, - "learning_rate": 3.645921051348396e-05, - "loss": 0.2423, - "step": 728000 - }, - { - "epoch": 11.13, - "learning_rate": 3.627819364283345e-05, - "loss": 0.2456, - "step": 729000 - }, - { - "epoch": 11.15, - "learning_rate": 3.6097654906852405e-05, - "loss": 0.2431, - "step": 730000 - }, - { - "epoch": 11.15, - "eval_runtime": 0.7906, - "eval_samples_per_second": 1264.795, - "eval_steps_per_second": 20.237, - "step": 730000 - }, - { - "epoch": 11.16, - "learning_rate": 3.591759627988353e-05, - "loss": 0.242, - "step": 731000 - }, - { - "epoch": 11.18, - "learning_rate": 3.573801973101913e-05, - "loss": 0.2418, - "step": 732000 - }, - { - "epoch": 11.19, - "learning_rate": 3.5558927224079534e-05, - "loss": 0.2418, - "step": 733000 - }, - { - "epoch": 11.21, - "learning_rate": 3.5380320717591716e-05, - "loss": 0.2419, - "step": 734000 - }, - { - "epoch": 11.22, - "learning_rate": 3.5202202164767836e-05, - "loss": 0.2418, - "step": 735000 - }, - { - "epoch": 11.22, - "eval_runtime": 0.8971, - "eval_samples_per_second": 1114.723, - "eval_steps_per_second": 17.836, - "step": 735000 - }, - { - "epoch": 11.24, - "learning_rate": 3.5024573513483864e-05, - "loss": 0.2415, - "step": 736000 - }, - { - "epoch": 11.25, - "learning_rate": 3.484743670625822e-05, - "loss": 0.2414, - "step": 737000 - }, - { - "epoch": 11.27, - "learning_rate": 3.467079368023068e-05, - "loss": 0.2413, - "step": 738000 - }, - { - "epoch": 11.28, - "learning_rate": 3.449464636714107e-05, - "loss": 0.2415, - "step": 739000 - }, - { - "epoch": 11.3, - "learning_rate": 3.431899669330819e-05, - "loss": 0.2414, - "step": 740000 - }, - { - "epoch": 11.3, - "eval_runtime": 0.7754, - "eval_samples_per_second": 1289.598, - "eval_steps_per_second": 20.634, - "step": 740000 - }, - { - "epoch": 11.32, - "learning_rate": 3.4143846579608744e-05, - "loss": 0.2411, - "step": 741000 - }, - { - "epoch": 11.33, - "learning_rate": 3.396919794145629e-05, - "loss": 0.2412, - "step": 742000 - }, - { - "epoch": 11.35, - "learning_rate": 3.3795052688780345e-05, - "loss": 0.241, - "step": 743000 - }, - { - "epoch": 11.36, - "learning_rate": 3.362141272600552e-05, - "loss": 0.2413, - "step": 744000 - }, - { - "epoch": 11.38, - "learning_rate": 3.3448279952030615e-05, - "loss": 0.241, - "step": 745000 - }, - { - "epoch": 11.38, - "eval_runtime": 0.937, - "eval_samples_per_second": 1067.221, - "eval_steps_per_second": 17.076, - "step": 745000 - }, - { - "epoch": 11.39, - "learning_rate": 3.327565626020793e-05, - "loss": 0.2408, - "step": 746000 - }, - { - "epoch": 11.41, - "learning_rate": 3.3103543538322455e-05, - "loss": 0.2408, - "step": 747000 - }, - { - "epoch": 11.42, - "learning_rate": 3.293194366857137e-05, - "loss": 0.2407, - "step": 748000 - }, - { - "epoch": 11.44, - "learning_rate": 3.276085852754336e-05, - "loss": 0.2409, - "step": 749000 - }, - { - "epoch": 11.45, - "learning_rate": 3.259028998619814e-05, - "loss": 0.2405, - "step": 750000 - }, - { - "epoch": 11.45, - "eval_runtime": 0.7243, - "eval_samples_per_second": 1380.717, - "eval_steps_per_second": 22.091, - "step": 750000 - }, - { - "epoch": 11.47, - "learning_rate": 3.2420239909845894e-05, - "loss": 0.2403, - "step": 751000 - }, - { - "epoch": 11.48, - "learning_rate": 3.2250710158127045e-05, - "loss": 0.2402, - "step": 752000 - }, - { - "epoch": 11.5, - "learning_rate": 3.2081702584991786e-05, - "loss": 0.2398, - "step": 753000 - }, - { - "epoch": 11.51, - "learning_rate": 3.191321903867988e-05, - "loss": 0.2401, - "step": 754000 - }, - { - "epoch": 11.53, - "learning_rate": 3.174526136170039e-05, - "loss": 0.2403, - "step": 755000 - }, - { - "epoch": 11.53, - "eval_runtime": 0.695, - "eval_samples_per_second": 1438.835, - "eval_steps_per_second": 23.021, - "step": 755000 - }, - { - "epoch": 11.54, - "learning_rate": 3.157783139081155e-05, - "loss": 0.24, - "step": 756000 - }, - { - "epoch": 11.56, - "learning_rate": 3.141093095700072e-05, - "loss": 0.2401, - "step": 757000 - }, - { - "epoch": 11.57, - "learning_rate": 3.1244561885464244e-05, - "loss": 0.252, - "step": 758000 - }, - { - "epoch": 11.59, - "learning_rate": 3.107872599558769e-05, - "loss": 0.24, - "step": 759000 - }, - { - "epoch": 11.61, - "learning_rate": 3.0913425100925795e-05, - "loss": 0.2396, - "step": 760000 - }, - { - "epoch": 11.61, - "eval_runtime": 0.7192, - "eval_samples_per_second": 1390.499, - "eval_steps_per_second": 22.248, - "step": 760000 - }, - { - "epoch": 11.62, - "learning_rate": 3.0748661009182616e-05, - "loss": 0.2396, - "step": 761000 - }, - { - "epoch": 11.64, - "learning_rate": 3.0584435522191896e-05, - "loss": 0.2395, - "step": 762000 - }, - { - "epoch": 11.65, - "learning_rate": 3.0420750435897183e-05, - "loss": 0.2393, - "step": 763000 - }, - { - "epoch": 11.67, - "learning_rate": 3.025760754033246e-05, - "loss": 0.239, - "step": 764000 - }, - { - "epoch": 11.68, - "learning_rate": 3.0095008619602206e-05, - "loss": 0.2392, - "step": 765000 - }, - { - "epoch": 11.68, - "eval_runtime": 0.7905, - "eval_samples_per_second": 1264.968, - "eval_steps_per_second": 20.239, - "step": 765000 - }, - { - "epoch": 11.7, - "learning_rate": 2.993295545186223e-05, - "loss": 0.2393, - "step": 766000 - }, - { - "epoch": 11.71, - "learning_rate": 2.977144980929996e-05, - "loss": 0.2392, - "step": 767000 - }, - { - "epoch": 11.73, - "learning_rate": 2.961049345811523e-05, - "loss": 0.2388, - "step": 768000 - }, - { - "epoch": 11.74, - "learning_rate": 2.945008815850097e-05, - "loss": 0.2392, - "step": 769000 - }, - { - "epoch": 11.76, - "learning_rate": 2.929023566462377e-05, - "loss": 0.2391, - "step": 770000 - }, - { - "epoch": 11.76, - "eval_runtime": 0.8418, - "eval_samples_per_second": 1187.898, - "eval_steps_per_second": 19.006, - "step": 770000 - }, - { - "epoch": 11.77, - "learning_rate": 2.9130937724604947e-05, - "loss": 0.2401, - "step": 771000 - }, - { - "epoch": 11.79, - "learning_rate": 2.8972196080501208e-05, - "loss": 0.2392, - "step": 772000 - }, - { - "epoch": 11.8, - "learning_rate": 2.8814012468285748e-05, - "loss": 0.2395, - "step": 773000 - }, - { - "epoch": 11.82, - "learning_rate": 2.865638861782922e-05, - "loss": 0.2387, - "step": 774000 - }, - { - "epoch": 11.83, - "learning_rate": 2.849932625288079e-05, - "loss": 0.2383, - "step": 775000 - }, - { - "epoch": 11.83, - "eval_runtime": 0.7561, - "eval_samples_per_second": 1322.637, - "eval_steps_per_second": 21.162, - "step": 775000 - }, - { - "epoch": 11.85, - "learning_rate": 2.8342827091049336e-05, - "loss": 0.2383, - "step": 776000 - }, - { - "epoch": 11.86, - "learning_rate": 2.8186892843784587e-05, - "loss": 0.2384, - "step": 777000 - }, - { - "epoch": 11.88, - "learning_rate": 2.803152521635851e-05, - "loss": 0.2382, - "step": 778000 - }, - { - "epoch": 11.9, - "learning_rate": 2.7876725907846578e-05, - "loss": 0.2378, - "step": 779000 - }, - { - "epoch": 11.91, - "learning_rate": 2.7722496611109243e-05, - "loss": 0.2378, - "step": 780000 - }, - { - "epoch": 11.91, - "eval_runtime": 0.7835, - "eval_samples_per_second": 1276.363, - "eval_steps_per_second": 20.422, - "step": 780000 - }, - { - "epoch": 11.93, - "learning_rate": 2.7568839012773365e-05, - "loss": 0.238, - "step": 781000 - }, - { - "epoch": 11.94, - "learning_rate": 2.7415754793213826e-05, - "loss": 0.2375, - "step": 782000 - }, - { - "epoch": 11.96, - "learning_rate": 2.7263245626535116e-05, - "loss": 0.2377, - "step": 783000 - }, - { - "epoch": 11.97, - "learning_rate": 2.7111313180553077e-05, - "loss": 0.2378, - "step": 784000 - }, - { - "epoch": 11.99, - "learning_rate": 2.6959959116776587e-05, - "loss": 0.2376, - "step": 785000 - }, - { - "epoch": 11.99, - "eval_runtime": 0.7664, - "eval_samples_per_second": 1304.853, - "eval_steps_per_second": 20.878, - "step": 785000 - }, - { - "epoch": 12.0, - "learning_rate": 2.6809185090389406e-05, - "loss": 0.2371, - "step": 786000 - }, - { - "epoch": 12.02, - "learning_rate": 2.6658992750232167e-05, - "loss": 0.2373, - "step": 787000 - }, - { - "epoch": 12.03, - "learning_rate": 2.6509383738784218e-05, - "loss": 0.2374, - "step": 788000 - }, - { - "epoch": 12.05, - "learning_rate": 2.6360359692145757e-05, - "loss": 0.237, - "step": 789000 - }, - { - "epoch": 12.06, - "learning_rate": 2.6211922240019883e-05, - "loss": 0.2368, - "step": 790000 - }, - { - "epoch": 12.06, - "eval_runtime": 0.7543, - "eval_samples_per_second": 1325.719, - "eval_steps_per_second": 21.212, - "step": 790000 - }, - { - "epoch": 12.08, - "learning_rate": 2.6064073005694758e-05, - "loss": 0.2381, - "step": 791000 - }, - { - "epoch": 12.09, - "learning_rate": 2.591681360602595e-05, - "loss": 0.2373, - "step": 792000 - }, - { - "epoch": 12.11, - "learning_rate": 2.577014565141866e-05, - "loss": 0.2377, - "step": 793000 - }, - { - "epoch": 12.12, - "learning_rate": 2.562407074581014e-05, - "loss": 0.2382, - "step": 794000 - }, - { - "epoch": 12.14, - "learning_rate": 2.5478590486652137e-05, - "loss": 0.2374, - "step": 795000 - }, - { - "epoch": 12.14, - "eval_runtime": 0.8227, - "eval_samples_per_second": 1215.581, - "eval_steps_per_second": 19.449, - "step": 795000 - }, - { - "epoch": 12.16, - "learning_rate": 2.533370646489347e-05, - "loss": 0.237, - "step": 796000 - }, - { - "epoch": 12.17, - "learning_rate": 2.5189420264962586e-05, - "loss": 0.2367, - "step": 797000 - }, - { - "epoch": 12.19, - "learning_rate": 2.504573346475026e-05, - "loss": 0.2371, - "step": 798000 - }, - { - "epoch": 12.2, - "learning_rate": 2.4902647635592324e-05, - "loss": 0.2372, - "step": 799000 - }, - { - "epoch": 12.22, - "learning_rate": 2.476016434225246e-05, - "loss": 0.2372, - "step": 800000 - }, - { - "epoch": 12.22, - "eval_runtime": 0.741, - "eval_samples_per_second": 1349.61, - "eval_steps_per_second": 21.594, - "step": 800000 - }, - { - "epoch": 12.23, - "learning_rate": 2.461828514290513e-05, - "loss": 0.2364, - "step": 801000 - }, - { - "epoch": 12.25, - "learning_rate": 2.447701158911855e-05, - "loss": 0.2373, - "step": 802000 - }, - { - "epoch": 12.26, - "learning_rate": 2.4336345225837658e-05, - "loss": 0.2369, - "step": 803000 - }, - { - "epoch": 12.28, - "learning_rate": 2.4196287591367296e-05, - "loss": 0.2363, - "step": 804000 - }, - { - "epoch": 12.29, - "learning_rate": 2.405684021735527e-05, - "loss": 0.2366, - "step": 805000 - }, - { - "epoch": 12.29, - "eval_runtime": 0.7797, - "eval_samples_per_second": 1282.575, - "eval_steps_per_second": 20.521, - "step": 805000 - }, - { - "epoch": 12.31, - "learning_rate": 2.3918004628775736e-05, - "loss": 0.2366, - "step": 806000 - }, - { - "epoch": 12.32, - "learning_rate": 2.3779782343912463e-05, - "loss": 0.2367, - "step": 807000 - }, - { - "epoch": 12.34, - "learning_rate": 2.364217487434221e-05, - "loss": 0.24, - "step": 808000 - }, - { - "epoch": 12.35, - "learning_rate": 2.3505183724918196e-05, - "loss": 0.2369, - "step": 809000 - }, - { - "epoch": 12.37, - "learning_rate": 2.3368810393753687e-05, - "loss": 0.2365, - "step": 810000 - }, - { - "epoch": 12.37, - "eval_runtime": 0.7457, - "eval_samples_per_second": 1341.078, - "eval_steps_per_second": 21.457, - "step": 810000 - }, - { - "epoch": 12.38, - "learning_rate": 2.32330563722056e-05, - "loss": 0.2357, - "step": 811000 - }, - { - "epoch": 12.4, - "learning_rate": 2.309792314485815e-05, - "loss": 0.2356, - "step": 812000 - }, - { - "epoch": 12.41, - "learning_rate": 2.2963412189506695e-05, - "loss": 0.2358, - "step": 813000 - }, - { - "epoch": 12.43, - "learning_rate": 2.282952497714145e-05, - "loss": 0.2356, - "step": 814000 - }, - { - "epoch": 12.45, - "learning_rate": 2.2696262971931538e-05, - "loss": 0.2357, - "step": 815000 - }, - { - "epoch": 12.45, - "eval_runtime": 0.7163, - "eval_samples_per_second": 1396.105, - "eval_steps_per_second": 22.338, - "step": 815000 - }, - { - "epoch": 12.46, - "learning_rate": 2.2563627631208887e-05, - "loss": 0.2355, - "step": 816000 - }, - { - "epoch": 12.48, - "learning_rate": 2.2431620405452336e-05, - "loss": 0.2351, - "step": 817000 - }, - { - "epoch": 12.49, - "learning_rate": 2.230024273827179e-05, - "loss": 0.2357, - "step": 818000 - }, - { - "epoch": 12.51, - "learning_rate": 2.216949606639231e-05, - "loss": 0.2353, - "step": 819000 - }, - { - "epoch": 12.52, - "learning_rate": 2.2039381819638596e-05, - "loss": 0.2351, - "step": 820000 - }, - { - "epoch": 12.52, - "eval_runtime": 0.6211, - "eval_samples_per_second": 1609.99, - "eval_steps_per_second": 25.76, - "step": 820000 - }, - { - "epoch": 12.54, - "learning_rate": 2.1909901420919184e-05, - "loss": 0.2351, - "step": 821000 - }, - { - "epoch": 12.55, - "learning_rate": 2.1781056286210997e-05, - "loss": 0.235, - "step": 822000 - }, - { - "epoch": 12.57, - "learning_rate": 2.1652847824543744e-05, - "loss": 0.2347, - "step": 823000 - }, - { - "epoch": 12.58, - "learning_rate": 2.1525277437984636e-05, - "loss": 0.2348, - "step": 824000 - }, - { - "epoch": 12.6, - "learning_rate": 2.1398346521623e-05, - "loss": 0.2345, - "step": 825000 - }, - { - "epoch": 12.6, - "eval_runtime": 0.7605, - "eval_samples_per_second": 1314.934, - "eval_steps_per_second": 21.039, - "step": 825000 - }, - { - "epoch": 12.61, - "learning_rate": 2.1272056463554978e-05, - "loss": 0.2343, - "step": 826000 - }, - { - "epoch": 12.63, - "learning_rate": 2.114640864486845e-05, - "loss": 0.2346, - "step": 827000 - }, - { - "epoch": 12.64, - "learning_rate": 2.1021404439627775e-05, - "loss": 0.2344, - "step": 828000 - }, - { - "epoch": 12.66, - "learning_rate": 2.089704521485896e-05, - "loss": 0.2344, - "step": 829000 - }, - { - "epoch": 12.67, - "learning_rate": 2.0773332330534513e-05, - "loss": 0.2343, - "step": 830000 - }, - { - "epoch": 12.67, - "eval_runtime": 0.7327, - "eval_samples_per_second": 1364.889, - "eval_steps_per_second": 21.838, - "step": 830000 - }, - { - "epoch": 12.69, - "learning_rate": 2.0650267139558772e-05, - "loss": 0.2339, - "step": 831000 - }, - { - "epoch": 12.7, - "learning_rate": 2.052785098775293e-05, - "loss": 0.2339, - "step": 832000 - }, - { - "epoch": 12.72, - "learning_rate": 2.04060852138404e-05, - "loss": 0.234, - "step": 833000 - }, - { - "epoch": 12.74, - "learning_rate": 2.028497114943219e-05, - "loss": 0.234, - "step": 834000 - }, - { - "epoch": 12.75, - "learning_rate": 2.0164510119012263e-05, - "loss": 0.2338, - "step": 835000 - }, - { - "epoch": 12.75, - "eval_runtime": 0.7099, - "eval_samples_per_second": 1408.578, - "eval_steps_per_second": 22.537, - "step": 835000 - }, - { - "epoch": 12.77, - "learning_rate": 2.0044703439923217e-05, - "loss": 0.2336, - "step": 836000 - }, - { - "epoch": 12.78, - "learning_rate": 1.9925552422351654e-05, - "loss": 0.2338, - "step": 837000 - }, - { - "epoch": 12.8, - "learning_rate": 1.9807058369314016e-05, - "loss": 0.2335, - "step": 838000 - }, - { - "epoch": 12.81, - "learning_rate": 1.968922257664231e-05, - "loss": 0.2337, - "step": 839000 - }, - { - "epoch": 12.83, - "learning_rate": 1.9572046332969825e-05, - "loss": 0.2335, - "step": 840000 - }, - { - "epoch": 12.83, - "eval_runtime": 0.7491, - "eval_samples_per_second": 1334.897, - "eval_steps_per_second": 21.358, - "step": 840000 - }, - { - "epoch": 12.84, - "learning_rate": 1.945553091971727e-05, - "loss": 0.2334, - "step": 841000 - }, - { - "epoch": 12.86, - "learning_rate": 1.933967761107847e-05, - "loss": 0.234, - "step": 842000 - }, - { - "epoch": 12.87, - "learning_rate": 1.9224487674006694e-05, - "loss": 0.234, - "step": 843000 - }, - { - "epoch": 12.89, - "learning_rate": 1.9109962368200602e-05, - "loss": 0.2379, - "step": 844000 - }, - { - "epoch": 12.9, - "learning_rate": 1.8996102946090586e-05, - "loss": 0.2335, - "step": 845000 - }, - { - "epoch": 12.9, - "eval_runtime": 0.7039, - "eval_samples_per_second": 1420.612, - "eval_steps_per_second": 22.73, - "step": 845000 - }, - { - "epoch": 12.92, - "learning_rate": 1.888291065282509e-05, - "loss": 0.2338, - "step": 846000 - }, - { - "epoch": 12.93, - "learning_rate": 1.8770386726256865e-05, - "loss": 0.2329, - "step": 847000 - }, - { - "epoch": 12.95, - "learning_rate": 1.8658532396929565e-05, - "loss": 0.2334, - "step": 848000 - }, - { - "epoch": 12.96, - "learning_rate": 1.8547348888064178e-05, - "loss": 0.2341, - "step": 849000 - }, - { - "epoch": 12.98, - "learning_rate": 1.8436837415545772e-05, - "loss": 0.2356, - "step": 850000 - }, - { - "epoch": 12.98, - "eval_runtime": 0.8308, - "eval_samples_per_second": 1203.685, - "eval_steps_per_second": 19.259, - "step": 850000 - }, - { - "epoch": 12.99, - "learning_rate": 1.8326999187910095e-05, - "loss": 0.2342, - "step": 851000 - }, - { - "epoch": 13.01, - "learning_rate": 1.8217835406330415e-05, - "loss": 0.2344, - "step": 852000 - }, - { - "epoch": 13.03, - "learning_rate": 1.810934726460436e-05, - "loss": 0.2328, - "step": 853000 - }, - { - "epoch": 13.04, - "learning_rate": 1.800153594914084e-05, - "loss": 0.2326, - "step": 854000 - }, - { - "epoch": 13.06, - "learning_rate": 1.7894402638947176e-05, - "loss": 0.2325, - "step": 855000 - }, - { - "epoch": 13.06, - "eval_runtime": 0.7234, - "eval_samples_per_second": 1382.419, - "eval_steps_per_second": 22.119, - "step": 855000 - }, - { - "epoch": 13.07, - "learning_rate": 1.778794850561604e-05, - "loss": 0.2327, - "step": 856000 - }, - { - "epoch": 13.09, - "learning_rate": 1.7682174713312805e-05, - "loss": 0.2326, - "step": 857000 - }, - { - "epoch": 13.1, - "learning_rate": 1.75770824187627e-05, - "loss": 0.2325, - "step": 858000 - }, - { - "epoch": 13.12, - "learning_rate": 1.747267277123821e-05, - "loss": 0.2327, - "step": 859000 - }, - { - "epoch": 13.13, - "learning_rate": 1.7368946912546556e-05, - "loss": 0.2329, - "step": 860000 - }, - { - "epoch": 13.13, - "eval_runtime": 0.7568, - "eval_samples_per_second": 1321.327, - "eval_steps_per_second": 21.141, - "step": 860000 - }, - { - "epoch": 13.15, - "learning_rate": 1.726590597701708e-05, - "loss": 0.2322, - "step": 861000 - }, - { - "epoch": 13.16, - "learning_rate": 1.7163551091488952e-05, - "loss": 0.2375, - "step": 862000 - }, - { - "epoch": 13.18, - "learning_rate": 1.7061883375298788e-05, - "loss": 0.2328, - "step": 863000 - }, - { - "epoch": 13.19, - "learning_rate": 1.6960903940268456e-05, - "loss": 0.2323, - "step": 864000 - }, - { - "epoch": 13.21, - "learning_rate": 1.6860613890692876e-05, - "loss": 0.2334, - "step": 865000 - }, - { - "epoch": 13.21, - "eval_runtime": 0.7389, - "eval_samples_per_second": 1353.416, - "eval_steps_per_second": 21.655, - "step": 865000 - }, - { - "epoch": 13.22, - "learning_rate": 1.6761014323327962e-05, - "loss": 0.233, - "step": 866000 - }, - { - "epoch": 13.24, - "learning_rate": 1.6662106327378645e-05, - "loss": 0.2334, - "step": 867000 - }, - { - "epoch": 13.25, - "learning_rate": 1.6563890984486884e-05, - "loss": 0.2333, - "step": 868000 - }, - { - "epoch": 13.27, - "learning_rate": 1.6466369368719955e-05, - "loss": 0.2324, - "step": 869000 - }, - { - "epoch": 13.29, - "learning_rate": 1.6369542546558626e-05, - "loss": 0.2324, - "step": 870000 - }, - { - "epoch": 13.29, - "eval_runtime": 0.8823, - "eval_samples_per_second": 1133.455, - "eval_steps_per_second": 18.135, - "step": 870000 - }, - { - "epoch": 13.3, - "learning_rate": 1.6273411576885517e-05, - "loss": 0.2323, - "step": 871000 - }, - { - "epoch": 13.32, - "learning_rate": 1.617797751097349e-05, - "loss": 0.2322, - "step": 872000 - }, - { - "epoch": 13.33, - "learning_rate": 1.608324139247421e-05, - "loss": 0.2335, - "step": 873000 - }, - { - "epoch": 13.35, - "learning_rate": 1.5989204257406693e-05, - "loss": 0.2329, - "step": 874000 - }, - { - "epoch": 13.36, - "learning_rate": 1.5895867134145974e-05, - "loss": 0.2325, - "step": 875000 - }, - { - "epoch": 13.36, - "eval_runtime": 0.8114, - "eval_samples_per_second": 1232.442, - "eval_steps_per_second": 19.719, - "step": 875000 - }, - { - "epoch": 13.38, - "learning_rate": 1.5803231043411912e-05, - "loss": 0.2322, - "step": 876000 - }, - { - "epoch": 13.39, - "learning_rate": 1.5711296998257902e-05, - "loss": 0.232, - "step": 877000 - }, - { - "epoch": 13.41, - "learning_rate": 1.562006600405996e-05, - "loss": 0.2322, - "step": 878000 - }, - { - "epoch": 13.42, - "learning_rate": 1.5529539058505624e-05, - "loss": 0.2317, - "step": 879000 - }, - { - "epoch": 13.44, - "learning_rate": 1.543971715158307e-05, - "loss": 0.2318, - "step": 880000 - }, - { - "epoch": 13.44, - "eval_runtime": 0.8591, - "eval_samples_per_second": 1164.009, - "eval_steps_per_second": 18.624, - "step": 880000 - }, - { - "epoch": 13.45, - "learning_rate": 1.535060126557028e-05, - "loss": 0.2322, - "step": 881000 - }, - { - "epoch": 13.47, - "learning_rate": 1.5262192375024284e-05, - "loss": 0.232, - "step": 882000 - }, - { - "epoch": 13.48, - "learning_rate": 1.5174491446770566e-05, - "loss": 0.2314, - "step": 883000 - }, - { - "epoch": 13.5, - "learning_rate": 1.508749943989242e-05, - "loss": 0.2312, - "step": 884000 - }, - { - "epoch": 13.51, - "learning_rate": 1.500121730572051e-05, - "loss": 0.2314, - "step": 885000 - }, - { - "epoch": 13.51, - "eval_runtime": 0.7508, - "eval_samples_per_second": 1331.906, - "eval_steps_per_second": 21.31, - "step": 885000 - }, - { - "epoch": 13.53, - "learning_rate": 1.4915645987822406e-05, - "loss": 0.2314, - "step": 886000 - }, - { - "epoch": 13.54, - "learning_rate": 1.4830786421992347e-05, - "loss": 0.2316, - "step": 887000 - }, - { - "epoch": 13.56, - "learning_rate": 1.4746639536240942e-05, - "loss": 0.2312, - "step": 888000 - }, - { - "epoch": 13.58, - "learning_rate": 1.4663206250785055e-05, - "loss": 0.2315, - "step": 889000 - }, - { - "epoch": 13.59, - "learning_rate": 1.4580487478037748e-05, - "loss": 0.2311, - "step": 890000 - }, - { - "epoch": 13.59, - "eval_runtime": 0.7331, - "eval_samples_per_second": 1364.004, - "eval_steps_per_second": 21.824, - "step": 890000 - }, - { - "epoch": 13.61, - "learning_rate": 1.4498484122598232e-05, - "loss": 0.2308, - "step": 891000 - }, - { - "epoch": 13.62, - "learning_rate": 1.4417197081242083e-05, - "loss": 0.2305, - "step": 892000 - }, - { - "epoch": 13.64, - "learning_rate": 1.433662724291136e-05, - "loss": 0.2313, - "step": 893000 - }, - { - "epoch": 13.65, - "learning_rate": 1.4256775488704904e-05, - "loss": 0.2311, - "step": 894000 - }, - { - "epoch": 13.67, - "learning_rate": 1.4177642691868717e-05, - "loss": 0.231, - "step": 895000 - }, - { - "epoch": 13.67, - "eval_runtime": 0.7236, - "eval_samples_per_second": 1382.053, - "eval_steps_per_second": 22.113, - "step": 895000 - }, - { - "epoch": 13.68, - "learning_rate": 1.4099229717786368e-05, - "loss": 0.231, - "step": 896000 - }, - { - "epoch": 13.7, - "learning_rate": 1.4021537423969588e-05, - "loss": 0.2317, - "step": 897000 - }, - { - "epoch": 13.71, - "learning_rate": 1.3944566660048863e-05, - "loss": 0.2308, - "step": 898000 - }, - { - "epoch": 13.73, - "learning_rate": 1.3868318267764128e-05, - "loss": 0.2309, - "step": 899000 - }, - { - "epoch": 13.74, - "learning_rate": 1.3792793080955574e-05, - "loss": 0.2308, - "step": 900000 - }, - { - "epoch": 13.74, - "eval_runtime": 0.7542, - "eval_samples_per_second": 1325.982, - "eval_steps_per_second": 21.216, - "step": 900000 - }, - { - "epoch": 13.76, - "learning_rate": 1.3717991925554562e-05, - "loss": 0.2308, - "step": 901000 - }, - { - "epoch": 13.77, - "learning_rate": 1.3643915619574529e-05, - "loss": 0.2305, - "step": 902000 - }, - { - "epoch": 13.79, - "learning_rate": 1.35705649731021e-05, - "loss": 0.2304, - "step": 903000 - }, - { - "epoch": 13.8, - "learning_rate": 1.3497940788288195e-05, - "loss": 0.2301, - "step": 904000 - }, - { - "epoch": 13.82, - "learning_rate": 1.3426043859339253e-05, - "loss": 0.2304, - "step": 905000 - }, - { - "epoch": 13.82, - "eval_runtime": 0.9802, - "eval_samples_per_second": 1020.243, - "eval_steps_per_second": 16.324, - "step": 905000 - }, - { - "epoch": 13.83, - "learning_rate": 1.3354874972508582e-05, - "loss": 0.2302, - "step": 906000 - }, - { - "epoch": 13.85, - "learning_rate": 1.3284434906087695e-05, - "loss": 0.2303, - "step": 907000 - }, - { - "epoch": 13.87, - "learning_rate": 1.3214724430397915e-05, - "loss": 0.2304, - "step": 908000 - }, - { - "epoch": 13.88, - "learning_rate": 1.314574430778182e-05, - "loss": 0.2302, - "step": 909000 - }, - { - "epoch": 13.9, - "learning_rate": 1.3077495292594966e-05, - "loss": 0.2305, - "step": 910000 - }, - { - "epoch": 13.9, - "eval_runtime": 0.7262, - "eval_samples_per_second": 1377.03, - "eval_steps_per_second": 22.032, - "step": 910000 - }, - { - "epoch": 13.91, - "learning_rate": 1.3009978131197669e-05, - "loss": 0.2314, - "step": 911000 - }, - { - "epoch": 13.93, - "learning_rate": 1.2943193561946762e-05, - "loss": 0.2304, - "step": 912000 - }, - { - "epoch": 13.94, - "learning_rate": 1.2877142315187628e-05, - "loss": 0.2299, - "step": 913000 - }, - { - "epoch": 13.96, - "learning_rate": 1.28118251132461e-05, - "loss": 0.23, - "step": 914000 - }, - { - "epoch": 13.97, - "learning_rate": 1.274724267042063e-05, - "loss": 0.2299, - "step": 915000 - }, - { - "epoch": 13.97, - "eval_runtime": 0.795, - "eval_samples_per_second": 1257.794, - "eval_steps_per_second": 20.125, - "step": 915000 - }, - { - "epoch": 13.99, - "learning_rate": 1.2683395692974472e-05, - "loss": 0.23, - "step": 916000 - }, - { - "epoch": 14.0, - "learning_rate": 1.2620284879127947e-05, - "loss": 0.23, - "step": 917000 - }, - { - "epoch": 14.02, - "learning_rate": 1.2557910919050803e-05, - "loss": 0.2295, - "step": 918000 - }, - { - "epoch": 14.03, - "learning_rate": 1.2496274494854666e-05, - "loss": 0.2296, - "step": 919000 - }, - { - "epoch": 14.05, - "learning_rate": 1.24353762805856e-05, - "loss": 0.2297, - "step": 920000 - }, - { - "epoch": 14.05, - "eval_runtime": 0.7692, - "eval_samples_per_second": 1300.053, - "eval_steps_per_second": 20.801, - "step": 920000 - }, - { - "epoch": 14.06, - "learning_rate": 1.2375216942216713e-05, - "loss": 0.2306, - "step": 921000 - }, - { - "epoch": 14.08, - "learning_rate": 1.2315797137640906e-05, - "loss": 0.2298, - "step": 922000 - }, - { - "epoch": 14.09, - "learning_rate": 1.225711751666363e-05, - "loss": 0.2295, - "step": 923000 - }, - { - "epoch": 14.11, - "learning_rate": 1.2199178720995825e-05, - "loss": 0.2299, - "step": 924000 - }, - { - "epoch": 14.12, - "learning_rate": 1.2141981384246874e-05, - "loss": 0.23, - "step": 925000 - }, - { - "epoch": 14.12, - "eval_runtime": 0.827, - "eval_samples_per_second": 1209.23, - "eval_steps_per_second": 19.348, - "step": 925000 - }, - { - "epoch": 14.14, - "learning_rate": 1.2085526131917685e-05, - "loss": 0.2294, - "step": 926000 - }, - { - "epoch": 14.16, - "learning_rate": 1.2029813581393866e-05, - "loss": 0.2289, - "step": 927000 - }, - { - "epoch": 14.17, - "learning_rate": 1.197484434193893e-05, - "loss": 0.2295, - "step": 928000 - }, - { - "epoch": 14.19, - "learning_rate": 1.192061901468768e-05, - "loss": 0.2293, - "step": 929000 - }, - { - "epoch": 14.2, - "learning_rate": 1.1867138192639601e-05, - "loss": 0.2293, - "step": 930000 - }, - { - "epoch": 14.2, - "eval_runtime": 0.9644, - "eval_samples_per_second": 1036.936, - "eval_steps_per_second": 16.591, - "step": 930000 - }, - { - "epoch": 14.22, - "learning_rate": 1.1814402460652382e-05, - "loss": 0.2291, - "step": 931000 - }, - { - "epoch": 14.23, - "learning_rate": 1.176241239543558e-05, - "loss": 0.229, - "step": 932000 - }, - { - "epoch": 14.25, - "learning_rate": 1.171116856554418e-05, - "loss": 0.2291, - "step": 933000 - }, - { - "epoch": 14.26, - "learning_rate": 1.1660671531372517e-05, - "loss": 0.2301, - "step": 934000 - }, - { - "epoch": 14.28, - "learning_rate": 1.1610921845148052e-05, - "loss": 0.2295, - "step": 935000 - }, - { - "epoch": 14.28, - "eval_runtime": 0.8534, - "eval_samples_per_second": 1171.832, - "eval_steps_per_second": 18.749, - "step": 935000 - }, - { - "epoch": 14.29, - "learning_rate": 1.156192005092539e-05, - "loss": 0.2301, - "step": 936000 - }, - { - "epoch": 14.31, - "learning_rate": 1.1513666684580308e-05, - "loss": 0.2291, - "step": 937000 - }, - { - "epoch": 14.32, - "learning_rate": 1.1466162273803876e-05, - "loss": 0.2292, - "step": 938000 - }, - { - "epoch": 14.34, - "learning_rate": 1.1419407338096732e-05, - "loss": 0.2287, - "step": 939000 - }, - { - "epoch": 14.35, - "learning_rate": 1.1373402388763346e-05, - "loss": 0.2286, - "step": 940000 - }, - { - "epoch": 14.35, - "eval_runtime": 0.7875, - "eval_samples_per_second": 1269.803, - "eval_steps_per_second": 20.317, - "step": 940000 - }, - { - "epoch": 14.37, - "learning_rate": 1.1328147928906494e-05, - "loss": 0.2287, - "step": 941000 - }, - { - "epoch": 14.38, - "learning_rate": 1.1283644453421678e-05, - "loss": 0.2289, - "step": 942000 - }, - { - "epoch": 14.4, - "learning_rate": 1.1239892448991798e-05, - "loss": 0.2284, - "step": 943000 - }, - { - "epoch": 14.42, - "learning_rate": 1.1196892394081743e-05, - "loss": 0.2287, - "step": 944000 - }, - { - "epoch": 14.43, - "learning_rate": 1.1154644758933235e-05, - "loss": 0.2285, - "step": 945000 - }, - { - "epoch": 14.43, - "eval_runtime": 0.7294, - "eval_samples_per_second": 1370.909, - "eval_steps_per_second": 21.935, - "step": 945000 - }, - { - "epoch": 14.45, - "learning_rate": 1.1113150005559644e-05, - "loss": 0.2283, - "step": 946000 - }, - { - "epoch": 14.46, - "learning_rate": 1.1072408587740942e-05, - "loss": 0.2282, - "step": 947000 - }, - { - "epoch": 14.48, - "learning_rate": 1.1032420951018755e-05, - "loss": 0.228, - "step": 948000 - }, - { - "epoch": 14.49, - "learning_rate": 1.0993187532691458e-05, - "loss": 0.2281, - "step": 949000 - }, - { - "epoch": 14.51, - "learning_rate": 1.0954708761809438e-05, - "loss": 0.2281, - "step": 950000 - }, - { - "epoch": 14.51, - "eval_runtime": 0.7692, - "eval_samples_per_second": 1300.007, - "eval_steps_per_second": 20.8, - "step": 950000 - }, - { - "epoch": 14.52, - "learning_rate": 1.091698505917036e-05, - "loss": 0.2281, - "step": 951000 - }, - { - "epoch": 14.54, - "learning_rate": 1.0880016837314599e-05, - "loss": 0.2283, - "step": 952000 - }, - { - "epoch": 14.55, - "learning_rate": 1.084380450052071e-05, - "loss": 0.2281, - "step": 953000 - }, - { - "epoch": 14.57, - "learning_rate": 1.0808348444801e-05, - "loss": 0.2278, - "step": 954000 - }, - { - "epoch": 14.58, - "learning_rate": 1.0773649057897206e-05, - "loss": 0.2283, - "step": 955000 - }, - { - "epoch": 14.58, - "eval_runtime": 0.7689, - "eval_samples_per_second": 1300.511, - "eval_steps_per_second": 20.808, - "step": 955000 - }, - { - "epoch": 14.6, - "learning_rate": 1.073970671927628e-05, - "loss": 0.2277, - "step": 956000 - }, - { - "epoch": 14.61, - "learning_rate": 1.0706521800126198e-05, - "loss": 0.2279, - "step": 957000 - }, - { - "epoch": 14.63, - "learning_rate": 1.0674094663351906e-05, - "loss": 0.2278, - "step": 958000 - }, - { - "epoch": 14.64, - "learning_rate": 1.0642425663571383e-05, - "loss": 0.2279, - "step": 959000 - }, - { - "epoch": 14.66, - "learning_rate": 1.0611515147111736e-05, - "loss": 0.2279, - "step": 960000 - }, - { - "epoch": 14.66, - "eval_runtime": 0.8148, - "eval_samples_per_second": 1227.238, - "eval_steps_per_second": 19.636, - "step": 960000 - }, - { - "epoch": 14.67, - "learning_rate": 1.0581363452005424e-05, - "loss": 0.2279, - "step": 961000 - }, - { - "epoch": 14.69, - "learning_rate": 1.0551970907986557e-05, - "loss": 0.2277, - "step": 962000 - }, - { - "epoch": 14.71, - "learning_rate": 1.0523337836487271e-05, - "loss": 0.2276, - "step": 963000 - }, - { - "epoch": 14.72, - "learning_rate": 1.0495464550634267e-05, - "loss": 0.2278, - "step": 964000 - }, - { - "epoch": 14.74, - "learning_rate": 1.046835135524533e-05, - "loss": 0.2277, - "step": 965000 - }, - { - "epoch": 14.74, - "eval_runtime": 0.7884, - "eval_samples_per_second": 1268.404, - "eval_steps_per_second": 20.294, - "step": 965000 - }, - { - "epoch": 14.75, - "learning_rate": 1.044199854682601e-05, - "loss": 0.2278, - "step": 966000 - }, - { - "epoch": 14.77, - "learning_rate": 1.0416406413566414e-05, - "loss": 0.2279, - "step": 967000 - }, - { - "epoch": 14.78, - "learning_rate": 1.0391575235337991e-05, - "loss": 0.2278, - "step": 968000 - }, - { - "epoch": 14.8, - "learning_rate": 1.0367505283690547e-05, - "loss": 0.2276, - "step": 969000 - }, - { - "epoch": 14.81, - "learning_rate": 1.0344196821849202e-05, - "loss": 0.2279, - "step": 970000 - }, - { - "epoch": 14.81, - "eval_runtime": 0.7534, - "eval_samples_per_second": 1327.252, - "eval_steps_per_second": 21.236, - "step": 970000 - }, - { - "epoch": 14.83, - "learning_rate": 1.032165010471157e-05, - "loss": 0.2277, - "step": 971000 - }, - { - "epoch": 14.84, - "learning_rate": 1.0299865378844936e-05, - "loss": 0.2275, - "step": 972000 - }, - { - "epoch": 14.86, - "learning_rate": 1.0278842882483569e-05, - "loss": 0.2275, - "step": 973000 - }, - { - "epoch": 14.87, - "learning_rate": 1.025858284552612e-05, - "loss": 0.2276, - "step": 974000 - }, - { - "epoch": 14.89, - "learning_rate": 1.023908548953311e-05, - "loss": 0.2275, - "step": 975000 - }, - { - "epoch": 14.89, - "eval_runtime": 0.7861, - "eval_samples_per_second": 1272.066, - "eval_steps_per_second": 20.353, - "step": 975000 - }, - { - "epoch": 14.9, - "learning_rate": 1.02203510277245e-05, - "loss": 0.2276, - "step": 976000 - }, - { - "epoch": 14.92, - "learning_rate": 1.0202379664977364e-05, - "loss": 0.2272, - "step": 977000 - }, - { - "epoch": 14.93, - "learning_rate": 1.018517159782365e-05, - "loss": 0.2274, - "step": 978000 - }, - { - "epoch": 14.95, - "learning_rate": 1.0168727014448004e-05, - "loss": 0.2272, - "step": 979000 - }, - { - "epoch": 14.96, - "learning_rate": 1.0153046094685783e-05, - "loss": 0.227, - "step": 980000 - }, - { - "epoch": 14.96, - "eval_runtime": 0.7489, - "eval_samples_per_second": 1335.226, - "eval_steps_per_second": 21.364, - "step": 980000 - }, - { - "epoch": 14.98, - "learning_rate": 1.0138129010020992e-05, - "loss": 0.2272, - "step": 981000 - }, - { - "epoch": 15.0, - "learning_rate": 1.0123975923584488e-05, - "loss": 0.2273, - "step": 982000 - }, - { - "epoch": 15.01, - "learning_rate": 1.0110586990152152e-05, - "loss": 0.227, - "step": 983000 - }, - { - "epoch": 15.03, - "learning_rate": 1.0097962356143219e-05, - "loss": 0.2273, - "step": 984000 - }, - { - "epoch": 15.04, - "learning_rate": 1.0086102159618668e-05, - "loss": 0.227, - "step": 985000 - }, - { - "epoch": 15.04, - "eval_runtime": 0.7868, - "eval_samples_per_second": 1271.022, - "eval_steps_per_second": 20.336, - "step": 985000 - }, - { - "epoch": 15.06, - "learning_rate": 1.0075006530279694e-05, - "loss": 0.2271, - "step": 986000 - }, - { - "epoch": 15.07, - "learning_rate": 1.0064675589466339e-05, - "loss": 0.2268, - "step": 987000 - }, - { - "epoch": 15.09, - "learning_rate": 1.0055109450156098e-05, - "loss": 0.2272, - "step": 988000 - }, - { - "epoch": 15.1, - "learning_rate": 1.0046308216962759e-05, - "loss": 0.2269, - "step": 989000 - }, - { - "epoch": 15.12, - "learning_rate": 1.0038271986135177e-05, - "loss": 0.2272, - "step": 990000 - }, - { - "epoch": 15.12, - "eval_runtime": 0.7713, - "eval_samples_per_second": 1296.438, - "eval_steps_per_second": 20.743, - "step": 990000 - }, - { - "epoch": 15.13, - "learning_rate": 1.0031000845556304e-05, - "loss": 0.2272, - "step": 991000 - }, - { - "epoch": 15.15, - "learning_rate": 1.0024494874742152e-05, - "loss": 0.2272, - "step": 992000 - }, - { - "epoch": 15.16, - "learning_rate": 1.0018754144840986e-05, - "loss": 0.2272, - "step": 993000 - }, - { - "epoch": 15.18, - "learning_rate": 1.0013778718632507e-05, - "loss": 0.227, - "step": 994000 - }, - { - "epoch": 15.19, - "learning_rate": 1.000956865052717e-05, - "loss": 0.2269, - "step": 995000 - }, - { - "epoch": 15.19, - "eval_runtime": 0.7194, - "eval_samples_per_second": 1390.055, - "eval_steps_per_second": 22.241, - "step": 995000 - }, - { - "epoch": 15.21, - "learning_rate": 1.0006123986565623e-05, - "loss": 0.2267, - "step": 996000 - }, - { - "epoch": 15.22, - "learning_rate": 1.0003444764418138e-05, - "loss": 0.2265, - "step": 997000 - }, - { - "epoch": 15.24, - "learning_rate": 1.000153101338428e-05, - "loss": 0.2268, - "step": 998000 - }, - { - "epoch": 15.25, - "learning_rate": 1.00003827543925e-05, - "loss": 0.2269, - "step": 999000 - }, - { - "epoch": 15.27, - "learning_rate": 1e-05, - "loss": 0.2268, - "step": 1000000 - }, - { - "epoch": 15.27, - "eval_runtime": 0.8245, - "eval_samples_per_second": 1212.903, - "eval_steps_per_second": 19.406, - "step": 1000000 - } - ], - "max_steps": 1000000, - "num_train_epochs": 16, - "total_flos": 7.010016247012483e+22, - "trial_name": null, - "trial_params": null -}