diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,111 +1,4391 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 9.997955010224949, - "global_step": 610, + "epoch": 499.9979550102249, + "global_step": 30500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, - "eval_loss": 39.944950103759766, - "eval_runtime": 1.5442, - "eval_samples_per_second": 79.651, - "eval_steps_per_second": 10.361, + "eval_loss": 39.79280090332031, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.634, + "eval_steps_per_second": 10.444, "step": 61 }, { "epoch": 2.0, - "eval_loss": 39.573768615722656, - "eval_runtime": 1.5441, - "eval_samples_per_second": 79.656, - "eval_steps_per_second": 10.362, + "eval_loss": 39.81950378417969, + "eval_runtime": 1.5354, + "eval_samples_per_second": 79.461, + "eval_steps_per_second": 10.421, "step": 122 }, { "epoch": 3.0, - "eval_loss": 40.006507873535156, - "eval_runtime": 1.5451, - "eval_samples_per_second": 79.606, - "eval_steps_per_second": 10.355, + "eval_loss": 39.82277297973633, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.635, + "eval_steps_per_second": 10.444, "step": 183 }, { "epoch": 4.0, - "eval_loss": 39.271480560302734, - "eval_runtime": 1.5464, - "eval_samples_per_second": 79.54, - "eval_steps_per_second": 10.347, + "eval_loss": 39.07925796508789, + "eval_runtime": 1.5323, + "eval_samples_per_second": 79.619, + "eval_steps_per_second": 10.442, "step": 244 }, { "epoch": 5.0, - "eval_loss": 38.749183654785156, - "eval_runtime": 1.5468, - "eval_samples_per_second": 79.518, - "eval_steps_per_second": 10.344, + "eval_loss": 38.66284942626953, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.671, + "eval_steps_per_second": 10.449, "step": 305 }, { "epoch": 6.0, - "eval_loss": 38.856666564941406, - "eval_runtime": 1.5467, - "eval_samples_per_second": 79.525, - "eval_steps_per_second": 10.345, + "eval_loss": 37.40141296386719, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.615, + "eval_steps_per_second": 10.441, "step": 366 }, { "epoch": 7.0, - "eval_loss": 38.75959396362305, - "eval_runtime": 1.5455, - "eval_samples_per_second": 79.588, - "eval_steps_per_second": 10.353, + "eval_loss": 36.820281982421875, + "eval_runtime": 1.5357, + "eval_samples_per_second": 79.443, + "eval_steps_per_second": 10.419, "step": 427 }, { "epoch": 8.0, - "eval_loss": 38.665225982666016, - "eval_runtime": 1.5457, - "eval_samples_per_second": 79.578, - "eval_steps_per_second": 10.352, + "eval_loss": 36.260650634765625, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.631, + "eval_steps_per_second": 10.443, "step": 488 }, { "epoch": 8.2, - "learning_rate": 9.01639344262295e-09, - "loss": 42.1342, + "learning_rate": 4.918032786885246e-08, + "loss": 41.0436, "step": 500 }, { "epoch": 9.0, - "eval_loss": 38.66032028198242, - "eval_runtime": 1.551, - "eval_samples_per_second": 79.304, - "eval_steps_per_second": 10.316, + "eval_loss": 35.931610107421875, + "eval_runtime": 1.5478, + "eval_samples_per_second": 78.82, + "eval_steps_per_second": 10.337, "step": 549 }, { "epoch": 10.0, - "eval_loss": 38.66507339477539, - "eval_runtime": 1.5562, - "eval_samples_per_second": 79.038, - "eval_steps_per_second": 10.281, + "eval_loss": 34.32823181152344, + "eval_runtime": 1.5396, + "eval_samples_per_second": 79.242, + "eval_steps_per_second": 10.392, "step": 610 }, { - "epoch": 10.0, - "step": 610, - "total_flos": 322666370343936.0, - "train_loss": 41.80477074795082, - "train_runtime": 454.8629, - "train_samples_per_second": 10.75, - "train_steps_per_second": 1.341 + "epoch": 11.0, + "eval_loss": 33.36042404174805, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.469, + "eval_steps_per_second": 10.422, + "step": 671 + }, + { + "epoch": 12.0, + "eval_loss": 32.44695281982422, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.623, + "eval_steps_per_second": 10.442, + "step": 732 + }, + { + "epoch": 13.0, + "eval_loss": 31.54026985168457, + "eval_runtime": 1.5308, + "eval_samples_per_second": 79.698, + "eval_steps_per_second": 10.452, + "step": 793 + }, + { + "epoch": 14.0, + "eval_loss": 30.966489791870117, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.689, + "eval_steps_per_second": 10.451, + "step": 854 + }, + { + "epoch": 15.0, + "eval_loss": 29.417085647583008, + "eval_runtime": 1.5409, + "eval_samples_per_second": 79.175, + "eval_steps_per_second": 10.384, + "step": 915 + }, + { + "epoch": 16.0, + "eval_loss": 28.458908081054688, + "eval_runtime": 1.5307, + "eval_samples_per_second": 79.7, + "eval_steps_per_second": 10.452, + "step": 976 + }, + { + "epoch": 16.39, + "learning_rate": 4.8360655737704914e-08, + "loss": 32.5506, + "step": 1000 + }, + { + "epoch": 17.0, + "eval_loss": 27.26991081237793, + "eval_runtime": 1.5386, + "eval_samples_per_second": 79.293, + "eval_steps_per_second": 10.399, + "step": 1037 + }, + { + "epoch": 18.0, + "eval_loss": 26.061372756958008, + "eval_runtime": 1.5472, + "eval_samples_per_second": 78.852, + "eval_steps_per_second": 10.341, + "step": 1098 + }, + { + "epoch": 19.0, + "eval_loss": 25.152942657470703, + "eval_runtime": 1.5408, + "eval_samples_per_second": 79.178, + "eval_steps_per_second": 10.384, + "step": 1159 + }, + { + "epoch": 20.0, + "eval_loss": 24.308048248291016, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.616, + "eval_steps_per_second": 10.441, + "step": 1220 + }, + { + "epoch": 21.0, + "eval_loss": 23.15108871459961, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.509, + "eval_steps_per_second": 10.427, + "step": 1281 + }, + { + "epoch": 22.0, + "eval_loss": 22.333181381225586, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.691, + "eval_steps_per_second": 10.451, + "step": 1342 + }, + { + "epoch": 23.0, + "eval_loss": 21.71536636352539, + "eval_runtime": 1.5304, + "eval_samples_per_second": 79.718, + "eval_steps_per_second": 10.455, + "step": 1403 + }, + { + "epoch": 24.0, + "eval_loss": 20.736703872680664, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.67, + "eval_steps_per_second": 10.448, + "step": 1464 + }, + { + "epoch": 24.59, + "learning_rate": 4.7540983606557375e-08, + "loss": 24.3212, + "step": 1500 + }, + { + "epoch": 25.0, + "eval_loss": 20.180402755737305, + "eval_runtime": 1.5364, + "eval_samples_per_second": 79.404, + "eval_steps_per_second": 10.414, + "step": 1525 + }, + { + "epoch": 26.0, + "eval_loss": 19.559112548828125, + "eval_runtime": 1.5443, + "eval_samples_per_second": 79.002, + "eval_steps_per_second": 10.361, + "step": 1586 + }, + { + "epoch": 27.0, + "eval_loss": 18.87604522705078, + "eval_runtime": 1.5495, + "eval_samples_per_second": 78.733, + "eval_steps_per_second": 10.326, + "step": 1647 + }, + { + "epoch": 28.0, + "eval_loss": 18.329124450683594, + "eval_runtime": 1.5379, + "eval_samples_per_second": 79.329, + "eval_steps_per_second": 10.404, + "step": 1708 + }, + { + "epoch": 29.0, + "eval_loss": 18.029600143432617, + "eval_runtime": 1.5333, + "eval_samples_per_second": 79.569, + "eval_steps_per_second": 10.435, + "step": 1769 + }, + { + "epoch": 30.0, + "eval_loss": 17.40913200378418, + "eval_runtime": 1.5337, + "eval_samples_per_second": 79.546, + "eval_steps_per_second": 10.432, + "step": 1830 + }, + { + "epoch": 31.0, + "eval_loss": 17.215015411376953, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 10.433, + "step": 1891 + }, + { + "epoch": 32.0, + "eval_loss": 16.648334503173828, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.692, + "eval_steps_per_second": 10.451, + "step": 1952 + }, + { + "epoch": 32.79, + "learning_rate": 4.6721311475409836e-08, + "loss": 18.5662, + "step": 2000 + }, + { + "epoch": 33.0, + "eval_loss": 16.269676208496094, + "eval_runtime": 1.5361, + "eval_samples_per_second": 79.421, + "eval_steps_per_second": 10.416, + "step": 2013 + }, + { + "epoch": 34.0, + "eval_loss": 16.17144775390625, + "eval_runtime": 1.5342, + "eval_samples_per_second": 79.519, + "eval_steps_per_second": 10.429, + "step": 2074 + }, + { + "epoch": 35.0, + "eval_loss": 15.8008394241333, + "eval_runtime": 1.538, + "eval_samples_per_second": 79.323, + "eval_steps_per_second": 10.403, + "step": 2135 + }, + { + "epoch": 36.0, + "eval_loss": 15.264719009399414, + "eval_runtime": 1.5357, + "eval_samples_per_second": 79.444, + "eval_steps_per_second": 10.419, + "step": 2196 + }, + { + "epoch": 37.0, + "eval_loss": 15.41949462890625, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.387, + "eval_steps_per_second": 10.411, + "step": 2257 + }, + { + "epoch": 38.0, + "eval_loss": 14.989760398864746, + "eval_runtime": 1.5351, + "eval_samples_per_second": 79.476, + "eval_steps_per_second": 10.423, + "step": 2318 + }, + { + "epoch": 39.0, + "eval_loss": 14.859911918640137, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.676, + "eval_steps_per_second": 10.449, + "step": 2379 + }, + { + "epoch": 40.0, + "eval_loss": 14.611943244934082, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.677, + "eval_steps_per_second": 10.449, + "step": 2440 + }, + { + "epoch": 40.98, + "learning_rate": 4.590163934426229e-08, + "loss": 15.1141, + "step": 2500 + }, + { + "epoch": 41.0, + "eval_loss": 14.464351654052734, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.682, + "eval_steps_per_second": 10.45, + "step": 2501 + }, + { + "epoch": 42.0, + "eval_loss": 14.339770317077637, + "eval_runtime": 1.5431, + "eval_samples_per_second": 79.064, + "eval_steps_per_second": 10.369, + "step": 2562 + }, + { + "epoch": 43.0, + "eval_loss": 14.410659790039062, + "eval_runtime": 1.5584, + "eval_samples_per_second": 78.285, + "eval_steps_per_second": 10.267, + "step": 2623 + }, + { + "epoch": 44.0, + "eval_loss": 13.903946876525879, + "eval_runtime": 1.5462, + "eval_samples_per_second": 78.902, + "eval_steps_per_second": 10.348, + "step": 2684 + }, + { + "epoch": 45.0, + "eval_loss": 14.004327774047852, + "eval_runtime": 1.5337, + "eval_samples_per_second": 79.545, + "eval_steps_per_second": 10.432, + "step": 2745 + }, + { + "epoch": 46.0, + "eval_loss": 13.953380584716797, + "eval_runtime": 1.5354, + "eval_samples_per_second": 79.459, + "eval_steps_per_second": 10.421, + "step": 2806 + }, + { + "epoch": 47.0, + "eval_loss": 13.926652908325195, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 10.433, + "step": 2867 + }, + { + "epoch": 48.0, + "eval_loss": 13.588834762573242, + "eval_runtime": 1.563, + "eval_samples_per_second": 78.054, + "eval_steps_per_second": 10.237, + "step": 2928 + }, + { + "epoch": 49.0, + "eval_loss": 13.63134479522705, + "eval_runtime": 1.5341, + "eval_samples_per_second": 79.524, + "eval_steps_per_second": 10.429, + "step": 2989 + }, + { + "epoch": 49.18, + "learning_rate": 4.508196721311475e-08, + "loss": 13.182, + "step": 3000 + }, + { + "epoch": 50.0, + "eval_loss": 13.540535926818848, + "eval_runtime": 1.5436, + "eval_samples_per_second": 79.038, + "eval_steps_per_second": 10.366, + "step": 3050 + }, + { + "epoch": 51.0, + "eval_loss": 13.337821960449219, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.316, + "eval_steps_per_second": 10.402, + "step": 3111 + }, + { + "epoch": 52.0, + "eval_loss": 13.316015243530273, + "eval_runtime": 1.5353, + "eval_samples_per_second": 79.465, + "eval_steps_per_second": 10.422, + "step": 3172 + }, + { + "epoch": 53.0, + "eval_loss": 13.135573387145996, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.692, + "eval_steps_per_second": 10.451, + "step": 3233 + }, + { + "epoch": 54.0, + "eval_loss": 13.248279571533203, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.688, + "eval_steps_per_second": 10.451, + "step": 3294 + }, + { + "epoch": 55.0, + "eval_loss": 13.125914573669434, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.693, + "eval_steps_per_second": 10.452, + "step": 3355 + }, + { + "epoch": 56.0, + "eval_loss": 13.177481651306152, + "eval_runtime": 1.5387, + "eval_samples_per_second": 79.286, + "eval_steps_per_second": 10.398, + "step": 3416 + }, + { + "epoch": 57.0, + "eval_loss": 13.111815452575684, + "eval_runtime": 1.5355, + "eval_samples_per_second": 79.455, + "eval_steps_per_second": 10.42, + "step": 3477 + }, + { + "epoch": 57.38, + "learning_rate": 4.426229508196721e-08, + "loss": 12.1712, + "step": 3500 + }, + { + "epoch": 58.0, + "eval_loss": 12.936331748962402, + "eval_runtime": 1.5524, + "eval_samples_per_second": 78.59, + "eval_steps_per_second": 10.307, + "step": 3538 + }, + { + "epoch": 59.0, + "eval_loss": 12.876540184020996, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 10.406, + "step": 3599 + }, + { + "epoch": 60.0, + "eval_loss": 12.79231071472168, + "eval_runtime": 1.537, + "eval_samples_per_second": 79.375, + "eval_steps_per_second": 10.41, + "step": 3660 + }, + { + "epoch": 61.0, + "eval_loss": 12.973188400268555, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.532, + "eval_steps_per_second": 10.43, + "step": 3721 + }, + { + "epoch": 62.0, + "eval_loss": 12.860616683959961, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.468, + "eval_steps_per_second": 10.422, + "step": 3782 + }, + { + "epoch": 63.0, + "eval_loss": 12.789679527282715, + "eval_runtime": 1.5317, + "eval_samples_per_second": 79.652, + "eval_steps_per_second": 10.446, + "step": 3843 + }, + { + "epoch": 64.0, + "eval_loss": 12.651591300964355, + "eval_runtime": 1.5315, + "eval_samples_per_second": 79.662, + "eval_steps_per_second": 10.447, + "step": 3904 + }, + { + "epoch": 65.0, + "eval_loss": 12.624197959899902, + "eval_runtime": 1.5373, + "eval_samples_per_second": 79.36, + "eval_steps_per_second": 10.408, + "step": 3965 + }, + { + "epoch": 65.57, + "learning_rate": 4.344262295081967e-08, + "loss": 11.5853, + "step": 4000 + }, + { + "epoch": 66.0, + "eval_loss": 12.59536361694336, + "eval_runtime": 1.537, + "eval_samples_per_second": 79.375, + "eval_steps_per_second": 10.41, + "step": 4026 + }, + { + "epoch": 67.0, + "eval_loss": 12.450535774230957, + "eval_runtime": 1.5373, + "eval_samples_per_second": 79.357, + "eval_steps_per_second": 10.408, + "step": 4087 + }, + { + "epoch": 68.0, + "eval_loss": 12.559524536132812, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.356, + "eval_steps_per_second": 10.407, + "step": 4148 + }, + { + "epoch": 69.0, + "eval_loss": 12.428722381591797, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.665, + "eval_steps_per_second": 10.448, + "step": 4209 + }, + { + "epoch": 70.0, + "eval_loss": 12.476819038391113, + "eval_runtime": 1.5329, + "eval_samples_per_second": 79.59, + "eval_steps_per_second": 10.438, + "step": 4270 + }, + { + "epoch": 71.0, + "eval_loss": 12.400250434875488, + "eval_runtime": 1.5417, + "eval_samples_per_second": 79.132, + "eval_steps_per_second": 10.378, + "step": 4331 + }, + { + "epoch": 72.0, + "eval_loss": 12.432337760925293, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.387, + "eval_steps_per_second": 10.411, + "step": 4392 + }, + { + "epoch": 73.0, + "eval_loss": 12.41788101196289, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.531, + "eval_steps_per_second": 10.43, + "step": 4453 + }, + { + "epoch": 73.77, + "learning_rate": 4.262295081967213e-08, + "loss": 11.25, + "step": 4500 + }, + { + "epoch": 74.0, + "eval_loss": 12.376703262329102, + "eval_runtime": 1.5362, + "eval_samples_per_second": 79.418, + "eval_steps_per_second": 10.415, + "step": 4514 + }, + { + "epoch": 75.0, + "eval_loss": 12.434186935424805, + "eval_runtime": 1.543, + "eval_samples_per_second": 79.065, + "eval_steps_per_second": 10.369, + "step": 4575 + }, + { + "epoch": 76.0, + "eval_loss": 12.256501197814941, + "eval_runtime": 1.5414, + "eval_samples_per_second": 79.148, + "eval_steps_per_second": 10.38, + "step": 4636 + }, + { + "epoch": 77.0, + "eval_loss": 12.312841415405273, + "eval_runtime": 1.5542, + "eval_samples_per_second": 78.498, + "eval_steps_per_second": 10.295, + "step": 4697 + }, + { + "epoch": 78.0, + "eval_loss": 12.374871253967285, + "eval_runtime": 1.5323, + "eval_samples_per_second": 79.62, + "eval_steps_per_second": 10.442, + "step": 4758 + }, + { + "epoch": 79.0, + "eval_loss": 12.356457710266113, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.468, + "eval_steps_per_second": 10.422, + "step": 4819 + }, + { + "epoch": 80.0, + "eval_loss": 12.328751564025879, + "eval_runtime": 1.5365, + "eval_samples_per_second": 79.4, + "eval_steps_per_second": 10.413, + "step": 4880 + }, + { + "epoch": 81.0, + "eval_loss": 12.201430320739746, + "eval_runtime": 1.5343, + "eval_samples_per_second": 79.517, + "eval_steps_per_second": 10.428, + "step": 4941 + }, + { + "epoch": 81.97, + "learning_rate": 4.180327868852459e-08, + "loss": 11.0038, + "step": 5000 + }, + { + "epoch": 82.0, + "eval_loss": 12.177206039428711, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 10.406, + "step": 5002 + }, + { + "epoch": 83.0, + "eval_loss": 12.2689790725708, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.346, + "eval_steps_per_second": 10.406, + "step": 5063 + }, + { + "epoch": 84.0, + "eval_loss": 12.198723793029785, + "eval_runtime": 1.5387, + "eval_samples_per_second": 79.286, + "eval_steps_per_second": 10.398, + "step": 5124 + }, + { + "epoch": 85.0, + "eval_loss": 12.0653715133667, + "eval_runtime": 1.5397, + "eval_samples_per_second": 79.234, + "eval_steps_per_second": 10.391, + "step": 5185 + }, + { + "epoch": 86.0, + "eval_loss": 12.12946891784668, + "eval_runtime": 1.5325, + "eval_samples_per_second": 79.61, + "eval_steps_per_second": 10.441, + "step": 5246 + }, + { + "epoch": 87.0, + "eval_loss": 12.156049728393555, + "eval_runtime": 1.5372, + "eval_samples_per_second": 79.367, + "eval_steps_per_second": 10.409, + "step": 5307 + }, + { + "epoch": 88.0, + "eval_loss": 11.96617317199707, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.471, + "eval_steps_per_second": 10.422, + "step": 5368 + }, + { + "epoch": 89.0, + "eval_loss": 12.113394737243652, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.693, + "eval_steps_per_second": 10.451, + "step": 5429 + }, + { + "epoch": 90.0, + "eval_loss": 12.029434204101562, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.626, + "eval_steps_per_second": 10.443, + "step": 5490 + }, + { + "epoch": 90.16, + "learning_rate": 4.0983606557377046e-08, + "loss": 10.8283, + "step": 5500 + }, + { + "epoch": 91.0, + "eval_loss": 12.111231803894043, + "eval_runtime": 1.5402, + "eval_samples_per_second": 79.212, + "eval_steps_per_second": 10.388, + "step": 5551 + }, + { + "epoch": 92.0, + "eval_loss": 12.091044425964355, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.357, + "eval_steps_per_second": 10.407, + "step": 5612 + }, + { + "epoch": 93.0, + "eval_loss": 12.008685111999512, + "eval_runtime": 1.5385, + "eval_samples_per_second": 79.298, + "eval_steps_per_second": 10.4, + "step": 5673 + }, + { + "epoch": 94.0, + "eval_loss": 11.980273246765137, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.688, + "eval_steps_per_second": 10.451, + "step": 5734 + }, + { + "epoch": 95.0, + "eval_loss": 11.909398078918457, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.689, + "eval_steps_per_second": 10.451, + "step": 5795 + }, + { + "epoch": 96.0, + "eval_loss": 12.159173965454102, + "eval_runtime": 1.5365, + "eval_samples_per_second": 79.401, + "eval_steps_per_second": 10.413, + "step": 5856 + }, + { + "epoch": 97.0, + "eval_loss": 11.936081886291504, + "eval_runtime": 1.5323, + "eval_samples_per_second": 79.617, + "eval_steps_per_second": 10.442, + "step": 5917 + }, + { + "epoch": 98.0, + "eval_loss": 11.959556579589844, + "eval_runtime": 1.5396, + "eval_samples_per_second": 79.243, + "eval_steps_per_second": 10.393, + "step": 5978 + }, + { + "epoch": 98.36, + "learning_rate": 4.016393442622951e-08, + "loss": 10.693, + "step": 6000 + }, + { + "epoch": 99.0, + "eval_loss": 11.902626037597656, + "eval_runtime": 1.5386, + "eval_samples_per_second": 79.293, + "eval_steps_per_second": 10.399, + "step": 6039 + }, + { + "epoch": 100.0, + "eval_loss": 12.003995895385742, + "eval_runtime": 1.54, + "eval_samples_per_second": 79.222, + "eval_steps_per_second": 10.39, + "step": 6100 + }, + { + "epoch": 101.0, + "eval_loss": 11.886563301086426, + "eval_runtime": 1.5404, + "eval_samples_per_second": 79.201, + "eval_steps_per_second": 10.387, + "step": 6161 + }, + { + "epoch": 102.0, + "eval_loss": 11.953593254089355, + "eval_runtime": 1.5318, + "eval_samples_per_second": 79.643, + "eval_steps_per_second": 10.445, + "step": 6222 + }, + { + "epoch": 103.0, + "eval_loss": 11.803376197814941, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.678, + "eval_steps_per_second": 10.45, + "step": 6283 + }, + { + "epoch": 104.0, + "eval_loss": 11.688481330871582, + "eval_runtime": 1.5386, + "eval_samples_per_second": 79.291, + "eval_steps_per_second": 10.399, + "step": 6344 + }, + { + "epoch": 105.0, + "eval_loss": 11.850458145141602, + "eval_runtime": 1.5318, + "eval_samples_per_second": 79.647, + "eval_steps_per_second": 10.446, + "step": 6405 + }, + { + "epoch": 106.0, + "eval_loss": 11.828042984008789, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.677, + "eval_steps_per_second": 10.449, + "step": 6466 + }, + { + "epoch": 106.56, + "learning_rate": 3.934426229508196e-08, + "loss": 10.5875, + "step": 6500 + }, + { + "epoch": 107.0, + "eval_loss": 11.787361145019531, + "eval_runtime": 1.5414, + "eval_samples_per_second": 79.149, + "eval_steps_per_second": 10.38, + "step": 6527 + }, + { + "epoch": 108.0, + "eval_loss": 11.734838485717773, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.353, + "eval_steps_per_second": 10.407, + "step": 6588 + }, + { + "epoch": 109.0, + "eval_loss": 11.776476860046387, + "eval_runtime": 1.5397, + "eval_samples_per_second": 79.238, + "eval_steps_per_second": 10.392, + "step": 6649 + }, + { + "epoch": 110.0, + "eval_loss": 11.752671241760254, + "eval_runtime": 1.538, + "eval_samples_per_second": 79.323, + "eval_steps_per_second": 10.403, + "step": 6710 + }, + { + "epoch": 111.0, + "eval_loss": 11.681644439697266, + "eval_runtime": 1.5307, + "eval_samples_per_second": 79.7, + "eval_steps_per_second": 10.452, + "step": 6771 + }, + { + "epoch": 112.0, + "eval_loss": 11.739622116088867, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.674, + "eval_steps_per_second": 10.449, + "step": 6832 + }, + { + "epoch": 113.0, + "eval_loss": 11.647455215454102, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.625, + "eval_steps_per_second": 10.443, + "step": 6893 + }, + { + "epoch": 114.0, + "eval_loss": 11.70103645324707, + "eval_runtime": 1.5307, + "eval_samples_per_second": 79.702, + "eval_steps_per_second": 10.453, + "step": 6954 + }, + { + "epoch": 114.75, + "learning_rate": 3.852459016393442e-08, + "loss": 10.5114, + "step": 7000 + }, + { + "epoch": 115.0, + "eval_loss": 11.704916000366211, + "eval_runtime": 1.543, + "eval_samples_per_second": 79.064, + "eval_steps_per_second": 10.369, + "step": 7015 + }, + { + "epoch": 116.0, + "eval_loss": 11.796719551086426, + "eval_runtime": 1.5451, + "eval_samples_per_second": 78.957, + "eval_steps_per_second": 10.355, + "step": 7076 + }, + { + "epoch": 117.0, + "eval_loss": 11.724781036376953, + "eval_runtime": 1.5406, + "eval_samples_per_second": 79.189, + "eval_steps_per_second": 10.385, + "step": 7137 + }, + { + "epoch": 118.0, + "eval_loss": 11.654914855957031, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 10.406, + "step": 7198 + }, + { + "epoch": 119.0, + "eval_loss": 11.519423484802246, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.641, + "eval_steps_per_second": 10.445, + "step": 7259 + }, + { + "epoch": 120.0, + "eval_loss": 11.692423820495605, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.691, + "eval_steps_per_second": 10.451, + "step": 7320 + }, + { + "epoch": 121.0, + "eval_loss": 11.519417762756348, + "eval_runtime": 1.5318, + "eval_samples_per_second": 79.643, + "eval_steps_per_second": 10.445, + "step": 7381 + }, + { + "epoch": 122.0, + "eval_loss": 11.660749435424805, + "eval_runtime": 1.5564, + "eval_samples_per_second": 78.387, + "eval_steps_per_second": 10.28, + "step": 7442 + }, + { + "epoch": 122.95, + "learning_rate": 3.770491803278688e-08, + "loss": 10.4791, + "step": 7500 + }, + { + "epoch": 123.0, + "eval_loss": 11.518941879272461, + "eval_runtime": 1.5358, + "eval_samples_per_second": 79.437, + "eval_steps_per_second": 10.418, + "step": 7503 + }, + { + "epoch": 124.0, + "eval_loss": 11.552470207214355, + "eval_runtime": 1.5427, + "eval_samples_per_second": 79.083, + "eval_steps_per_second": 10.372, + "step": 7564 + }, + { + "epoch": 125.0, + "eval_loss": 11.522570610046387, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.342, + "eval_steps_per_second": 10.406, + "step": 7625 + }, + { + "epoch": 126.0, + "eval_loss": 11.495443344116211, + "eval_runtime": 1.5392, + "eval_samples_per_second": 79.264, + "eval_steps_per_second": 10.395, + "step": 7686 + }, + { + "epoch": 127.0, + "eval_loss": 11.59436321258545, + "eval_runtime": 1.5335, + "eval_samples_per_second": 79.555, + "eval_steps_per_second": 10.433, + "step": 7747 + }, + { + "epoch": 128.0, + "eval_loss": 11.705589294433594, + "eval_runtime": 1.5364, + "eval_samples_per_second": 79.408, + "eval_steps_per_second": 10.414, + "step": 7808 + }, + { + "epoch": 129.0, + "eval_loss": 11.746439933776855, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.665, + "eval_steps_per_second": 10.448, + "step": 7869 + }, + { + "epoch": 130.0, + "eval_loss": 11.48509407043457, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.51, + "eval_steps_per_second": 10.428, + "step": 7930 + }, + { + "epoch": 131.0, + "eval_loss": 11.310612678527832, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.688, + "eval_steps_per_second": 10.451, + "step": 7991 + }, + { + "epoch": 131.15, + "learning_rate": 3.6885245901639346e-08, + "loss": 10.4223, + "step": 8000 + }, + { + "epoch": 132.0, + "eval_loss": 11.661527633666992, + "eval_runtime": 1.543, + "eval_samples_per_second": 79.067, + "eval_steps_per_second": 10.369, + "step": 8052 + }, + { + "epoch": 133.0, + "eval_loss": 11.512996673583984, + "eval_runtime": 1.5421, + "eval_samples_per_second": 79.112, + "eval_steps_per_second": 10.375, + "step": 8113 + }, + { + "epoch": 134.0, + "eval_loss": 11.586601257324219, + "eval_runtime": 1.5394, + "eval_samples_per_second": 79.249, + "eval_steps_per_second": 10.393, + "step": 8174 + }, + { + "epoch": 135.0, + "eval_loss": 11.499874114990234, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.679, + "eval_steps_per_second": 10.45, + "step": 8235 + }, + { + "epoch": 136.0, + "eval_loss": 11.573627471923828, + "eval_runtime": 1.5328, + "eval_samples_per_second": 79.594, + "eval_steps_per_second": 10.439, + "step": 8296 + }, + { + "epoch": 137.0, + "eval_loss": 11.610065460205078, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.611, + "eval_steps_per_second": 10.441, + "step": 8357 + }, + { + "epoch": 138.0, + "eval_loss": 11.601760864257812, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.629, + "eval_steps_per_second": 10.443, + "step": 8418 + }, + { + "epoch": 139.0, + "eval_loss": 11.506176948547363, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.672, + "eval_steps_per_second": 10.449, + "step": 8479 + }, + { + "epoch": 139.34, + "learning_rate": 3.60655737704918e-08, + "loss": 10.3725, + "step": 8500 + }, + { + "epoch": 140.0, + "eval_loss": 11.469504356384277, + "eval_runtime": 1.5361, + "eval_samples_per_second": 79.423, + "eval_steps_per_second": 10.416, + "step": 8540 + }, + { + "epoch": 141.0, + "eval_loss": 11.574456214904785, + "eval_runtime": 1.5371, + "eval_samples_per_second": 79.368, + "eval_steps_per_second": 10.409, + "step": 8601 + }, + { + "epoch": 142.0, + "eval_loss": 11.213080406188965, + "eval_runtime": 1.5468, + "eval_samples_per_second": 78.874, + "eval_steps_per_second": 10.344, + "step": 8662 + }, + { + "epoch": 143.0, + "eval_loss": 11.486066818237305, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.628, + "eval_steps_per_second": 10.443, + "step": 8723 + }, + { + "epoch": 144.0, + "eval_loss": 11.352083206176758, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.354, + "eval_steps_per_second": 10.407, + "step": 8784 + }, + { + "epoch": 145.0, + "eval_loss": 11.430201530456543, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.692, + "eval_steps_per_second": 10.451, + "step": 8845 + }, + { + "epoch": 146.0, + "eval_loss": 11.389131546020508, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.634, + "eval_steps_per_second": 10.444, + "step": 8906 + }, + { + "epoch": 147.0, + "eval_loss": 11.494805335998535, + "eval_runtime": 1.5431, + "eval_samples_per_second": 79.064, + "eval_steps_per_second": 10.369, + "step": 8967 + }, + { + "epoch": 147.54, + "learning_rate": 3.524590163934426e-08, + "loss": 10.3579, + "step": 9000 + }, + { + "epoch": 148.0, + "eval_loss": 11.413972854614258, + "eval_runtime": 1.5681, + "eval_samples_per_second": 77.803, + "eval_steps_per_second": 10.204, + "step": 9028 + }, + { + "epoch": 149.0, + "eval_loss": 11.28091812133789, + "eval_runtime": 1.5378, + "eval_samples_per_second": 79.334, + "eval_steps_per_second": 10.404, + "step": 9089 + }, + { + "epoch": 150.0, + "eval_loss": 11.508408546447754, + "eval_runtime": 1.5385, + "eval_samples_per_second": 79.299, + "eval_steps_per_second": 10.4, + "step": 9150 + }, + { + "epoch": 151.0, + "eval_loss": 11.494176864624023, + "eval_runtime": 1.5357, + "eval_samples_per_second": 79.441, + "eval_steps_per_second": 10.418, + "step": 9211 + }, + { + "epoch": 152.0, + "eval_loss": 11.431586265563965, + "eval_runtime": 1.535, + "eval_samples_per_second": 79.477, + "eval_steps_per_second": 10.423, + "step": 9272 + }, + { + "epoch": 153.0, + "eval_loss": 11.505435943603516, + "eval_runtime": 1.5329, + "eval_samples_per_second": 79.587, + "eval_steps_per_second": 10.438, + "step": 9333 + }, + { + "epoch": 154.0, + "eval_loss": 11.435380935668945, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.671, + "eval_steps_per_second": 10.449, + "step": 9394 + }, + { + "epoch": 155.0, + "eval_loss": 11.370787620544434, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.677, + "eval_steps_per_second": 10.449, + "step": 9455 + }, + { + "epoch": 155.74, + "learning_rate": 3.442622950819672e-08, + "loss": 10.3411, + "step": 9500 + }, + { + "epoch": 156.0, + "eval_loss": 11.60168170928955, + "eval_runtime": 1.541, + "eval_samples_per_second": 79.168, + "eval_steps_per_second": 10.383, + "step": 9516 + }, + { + "epoch": 157.0, + "eval_loss": 11.441522598266602, + "eval_runtime": 1.5413, + "eval_samples_per_second": 79.155, + "eval_steps_per_second": 10.381, + "step": 9577 + }, + { + "epoch": 158.0, + "eval_loss": 11.614983558654785, + "eval_runtime": 1.5394, + "eval_samples_per_second": 79.253, + "eval_steps_per_second": 10.394, + "step": 9638 + }, + { + "epoch": 159.0, + "eval_loss": 11.37887191772461, + "eval_runtime": 1.536, + "eval_samples_per_second": 79.425, + "eval_steps_per_second": 10.416, + "step": 9699 + }, + { + "epoch": 160.0, + "eval_loss": 11.334174156188965, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.637, + "eval_steps_per_second": 10.444, + "step": 9760 + }, + { + "epoch": 161.0, + "eval_loss": 11.476101875305176, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.684, + "eval_steps_per_second": 10.45, + "step": 9821 + }, + { + "epoch": 162.0, + "eval_loss": 11.300040245056152, + "eval_runtime": 1.5341, + "eval_samples_per_second": 79.525, + "eval_steps_per_second": 10.43, + "step": 9882 + }, + { + "epoch": 163.0, + "eval_loss": 11.410900115966797, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.53, + "eval_steps_per_second": 10.43, + "step": 9943 + }, + { + "epoch": 163.93, + "learning_rate": 3.360655737704918e-08, + "loss": 10.3236, + "step": 10000 + }, + { + "epoch": 164.0, + "eval_loss": 11.425009727478027, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.612, + "eval_steps_per_second": 10.441, + "step": 10004 + }, + { + "epoch": 165.0, + "eval_loss": 11.32499885559082, + "eval_runtime": 1.5438, + "eval_samples_per_second": 79.024, + "eval_steps_per_second": 10.364, + "step": 10065 + }, + { + "epoch": 166.0, + "eval_loss": 11.42471981048584, + "eval_runtime": 1.5488, + "eval_samples_per_second": 78.773, + "eval_steps_per_second": 10.331, + "step": 10126 + }, + { + "epoch": 167.0, + "eval_loss": 11.16617488861084, + "eval_runtime": 1.5363, + "eval_samples_per_second": 79.414, + "eval_steps_per_second": 10.415, + "step": 10187 + }, + { + "epoch": 168.0, + "eval_loss": 11.44943904876709, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.507, + "eval_steps_per_second": 10.427, + "step": 10248 + }, + { + "epoch": 169.0, + "eval_loss": 11.387123107910156, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.551, + "eval_steps_per_second": 10.433, + "step": 10309 + }, + { + "epoch": 170.0, + "eval_loss": 11.296111106872559, + "eval_runtime": 1.5331, + "eval_samples_per_second": 79.578, + "eval_steps_per_second": 10.436, + "step": 10370 + }, + { + "epoch": 171.0, + "eval_loss": 11.357579231262207, + "eval_runtime": 1.5325, + "eval_samples_per_second": 79.607, + "eval_steps_per_second": 10.44, + "step": 10431 + }, + { + "epoch": 172.0, + "eval_loss": 11.435007095336914, + "eval_runtime": 1.5331, + "eval_samples_per_second": 79.577, + "eval_steps_per_second": 10.436, + "step": 10492 + }, + { + "epoch": 172.13, + "learning_rate": 3.278688524590163e-08, + "loss": 10.3059, + "step": 10500 + }, + { + "epoch": 173.0, + "eval_loss": 11.361226081848145, + "eval_runtime": 1.5571, + "eval_samples_per_second": 78.352, + "eval_steps_per_second": 10.276, + "step": 10553 + }, + { + "epoch": 174.0, + "eval_loss": 11.39792251586914, + "eval_runtime": 1.541, + "eval_samples_per_second": 79.167, + "eval_steps_per_second": 10.383, + "step": 10614 + }, + { + "epoch": 175.0, + "eval_loss": 11.371596336364746, + "eval_runtime": 1.5383, + "eval_samples_per_second": 79.306, + "eval_steps_per_second": 10.401, + "step": 10675 + }, + { + "epoch": 176.0, + "eval_loss": 11.40168571472168, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.669, + "eval_steps_per_second": 10.448, + "step": 10736 + }, + { + "epoch": 177.0, + "eval_loss": 11.534231185913086, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.584, + "eval_steps_per_second": 10.437, + "step": 10797 + }, + { + "epoch": 178.0, + "eval_loss": 11.227432250976562, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.634, + "eval_steps_per_second": 10.444, + "step": 10858 + }, + { + "epoch": 179.0, + "eval_loss": 11.432555198669434, + "eval_runtime": 1.5369, + "eval_samples_per_second": 79.379, + "eval_steps_per_second": 10.41, + "step": 10919 + }, + { + "epoch": 180.0, + "eval_loss": 11.477941513061523, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.671, + "eval_steps_per_second": 10.449, + "step": 10980 + }, + { + "epoch": 180.33, + "learning_rate": 3.19672131147541e-08, + "loss": 10.2637, + "step": 11000 + }, + { + "epoch": 181.0, + "eval_loss": 11.34237003326416, + "eval_runtime": 1.5403, + "eval_samples_per_second": 79.205, + "eval_steps_per_second": 10.388, + "step": 11041 + }, + { + "epoch": 182.0, + "eval_loss": 11.245933532714844, + "eval_runtime": 1.5372, + "eval_samples_per_second": 79.363, + "eval_steps_per_second": 10.408, + "step": 11102 + }, + { + "epoch": 183.0, + "eval_loss": 11.317768096923828, + "eval_runtime": 1.5396, + "eval_samples_per_second": 79.241, + "eval_steps_per_second": 10.392, + "step": 11163 + }, + { + "epoch": 184.0, + "eval_loss": 11.325362205505371, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.67, + "eval_steps_per_second": 10.449, + "step": 11224 + }, + { + "epoch": 185.0, + "eval_loss": 11.263531684875488, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.611, + "eval_steps_per_second": 10.441, + "step": 11285 + }, + { + "epoch": 186.0, + "eval_loss": 11.21447467803955, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.641, + "eval_steps_per_second": 10.445, + "step": 11346 + }, + { + "epoch": 187.0, + "eval_loss": 11.32798957824707, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.67, + "eval_steps_per_second": 10.449, + "step": 11407 + }, + { + "epoch": 188.0, + "eval_loss": 11.337279319763184, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.571, + "eval_steps_per_second": 10.436, + "step": 11468 + }, + { + "epoch": 188.52, + "learning_rate": 3.1147540983606555e-08, + "loss": 10.2837, + "step": 11500 + }, + { + "epoch": 189.0, + "eval_loss": 11.180814743041992, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.316, + "eval_steps_per_second": 10.402, + "step": 11529 + }, + { + "epoch": 190.0, + "eval_loss": 11.221953392028809, + "eval_runtime": 1.5474, + "eval_samples_per_second": 78.844, + "eval_steps_per_second": 10.34, + "step": 11590 + }, + { + "epoch": 191.0, + "eval_loss": 11.125133514404297, + "eval_runtime": 1.5562, + "eval_samples_per_second": 78.399, + "eval_steps_per_second": 10.282, + "step": 11651 + }, + { + "epoch": 192.0, + "eval_loss": 11.356901168823242, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.64, + "eval_steps_per_second": 10.445, + "step": 11712 + }, + { + "epoch": 193.0, + "eval_loss": 11.188824653625488, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.686, + "eval_steps_per_second": 10.451, + "step": 11773 + }, + { + "epoch": 194.0, + "eval_loss": 11.257243156433105, + "eval_runtime": 1.5371, + "eval_samples_per_second": 79.368, + "eval_steps_per_second": 10.409, + "step": 11834 + }, + { + "epoch": 195.0, + "eval_loss": 11.315752029418945, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.655, + "eval_steps_per_second": 10.447, + "step": 11895 + }, + { + "epoch": 196.0, + "eval_loss": 11.294753074645996, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.571, + "eval_steps_per_second": 10.436, + "step": 11956 + }, + { + "epoch": 196.72, + "learning_rate": 3.032786885245902e-08, + "loss": 10.2404, + "step": 12000 + }, + { + "epoch": 197.0, + "eval_loss": 11.21860408782959, + "eval_runtime": 1.5446, + "eval_samples_per_second": 78.984, + "eval_steps_per_second": 10.359, + "step": 12017 + }, + { + "epoch": 198.0, + "eval_loss": 11.28231143951416, + "eval_runtime": 1.5623, + "eval_samples_per_second": 78.089, + "eval_steps_per_second": 10.241, + "step": 12078 + }, + { + "epoch": 199.0, + "eval_loss": 11.250580787658691, + "eval_runtime": 1.5444, + "eval_samples_per_second": 78.996, + "eval_steps_per_second": 10.36, + "step": 12139 + }, + { + "epoch": 200.0, + "eval_loss": 11.55415153503418, + "eval_runtime": 1.5349, + "eval_samples_per_second": 79.484, + "eval_steps_per_second": 10.424, + "step": 12200 + }, + { + "epoch": 201.0, + "eval_loss": 11.313996315002441, + "eval_runtime": 1.5323, + "eval_samples_per_second": 79.619, + "eval_steps_per_second": 10.442, + "step": 12261 + }, + { + "epoch": 202.0, + "eval_loss": 11.200800895690918, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 10.448, + "step": 12322 + }, + { + "epoch": 203.0, + "eval_loss": 11.157076835632324, + "eval_runtime": 1.5375, + "eval_samples_per_second": 79.352, + "eval_steps_per_second": 10.407, + "step": 12383 + }, + { + "epoch": 204.0, + "eval_loss": 11.233716011047363, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.671, + "eval_steps_per_second": 10.449, + "step": 12444 + }, + { + "epoch": 204.92, + "learning_rate": 2.9508196721311475e-08, + "loss": 10.2304, + "step": 12500 + }, + { + "epoch": 205.0, + "eval_loss": 11.270787239074707, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.615, + "eval_steps_per_second": 10.441, + "step": 12505 + }, + { + "epoch": 206.0, + "eval_loss": 11.30806827545166, + "eval_runtime": 1.5392, + "eval_samples_per_second": 79.261, + "eval_steps_per_second": 10.395, + "step": 12566 + }, + { + "epoch": 207.0, + "eval_loss": 11.11026382446289, + "eval_runtime": 1.5416, + "eval_samples_per_second": 79.137, + "eval_steps_per_second": 10.379, + "step": 12627 + }, + { + "epoch": 208.0, + "eval_loss": 11.123950958251953, + "eval_runtime": 1.538, + "eval_samples_per_second": 79.323, + "eval_steps_per_second": 10.403, + "step": 12688 + }, + { + "epoch": 209.0, + "eval_loss": 11.3450288772583, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.55, + "eval_steps_per_second": 10.433, + "step": 12749 + }, + { + "epoch": 210.0, + "eval_loss": 11.059000015258789, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.683, + "eval_steps_per_second": 10.45, + "step": 12810 + }, + { + "epoch": 211.0, + "eval_loss": 11.257308006286621, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.654, + "eval_steps_per_second": 10.446, + "step": 12871 + }, + { + "epoch": 212.0, + "eval_loss": 11.207562446594238, + "eval_runtime": 1.5416, + "eval_samples_per_second": 79.141, + "eval_steps_per_second": 10.379, + "step": 12932 + }, + { + "epoch": 213.0, + "eval_loss": 11.18131160736084, + "eval_runtime": 1.5315, + "eval_samples_per_second": 79.659, + "eval_steps_per_second": 10.447, + "step": 12993 + }, + { + "epoch": 213.11, + "learning_rate": 2.8688524590163933e-08, + "loss": 10.2405, + "step": 13000 + }, + { + "epoch": 214.0, + "eval_loss": 11.338172912597656, + "eval_runtime": 1.5378, + "eval_samples_per_second": 79.334, + "eval_steps_per_second": 10.404, + "step": 13054 + }, + { + "epoch": 215.0, + "eval_loss": 11.409613609313965, + "eval_runtime": 1.5389, + "eval_samples_per_second": 79.277, + "eval_steps_per_second": 10.397, + "step": 13115 + }, + { + "epoch": 216.0, + "eval_loss": 11.152140617370605, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.313, + "eval_steps_per_second": 10.402, + "step": 13176 + }, + { + "epoch": 217.0, + "eval_loss": 11.312527656555176, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.653, + "eval_steps_per_second": 10.446, + "step": 13237 + }, + { + "epoch": 218.0, + "eval_loss": 11.191665649414062, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.682, + "eval_steps_per_second": 10.45, + "step": 13298 + }, + { + "epoch": 219.0, + "eval_loss": 11.279202461242676, + "eval_runtime": 1.5384, + "eval_samples_per_second": 79.305, + "eval_steps_per_second": 10.401, + "step": 13359 + }, + { + "epoch": 220.0, + "eval_loss": 11.123604774475098, + "eval_runtime": 1.5361, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 10.416, + "step": 13420 + }, + { + "epoch": 221.0, + "eval_loss": 11.239697456359863, + "eval_runtime": 1.5358, + "eval_samples_per_second": 79.438, + "eval_steps_per_second": 10.418, + "step": 13481 + }, + { + "epoch": 221.31, + "learning_rate": 2.786885245901639e-08, + "loss": 10.2096, + "step": 13500 + }, + { + "epoch": 222.0, + "eval_loss": 11.187539100646973, + "eval_runtime": 1.5423, + "eval_samples_per_second": 79.104, + "eval_steps_per_second": 10.374, + "step": 13542 + }, + { + "epoch": 223.0, + "eval_loss": 11.311732292175293, + "eval_runtime": 1.5401, + "eval_samples_per_second": 79.213, + "eval_steps_per_second": 10.389, + "step": 13603 + }, + { + "epoch": 224.0, + "eval_loss": 11.156542778015137, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.353, + "eval_steps_per_second": 10.407, + "step": 13664 + }, + { + "epoch": 225.0, + "eval_loss": 11.416545867919922, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.581, + "eval_steps_per_second": 10.437, + "step": 13725 + }, + { + "epoch": 226.0, + "eval_loss": 11.147370338439941, + "eval_runtime": 1.5328, + "eval_samples_per_second": 79.591, + "eval_steps_per_second": 10.438, + "step": 13786 + }, + { + "epoch": 227.0, + "eval_loss": 10.985366821289062, + "eval_runtime": 1.5331, + "eval_samples_per_second": 79.579, + "eval_steps_per_second": 10.437, + "step": 13847 + }, + { + "epoch": 228.0, + "eval_loss": 11.134580612182617, + "eval_runtime": 1.5339, + "eval_samples_per_second": 79.538, + "eval_steps_per_second": 10.431, + "step": 13908 + }, + { + "epoch": 229.0, + "eval_loss": 11.212328910827637, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.625, + "eval_steps_per_second": 10.443, + "step": 13969 + }, + { + "epoch": 229.51, + "learning_rate": 2.7049180327868852e-08, + "loss": 10.1998, + "step": 14000 + }, + { + "epoch": 230.0, + "eval_loss": 11.24535846710205, + "eval_runtime": 1.5423, + "eval_samples_per_second": 79.101, + "eval_steps_per_second": 10.374, + "step": 14030 + }, + { + "epoch": 231.0, + "eval_loss": 11.335315704345703, + "eval_runtime": 1.5438, + "eval_samples_per_second": 79.027, + "eval_steps_per_second": 10.364, + "step": 14091 + }, + { + "epoch": 232.0, + "eval_loss": 11.305168151855469, + "eval_runtime": 1.5455, + "eval_samples_per_second": 78.937, + "eval_steps_per_second": 10.352, + "step": 14152 + }, + { + "epoch": 233.0, + "eval_loss": 11.177278518676758, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.631, + "eval_steps_per_second": 10.443, + "step": 14213 + }, + { + "epoch": 234.0, + "eval_loss": 11.132712364196777, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.57, + "eval_steps_per_second": 10.435, + "step": 14274 + }, + { + "epoch": 235.0, + "eval_loss": 11.310881614685059, + "eval_runtime": 1.5363, + "eval_samples_per_second": 79.411, + "eval_steps_per_second": 10.415, + "step": 14335 + }, + { + "epoch": 236.0, + "eval_loss": 11.178828239440918, + "eval_runtime": 1.5317, + "eval_samples_per_second": 79.652, + "eval_steps_per_second": 10.446, + "step": 14396 + }, + { + "epoch": 237.0, + "eval_loss": 11.376667022705078, + "eval_runtime": 1.542, + "eval_samples_per_second": 79.118, + "eval_steps_per_second": 10.376, + "step": 14457 + }, + { + "epoch": 237.7, + "learning_rate": 2.622950819672131e-08, + "loss": 10.1947, + "step": 14500 + }, + { + "epoch": 238.0, + "eval_loss": 11.21568489074707, + "eval_runtime": 1.5435, + "eval_samples_per_second": 79.039, + "eval_steps_per_second": 10.366, + "step": 14518 + }, + { + "epoch": 239.0, + "eval_loss": 11.210213661193848, + "eval_runtime": 1.555, + "eval_samples_per_second": 78.455, + "eval_steps_per_second": 10.289, + "step": 14579 + }, + { + "epoch": 240.0, + "eval_loss": 11.184165954589844, + "eval_runtime": 1.5406, + "eval_samples_per_second": 79.191, + "eval_steps_per_second": 10.386, + "step": 14640 + }, + { + "epoch": 241.0, + "eval_loss": 11.139164924621582, + "eval_runtime": 1.5333, + "eval_samples_per_second": 79.568, + "eval_steps_per_second": 10.435, + "step": 14701 + }, + { + "epoch": 242.0, + "eval_loss": 11.139853477478027, + "eval_runtime": 1.5354, + "eval_samples_per_second": 79.456, + "eval_steps_per_second": 10.42, + "step": 14762 + }, + { + "epoch": 243.0, + "eval_loss": 11.163023948669434, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.55, + "eval_steps_per_second": 10.433, + "step": 14823 + }, + { + "epoch": 244.0, + "eval_loss": 11.19721794128418, + "eval_runtime": 1.5334, + "eval_samples_per_second": 79.56, + "eval_steps_per_second": 10.434, + "step": 14884 + }, + { + "epoch": 245.0, + "eval_loss": 11.054798126220703, + "eval_runtime": 1.5315, + "eval_samples_per_second": 79.661, + "eval_steps_per_second": 10.447, + "step": 14945 + }, + { + "epoch": 245.9, + "learning_rate": 2.5409836065573768e-08, + "loss": 10.1922, + "step": 15000 + }, + { + "epoch": 246.0, + "eval_loss": 11.127946853637695, + "eval_runtime": 1.5532, + "eval_samples_per_second": 78.545, + "eval_steps_per_second": 10.301, + "step": 15006 + }, + { + "epoch": 247.0, + "eval_loss": 11.096878051757812, + "eval_runtime": 1.5376, + "eval_samples_per_second": 79.343, + "eval_steps_per_second": 10.406, + "step": 15067 + }, + { + "epoch": 248.0, + "eval_loss": 11.234783172607422, + "eval_runtime": 1.5405, + "eval_samples_per_second": 79.193, + "eval_steps_per_second": 10.386, + "step": 15128 + }, + { + "epoch": 249.0, + "eval_loss": 11.115097045898438, + "eval_runtime": 1.5469, + "eval_samples_per_second": 78.865, + "eval_steps_per_second": 10.343, + "step": 15189 + }, + { + "epoch": 250.0, + "eval_loss": 11.53684139251709, + "eval_runtime": 1.5325, + "eval_samples_per_second": 79.611, + "eval_steps_per_second": 10.441, + "step": 15250 + }, + { + "epoch": 251.0, + "eval_loss": 11.224416732788086, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.627, + "eval_steps_per_second": 10.443, + "step": 15311 + }, + { + "epoch": 252.0, + "eval_loss": 11.210186958312988, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.387, + "eval_steps_per_second": 10.411, + "step": 15372 + }, + { + "epoch": 253.0, + "eval_loss": 11.273506164550781, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.622, + "eval_steps_per_second": 10.442, + "step": 15433 + }, + { + "epoch": 254.0, + "eval_loss": 11.3226900100708, + "eval_runtime": 1.5371, + "eval_samples_per_second": 79.371, + "eval_steps_per_second": 10.409, + "step": 15494 + }, + { + "epoch": 254.1, + "learning_rate": 2.459016393442623e-08, + "loss": 10.1994, + "step": 15500 + }, + { + "epoch": 255.0, + "eval_loss": 11.237663269042969, + "eval_runtime": 1.5421, + "eval_samples_per_second": 79.114, + "eval_steps_per_second": 10.376, + "step": 15555 + }, + { + "epoch": 256.0, + "eval_loss": 11.283459663391113, + "eval_runtime": 1.539, + "eval_samples_per_second": 79.272, + "eval_steps_per_second": 10.396, + "step": 15616 + }, + { + "epoch": 257.0, + "eval_loss": 11.347498893737793, + "eval_runtime": 1.5437, + "eval_samples_per_second": 79.032, + "eval_steps_per_second": 10.365, + "step": 15677 + }, + { + "epoch": 258.0, + "eval_loss": 11.209207534790039, + "eval_runtime": 1.5326, + "eval_samples_per_second": 79.605, + "eval_steps_per_second": 10.44, + "step": 15738 + }, + { + "epoch": 259.0, + "eval_loss": 11.19356918334961, + "eval_runtime": 1.5356, + "eval_samples_per_second": 79.448, + "eval_steps_per_second": 10.419, + "step": 15799 + }, + { + "epoch": 260.0, + "eval_loss": 11.0318603515625, + "eval_runtime": 1.5384, + "eval_samples_per_second": 79.303, + "eval_steps_per_second": 10.4, + "step": 15860 + }, + { + "epoch": 261.0, + "eval_loss": 11.191557884216309, + "eval_runtime": 1.5356, + "eval_samples_per_second": 79.447, + "eval_steps_per_second": 10.419, + "step": 15921 + }, + { + "epoch": 262.0, + "eval_loss": 11.135727882385254, + "eval_runtime": 1.5318, + "eval_samples_per_second": 79.643, + "eval_steps_per_second": 10.445, + "step": 15982 + }, + { + "epoch": 262.29, + "learning_rate": 2.3770491803278688e-08, + "loss": 10.1883, + "step": 16000 + }, + { + "epoch": 263.0, + "eval_loss": 10.97315788269043, + "eval_runtime": 1.5381, + "eval_samples_per_second": 79.317, + "eval_steps_per_second": 10.402, + "step": 16043 + }, + { + "epoch": 264.0, + "eval_loss": 11.183859825134277, + "eval_runtime": 1.5418, + "eval_samples_per_second": 79.131, + "eval_steps_per_second": 10.378, + "step": 16104 + }, + { + "epoch": 265.0, + "eval_loss": 11.070130348205566, + "eval_runtime": 1.5422, + "eval_samples_per_second": 79.109, + "eval_steps_per_second": 10.375, + "step": 16165 + }, + { + "epoch": 266.0, + "eval_loss": 11.161293029785156, + "eval_runtime": 1.5365, + "eval_samples_per_second": 79.403, + "eval_steps_per_second": 10.414, + "step": 16226 + }, + { + "epoch": 267.0, + "eval_loss": 11.130182266235352, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.572, + "eval_steps_per_second": 10.436, + "step": 16287 + }, + { + "epoch": 268.0, + "eval_loss": 11.095097541809082, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.642, + "eval_steps_per_second": 10.445, + "step": 16348 + }, + { + "epoch": 269.0, + "eval_loss": 11.05788803100586, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 10.433, + "step": 16409 + }, + { + "epoch": 270.0, + "eval_loss": 11.145880699157715, + "eval_runtime": 1.5341, + "eval_samples_per_second": 79.523, + "eval_steps_per_second": 10.429, + "step": 16470 + }, + { + "epoch": 270.49, + "learning_rate": 2.2950819672131146e-08, + "loss": 10.1863, + "step": 16500 + }, + { + "epoch": 271.0, + "eval_loss": 11.196855545043945, + "eval_runtime": 1.5401, + "eval_samples_per_second": 79.218, + "eval_steps_per_second": 10.389, + "step": 16531 + }, + { + "epoch": 272.0, + "eval_loss": 11.127517700195312, + "eval_runtime": 1.5433, + "eval_samples_per_second": 79.049, + "eval_steps_per_second": 10.367, + "step": 16592 + }, + { + "epoch": 273.0, + "eval_loss": 11.111509323120117, + "eval_runtime": 1.5367, + "eval_samples_per_second": 79.393, + "eval_steps_per_second": 10.412, + "step": 16653 + }, + { + "epoch": 274.0, + "eval_loss": 11.128540992736816, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.616, + "eval_steps_per_second": 10.441, + "step": 16714 + }, + { + "epoch": 275.0, + "eval_loss": 11.105259895324707, + "eval_runtime": 1.5326, + "eval_samples_per_second": 79.601, + "eval_steps_per_second": 10.439, + "step": 16775 + }, + { + "epoch": 276.0, + "eval_loss": 11.010540008544922, + "eval_runtime": 1.5357, + "eval_samples_per_second": 79.442, + "eval_steps_per_second": 10.419, + "step": 16836 + }, + { + "epoch": 277.0, + "eval_loss": 11.137755393981934, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.53, + "eval_steps_per_second": 10.43, + "step": 16897 + }, + { + "epoch": 278.0, + "eval_loss": 11.077096939086914, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.582, + "eval_steps_per_second": 10.437, + "step": 16958 + }, + { + "epoch": 278.69, + "learning_rate": 2.2131147540983604e-08, + "loss": 10.1614, + "step": 17000 + }, + { + "epoch": 279.0, + "eval_loss": 11.061995506286621, + "eval_runtime": 1.5452, + "eval_samples_per_second": 78.952, + "eval_steps_per_second": 10.354, + "step": 17019 + }, + { + "epoch": 280.0, + "eval_loss": 10.990643501281738, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.385, + "eval_steps_per_second": 10.411, + "step": 17080 + }, + { + "epoch": 281.0, + "eval_loss": 11.077146530151367, + "eval_runtime": 1.539, + "eval_samples_per_second": 79.273, + "eval_steps_per_second": 10.396, + "step": 17141 + }, + { + "epoch": 282.0, + "eval_loss": 11.035726547241211, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.51, + "eval_steps_per_second": 10.428, + "step": 17202 + }, + { + "epoch": 283.0, + "eval_loss": 11.0416841506958, + "eval_runtime": 1.5338, + "eval_samples_per_second": 79.542, + "eval_steps_per_second": 10.432, + "step": 17263 + }, + { + "epoch": 284.0, + "eval_loss": 11.028667449951172, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.612, + "eval_steps_per_second": 10.441, + "step": 17324 + }, + { + "epoch": 285.0, + "eval_loss": 11.117180824279785, + "eval_runtime": 1.5342, + "eval_samples_per_second": 79.518, + "eval_steps_per_second": 10.429, + "step": 17385 + }, + { + "epoch": 286.0, + "eval_loss": 10.925678253173828, + "eval_runtime": 1.5356, + "eval_samples_per_second": 79.449, + "eval_steps_per_second": 10.42, + "step": 17446 + }, + { + "epoch": 286.88, + "learning_rate": 2.1311475409836065e-08, + "loss": 10.1717, + "step": 17500 + }, + { + "epoch": 287.0, + "eval_loss": 11.231226921081543, + "eval_runtime": 1.542, + "eval_samples_per_second": 79.12, + "eval_steps_per_second": 10.376, + "step": 17507 + }, + { + "epoch": 288.0, + "eval_loss": 11.380009651184082, + "eval_runtime": 1.5602, + "eval_samples_per_second": 78.195, + "eval_steps_per_second": 10.255, + "step": 17568 + }, + { + "epoch": 289.0, + "eval_loss": 11.13857364654541, + "eval_runtime": 1.5491, + "eval_samples_per_second": 78.754, + "eval_steps_per_second": 10.328, + "step": 17629 + }, + { + "epoch": 290.0, + "eval_loss": 11.172418594360352, + "eval_runtime": 1.5472, + "eval_samples_per_second": 78.854, + "eval_steps_per_second": 10.341, + "step": 17690 + }, + { + "epoch": 291.0, + "eval_loss": 11.162772178649902, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.529, + "eval_steps_per_second": 10.43, + "step": 17751 + }, + { + "epoch": 292.0, + "eval_loss": 11.122556686401367, + "eval_runtime": 1.5424, + "eval_samples_per_second": 79.097, + "eval_steps_per_second": 10.373, + "step": 17812 + }, + { + "epoch": 293.0, + "eval_loss": 11.19551944732666, + "eval_runtime": 1.5328, + "eval_samples_per_second": 79.592, + "eval_steps_per_second": 10.438, + "step": 17873 + }, + { + "epoch": 294.0, + "eval_loss": 11.138795852661133, + "eval_runtime": 1.537, + "eval_samples_per_second": 79.376, + "eval_steps_per_second": 10.41, + "step": 17934 + }, + { + "epoch": 295.0, + "eval_loss": 11.087359428405762, + "eval_runtime": 1.5338, + "eval_samples_per_second": 79.543, + "eval_steps_per_second": 10.432, + "step": 17995 + }, + { + "epoch": 295.08, + "learning_rate": 2.0491803278688523e-08, + "loss": 10.1806, + "step": 18000 + }, + { + "epoch": 296.0, + "eval_loss": 11.081265449523926, + "eval_runtime": 1.5381, + "eval_samples_per_second": 79.319, + "eval_steps_per_second": 10.403, + "step": 18056 + }, + { + "epoch": 297.0, + "eval_loss": 11.147530555725098, + "eval_runtime": 1.5414, + "eval_samples_per_second": 79.151, + "eval_steps_per_second": 10.381, + "step": 18117 + }, + { + "epoch": 298.0, + "eval_loss": 11.167841911315918, + "eval_runtime": 1.5414, + "eval_samples_per_second": 79.146, + "eval_steps_per_second": 10.38, + "step": 18178 + }, + { + "epoch": 299.0, + "eval_loss": 11.244978904724121, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.623, + "eval_steps_per_second": 10.442, + "step": 18239 + }, + { + "epoch": 300.0, + "eval_loss": 11.193556785583496, + "eval_runtime": 1.5334, + "eval_samples_per_second": 79.563, + "eval_steps_per_second": 10.434, + "step": 18300 + }, + { + "epoch": 301.0, + "eval_loss": 11.102090835571289, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.642, + "eval_steps_per_second": 10.445, + "step": 18361 + }, + { + "epoch": 302.0, + "eval_loss": 11.187400817871094, + "eval_runtime": 1.5378, + "eval_samples_per_second": 79.332, + "eval_steps_per_second": 10.404, + "step": 18422 + }, + { + "epoch": 303.0, + "eval_loss": 11.171931266784668, + "eval_runtime": 1.5351, + "eval_samples_per_second": 79.473, + "eval_steps_per_second": 10.423, + "step": 18483 + }, + { + "epoch": 303.28, + "learning_rate": 1.967213114754098e-08, + "loss": 10.1683, + "step": 18500 + }, + { + "epoch": 304.0, + "eval_loss": 11.155352592468262, + "eval_runtime": 1.5403, + "eval_samples_per_second": 79.204, + "eval_steps_per_second": 10.387, + "step": 18544 + }, + { + "epoch": 305.0, + "eval_loss": 11.0771484375, + "eval_runtime": 1.5429, + "eval_samples_per_second": 79.074, + "eval_steps_per_second": 10.37, + "step": 18605 + }, + { + "epoch": 306.0, + "eval_loss": 11.067580223083496, + "eval_runtime": 1.5407, + "eval_samples_per_second": 79.183, + "eval_steps_per_second": 10.385, + "step": 18666 + }, + { + "epoch": 307.0, + "eval_loss": 11.128029823303223, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.63, + "eval_steps_per_second": 10.443, + "step": 18727 + }, + { + "epoch": 308.0, + "eval_loss": 11.023638725280762, + "eval_runtime": 1.535, + "eval_samples_per_second": 79.477, + "eval_steps_per_second": 10.423, + "step": 18788 + }, + { + "epoch": 309.0, + "eval_loss": 11.14404582977295, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.47, + "eval_steps_per_second": 10.422, + "step": 18849 + }, + { + "epoch": 310.0, + "eval_loss": 11.184294700622559, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.508, + "eval_steps_per_second": 10.427, + "step": 18910 + }, + { + "epoch": 311.0, + "eval_loss": 11.04742431640625, + "eval_runtime": 1.5383, + "eval_samples_per_second": 79.309, + "eval_steps_per_second": 10.401, + "step": 18971 + }, + { + "epoch": 311.47, + "learning_rate": 1.885245901639344e-08, + "loss": 10.1437, + "step": 19000 + }, + { + "epoch": 312.0, + "eval_loss": 11.039079666137695, + "eval_runtime": 1.5399, + "eval_samples_per_second": 79.224, + "eval_steps_per_second": 10.39, + "step": 19032 + }, + { + "epoch": 313.0, + "eval_loss": 10.914813995361328, + "eval_runtime": 1.5428, + "eval_samples_per_second": 79.078, + "eval_steps_per_second": 10.371, + "step": 19093 + }, + { + "epoch": 314.0, + "eval_loss": 11.057476997375488, + "eval_runtime": 1.5481, + "eval_samples_per_second": 78.806, + "eval_steps_per_second": 10.335, + "step": 19154 + }, + { + "epoch": 315.0, + "eval_loss": 11.195489883422852, + "eval_runtime": 1.536, + "eval_samples_per_second": 79.425, + "eval_steps_per_second": 10.416, + "step": 19215 + }, + { + "epoch": 316.0, + "eval_loss": 11.005314826965332, + "eval_runtime": 1.5327, + "eval_samples_per_second": 79.597, + "eval_steps_per_second": 10.439, + "step": 19276 + }, + { + "epoch": 317.0, + "eval_loss": 11.080973625183105, + "eval_runtime": 1.5413, + "eval_samples_per_second": 79.154, + "eval_steps_per_second": 10.381, + "step": 19337 + }, + { + "epoch": 318.0, + "eval_loss": 11.13598346710205, + "eval_runtime": 1.5363, + "eval_samples_per_second": 79.411, + "eval_steps_per_second": 10.415, + "step": 19398 + }, + { + "epoch": 319.0, + "eval_loss": 11.229124069213867, + "eval_runtime": 1.5344, + "eval_samples_per_second": 79.512, + "eval_steps_per_second": 10.428, + "step": 19459 + }, + { + "epoch": 319.67, + "learning_rate": 1.80327868852459e-08, + "loss": 10.1539, + "step": 19500 + }, + { + "epoch": 320.0, + "eval_loss": 11.023887634277344, + "eval_runtime": 1.5371, + "eval_samples_per_second": 79.369, + "eval_steps_per_second": 10.409, + "step": 19520 + }, + { + "epoch": 321.0, + "eval_loss": 11.121563911437988, + "eval_runtime": 1.5411, + "eval_samples_per_second": 79.162, + "eval_steps_per_second": 10.382, + "step": 19581 + }, + { + "epoch": 322.0, + "eval_loss": 11.251620292663574, + "eval_runtime": 1.545, + "eval_samples_per_second": 78.965, + "eval_steps_per_second": 10.356, + "step": 19642 + }, + { + "epoch": 323.0, + "eval_loss": 10.975918769836426, + "eval_runtime": 1.5345, + "eval_samples_per_second": 79.505, + "eval_steps_per_second": 10.427, + "step": 19703 + }, + { + "epoch": 324.0, + "eval_loss": 11.039804458618164, + "eval_runtime": 1.5356, + "eval_samples_per_second": 79.446, + "eval_steps_per_second": 10.419, + "step": 19764 + }, + { + "epoch": 325.0, + "eval_loss": 11.043091773986816, + "eval_runtime": 1.5325, + "eval_samples_per_second": 79.607, + "eval_steps_per_second": 10.44, + "step": 19825 + }, + { + "epoch": 326.0, + "eval_loss": 10.915124893188477, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.612, + "eval_steps_per_second": 10.441, + "step": 19886 + }, + { + "epoch": 327.0, + "eval_loss": 11.090548515319824, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.626, + "eval_steps_per_second": 10.443, + "step": 19947 + }, + { + "epoch": 327.87, + "learning_rate": 1.721311475409836e-08, + "loss": 10.1432, + "step": 20000 + }, + { + "epoch": 328.0, + "eval_loss": 11.009906768798828, + "eval_runtime": 1.539, + "eval_samples_per_second": 79.274, + "eval_steps_per_second": 10.397, + "step": 20008 + }, + { + "epoch": 329.0, + "eval_loss": 11.089301109313965, + "eval_runtime": 1.5522, + "eval_samples_per_second": 78.6, + "eval_steps_per_second": 10.308, + "step": 20069 + }, + { + "epoch": 330.0, + "eval_loss": 11.134428024291992, + "eval_runtime": 1.5358, + "eval_samples_per_second": 79.437, + "eval_steps_per_second": 10.418, + "step": 20130 + }, + { + "epoch": 331.0, + "eval_loss": 11.068151473999023, + "eval_runtime": 1.5446, + "eval_samples_per_second": 78.987, + "eval_steps_per_second": 10.359, + "step": 20191 + }, + { + "epoch": 332.0, + "eval_loss": 10.955802917480469, + "eval_runtime": 1.5328, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 10.438, + "step": 20252 + }, + { + "epoch": 333.0, + "eval_loss": 11.066947937011719, + "eval_runtime": 1.5339, + "eval_samples_per_second": 79.537, + "eval_steps_per_second": 10.431, + "step": 20313 + }, + { + "epoch": 334.0, + "eval_loss": 11.055612564086914, + "eval_runtime": 1.5381, + "eval_samples_per_second": 79.321, + "eval_steps_per_second": 10.403, + "step": 20374 + }, + { + "epoch": 335.0, + "eval_loss": 11.20965576171875, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.666, + "eval_steps_per_second": 10.448, + "step": 20435 + }, + { + "epoch": 336.0, + "eval_loss": 11.019977569580078, + "eval_runtime": 1.5335, + "eval_samples_per_second": 79.554, + "eval_steps_per_second": 10.433, + "step": 20496 + }, + { + "epoch": 336.07, + "learning_rate": 1.6393442622950816e-08, + "loss": 10.1343, + "step": 20500 + }, + { + "epoch": 337.0, + "eval_loss": 10.968345642089844, + "eval_runtime": 1.5381, + "eval_samples_per_second": 79.32, + "eval_steps_per_second": 10.403, + "step": 20557 + }, + { + "epoch": 338.0, + "eval_loss": 10.98238754272461, + "eval_runtime": 1.5536, + "eval_samples_per_second": 78.526, + "eval_steps_per_second": 10.299, + "step": 20618 + }, + { + "epoch": 339.0, + "eval_loss": 11.156255722045898, + "eval_runtime": 1.536, + "eval_samples_per_second": 79.425, + "eval_steps_per_second": 10.416, + "step": 20679 + }, + { + "epoch": 340.0, + "eval_loss": 11.148921012878418, + "eval_runtime": 1.5358, + "eval_samples_per_second": 79.438, + "eval_steps_per_second": 10.418, + "step": 20740 + }, + { + "epoch": 341.0, + "eval_loss": 11.138899803161621, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.686, + "eval_steps_per_second": 10.451, + "step": 20801 + }, + { + "epoch": 342.0, + "eval_loss": 11.112798690795898, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.638, + "eval_steps_per_second": 10.444, + "step": 20862 + }, + { + "epoch": 343.0, + "eval_loss": 11.043689727783203, + "eval_runtime": 1.5362, + "eval_samples_per_second": 79.418, + "eval_steps_per_second": 10.416, + "step": 20923 + }, + { + "epoch": 344.0, + "eval_loss": 11.100526809692383, + "eval_runtime": 1.5335, + "eval_samples_per_second": 79.556, + "eval_steps_per_second": 10.434, + "step": 20984 + }, + { + "epoch": 344.26, + "learning_rate": 1.5573770491803278e-08, + "loss": 10.143, + "step": 21000 + }, + { + "epoch": 345.0, + "eval_loss": 11.16964054107666, + "eval_runtime": 1.5442, + "eval_samples_per_second": 79.003, + "eval_steps_per_second": 10.361, + "step": 21045 + }, + { + "epoch": 346.0, + "eval_loss": 11.135641098022461, + "eval_runtime": 1.5566, + "eval_samples_per_second": 78.375, + "eval_steps_per_second": 10.279, + "step": 21106 + }, + { + "epoch": 347.0, + "eval_loss": 11.079752922058105, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.311, + "eval_steps_per_second": 10.401, + "step": 21167 + }, + { + "epoch": 348.0, + "eval_loss": 10.918296813964844, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.574, + "eval_steps_per_second": 10.436, + "step": 21228 + }, + { + "epoch": 349.0, + "eval_loss": 11.08788776397705, + "eval_runtime": 1.5381, + "eval_samples_per_second": 79.317, + "eval_steps_per_second": 10.402, + "step": 21289 + }, + { + "epoch": 350.0, + "eval_loss": 10.965096473693848, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.67, + "eval_steps_per_second": 10.449, + "step": 21350 + }, + { + "epoch": 351.0, + "eval_loss": 11.072400093078613, + "eval_runtime": 1.5343, + "eval_samples_per_second": 79.515, + "eval_steps_per_second": 10.428, + "step": 21411 + }, + { + "epoch": 352.0, + "eval_loss": 11.0264253616333, + "eval_runtime": 1.5325, + "eval_samples_per_second": 79.608, + "eval_steps_per_second": 10.44, + "step": 21472 + }, + { + "epoch": 352.46, + "learning_rate": 1.4754098360655737e-08, + "loss": 10.1456, + "step": 21500 + }, + { + "epoch": 353.0, + "eval_loss": 11.1398344039917, + "eval_runtime": 1.5366, + "eval_samples_per_second": 79.396, + "eval_steps_per_second": 10.413, + "step": 21533 + }, + { + "epoch": 354.0, + "eval_loss": 11.249741554260254, + "eval_runtime": 1.5389, + "eval_samples_per_second": 79.278, + "eval_steps_per_second": 10.397, + "step": 21594 + }, + { + "epoch": 355.0, + "eval_loss": 10.889848709106445, + "eval_runtime": 1.5359, + "eval_samples_per_second": 79.432, + "eval_steps_per_second": 10.417, + "step": 21655 + }, + { + "epoch": 356.0, + "eval_loss": 10.963099479675293, + "eval_runtime": 1.5354, + "eval_samples_per_second": 79.457, + "eval_steps_per_second": 10.421, + "step": 21716 + }, + { + "epoch": 357.0, + "eval_loss": 11.073355674743652, + "eval_runtime": 1.5372, + "eval_samples_per_second": 79.365, + "eval_steps_per_second": 10.408, + "step": 21777 + }, + { + "epoch": 358.0, + "eval_loss": 11.122593879699707, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.616, + "eval_steps_per_second": 10.441, + "step": 21838 + }, + { + "epoch": 359.0, + "eval_loss": 11.168622016906738, + "eval_runtime": 1.5326, + "eval_samples_per_second": 79.602, + "eval_steps_per_second": 10.44, + "step": 21899 + }, + { + "epoch": 360.0, + "eval_loss": 11.031410217285156, + "eval_runtime": 1.5309, + "eval_samples_per_second": 79.694, + "eval_steps_per_second": 10.452, + "step": 21960 + }, + { + "epoch": 360.65, + "learning_rate": 1.3934426229508195e-08, + "loss": 10.1345, + "step": 22000 + }, + { + "epoch": 361.0, + "eval_loss": 11.09403133392334, + "eval_runtime": 1.5394, + "eval_samples_per_second": 79.251, + "eval_steps_per_second": 10.394, + "step": 22021 + }, + { + "epoch": 362.0, + "eval_loss": 10.922209739685059, + "eval_runtime": 1.5403, + "eval_samples_per_second": 79.205, + "eval_steps_per_second": 10.388, + "step": 22082 + }, + { + "epoch": 363.0, + "eval_loss": 11.103551864624023, + "eval_runtime": 1.5393, + "eval_samples_per_second": 79.254, + "eval_steps_per_second": 10.394, + "step": 22143 + }, + { + "epoch": 364.0, + "eval_loss": 11.19053840637207, + "eval_runtime": 1.5388, + "eval_samples_per_second": 79.281, + "eval_steps_per_second": 10.398, + "step": 22204 + }, + { + "epoch": 365.0, + "eval_loss": 10.974089622497559, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.689, + "eval_steps_per_second": 10.451, + "step": 22265 + }, + { + "epoch": 366.0, + "eval_loss": 10.909160614013672, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.674, + "eval_steps_per_second": 10.449, + "step": 22326 + }, + { + "epoch": 367.0, + "eval_loss": 11.05643367767334, + "eval_runtime": 1.5337, + "eval_samples_per_second": 79.544, + "eval_steps_per_second": 10.432, + "step": 22387 + }, + { + "epoch": 368.0, + "eval_loss": 11.053413391113281, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.313, + "eval_steps_per_second": 10.402, + "step": 22448 + }, + { + "epoch": 368.85, + "learning_rate": 1.3114754098360655e-08, + "loss": 10.1354, + "step": 22500 + }, + { + "epoch": 369.0, + "eval_loss": 11.030744552612305, + "eval_runtime": 1.551, + "eval_samples_per_second": 78.66, + "eval_steps_per_second": 10.316, + "step": 22509 + }, + { + "epoch": 370.0, + "eval_loss": 11.14685344696045, + "eval_runtime": 1.5382, + "eval_samples_per_second": 79.313, + "eval_steps_per_second": 10.402, + "step": 22570 + }, + { + "epoch": 371.0, + "eval_loss": 11.056023597717285, + "eval_runtime": 1.5383, + "eval_samples_per_second": 79.307, + "eval_steps_per_second": 10.401, + "step": 22631 + }, + { + "epoch": 372.0, + "eval_loss": 11.023992538452148, + "eval_runtime": 1.5355, + "eval_samples_per_second": 79.454, + "eval_steps_per_second": 10.42, + "step": 22692 + }, + { + "epoch": 373.0, + "eval_loss": 10.986889839172363, + "eval_runtime": 1.5328, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 10.438, + "step": 22753 + }, + { + "epoch": 374.0, + "eval_loss": 11.000397682189941, + "eval_runtime": 1.5337, + "eval_samples_per_second": 79.547, + "eval_steps_per_second": 10.432, + "step": 22814 + }, + { + "epoch": 375.0, + "eval_loss": 11.137325286865234, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.665, + "eval_steps_per_second": 10.448, + "step": 22875 + }, + { + "epoch": 376.0, + "eval_loss": 11.095502853393555, + "eval_runtime": 1.5326, + "eval_samples_per_second": 79.601, + "eval_steps_per_second": 10.439, + "step": 22936 + }, + { + "epoch": 377.0, + "eval_loss": 11.05420207977295, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.667, + "eval_steps_per_second": 10.448, + "step": 22997 + }, + { + "epoch": 377.05, + "learning_rate": 1.2295081967213115e-08, + "loss": 10.1382, + "step": 23000 + }, + { + "epoch": 378.0, + "eval_loss": 10.981268882751465, + "eval_runtime": 1.542, + "eval_samples_per_second": 79.12, + "eval_steps_per_second": 10.376, + "step": 23058 + }, + { + "epoch": 379.0, + "eval_loss": 10.987357139587402, + "eval_runtime": 1.547, + "eval_samples_per_second": 78.861, + "eval_steps_per_second": 10.342, + "step": 23119 + }, + { + "epoch": 380.0, + "eval_loss": 10.973642349243164, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.386, + "eval_steps_per_second": 10.411, + "step": 23180 + }, + { + "epoch": 381.0, + "eval_loss": 11.129460334777832, + "eval_runtime": 1.5333, + "eval_samples_per_second": 79.565, + "eval_steps_per_second": 10.435, + "step": 23241 + }, + { + "epoch": 382.0, + "eval_loss": 10.87239933013916, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.672, + "eval_steps_per_second": 10.449, + "step": 23302 + }, + { + "epoch": 383.0, + "eval_loss": 10.936705589294434, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.573, + "eval_steps_per_second": 10.436, + "step": 23363 + }, + { + "epoch": 384.0, + "eval_loss": 11.051558494567871, + "eval_runtime": 1.5374, + "eval_samples_per_second": 79.353, + "eval_steps_per_second": 10.407, + "step": 23424 + }, + { + "epoch": 385.0, + "eval_loss": 11.027456283569336, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.575, + "eval_steps_per_second": 10.436, + "step": 23485 + }, + { + "epoch": 385.25, + "learning_rate": 1.1475409836065573e-08, + "loss": 10.1246, + "step": 23500 + }, + { + "epoch": 386.0, + "eval_loss": 11.018380165100098, + "eval_runtime": 1.5356, + "eval_samples_per_second": 79.45, + "eval_steps_per_second": 10.42, + "step": 23546 + }, + { + "epoch": 387.0, + "eval_loss": 11.057013511657715, + "eval_runtime": 1.5451, + "eval_samples_per_second": 78.957, + "eval_steps_per_second": 10.355, + "step": 23607 + }, + { + "epoch": 388.0, + "eval_loss": 11.024608612060547, + "eval_runtime": 1.5372, + "eval_samples_per_second": 79.364, + "eval_steps_per_second": 10.408, + "step": 23668 + }, + { + "epoch": 389.0, + "eval_loss": 11.013128280639648, + "eval_runtime": 1.5353, + "eval_samples_per_second": 79.466, + "eval_steps_per_second": 10.422, + "step": 23729 + }, + { + "epoch": 390.0, + "eval_loss": 11.016827583312988, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.53, + "eval_steps_per_second": 10.43, + "step": 23790 + }, + { + "epoch": 391.0, + "eval_loss": 11.08166217803955, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.551, + "eval_steps_per_second": 10.433, + "step": 23851 + }, + { + "epoch": 392.0, + "eval_loss": 10.894896507263184, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.681, + "eval_steps_per_second": 10.45, + "step": 23912 + }, + { + "epoch": 393.0, + "eval_loss": 10.769810676574707, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.625, + "eval_steps_per_second": 10.443, + "step": 23973 + }, + { + "epoch": 393.44, + "learning_rate": 1.0655737704918032e-08, + "loss": 10.1173, + "step": 24000 + }, + { + "epoch": 394.0, + "eval_loss": 11.004133224487305, + "eval_runtime": 1.545, + "eval_samples_per_second": 78.965, + "eval_steps_per_second": 10.356, + "step": 24034 + }, + { + "epoch": 395.0, + "eval_loss": 10.925704956054688, + "eval_runtime": 1.542, + "eval_samples_per_second": 79.12, + "eval_steps_per_second": 10.376, + "step": 24095 + }, + { + "epoch": 396.0, + "eval_loss": 10.9295015335083, + "eval_runtime": 1.5407, + "eval_samples_per_second": 79.187, + "eval_steps_per_second": 10.385, + "step": 24156 + }, + { + "epoch": 397.0, + "eval_loss": 10.947636604309082, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 10.433, + "step": 24217 + }, + { + "epoch": 398.0, + "eval_loss": 11.058280944824219, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.682, + "eval_steps_per_second": 10.45, + "step": 24278 + }, + { + "epoch": 399.0, + "eval_loss": 11.00063419342041, + "eval_runtime": 1.5321, + "eval_samples_per_second": 79.627, + "eval_steps_per_second": 10.443, + "step": 24339 + }, + { + "epoch": 400.0, + "eval_loss": 10.971447944641113, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.656, + "eval_steps_per_second": 10.447, + "step": 24400 + }, + { + "epoch": 401.0, + "eval_loss": 11.0480318069458, + "eval_runtime": 1.5351, + "eval_samples_per_second": 79.474, + "eval_steps_per_second": 10.423, + "step": 24461 + }, + { + "epoch": 401.64, + "learning_rate": 9.83606557377049e-09, + "loss": 10.1253, + "step": 24500 + }, + { + "epoch": 402.0, + "eval_loss": 11.021344184875488, + "eval_runtime": 1.5405, + "eval_samples_per_second": 79.197, + "eval_steps_per_second": 10.386, + "step": 24522 + }, + { + "epoch": 403.0, + "eval_loss": 10.96359920501709, + "eval_runtime": 1.5391, + "eval_samples_per_second": 79.267, + "eval_steps_per_second": 10.396, + "step": 24583 + }, + { + "epoch": 404.0, + "eval_loss": 10.988608360290527, + "eval_runtime": 1.5469, + "eval_samples_per_second": 78.87, + "eval_steps_per_second": 10.344, + "step": 24644 + }, + { + "epoch": 405.0, + "eval_loss": 11.066388130187988, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.672, + "eval_steps_per_second": 10.449, + "step": 24705 + }, + { + "epoch": 406.0, + "eval_loss": 11.046152114868164, + "eval_runtime": 1.5351, + "eval_samples_per_second": 79.475, + "eval_steps_per_second": 10.423, + "step": 24766 + }, + { + "epoch": 407.0, + "eval_loss": 11.012222290039062, + "eval_runtime": 1.5324, + "eval_samples_per_second": 79.611, + "eval_steps_per_second": 10.441, + "step": 24827 + }, + { + "epoch": 408.0, + "eval_loss": 10.857176780700684, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.673, + "eval_steps_per_second": 10.449, + "step": 24888 + }, + { + "epoch": 409.0, + "eval_loss": 11.138158798217773, + "eval_runtime": 1.5365, + "eval_samples_per_second": 79.399, + "eval_steps_per_second": 10.413, + "step": 24949 + }, + { + "epoch": 409.83, + "learning_rate": 9.01639344262295e-09, + "loss": 10.1386, + "step": 25000 + }, + { + "epoch": 410.0, + "eval_loss": 11.070024490356445, + "eval_runtime": 1.5371, + "eval_samples_per_second": 79.369, + "eval_steps_per_second": 10.409, + "step": 25010 + }, + { + "epoch": 411.0, + "eval_loss": 10.96763801574707, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.384, + "eval_steps_per_second": 10.411, + "step": 25071 + }, + { + "epoch": 412.0, + "eval_loss": 11.1865234375, + "eval_runtime": 1.5408, + "eval_samples_per_second": 79.181, + "eval_steps_per_second": 10.384, + "step": 25132 + }, + { + "epoch": 413.0, + "eval_loss": 11.078502655029297, + "eval_runtime": 1.5486, + "eval_samples_per_second": 78.78, + "eval_steps_per_second": 10.332, + "step": 25193 + }, + { + "epoch": 414.0, + "eval_loss": 11.028984069824219, + "eval_runtime": 1.5323, + "eval_samples_per_second": 79.619, + "eval_steps_per_second": 10.442, + "step": 25254 + }, + { + "epoch": 415.0, + "eval_loss": 11.138273239135742, + "eval_runtime": 1.5322, + "eval_samples_per_second": 79.622, + "eval_steps_per_second": 10.442, + "step": 25315 + }, + { + "epoch": 416.0, + "eval_loss": 11.113880157470703, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.678, + "eval_steps_per_second": 10.45, + "step": 25376 + }, + { + "epoch": 417.0, + "eval_loss": 11.018509864807129, + "eval_runtime": 1.5361, + "eval_samples_per_second": 79.42, + "eval_steps_per_second": 10.416, + "step": 25437 + }, + { + "epoch": 418.0, + "eval_loss": 11.018741607666016, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 10.448, + "step": 25498 + }, + { + "epoch": 418.03, + "learning_rate": 8.196721311475408e-09, + "loss": 10.1491, + "step": 25500 + }, + { + "epoch": 419.0, + "eval_loss": 11.089320182800293, + "eval_runtime": 1.5384, + "eval_samples_per_second": 79.303, + "eval_steps_per_second": 10.4, + "step": 25559 + }, + { + "epoch": 420.0, + "eval_loss": 11.034805297851562, + "eval_runtime": 1.5427, + "eval_samples_per_second": 79.083, + "eval_steps_per_second": 10.372, + "step": 25620 + }, + { + "epoch": 421.0, + "eval_loss": 10.993184089660645, + "eval_runtime": 1.538, + "eval_samples_per_second": 79.323, + "eval_steps_per_second": 10.403, + "step": 25681 + }, + { + "epoch": 422.0, + "eval_loss": 11.076547622680664, + "eval_runtime": 1.5334, + "eval_samples_per_second": 79.562, + "eval_steps_per_second": 10.434, + "step": 25742 + }, + { + "epoch": 423.0, + "eval_loss": 11.048792839050293, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.638, + "eval_steps_per_second": 10.444, + "step": 25803 + }, + { + "epoch": 424.0, + "eval_loss": 11.024137496948242, + "eval_runtime": 1.5337, + "eval_samples_per_second": 79.548, + "eval_steps_per_second": 10.433, + "step": 25864 + }, + { + "epoch": 425.0, + "eval_loss": 11.069534301757812, + "eval_runtime": 1.532, + "eval_samples_per_second": 79.632, + "eval_steps_per_second": 10.444, + "step": 25925 + }, + { + "epoch": 426.0, + "eval_loss": 10.898648262023926, + "eval_runtime": 1.5315, + "eval_samples_per_second": 79.662, + "eval_steps_per_second": 10.448, + "step": 25986 + }, + { + "epoch": 426.23, + "learning_rate": 7.377049180327869e-09, + "loss": 10.1184, + "step": 26000 + }, + { + "epoch": 427.0, + "eval_loss": 10.843330383300781, + "eval_runtime": 1.5388, + "eval_samples_per_second": 79.284, + "eval_steps_per_second": 10.398, + "step": 26047 + }, + { + "epoch": 428.0, + "eval_loss": 10.847617149353027, + "eval_runtime": 1.5395, + "eval_samples_per_second": 79.244, + "eval_steps_per_second": 10.393, + "step": 26108 + }, + { + "epoch": 429.0, + "eval_loss": 10.974745750427246, + "eval_runtime": 1.5392, + "eval_samples_per_second": 79.259, + "eval_steps_per_second": 10.395, + "step": 26169 + }, + { + "epoch": 430.0, + "eval_loss": 10.925885200500488, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.672, + "eval_steps_per_second": 10.449, + "step": 26230 + }, + { + "epoch": 431.0, + "eval_loss": 10.864691734313965, + "eval_runtime": 1.5326, + "eval_samples_per_second": 79.602, + "eval_steps_per_second": 10.44, + "step": 26291 + }, + { + "epoch": 432.0, + "eval_loss": 11.028008460998535, + "eval_runtime": 1.5318, + "eval_samples_per_second": 79.645, + "eval_steps_per_second": 10.445, + "step": 26352 + }, + { + "epoch": 433.0, + "eval_loss": 10.958173751831055, + "eval_runtime": 1.5349, + "eval_samples_per_second": 79.482, + "eval_steps_per_second": 10.424, + "step": 26413 + }, + { + "epoch": 434.0, + "eval_loss": 10.98095989227295, + "eval_runtime": 1.5339, + "eval_samples_per_second": 79.536, + "eval_steps_per_second": 10.431, + "step": 26474 + }, + { + "epoch": 434.43, + "learning_rate": 6.5573770491803275e-09, + "loss": 10.1396, + "step": 26500 + }, + { + "epoch": 435.0, + "eval_loss": 11.049097061157227, + "eval_runtime": 1.5478, + "eval_samples_per_second": 78.819, + "eval_steps_per_second": 10.337, + "step": 26535 + }, + { + "epoch": 436.0, + "eval_loss": 11.069966316223145, + "eval_runtime": 1.5391, + "eval_samples_per_second": 79.265, + "eval_steps_per_second": 10.395, + "step": 26596 + }, + { + "epoch": 437.0, + "eval_loss": 10.987845420837402, + "eval_runtime": 1.5481, + "eval_samples_per_second": 78.804, + "eval_steps_per_second": 10.335, + "step": 26657 + }, + { + "epoch": 438.0, + "eval_loss": 10.939976692199707, + "eval_runtime": 1.5375, + "eval_samples_per_second": 79.351, + "eval_steps_per_second": 10.407, + "step": 26718 + }, + { + "epoch": 439.0, + "eval_loss": 10.868154525756836, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 10.448, + "step": 26779 + }, + { + "epoch": 440.0, + "eval_loss": 10.966716766357422, + "eval_runtime": 1.539, + "eval_samples_per_second": 79.27, + "eval_steps_per_second": 10.396, + "step": 26840 + }, + { + "epoch": 441.0, + "eval_loss": 11.011680603027344, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.674, + "eval_steps_per_second": 10.449, + "step": 26901 + }, + { + "epoch": 442.0, + "eval_loss": 11.037373542785645, + "eval_runtime": 1.5311, + "eval_samples_per_second": 79.68, + "eval_steps_per_second": 10.45, + "step": 26962 + }, + { + "epoch": 442.62, + "learning_rate": 5.737704918032786e-09, + "loss": 10.1337, + "step": 27000 + }, + { + "epoch": 443.0, + "eval_loss": 11.133712768554688, + "eval_runtime": 1.5458, + "eval_samples_per_second": 78.922, + "eval_steps_per_second": 10.35, + "step": 27023 + }, + { + "epoch": 444.0, + "eval_loss": 10.941494941711426, + "eval_runtime": 1.5386, + "eval_samples_per_second": 79.294, + "eval_steps_per_second": 10.399, + "step": 27084 + }, + { + "epoch": 445.0, + "eval_loss": 11.017427444458008, + "eval_runtime": 1.5391, + "eval_samples_per_second": 79.266, + "eval_steps_per_second": 10.396, + "step": 27145 + }, + { + "epoch": 446.0, + "eval_loss": 11.02389144897461, + "eval_runtime": 1.5319, + "eval_samples_per_second": 79.641, + "eval_steps_per_second": 10.445, + "step": 27206 + }, + { + "epoch": 447.0, + "eval_loss": 10.897884368896484, + "eval_runtime": 1.534, + "eval_samples_per_second": 79.53, + "eval_steps_per_second": 10.43, + "step": 27267 + }, + { + "epoch": 448.0, + "eval_loss": 10.921698570251465, + "eval_runtime": 1.5334, + "eval_samples_per_second": 79.564, + "eval_steps_per_second": 10.435, + "step": 27328 + }, + { + "epoch": 449.0, + "eval_loss": 10.892621994018555, + "eval_runtime": 1.5335, + "eval_samples_per_second": 79.557, + "eval_steps_per_second": 10.434, + "step": 27389 + }, + { + "epoch": 450.0, + "eval_loss": 11.121941566467285, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.668, + "eval_steps_per_second": 10.448, + "step": 27450 + }, + { + "epoch": 450.82, + "learning_rate": 4.918032786885245e-09, + "loss": 10.1168, + "step": 27500 + }, + { + "epoch": 451.0, + "eval_loss": 10.893115043640137, + "eval_runtime": 1.5424, + "eval_samples_per_second": 79.097, + "eval_steps_per_second": 10.373, + "step": 27511 + }, + { + "epoch": 452.0, + "eval_loss": 11.011245727539062, + "eval_runtime": 1.5529, + "eval_samples_per_second": 78.564, + "eval_steps_per_second": 10.303, + "step": 27572 + }, + { + "epoch": 453.0, + "eval_loss": 10.982295036315918, + "eval_runtime": 1.5411, + "eval_samples_per_second": 79.163, + "eval_steps_per_second": 10.382, + "step": 27633 + }, + { + "epoch": 454.0, + "eval_loss": 11.109077453613281, + "eval_runtime": 1.5438, + "eval_samples_per_second": 79.028, + "eval_steps_per_second": 10.364, + "step": 27694 + }, + { + "epoch": 455.0, + "eval_loss": 10.869407653808594, + "eval_runtime": 1.5332, + "eval_samples_per_second": 79.575, + "eval_steps_per_second": 10.436, + "step": 27755 + }, + { + "epoch": 456.0, + "eval_loss": 10.962514877319336, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.656, + "eval_steps_per_second": 10.447, + "step": 27816 + }, + { + "epoch": 457.0, + "eval_loss": 10.855345726013184, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.386, + "eval_steps_per_second": 10.411, + "step": 27877 + }, + { + "epoch": 458.0, + "eval_loss": 10.98884391784668, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.582, + "eval_steps_per_second": 10.437, + "step": 27938 + }, + { + "epoch": 459.0, + "eval_loss": 10.929617881774902, + "eval_runtime": 1.5316, + "eval_samples_per_second": 79.658, + "eval_steps_per_second": 10.447, + "step": 27999 + }, + { + "epoch": 459.02, + "learning_rate": 4.098360655737704e-09, + "loss": 10.1229, + "step": 28000 + }, + { + "epoch": 460.0, + "eval_loss": 10.88948917388916, + "eval_runtime": 1.5368, + "eval_samples_per_second": 79.387, + "eval_steps_per_second": 10.411, + "step": 28060 + }, + { + "epoch": 461.0, + "eval_loss": 10.980300903320312, + "eval_runtime": 1.5401, + "eval_samples_per_second": 79.214, + "eval_steps_per_second": 10.389, + "step": 28121 + }, + { + "epoch": 462.0, + "eval_loss": 11.014444351196289, + "eval_runtime": 1.5678, + "eval_samples_per_second": 77.815, + "eval_steps_per_second": 10.205, + "step": 28182 + }, + { + "epoch": 463.0, + "eval_loss": 11.025659561157227, + "eval_runtime": 1.5329, + "eval_samples_per_second": 79.586, + "eval_steps_per_second": 10.438, + "step": 28243 + }, + { + "epoch": 464.0, + "eval_loss": 10.914053916931152, + "eval_runtime": 1.5336, + "eval_samples_per_second": 79.55, + "eval_steps_per_second": 10.433, + "step": 28304 + }, + { + "epoch": 465.0, + "eval_loss": 11.134551048278809, + "eval_runtime": 1.5313, + "eval_samples_per_second": 79.669, + "eval_steps_per_second": 10.448, + "step": 28365 + }, + { + "epoch": 466.0, + "eval_loss": 11.025362968444824, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.58, + "eval_steps_per_second": 10.437, + "step": 28426 + }, + { + "epoch": 467.0, + "eval_loss": 11.038411140441895, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 10.448, + "step": 28487 + }, + { + "epoch": 467.21, + "learning_rate": 3.2786885245901638e-09, + "loss": 10.1179, + "step": 28500 + }, + { + "epoch": 468.0, + "eval_loss": 10.849050521850586, + "eval_runtime": 1.5609, + "eval_samples_per_second": 78.159, + "eval_steps_per_second": 10.25, + "step": 28548 + }, + { + "epoch": 469.0, + "eval_loss": 11.046910285949707, + "eval_runtime": 1.5417, + "eval_samples_per_second": 79.132, + "eval_steps_per_second": 10.378, + "step": 28609 + }, + { + "epoch": 470.0, + "eval_loss": 10.967819213867188, + "eval_runtime": 1.539, + "eval_samples_per_second": 79.271, + "eval_steps_per_second": 10.396, + "step": 28670 + }, + { + "epoch": 471.0, + "eval_loss": 10.890240669250488, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.687, + "eval_steps_per_second": 10.451, + "step": 28731 + }, + { + "epoch": 472.0, + "eval_loss": 10.964876174926758, + "eval_runtime": 1.5349, + "eval_samples_per_second": 79.483, + "eval_steps_per_second": 10.424, + "step": 28792 + }, + { + "epoch": 473.0, + "eval_loss": 10.925169944763184, + "eval_runtime": 1.5315, + "eval_samples_per_second": 79.659, + "eval_steps_per_second": 10.447, + "step": 28853 + }, + { + "epoch": 474.0, + "eval_loss": 11.113059997558594, + "eval_runtime": 1.5314, + "eval_samples_per_second": 79.666, + "eval_steps_per_second": 10.448, + "step": 28914 + }, + { + "epoch": 475.0, + "eval_loss": 11.026731491088867, + "eval_runtime": 1.5312, + "eval_samples_per_second": 79.677, + "eval_steps_per_second": 10.449, + "step": 28975 + }, + { + "epoch": 475.41, + "learning_rate": 2.4590163934426226e-09, + "loss": 10.1189, + "step": 29000 + }, + { + "epoch": 476.0, + "eval_loss": 10.842777252197266, + "eval_runtime": 1.5457, + "eval_samples_per_second": 78.928, + "eval_steps_per_second": 10.351, + "step": 29036 + }, + { + "epoch": 477.0, + "eval_loss": 11.031394958496094, + "eval_runtime": 1.5433, + "eval_samples_per_second": 79.053, + "eval_steps_per_second": 10.368, + "step": 29097 + }, + { + "epoch": 478.0, + "eval_loss": 11.093620300292969, + "eval_runtime": 1.5431, + "eval_samples_per_second": 79.061, + "eval_steps_per_second": 10.369, + "step": 29158 + }, + { + "epoch": 479.0, + "eval_loss": 10.99679946899414, + "eval_runtime": 1.5405, + "eval_samples_per_second": 79.196, + "eval_steps_per_second": 10.386, + "step": 29219 + }, + { + "epoch": 480.0, + "eval_loss": 10.872136116027832, + "eval_runtime": 1.5317, + "eval_samples_per_second": 79.648, + "eval_steps_per_second": 10.446, + "step": 29280 + }, + { + "epoch": 481.0, + "eval_loss": 11.01526165008545, + "eval_runtime": 1.5338, + "eval_samples_per_second": 79.539, + "eval_steps_per_second": 10.431, + "step": 29341 + }, + { + "epoch": 482.0, + "eval_loss": 11.176105499267578, + "eval_runtime": 1.531, + "eval_samples_per_second": 79.688, + "eval_steps_per_second": 10.451, + "step": 29402 + }, + { + "epoch": 483.0, + "eval_loss": 10.983979225158691, + "eval_runtime": 1.5364, + "eval_samples_per_second": 79.408, + "eval_steps_per_second": 10.414, + "step": 29463 + }, + { + "epoch": 483.61, + "learning_rate": 1.6393442622950819e-09, + "loss": 10.1153, + "step": 29500 + }, + { + "epoch": 484.0, + "eval_loss": 10.964775085449219, + "eval_runtime": 1.5399, + "eval_samples_per_second": 79.225, + "eval_steps_per_second": 10.39, + "step": 29524 + }, + { + "epoch": 485.0, + "eval_loss": 11.114033699035645, + "eval_runtime": 1.5364, + "eval_samples_per_second": 79.409, + "eval_steps_per_second": 10.414, + "step": 29585 + }, + { + "epoch": 486.0, + "eval_loss": 11.021162986755371, + "eval_runtime": 1.5483, + "eval_samples_per_second": 78.798, + "eval_steps_per_second": 10.334, + "step": 29646 + }, + { + "epoch": 487.0, + "eval_loss": 10.919731140136719, + "eval_runtime": 1.5334, + "eval_samples_per_second": 79.563, + "eval_steps_per_second": 10.434, + "step": 29707 + }, + { + "epoch": 488.0, + "eval_loss": 10.979778289794922, + "eval_runtime": 1.5341, + "eval_samples_per_second": 79.523, + "eval_steps_per_second": 10.429, + "step": 29768 + }, + { + "epoch": 489.0, + "eval_loss": 10.904738426208496, + "eval_runtime": 1.5327, + "eval_samples_per_second": 79.596, + "eval_steps_per_second": 10.439, + "step": 29829 + }, + { + "epoch": 490.0, + "eval_loss": 11.014572143554688, + "eval_runtime": 1.5384, + "eval_samples_per_second": 79.303, + "eval_steps_per_second": 10.4, + "step": 29890 + }, + { + "epoch": 491.0, + "eval_loss": 11.057549476623535, + "eval_runtime": 1.5352, + "eval_samples_per_second": 79.469, + "eval_steps_per_second": 10.422, + "step": 29951 + }, + { + "epoch": 491.8, + "learning_rate": 8.196721311475409e-10, + "loss": 10.1141, + "step": 30000 + }, + { + "epoch": 492.0, + "eval_loss": 11.040027618408203, + "eval_runtime": 1.5409, + "eval_samples_per_second": 79.175, + "eval_steps_per_second": 10.384, + "step": 30012 + }, + { + "epoch": 493.0, + "eval_loss": 11.089771270751953, + "eval_runtime": 1.5412, + "eval_samples_per_second": 79.159, + "eval_steps_per_second": 10.382, + "step": 30073 + }, + { + "epoch": 494.0, + "eval_loss": 10.99104118347168, + "eval_runtime": 1.5377, + "eval_samples_per_second": 79.339, + "eval_steps_per_second": 10.405, + "step": 30134 + }, + { + "epoch": 495.0, + "eval_loss": 11.057866096496582, + "eval_runtime": 1.5447, + "eval_samples_per_second": 78.98, + "eval_steps_per_second": 10.358, + "step": 30195 + }, + { + "epoch": 496.0, + "eval_loss": 10.857988357543945, + "eval_runtime": 1.533, + "eval_samples_per_second": 79.581, + "eval_steps_per_second": 10.437, + "step": 30256 + }, + { + "epoch": 497.0, + "eval_loss": 10.944987297058105, + "eval_runtime": 1.5401, + "eval_samples_per_second": 79.216, + "eval_steps_per_second": 10.389, + "step": 30317 + }, + { + "epoch": 498.0, + "eval_loss": 11.052291870117188, + "eval_runtime": 1.5333, + "eval_samples_per_second": 79.568, + "eval_steps_per_second": 10.435, + "step": 30378 + }, + { + "epoch": 499.0, + "eval_loss": 11.122845649719238, + "eval_runtime": 1.5327, + "eval_samples_per_second": 79.598, + "eval_steps_per_second": 10.439, + "step": 30439 + }, + { + "epoch": 500.0, + "learning_rate": 0.0, + "loss": 10.1176, + "step": 30500 + }, + { + "epoch": 500.0, + "eval_loss": 11.049210548400879, + "eval_runtime": 1.5625, + "eval_samples_per_second": 78.08, + "eval_steps_per_second": 10.24, + "step": 30500 + }, + { + "epoch": 500.0, + "step": 30500, + "total_flos": 1.6136552440728576e+16, + "train_loss": 11.67972986039959, + "train_runtime": 24510.5451, + "train_samples_per_second": 9.975, + "train_steps_per_second": 1.244 } ], - "max_steps": 610, - "num_train_epochs": 10, - "total_flos": 322666370343936.0, + "max_steps": 30500, + "num_train_epochs": 500, + "total_flos": 1.6136552440728576e+16, "trial_name": null, "trial_params": null }